camembertav2-base-ckpts / checkpoints /roleback-iter-checkpoints.sh
wissamantoun's picture
Upload checkpoints/roleback-iter-checkpoints.sh with huggingface_hub
c87716a verified
raw
history blame
2.4 kB
#!/bin/bash
# To be run in the main checkpoints directory
# Define the step number for the new model checkpoint path
TARGET_STEP="91000"
# Function to update a single checkpoint file
update_checkpoint_file() {
local checkpoint_file="$1"
local tmp_file
tmp_file=$(mktemp)
# Initialize counters for the paths and timestamps
path_count=0
timestamp_count=0
# Read the file line by line
while IFS= read -r line; do
# Count the number of path and timestamp entries
if [[ $line == all_model_checkpoint_paths* ]]; then
path_count=$((path_count + 1))
elif [[ $line == all_model_checkpoint_timestamps* ]]; then
timestamp_count=$((timestamp_count + 1))
fi
# Add the line to the temporary file
echo "$line" >> "$tmp_file"
done < "$checkpoint_file"
# Remove the last two paths and timestamps and rename the model checkpoint path
new_path_count=0
new_timestamp_count=0
while IFS= read -r line; do
if [[ $line == all_model_checkpoint_paths* ]]; then
new_path_count=$((new_path_count + 1))
# Skip the last two paths
if (( new_path_count > path_count - 1 )); then
continue
fi
elif [[ $line == all_model_checkpoint_timestamps* ]]; then
new_timestamp_count=$((new_timestamp_count + 1))
# Skip the last two timestamps
if (( new_timestamp_count > timestamp_count - 1 )); then
continue
fi
elif [[ $line == model_checkpoint_path* ]]; then
# Rename the model checkpoint path to the target step
line="model_checkpoint_path: \"iter_ckpt_rank_$(dirname $checkpoint_file | cut -d'_' -f4)-$TARGET_STEP\""
fi
# Add the line to the final temporary file
echo "$line" >> "${tmp_file}.final"
done < "$tmp_file"
# Replace the original file with the updated content
mv "${tmp_file}.final" "$checkpoint_file"
# Clean up temporary files
rm "$tmp_file"
}
# Find all checkpoint files with the given glob pattern
for checkpoint_file in iter_ckpt_rank_*/checkpoint; do
# Backup the original checkpoint file
cp "$checkpoint_file" "${checkpoint_file}.bak"
# Update the checkpoint file
update_checkpoint_file "$checkpoint_file"
done
echo "Checkpoint files have been updated."