|
#!/bin/bash |
|
|
|
|
|
TARGET_STEP="91000" |
|
|
|
|
|
update_checkpoint_file() { |
|
local checkpoint_file="$1" |
|
local tmp_file |
|
tmp_file=$(mktemp) |
|
|
|
|
|
path_count=0 |
|
timestamp_count=0 |
|
|
|
|
|
while IFS= read -r line; do |
|
|
|
if [[ $line == all_model_checkpoint_paths* ]]; then |
|
path_count=$((path_count + 1)) |
|
elif [[ $line == all_model_checkpoint_timestamps* ]]; then |
|
timestamp_count=$((timestamp_count + 1)) |
|
fi |
|
|
|
|
|
echo "$line" >> "$tmp_file" |
|
done < "$checkpoint_file" |
|
|
|
|
|
new_path_count=0 |
|
new_timestamp_count=0 |
|
while IFS= read -r line; do |
|
if [[ $line == all_model_checkpoint_paths* ]]; then |
|
new_path_count=$((new_path_count + 1)) |
|
|
|
if (( new_path_count > path_count - 1 )); then |
|
continue |
|
fi |
|
elif [[ $line == all_model_checkpoint_timestamps* ]]; then |
|
new_timestamp_count=$((new_timestamp_count + 1)) |
|
|
|
if (( new_timestamp_count > timestamp_count - 1 )); then |
|
continue |
|
fi |
|
elif [[ $line == model_checkpoint_path* ]]; then |
|
|
|
line="model_checkpoint_path: \"iter_ckpt_rank_$(dirname $checkpoint_file | cut -d'_' -f4)-$TARGET_STEP\"" |
|
fi |
|
|
|
echo "$line" >> "${tmp_file}.final" |
|
done < "$tmp_file" |
|
|
|
|
|
mv "${tmp_file}.final" "$checkpoint_file" |
|
|
|
|
|
rm "$tmp_file" |
|
} |
|
|
|
|
|
for checkpoint_file in iter_ckpt_rank_*/checkpoint; do |
|
|
|
cp "$checkpoint_file" "${checkpoint_file}.bak" |
|
|
|
|
|
update_checkpoint_file "$checkpoint_file" |
|
done |
|
|
|
echo "Checkpoint files have been updated." |
|
|