#!/bin/bash # Pretrain a multimodal model. export OMP_NUM_THREADS=8 export NCCL_IB_DISABLE=0 export NCCL_IB_GID_INDEX=3 export NCCL_SOCKET_IFNAME=eth0 export NCCL_DEBUG=INFO export NCCL_IB_SL=1 export CUDA_DEVICE_MAX_CONNECTIONS=1 export NCCL_BLOCKING_WAIT=1 export NCCL_ASYNC_ERROR_HANDLING=1 export NCCL_TIMEOUT=500 export TORCH_DISTRIBUTED_DEBUG=DETAIL DATETIME=`date +'%y-%m-%d-%H-%M-%S'` # Setting for multi nodes training. ports=(`echo $METIS_WORKER_0_PORT | tr ',' ' '`) port=${ports[0]} echo "total workers: ${ARNOLD_WORKER_NUM}" echo "cur worker id: ${ARNOLD_ID}" echo "gpus per worker: ${ARNOLD_WORKER_GPU}" echo "master ip: ${METIS_WORKER_0_HOST}" echo "master port: ${port}" source /mnt/bn/tns-algo-video-public-my2/wangpeng.an/environment/anaconda3/bin/activate multimodal cd /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/OmniFusion-main # Install necessary packages pip3 install requests pip3 install attrs pip3 install aiofiles pip3 install pynvml # Print Python executable path, torchrun, deepspeed and PYTHONPATH echo "Python executable: $(which python)" echo "torchrun executable: $(which torchrun)" echo "deepspeed executable: $(which deepspeed)" echo "PYTHONPATH before torchrun: $PYTHONPATH" sudo chmod 777 /var/lib/fastrak -R # Launch training with DeepSpeed and torchrun ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=$ARNOLD_WORKER_NUM --node_rank=$ARNOLD_ID --master_addr=$METIS_WORKER_0_HOST --master_port=$port \ llava/train/train_mem.py \ --deepspeed ./scripts/zero2.json \ --model_name_or_path /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/Meta-Llama-3.1-8B-Instruct \ --version llama_3_1 \ --data_path /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/multidataset/video_image_asr_caption_pre_1208.json \ --audio_asr_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data \ --audio_caption_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/audio_caption \ --video_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video \ --image_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/video/Video-LLaVA \ --X "Audio_asr" "Audio_caption" "Video" "Image" \ --audio_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/LanguageBind_Audio_Asr \ --audio_caption_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio \ --video_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Video_merge \ --image_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Image \ --pretrain_mm_mlp_adapter checkpoints/Video-LLaVA-Pretrain-7B-1109/mm_projector.bin \ --mm_projector_type mlp2x_gelu \ --mm_vision_select_layer -2 \ --mm_use_x_start_end False \ --mm_use_x_patch_token False \ --image_aspect_ratio pad \ --group_by_modality_length True \ --bf16 True \ --output_dir ./checkpoints/OmniFusion-8B-stage3-1208 \ --num_train_epochs 1 \ --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ --gradient_accumulation_steps 1 \ --evaluation_strategy "no" \ --save_strategy "steps" \ --save_steps 3000 \ --save_total_limit 4 \ --learning_rate 2e-5 \ --weight_decay 0. \ --warmup_ratio 0.03 \ --lr_scheduler_type "cosine" \ --logging_steps 1 \ --tf32 True \ --model_max_length 2048 \ --tokenizer_model_max_length 3072 \ --gradient_checkpointing True \ --dataloader_num_workers 8 \ --lazy_preprocess True \ --report_to none \ --cache_dir "./cache_dir"