diff --git a/README.md b/README.md index 15709b2..fd51ad2 100644 --- a/README.md +++ b/README.md @@ -127,7 +127,7 @@ torchrun --nproc_per_node=8 \ ## 150m on 8 DiLoCo Worker with 500 local steps In the `open_diloco` folder, run: ```bash -./run_training.sh 8 $PEER \ +./run_training.sh 8 1 $PEER \ --sharding-strategy NO_SHARD \ --per-device-train-batch-size 8 \ --precision bf16-mixed \ @@ -149,7 +149,7 @@ under the hood the `run_training.sh` script calls `train_fsdp.py` 8 times with t ## 150m on 8 DiLoCo Worker with 50 local steps In the `open_diloco` folder, run: ```bash -./run_training.sh 8 $PEER \ +./run_training.sh 8 1 $PEER \ --sharding-strategy NO_SHARD \ --per-device-train-batch-size 8 \ --total-batch-size 512 \ diff --git a/open_diloco/run_training.sh b/open_diloco/run_training.sh index 9cebb69..7198cfa 100755 --- a/open_diloco/run_training.sh +++ b/open_diloco/run_training.sh @@ -1,33 +1,48 @@ #!/bin/bash ## example usage -# ./run_training.sh 2 /ip4/127.0.0.1/tcp/36593/p2p/12D3KooWEAyutJ1zFqhAbzDn1LSzraB3o1uS8GSHxQYM87QP4AHN --per-device-train-batch-size 16 --batch-size 512 --local-steps 10 --total-steps 88000 --c4-tiny +# ./run_training.sh 4 2 /ip4/127.0.0.1/tcp/36593/p2p/12D3KooWEAyutJ1zFqhAbzDn1LSzraB3o1uS8GSHxQYM87QP4AHN --per-device-train-batch-size 16 --batch-size 512 --local-steps 10 --total-steps 88000 --c4-tiny # note that everything after the initial peer with will pass to all worker +# +## the command above will use a total of 8 gpu and create 4 diloco workers each of them with two gpu training ddp/fsdp wise -# Check if at least two arguments were passed -if [ "$#" -lt 2 ]; then - echo "Usage: $0 [additional_python_args]" +# Function to get CUDA devices based on the number of GPUs and index +function get_cuda_devices() { + local num_gpu=$1 + local index=$2 + local start_gpu=$((num_gpu * index)) + local end_gpu=$((start_gpu + num_gpu - 1)) + + if [ "$num_gpu" -eq 1 ]; then + echo $start_gpu + else + echo $(seq -s ',' $start_gpu $end_gpu) + fi +} + +# Check if at least three arguments were passed +if [ "$#" -lt 3 ]; then + echo "Usage: $0 [additional_python_args]" exit 1 fi N=$1 # Set N from the first argument -INITIAL_PEER=$2 # Set INITIAL_PEER from the second argument -shift 2 # Remove the first two arguments so $@ contains only additional Python arguments +NUM_GPU=$2 +INITIAL_PEER=$3 # Set INITIAL_PEER from the second argument +shift 3 # Remove the first three arguments so $@ contains only additional Python arguments # Ensure the logs directory exists mkdir -p logs -# Execute the command for the first device (CUDA_VISIBLE_DEVICES=0) and log the output, run in background -echo "Command: CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py --initial-peers $INITIAL_PEER $@ " -CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank 0 --hv.galaxy-size $N > logs/log0 2>&1 & +# Execute the command for the first device and log the output, run in background +CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU 0) torchrun --nproc_per_node=$NUM_GPU --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank 0 --hv.galaxy-size $N > logs/log0 2>&1 & # Wait for 1 second before continuing with the rest sleep 2 # Loop from 1 to N-1 and execute the command with different CUDA_VISIBLE_DEVICES and seed values, logging each command's output, run each in background for i in $(seq 1 $(($N - 1))) do - echo "Command: CUDA_VISIBLE_DEVICES=$i torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py.py --initial-peers $INITIAL_PEER $@" - WANDB_MODE=disabled CUDA_VISIBLE_DEVICES=$i torchrun --nproc_per_node=1 --rdzv-endpoint localhost:123$i --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank $i --hv.galaxy-size $N > logs/log$i 2>&1 & + CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) torchrun --nproc_per_node=$NUM_GPU --rdzv-endpoint localhost:123$i --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank $i --hv.galaxy-size $N > logs/log$i 2>&1 & done tail -f logs/log0