Skip to content

Commit

Permalink
support multi gpu per diloco worker in the run training script
Browse files Browse the repository at this point in the history
  • Loading branch information
samsja committed Aug 9, 2024
1 parent a53b294 commit fdb1acd
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 13 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ torchrun --nproc_per_node=8 \
## 150m on 8 DiLoCo Worker with 500 local steps
In the `open_diloco` folder, run:
```bash
./run_training.sh 8 $PEER \
./run_training.sh 8 1 $PEER \
--sharding-strategy NO_SHARD \
--per-device-train-batch-size 8 \
--precision bf16-mixed \
Expand All @@ -149,7 +149,7 @@ under the hood the `run_training.sh` script calls `train_fsdp.py` 8 times with t
## 150m on 8 DiLoCo Worker with 50 local steps
In the `open_diloco` folder, run:
```bash
./run_training.sh 8 $PEER \
./run_training.sh 8 1 $PEER \
--sharding-strategy NO_SHARD \
--per-device-train-batch-size 8 \
--total-batch-size 512 \
Expand Down
37 changes: 26 additions & 11 deletions open_diloco/run_training.sh
Original file line number Diff line number Diff line change
@@ -1,33 +1,48 @@
#!/bin/bash

## example usage
# ./run_training.sh 2 /ip4/127.0.0.1/tcp/36593/p2p/12D3KooWEAyutJ1zFqhAbzDn1LSzraB3o1uS8GSHxQYM87QP4AHN --per-device-train-batch-size 16 --batch-size 512 --local-steps 10 --total-steps 88000 --c4-tiny
# ./run_training.sh 4 2 /ip4/127.0.0.1/tcp/36593/p2p/12D3KooWEAyutJ1zFqhAbzDn1LSzraB3o1uS8GSHxQYM87QP4AHN --per-device-train-batch-size 16 --batch-size 512 --local-steps 10 --total-steps 88000 --c4-tiny
# note that everything after the initial peer with will pass to all worker
#
## the command above will use a total of 8 gpu and create 4 diloco workers each of them with two gpu training ddp/fsdp wise

# Check if at least two arguments were passed
if [ "$#" -lt 2 ]; then
echo "Usage: $0 <N> <initial_peer> [additional_python_args]"
# Function to get CUDA devices based on the number of GPUs and index
function get_cuda_devices() {
local num_gpu=$1
local index=$2
local start_gpu=$((num_gpu * index))
local end_gpu=$((start_gpu + num_gpu - 1))

if [ "$num_gpu" -eq 1 ]; then
echo $start_gpu
else
echo $(seq -s ',' $start_gpu $end_gpu)
fi
}

# Check if at least three arguments were passed
if [ "$#" -lt 3 ]; then
echo "Usage: $0 <N> <initial_peer> <num_gpu> [additional_python_args]"
exit 1
fi

N=$1 # Set N from the first argument
INITIAL_PEER=$2 # Set INITIAL_PEER from the second argument
shift 2 # Remove the first two arguments so $@ contains only additional Python arguments
NUM_GPU=$2
INITIAL_PEER=$3 # Set INITIAL_PEER from the second argument
shift 3 # Remove the first three arguments so $@ contains only additional Python arguments

# Ensure the logs directory exists
mkdir -p logs

# Execute the command for the first device (CUDA_VISIBLE_DEVICES=0) and log the output, run in background
echo "Command: CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py --initial-peers $INITIAL_PEER $@ "
CUDA_VISIBLE_DEVICES=0 torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank 0 --hv.galaxy-size $N > logs/log0 2>&1 &
# Execute the command for the first device and log the output, run in background
CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU 0) torchrun --nproc_per_node=$NUM_GPU --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank 0 --hv.galaxy-size $N > logs/log0 2>&1 &
# Wait for 1 second before continuing with the rest
sleep 2

# Loop from 1 to N-1 and execute the command with different CUDA_VISIBLE_DEVICES and seed values, logging each command's output, run each in background
for i in $(seq 1 $(($N - 1)))
do
echo "Command: CUDA_VISIBLE_DEVICES=$i torchrun --nproc_per_node=1 --nnodes=1 train_fsdp.py.py --initial-peers $INITIAL_PEER $@"
WANDB_MODE=disabled CUDA_VISIBLE_DEVICES=$i torchrun --nproc_per_node=1 --rdzv-endpoint localhost:123$i --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank $i --hv.galaxy-size $N > logs/log$i 2>&1 &
CUDA_VISIBLE_DEVICES=$(get_cuda_devices $NUM_GPU $i) torchrun --nproc_per_node=$NUM_GPU --rdzv-endpoint localhost:123$i --nnodes=1 train_fsdp.py --hv.initial-peers $INITIAL_PEER $@ --hv.world-rank $i --hv.galaxy-size $N > logs/log$i 2>&1 &
done

tail -f logs/log0

0 comments on commit fdb1acd

Please sign in to comment.