diff --git a/open_diloco/run_training.sh b/open_diloco/run_training.sh index 7198cfa..bc84335 100755 --- a/open_diloco/run_training.sh +++ b/open_diloco/run_training.sh @@ -6,6 +6,11 @@ # ## the command above will use a total of 8 gpu and create 4 diloco workers each of them with two gpu training ddp/fsdp wise + +# you can either pass a fixed initial peer or set it to auto and the script will start a dht server for you +## # ./run_training.sh 2 1 auto --per-device-train-batch-size 16 --batch-size 512 --local-steps 10 --total-steps 88000 --c4-tiny + + # Function to get CUDA devices based on the number of GPUs and index function get_cuda_devices() { local num_gpu=$1 @@ -31,6 +36,25 @@ NUM_GPU=$2 INITIAL_PEER=$3 # Set INITIAL_PEER from the second argument shift 3 # Remove the first three arguments so $@ contains only additional Python arguments +mkdir -p logs +echo "Initial peer: $INITIAL_PEER" + +# Check if INITIAL_PEER is set to 'auto' and adjust accordingly +if [ "$INITIAL_PEER" = "auto" ]; then + # start the dht server + echo "Starting DHT server" + hivemind-dht --host_maddr /ip4/0.0.0.0/tcp/12345 --identity_path fixed_key.pem > logs/log_dht 2>&1 & + + INITIAL_PEER="" + # get the initial peer from the logs, loop until the peer is found + while [ -z "$INITIAL_PEER" ]; do + sleep 1 + INITIAL_PEER=$(awk '/Running a DHT instance/ {print $NF}' logs/log_dht) + + done +fi +echo "Initial peer: $INITIAL_PEER" + # Ensure the logs directory exists mkdir -p logs