Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Distributed training setup #4

Draft
wants to merge 27 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
7833706
Adaptations CLIP model class for ddp training 1.
MicPie Dec 18, 2021
88bb5d9
Draft ddp training 1.
MicPie Dec 18, 2021
f037153
Draft ddp training 2 incl. dumming dataset and bug fixes.
MicPie Dec 18, 2021
cf83155
Draft ddp training 3 incl. webdataset and fixes.
MicPie Dec 19, 2021
211b549
Fix data timing measurement point.
MicPie Dec 19, 2021
db3a65d
Add checkdataloading argument to debug dataloading.
MicPie Dec 19, 2021
0ac974e
Added seeding, use of all latents and minor cleanups.
MicPie Dec 21, 2021
9ad4fa6
Added code for testing with all latents for debugging.
MicPie Dec 21, 2021
286f850
Added code for testing with all latents for debugging 2.
MicPie Dec 21, 2021
4880436
Added distributed_backends from https://github.com/lucidrains/DALLE-p…
MicPie Dec 21, 2021
31f58ac
Backup all_gather backprop setups for transfer.
MicPie Dec 28, 2021
cf7327a
fix for merge
MicPie Dec 28, 2021
142d79d
Merge branch 'lucidrains-main' into hvd
MicPie Dec 28, 2021
874ca70
Added pytorch ddp backend.
MicPie Dec 29, 2021
c880877
Adapted pytorch ddp backend class name and added class to __init__.
MicPie Dec 29, 2021
dfb5c99
update pytorch ddp setup for multi-gpu tests
MicPie Jan 3, 2022
20c7797
update pytorch ddp setup for multi-gpu tests 2
MicPie Jan 3, 2022
c988f5f
update pytorch ddp setup for multi-gpu tests 2
MicPie Jan 3, 2022
50a994a
update pytorch ddp setup for multi-gpu tests 3
MicPie Jan 3, 2022
34558ea
update pytorch ddp setup for multi-gpu tests 4
MicPie Jan 3, 2022
117ba82
update pytorch ddp setup for multi-gpu tests 3
MicPie Jan 9, 2022
7feff0f
cleanup
MicPie Jan 9, 2022
e39cba5
setup for distributed abstraction training
MicPie Jan 10, 2022
6b086b8
Update x-clip to only use local samples for the loss and general clea…
MicPie Jan 11, 2022
095bd0e
big cleanup and grad cache test setup
MicPie Jan 12, 2022
ec28d1c
distributed training setup 1
MicPie Jan 12, 2022
5338ecb
Merge branch 'distributed_training_setup' of github.com:MicPie/x-clip…
MicPie Jan 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,7 @@ dmypy.json

# Pyre type checker
.pyre/

# vim
*~
*.swp
103 changes: 103 additions & 0 deletions train/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Based on:
# https://github.com/Zasder3/open_clip_juwels/blob/d36754b624a3eb5f0513ae3d0ee4030a420409e5/src/training/data.py
# https://github.com/Zasder3/open_clip_juwels/blob/50308cffdb4cf1b41c1fe95d8e8f8665c6a5c5d6/src/clip/clip.py

from PIL import Image
import torch
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, RandomResizedCrop
from x_clip.tokenizer import tokenizer
import os
import braceexpand
import math
import webdataset as wds
from datetime import datetime


def preprocess_txt(text):
return tokenizer.tokenize(text)


def _convert_to_rgb(image):
return image.convert('RGB')


def transform_img(n_px: int, is_train: bool):
normalize = Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
if is_train:
return Compose([
RandomResizedCrop(n_px, scale=(0.9, 1.0), interpolation=Image.BICUBIC),
_convert_to_rgb,
ToTensor(),
normalize,
])
else:
return Compose([
Resize(n_px, interpolation=Image.BICUBIC),
CenterCrop(n_px),
_convert_to_rgb,
ToTensor(),
normalize,
])


def get_dataset_size(shards):
shards_list = list(braceexpand.braceexpand(shards))
dir_path = os.path.dirname(shards)
if 'sizes.json' in os.listdir(dir_path):
sizes_filename = os.path.join(dir_path, 'sizes.json')
sizes = json.load(open(sizes_filename, 'r'))
total_size = sum(
[int(sizes[os.path.basename(shard)]) for shard in shards_list])
elif '__len__' in os.listdir(dir_path):
total_size = eval(open(os.path.join(dir_path, '__len__'), 'r').read())
else:
raise ValueError(f'Could not find dataset size in {dir_path}')
num_shards = len(shards_list)
return total_size, num_shards


def get_wds_dataset(args, is_train=True, logger=None):
input_shards = args.path_data_train
assert input_shards is not None

# The following code is adapted from https://github.com/tmbdev/webdataset-examples/blob/master/main-wds.py
num_samples, num_shards = get_dataset_size(input_shards)

max_shards_per_node = math.ceil(num_shards / args.world_size)
num_samples = args.world_size * (num_samples * max_shards_per_node // num_shards)
num_batches = num_samples // (args.bs * args.world_size)
num_samples = num_batches * args.bs * args.world_size

logger.info(f"{datetime.now()} rank: {args.rank} max_shards_per_node: {max_shards_per_node}")
logger.info(f"{datetime.now()} rank: {args.rank} num_batches: {num_batches}")
logger.info(f"{datetime.now()} rank: {args.rank} num_samples: {num_samples}")

shardlist = wds.PytorchShardList(
input_shards,
epoch_shuffle=is_train,
split_by_node=is_train # NOTE: we do eval on a single gpu.
)

preprocess_img = transform_img(args.visual_image_size, True)

dataset = (
wds.WebDataset(shardlist)
.decode("pil")
.rename(image="jpg;png", text="txt")
.map_dict(image=preprocess_img, text=preprocess_txt)
.to_tuple("image", "text")
#.batched(args.bs, partial=not is_train or not args.distributed)
.batched(args.bs, partial=not is_train)
)

dataloader = wds.WebLoader(
dataset, batch_size=None, shuffle=False, num_workers=args.numw,
)

# With DDP, we need to make sure that all nodes get the same number of batches;
# we do that by reusing a little bit of data.
dataloader = dataloader.repeat(2).slice(num_batches)
dataloader.num_batches = num_batches
dataloader.num_samples = num_samples

return dataloader
66 changes: 66 additions & 0 deletions train/exp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash -x
#SBATCH --account=cstdl
#SBATCH --partition=develbooster
### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4)
### Note: --gres=gpu:x should equal to ntasks-per-node
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=2
#SBATCH --gres=gpu:2
#SBATCH --cpus-per-task=8
#SBATCH --job-name=exp1
###SBATCH --partition=gpu
#SBATCH --time=00:15:00

### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4)
### Note: --gres=gpu:x should equal to ntasks-per-node

### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others
### change WORLD_SIZE as gpus/node * num_nodes
###export WORLD_SIZE=4

### get the first node name as master address - customized for vgg slurm
### e.g. master(gnodee[2-5],gnoded1) == gnodee2
###echo "NODELIST="${SLURM_NODELIST}

export WORLD_SIZE=$SLURM_NTASKS
echo "WORLD_SIZE="$WORLD_SIZE

### Does not work with rank and local rank definition below, as for all processes 0 is handed over.
#export RANK=$SLURM_PROCID
#echo "RANK="$RANK

#export LOCAL_RANK=${SLURM_LOCALID:-$OMPI_COMM_WORLD_LOCAL_RANK}
#echo "LOCAL_RANK="$LOCAL_RANK

export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
echo "MASTER_ADDR="$MASTER_ADDR

export MASTER_PORT=12370
echo "MASTER_PORT="$MASTER_PORT

export CUDA_VISIBLE_DEVICES=0,1

eval "$(/p/project/ccstdl/pieler1/miniconda3/bin/conda shell.bash hook)" # init conda
conda activate pytorch1.10
cd /p/project/ccstdl/pieler1/x-clip
export PYTHONPATH="$PYTHONPATH:$PWD/src"
srun python -u train/train_ddp.py \
--id "test_ddp/exp2" \
--path-data-train "/p/scratch/ccstdl/gordon2/CC3M/train/{00000..03318}.tar" \
--save-interval-step 10000 \
--bs 64 \
--lr 1e-4 \
--numw 8 \
--seed 42 \
--loss-over-ranks \
--distributed_backend "PyTorch DDP" \
#--checkdataloading \
#--tb-profiler


### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
###SBATCH --constraint=p40&gmem24G
###SBATCH --mem=64gb
###SBATCH --chdir=/scratch/shared/beegfs/your_dir/
###SBATCH --output=/scratch/shared/beegfs/your_dir/%x-%j.out
63 changes: 63 additions & 0 deletions train/exp_ddp_16nodes_gpu4.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash -x
#SBATCH --account=cstdl
#SBATCH --partition=booster
### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4)
### Note: --gres=gpu:x should equal to ntasks-per-node
#SBATCH --nodes=16
#SBATCH --ntasks-per-node=4
#SBATCH --gres=gpu:4
#SBATCH --cpus-per-task=8
#SBATCH --job-name=gpu64
###SBATCH --partition=gpu
#SBATCH --time=00:05:00

### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others
### change WORLD_SIZE as gpus/node * num_nodes
###export WORLD_SIZE=4

### get the first node name as master address - customized for vgg slurm
### e.g. master(gnodee[2-5],gnoded1) == gnodee2
###echo "NODELIST="${SLURM_NODELIST}

export WORLD_SIZE=$SLURM_NTASKS
echo "WORLD_SIZE="$WORLD_SIZE

### Does not work with rank and local rank definition below, as for all processes 0 is handed over.
#export RANK=$SLURM_PROCID
#echo "RANK="$RANK

#export LOCAL_RANK=${SLURM_LOCALID:-$OMPI_COMM_WORLD_LOCAL_RANK}
#echo "LOCAL_RANK="$LOCAL_RANK

export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
echo "MASTER_ADDR="$MASTER_ADDR

export MASTER_PORT=12370
echo "MASTER_PORT="$MASTER_PORT

export CUDA_VISIBLE_DEVICES=0,1,2,3

eval "$(/p/project/ccstdl/pieler1/miniconda3/bin/conda shell.bash hook)" # init conda
conda activate pytorch1.10
cd /p/project/ccstdl/pieler1/x-clip
export PYTHONPATH="$PYTHONPATH:$PWD/src"
srun python -u train/train_ddp.py \
--id "test_scaling_gpus_b/gpu64" \
--path-data-train "/p/scratch/ccstdl/gordon2/CC3M/train/{00000..03318}.tar" \
--save-interval-step 10000 \
--bs 128 \
--lr 1e-4 \
--numw 8 \
--seed 42 \
--loss-over-ranks \
--distributed_backend "PyTorch DDP" \
#--checkdataloading \
#--tb-profiler


### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
###SBATCH --constraint=p40&gmem24G
###SBATCH --mem=64gb
###SBATCH --chdir=/scratch/shared/beegfs/your_dir/
###SBATCH --output=/scratch/shared/beegfs/your_dir/%x-%j.out
63 changes: 63 additions & 0 deletions train/exp_ddp_2nodes_gpu4.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash -x
#SBATCH --account=cstdl
#SBATCH --partition=booster
### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4)
### Note: --gres=gpu:x should equal to ntasks-per-node
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=4
#SBATCH --gres=gpu:4
#SBATCH --cpus-per-task=8
#SBATCH --job-name=gpu8
###SBATCH --partition=gpu
#SBATCH --time=00:05:00

### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others
### change WORLD_SIZE as gpus/node * num_nodes
###export WORLD_SIZE=4

### get the first node name as master address - customized for vgg slurm
### e.g. master(gnodee[2-5],gnoded1) == gnodee2
###echo "NODELIST="${SLURM_NODELIST}

export WORLD_SIZE=$SLURM_NTASKS
echo "WORLD_SIZE="$WORLD_SIZE

### Does not work with rank and local rank definition below, as for all processes 0 is handed over.
#export RANK=$SLURM_PROCID
#echo "RANK="$RANK

#export LOCAL_RANK=${SLURM_LOCALID:-$OMPI_COMM_WORLD_LOCAL_RANK}
#echo "LOCAL_RANK="$LOCAL_RANK

export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
echo "MASTER_ADDR="$MASTER_ADDR

export MASTER_PORT=12370
echo "MASTER_PORT="$MASTER_PORT

export CUDA_VISIBLE_DEVICES=0,1,2,3

eval "$(/p/project/ccstdl/pieler1/miniconda3/bin/conda shell.bash hook)" # init conda
conda activate pytorch1.10
cd /p/project/ccstdl/pieler1/x-clip
export PYTHONPATH="$PYTHONPATH:$PWD/src"
srun python -u train/train_ddp.py \
--id "test_scaling_gpus_b/gpu8" \
--path-data-train "/p/scratch/ccstdl/gordon2/CC3M/train/{00000..03318}.tar" \
--save-interval-step 10000 \
--bs 128 \
--lr 1e-4 \
--numw 8 \
--seed 42 \
--loss-over-ranks \
--distributed_backend "PyTorch DDP" \
#--checkdataloading \
#--tb-profiler


### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
###SBATCH --constraint=p40&gmem24G
###SBATCH --mem=64gb
###SBATCH --chdir=/scratch/shared/beegfs/your_dir/
###SBATCH --output=/scratch/shared/beegfs/your_dir/%x-%j.out
65 changes: 65 additions & 0 deletions train/exp_ddp_32nodes_gpu4.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash -x
#SBATCH --account=cstdl
#SBATCH --partition=booster
### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4)
### Note: --gres=gpu:x should equal to ntasks-per-node
#SBATCH --nodes=32
#SBATCH --ntasks-per-node=4
#SBATCH --gres=gpu:4
#SBATCH --cpus-per-task=8
#SBATCH --job-name=gpu128
###SBATCH --partition=gpu
#SBATCH --time=00:05:00

### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others
### change WORLD_SIZE as gpus/node * num_nodes
###export WORLD_SIZE=4

### get the first node name as master address - customized for vgg slurm
### e.g. master(gnodee[2-5],gnoded1) == gnodee2
###echo "NODELIST="${SLURM_NODELIST}

export WORLD_SIZE=$SLURM_NTASKS
echo "WORLD_SIZE="$WORLD_SIZE

### Does not work with rank and local rank definition below, as for all processes 0 is handed over.
#export RANK=$SLURM_PROCID
#echo "RANK="$RANK

#export LOCAL_RANK=${SLURM_LOCALID:-$OMPI_COMM_WORLD_LOCAL_RANK}
#echo "LOCAL_RANK="$LOCAL_RANK

export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
### Ross i fix
export MASTER_ADDR=$MASTER_ADDR"i"
echo "MASTER_ADDR="$MASTER_ADDR

export MASTER_PORT=12370
echo "MASTER_PORT="$MASTER_PORT

export CUDA_VISIBLE_DEVICES=0,1,2,3

eval "$(/p/project/ccstdl/pieler1/miniconda3/bin/conda shell.bash hook)" # init conda
conda activate pytorch1.10
cd /p/project/ccstdl/pieler1/x-clip
export PYTHONPATH="$PYTHONPATH:$PWD/src"
srun python -u train/train_ddp.py \
--id "test_scaling_gpus_b/gpu128" \
--path-data-train "/p/scratch/ccstdl/gordon2/CC3M/train/{00000..03318}.tar" \
--save-interval-step 10000 \
--bs 32 \
--lr 1e-4 \
--numw 8 \
--seed 42 \
--loss-over-ranks \
--distributed_backend "PyTorch DDP" \
#--checkdataloading \
#--tb-profiler


### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
###SBATCH --constraint=p40&gmem24G
###SBATCH --mem=64gb
###SBATCH --chdir=/scratch/shared/beegfs/your_dir/
###SBATCH --output=/scratch/shared/beegfs/your_dir/%x-%j.out
Loading