lucidrains · MicPie · Dec 18, 2021 · Dec 18, 2021 · Dec 18, 2021 · Dec 19, 2021
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,7 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# vim
+*~
+*.swp
diff --git a/train/dataset.py b/train/dataset.py
@@ -0,0 +1,103 @@
+ # Based on:
+ # https://github.com/Zasder3/open_clip_juwels/blob/d36754b624a3eb5f0513ae3d0ee4030a420409e5/src/training/data.py
+ # https://github.com/Zasder3/open_clip_juwels/blob/50308cffdb4cf1b41c1fe95d8e8f8665c6a5c5d6/src/clip/clip.py
+
+from PIL import Image
+import torch
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, RandomResizedCrop
+from x_clip.tokenizer import tokenizer
+import os
+import braceexpand
+import math
+import webdataset as wds
+from datetime import datetime
+
+
+def preprocess_txt(text):
+    return tokenizer.tokenize(text)
+
+
+def _convert_to_rgb(image):
+    return image.convert('RGB')
+
+
+def transform_img(n_px: int, is_train: bool):
+    normalize = Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+    if is_train:
+        return Compose([
+            RandomResizedCrop(n_px, scale=(0.9, 1.0), interpolation=Image.BICUBIC),
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ])
+    else:
+        return Compose([
+            Resize(n_px, interpolation=Image.BICUBIC),
+            CenterCrop(n_px),
+            _convert_to_rgb,
+            ToTensor(),
+            normalize,
+        ])
+
+
+def get_dataset_size(shards):
+    shards_list = list(braceexpand.braceexpand(shards))
+    dir_path = os.path.dirname(shards)
+    if 'sizes.json' in os.listdir(dir_path):
+        sizes_filename = os.path.join(dir_path, 'sizes.json')
+        sizes = json.load(open(sizes_filename, 'r'))
+        total_size = sum(
+            [int(sizes[os.path.basename(shard)]) for shard in shards_list])
+    elif '__len__' in os.listdir(dir_path):
+        total_size = eval(open(os.path.join(dir_path, '__len__'), 'r').read())
+    else:
+        raise ValueError(f'Could not find dataset size in {dir_path}')
+    num_shards = len(shards_list)
+    return total_size, num_shards
+
+
+def get_wds_dataset(args, is_train=True, logger=None):
+    input_shards = args.path_data_train
+    assert input_shards is not None
+
+    # The following code is adapted from https://github.com/tmbdev/webdataset-examples/blob/master/main-wds.py
+    num_samples, num_shards = get_dataset_size(input_shards)
+
+    max_shards_per_node = math.ceil(num_shards / args.world_size)
+    num_samples = args.world_size * (num_samples * max_shards_per_node // num_shards)
+    num_batches = num_samples // (args.bs * args.world_size)
+    num_samples = num_batches * args.bs * args.world_size
+
+    logger.info(f"{datetime.now()} rank: {args.rank} max_shards_per_node: {max_shards_per_node}")
+    logger.info(f"{datetime.now()} rank: {args.rank} num_batches: {num_batches}")
+    logger.info(f"{datetime.now()} rank: {args.rank} num_samples: {num_samples}")
+
+    shardlist = wds.PytorchShardList(
+        input_shards,
+        epoch_shuffle=is_train,
+        split_by_node=is_train  # NOTE: we do eval on a single gpu.
+    )
+
+    preprocess_img = transform_img(args.visual_image_size, True)
+
+    dataset = (
+        wds.WebDataset(shardlist)
+        .decode("pil")
+        .rename(image="jpg;png", text="txt")
+        .map_dict(image=preprocess_img, text=preprocess_txt)
+        .to_tuple("image", "text")
+        #.batched(args.bs, partial=not is_train or not args.distributed)
+        .batched(args.bs, partial=not is_train)
+    )
+
+    dataloader = wds.WebLoader(
+        dataset, batch_size=None, shuffle=False, num_workers=args.numw,
+    )
+
+    # With DDP, we need to make sure that all nodes get the same number of batches;
+    # we do that by reusing a little bit of data.
+    dataloader = dataloader.repeat(2).slice(num_batches)
+    dataloader.num_batches = num_batches
+    dataloader.num_samples = num_samples
+
+    return dataloader
diff --git a/train/exp.sh b/train/exp.sh
@@ -0,0 +1,66 @@
+#!/bin/bash -x
+#SBATCH --account=cstdl
+#SBATCH --partition=develbooster
+### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
+### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4)
+### Note: --gres=gpu:x should equal to ntasks-per-node
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=2
+#SBATCH --gres=gpu:2
+#SBATCH --cpus-per-task=8
+#SBATCH --job-name=exp1
+###SBATCH --partition=gpu
+#SBATCH --time=00:15:00
+
+### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4)
+### Note: --gres=gpu:x should equal to ntasks-per-node
+
+### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others
+### change WORLD_SIZE as gpus/node * num_nodes
+###export WORLD_SIZE=4
+
+### get the first node name as master address - customized for vgg slurm
+### e.g. master(gnodee[2-5],gnoded1) == gnodee2
+###echo "NODELIST="${SLURM_NODELIST}
+
+export WORLD_SIZE=$SLURM_NTASKS
+echo "WORLD_SIZE="$WORLD_SIZE
+
+### Does not work with rank and local rank definition below, as for all processes 0 is handed over.
+#export RANK=$SLURM_PROCID
+#echo "RANK="$RANK
+
+#export LOCAL_RANK=${SLURM_LOCALID:-$OMPI_COMM_WORLD_LOCAL_RANK}
+#echo "LOCAL_RANK="$LOCAL_RANK
+
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+echo "MASTER_ADDR="$MASTER_ADDR
+
+export MASTER_PORT=12370
+echo "MASTER_PORT="$MASTER_PORT
+
+export CUDA_VISIBLE_DEVICES=0,1
+
+eval "$(/p/project/ccstdl/pieler1/miniconda3/bin/conda shell.bash hook)" # init conda
+conda activate pytorch1.10
+cd /p/project/ccstdl/pieler1/x-clip
+export PYTHONPATH="$PYTHONPATH:$PWD/src"
+srun python -u train/train_ddp.py \
+--id "test_ddp/exp2" \
+--path-data-train "/p/scratch/ccstdl/gordon2/CC3M/train/{00000..03318}.tar" \
+--save-interval-step 10000 \
+--bs 64 \
+--lr 1e-4 \
+--numw 8 \
+--seed 42 \
+--loss-over-ranks \
+--distributed_backend "PyTorch DDP" \
+#--checkdataloading \
+#--tb-profiler
+
+
+### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
+###SBATCH --constraint=p40&gmem24G
+###SBATCH --mem=64gb
+###SBATCH --chdir=/scratch/shared/beegfs/your_dir/
+###SBATCH --output=/scratch/shared/beegfs/your_dir/%x-%j.out
diff --git a/train/exp_ddp_16nodes_gpu4.sh b/train/exp_ddp_16nodes_gpu4.sh
@@ -0,0 +1,63 @@
+#!/bin/bash -x
+#SBATCH --account=cstdl
+#SBATCH --partition=booster
+### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
+### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4)
+### Note: --gres=gpu:x should equal to ntasks-per-node
+#SBATCH --nodes=16
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4
+#SBATCH --cpus-per-task=8
+#SBATCH --job-name=gpu64
+###SBATCH --partition=gpu
+#SBATCH --time=00:05:00
+
+### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others
+### change WORLD_SIZE as gpus/node * num_nodes
+###export WORLD_SIZE=4
+
+### get the first node name as master address - customized for vgg slurm
+### e.g. master(gnodee[2-5],gnoded1) == gnodee2
+###echo "NODELIST="${SLURM_NODELIST}
+
+export WORLD_SIZE=$SLURM_NTASKS
+echo "WORLD_SIZE="$WORLD_SIZE
+
+### Does not work with rank and local rank definition below, as for all processes 0 is handed over.
+#export RANK=$SLURM_PROCID
+#echo "RANK="$RANK
+
+#export LOCAL_RANK=${SLURM_LOCALID:-$OMPI_COMM_WORLD_LOCAL_RANK}
+#echo "LOCAL_RANK="$LOCAL_RANK
+
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+echo "MASTER_ADDR="$MASTER_ADDR
+
+export MASTER_PORT=12370
+echo "MASTER_PORT="$MASTER_PORT
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+eval "$(/p/project/ccstdl/pieler1/miniconda3/bin/conda shell.bash hook)" # init conda
+conda activate pytorch1.10
+cd /p/project/ccstdl/pieler1/x-clip
+export PYTHONPATH="$PYTHONPATH:$PWD/src"
+srun python -u train/train_ddp.py \
+--id "test_scaling_gpus_b/gpu64" \
+--path-data-train "/p/scratch/ccstdl/gordon2/CC3M/train/{00000..03318}.tar" \
+--save-interval-step 10000 \
+--bs 128 \
+--lr 1e-4 \
+--numw 8 \
+--seed 42 \
+--loss-over-ranks \
+--distributed_backend "PyTorch DDP" \
+#--checkdataloading \
+#--tb-profiler
+
+
+### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
+###SBATCH --constraint=p40&gmem24G
+###SBATCH --mem=64gb
+###SBATCH --chdir=/scratch/shared/beegfs/your_dir/
+###SBATCH --output=/scratch/shared/beegfs/your_dir/%x-%j.out
diff --git a/train/exp_ddp_2nodes_gpu4.sh b/train/exp_ddp_2nodes_gpu4.sh
@@ -0,0 +1,63 @@
+#!/bin/bash -x
+#SBATCH --account=cstdl
+#SBATCH --partition=booster
+### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
+### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4)
+### Note: --gres=gpu:x should equal to ntasks-per-node
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4
+#SBATCH --cpus-per-task=8
+#SBATCH --job-name=gpu8
+###SBATCH --partition=gpu
+#SBATCH --time=00:05:00
+
+### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others
+### change WORLD_SIZE as gpus/node * num_nodes
+###export WORLD_SIZE=4
+
+### get the first node name as master address - customized for vgg slurm
+### e.g. master(gnodee[2-5],gnoded1) == gnodee2
+###echo "NODELIST="${SLURM_NODELIST}
+
+export WORLD_SIZE=$SLURM_NTASKS
+echo "WORLD_SIZE="$WORLD_SIZE
+
+### Does not work with rank and local rank definition below, as for all processes 0 is handed over.
+#export RANK=$SLURM_PROCID
+#echo "RANK="$RANK
+
+#export LOCAL_RANK=${SLURM_LOCALID:-$OMPI_COMM_WORLD_LOCAL_RANK}
+#echo "LOCAL_RANK="$LOCAL_RANK
+
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+echo "MASTER_ADDR="$MASTER_ADDR
+
+export MASTER_PORT=12370
+echo "MASTER_PORT="$MASTER_PORT
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+eval "$(/p/project/ccstdl/pieler1/miniconda3/bin/conda shell.bash hook)" # init conda
+conda activate pytorch1.10
+cd /p/project/ccstdl/pieler1/x-clip
+export PYTHONPATH="$PYTHONPATH:$PWD/src"
+srun python -u train/train_ddp.py \
+--id "test_scaling_gpus_b/gpu8" \
+--path-data-train "/p/scratch/ccstdl/gordon2/CC3M/train/{00000..03318}.tar" \
+--save-interval-step 10000 \
+--bs 128 \
+--lr 1e-4 \
+--numw 8 \
+--seed 42 \
+--loss-over-ranks \
+--distributed_backend "PyTorch DDP" \
+#--checkdataloading \
+#--tb-profiler
+
+
+### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
+###SBATCH --constraint=p40&gmem24G
+###SBATCH --mem=64gb
+###SBATCH --chdir=/scratch/shared/beegfs/your_dir/
+###SBATCH --output=/scratch/shared/beegfs/your_dir/%x-%j.out
diff --git a/train/exp_ddp_32nodes_gpu4.sh b/train/exp_ddp_32nodes_gpu4.sh
@@ -0,0 +1,65 @@
+#!/bin/bash -x
+#SBATCH --account=cstdl
+#SBATCH --partition=booster
+### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
+### e.g. request 4 nodes with 1 gpu each, totally 4 gpus (WORLD_SIZE==4)
+### Note: --gres=gpu:x should equal to ntasks-per-node
+#SBATCH --nodes=32
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres=gpu:4
+#SBATCH --cpus-per-task=8
+#SBATCH --job-name=gpu128
+###SBATCH --partition=gpu
+#SBATCH --time=00:05:00
+
+### change 5-digit MASTER_PORT as you wish, slurm will raise Error if duplicated with others
+### change WORLD_SIZE as gpus/node * num_nodes
+###export WORLD_SIZE=4
+
+### get the first node name as master address - customized for vgg slurm
+### e.g. master(gnodee[2-5],gnoded1) == gnodee2
+###echo "NODELIST="${SLURM_NODELIST}
+
+export WORLD_SIZE=$SLURM_NTASKS
+echo "WORLD_SIZE="$WORLD_SIZE
+
+### Does not work with rank and local rank definition below, as for all processes 0 is handed over.
+#export RANK=$SLURM_PROCID
+#echo "RANK="$RANK
+
+#export LOCAL_RANK=${SLURM_LOCALID:-$OMPI_COMM_WORLD_LOCAL_RANK}
+#echo "LOCAL_RANK="$LOCAL_RANK
+
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+### Ross i fix
+export MASTER_ADDR=$MASTER_ADDR"i"
+echo "MASTER_ADDR="$MASTER_ADDR
+
+export MASTER_PORT=12370
+echo "MASTER_PORT="$MASTER_PORT
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+eval "$(/p/project/ccstdl/pieler1/miniconda3/bin/conda shell.bash hook)" # init conda
+conda activate pytorch1.10
+cd /p/project/ccstdl/pieler1/x-clip
+export PYTHONPATH="$PYTHONPATH:$PWD/src"
+srun python -u train/train_ddp.py \
+--id "test_scaling_gpus_b/gpu128" \
+--path-data-train "/p/scratch/ccstdl/gordon2/CC3M/train/{00000..03318}.tar" \
+--save-interval-step 10000 \
+--bs 32 \
+--lr 1e-4 \
+--numw 8 \
+--seed 42 \
+--loss-over-ranks \
+--distributed_backend "PyTorch DDP" \
+#--checkdataloading \
+#--tb-profiler
+
+
+### Based on: https://gist.github.com/TengdaHan/1dd10d335c7ca6f13810fff41e809904
+###SBATCH --constraint=p40&gmem24G
+###SBATCH --mem=64gb
+###SBATCH --chdir=/scratch/shared/beegfs/your_dir/
+###SBATCH --output=/scratch/shared/beegfs/your_dir/%x-%j.out