From 230321b8430af823306d1cea6b4933f9296bec9c Mon Sep 17 00:00:00 2001 From: Verdi March Date: Fri, 3 May 2024 15:10:06 +0800 Subject: [PATCH 1/2] Bump and refactor pytorch dockerfile template --- .../pytorch/0.nvcr-pytorch-aws.dockerfile | 84 +++++++++---------- .../pytorch/1.xformers.fragment.dockerfile | 29 +++++++ .../containers/pytorch/README.md | 4 + 3 files changed, 75 insertions(+), 42 deletions(-) create mode 100644 2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile diff --git a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile index 49ab8c03..0616fe62 100644 --- a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile +++ b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile @@ -3,7 +3,8 @@ #################################################################################################### # This is a sample Dockerfile, with optional stanzas. Please read through this Dockerfile, -# understand what it does, then create your own Dockerfile. +# understand what it does, then create your own Dockerfile. Software versions are provided for +# illustration only. # # Sample build instructions: # @@ -19,13 +20,13 @@ # # Load image to local docker registry -> on head node, or new compute/build node. # docker load < /fsx/nvidia-pt-od__latest.tar #################################################################################################### -FROM nvcr.io/nvidia/pytorch:23.12-py3 +FROM nvcr.io/nvidia/pytorch:24.03-py3 ENV DEBIAN_FRONTEND=noninteractive # The three must-be-built packages. # Efa-installer>=1.29.1 required for nccl>=2.19.0 to avoid libfabric NCCL error. -ENV EFA_INSTALLER_VERSION=1.30.0 -ENV AWS_OFI_NCCL_VERSION=1.8.1-aws +ENV EFA_INSTALLER_VERSION=1.32.0 +ENV AWS_OFI_NCCL_VERSION=1.9.1-aws ENV NCCL_TESTS_VERSION=master ## Uncomment below when this Dockerfile builds a container image with efa-installer<1.29.1 and @@ -82,36 +83,44 @@ RUN apt-get update && \ rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH +ENV OPAL_PREFIX=/opt/amazon/openmpi #################################################################################################### # [CUSTOM_NCCL_OPTION_1] Uncomment below stanza to install another NCCL version using the official # binaries. # +# Please consult https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html to +# find out the prebuilt nccl version in the parent image. +# # NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the # aws-ofi-ccnl. #################################################################################################### -#ENV NCCL_VERSION=2.19.3-1 +#ENV NCCL_VERSION=2.21.5-1 #RUN cd /opt && \ # wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ # dpkg -i cuda-keyring_1.0-1_all.deb && \ # apt update && \ # apt install -y libnccl2==${NCCL_VERSION} libnccl-dev==${NCCL_VERSION} && \ -# echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf +# echo NCCL_SOCKET_IFNAME=^docker0,lo,veth_def_agent >> /etc/nccl.conf #################################################################################################### -# [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones. The -# benefits of installing to the same location as the built-in version are: +# [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones. +# +# Please consult https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html to +# find out the prebuilt nccl version in the parent image. +# +# Installation mechanics: # -# 1. There's only ever a single libnccl version offered by this image, preventing application from -# mistakenly chooses a wrong version. -# 2. No longer needing extra settings for LD_LIBRARY_PATH or LD_PRELOAD. +# 1. Remove pre-installed nccl to ensure there's only ever a single libnccl version offered by this +# image, preventing application from mistakenly chooses a wrong version. +# 2. Install to default location, so no more extra settings for LD_LIBRARY_PATH or LD_PRELOAD. # # NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the # aws-ofi-ccnl. #################################################################################################### -ENV NCCL_VERSION=2.19.3-1 +ENV NCCL_VERSION=2.21.5-1 RUN apt-get remove -y libnccl2 libnccl-dev \ && cd /tmp \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ @@ -120,7 +129,7 @@ RUN apt-get remove -y libnccl2 libnccl-dev \ # Build for p4 & p5. NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \ && rm -rf /tmp/nccl \ - && echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf + && echo NCCL_SOCKET_IFNAME=^docker0,lo,veth_def_agent >> /etc/nccl.conf #################################################################################################### @@ -136,7 +145,7 @@ RUN apt-get remove -y libnccl2 libnccl-dev \ ENV OPEN_MPI_PATH=/opt/amazon/openmpi # OpenMPI build script claims PMIX_VERSION, and complains if we use it. -ENV CUSTOM_PMIX_VERSION=4.2.6 +ENV CUSTOM_PMIX_VERSION=4.2.7 RUN apt-get update && apt-get install -y libevent-dev \ && cd /tmp \ && wget https://github.com/openpmix/openpmix/releases/download/v${CUSTOM_PMIX_VERSION}/pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ @@ -151,10 +160,6 @@ RUN apt-get update && apt-get install -y libevent-dev \ && ldconfig \ && cd / \ && rm -fr /tmp/pmix-${CUSTOM_PMIX_VERSION}/ -# To silence this runtime error message: -# [p4de-st-p4de-2:110912] PMIX ERROR: ERROR in file gds_ds12_lock_pthread.c at line 168 -ENV PMIX_GDS_MODULE=^ds12 \ - PMIX_MCA_gds=^ds12 # Rebuild openmpi with DLC style (which it remarks as "without libfabric"), with the above pmix. ENV OMPI_VERSION=4.1.6 @@ -192,20 +197,31 @@ RUN mkdir -p /tmp && \ --enable-platform-aws \ --with-mpi=/opt/amazon/openmpi && \ make -j$(nproc) install && \ - rm -rf /tmp/aws-ofi/nccl + rm -rf /tmp/aws-ofi-nccl # Do this to minimize the ld path env vars that users need to define when running this image. RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \ echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \ ldconfig -ENV OMPI_MCA_pml=^cm,ucx \ - OMPI_MCA_btl=tcp,self \ - OMPI_MCA_btl_tcp_if_exclude=lo,docker0 \ - OPAL_PREFIX=/opt/amazon/openmpi \ +ENV \ + # ----- BEGIN pmix env vars + # To silence this runtime error message: + # [p4de-st-p4de-2:110912] PMIX ERROR: ERROR in file gds_ds12_lock_pthread.c at line 168 + # https://github.com/open-mpi/ompi/issues/7516#issuecomment-599305327 + PMIX_GDS_MODULE=^ds12 \ + PMIX_MCA_gds=^ds12 \ + # https://github.com/open-mpi/ompi/issues/11557#issuecomment-1496245026 + PMIX_MCA_psec=^munge \ + # ----- BEGIN openmpi env vars + OMPI_MCA_pml=^cm,ucx \ + OMPI_MCA_btl=tcp,self \ + OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent \ + OPAL_PREFIX=/opt/amazon/openmpi \ # https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 # https://github.com/pytorch/pytorch/issues/68893 - NCCL_SOCKET_IFNAME=^docker,lo + # NOTE: veth_def_agent is from SageMaker HyperPod + NCCL_SOCKET_IFNAME=^docker,lo,veth_def_agent ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" @@ -220,22 +236,6 @@ RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ #################################################################################################### -# Custom packages. Disable as you like. NOTE: always check `pip list` what's been installed. For -# example, the base container comes pre-installed with Transformer Engine, flash attention, triton -# (https://github.com/openai/triton/), etc. +# Add your custom build steps below. For example, from 1.partial-xformers.dockerfile #################################################################################################### -# Install the xformers dependency from source, because pip install either breaks or try to pull -# its own pt + cuda. -# -# Pre-requisite: build node has enough memory to compile xformers. More info on the stanza. -RUN export TORCH_CUDA_ARCH_LIST="8.0;9.0+PTX" && \ - # On p4de.24xlarge: - # - MAX_JOBS=16 => 145GB memory - # - MAX_JOBS=32 => 241GB memory - # - MAX_JOBS=48 => 243GB memory, 542.5s - # - # NOTE: must export MAX_JOBS. For some reason, `MAX_JOBS=16 pip install ...` doesn't seem to - # work to prevent OOM. - export MAX_JOBS=32 && \ - export NVCC_PREPEND_FLAGS="-t 32" && \ - pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers +# This section is intentionally left empty by default. diff --git a/2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile b/2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile new file mode 100644 index 00000000..219b9f87 --- /dev/null +++ b/2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile @@ -0,0 +1,29 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#################################################################################################### +# This is NOT a complete Dockerfile! Attempt to docker build this file is guaranteed to fail. +# +# This file provides an sample stanza to build xformers, that you can optionally add to +# 0.nvcr-pytorch-aws.dockerfile should you need a container image with xformers. +# +# NOTE: always check `pip list` what's been installed. The base container (specified in +# 0.nvcr-pytorch-aws.dockerfile) is already pre-installed with Transformer Engine, flash attention, +# triton (https://github.com/openai/triton/), etc. +#################################################################################################### + +# Install the xformers dependency from source, because pip install either breaks or try to pull +# its own pt + cuda. +# +# Pre-requisite: build node has enough memory to compile xformers. More info on the stanza. +RUN export TORCH_CUDA_ARCH_LIST="8.0;9.0+PTX" && \ + # On p4de.24xlarge: + # - MAX_JOBS=16 => 145GB memory + # - MAX_JOBS=32 => 241GB memory + # - MAX_JOBS=48 => 243GB memory, 542.5s + # + # NOTE: must export MAX_JOBS. For some reason, `MAX_JOBS=16 pip install ...` doesn't seem to + # work to prevent OOM. + export MAX_JOBS=32 && \ + export NVCC_PREPEND_FLAGS="-t 32" && \ + pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers diff --git a/2.ami_and_containers/containers/pytorch/README.md b/2.ami_and_containers/containers/pytorch/README.md index 79d65795..f0fc5c14 100644 --- a/2.ami_and_containers/containers/pytorch/README.md +++ b/2.ami_and_containers/containers/pytorch/README.md @@ -13,6 +13,10 @@ With that said, feel free to explore the example. Happy coding, and experimentin ## 1. Essential software +Please note that software versions in the template are provided for illustration only. For +well-tested combinations, please refer to the various Dockerfile files under `3.test_cases/` and +`4.validation_and_observability/0.nccl_tests/`. + In principle, the reference `Dockerfile` does the following: - Provide PyTorch built for NVidia CUDA devices, by using a recent NVidia PyTorch image as the From f2c8c9796b46c4c5f7c361c2cf2a37d868cc2088 Mon Sep 17 00:00:00 2001 From: Verdi March Date: Fri, 3 May 2024 20:12:43 +0800 Subject: [PATCH 2/2] Make custom pmix+openmpi optional, since nccl-tests works fine without it (as of 20240503 on SMHP). --- .../pytorch/0.nvcr-pytorch-aws.dockerfile | 54 ++----------------- .../1.pmix-openmpi.fragment.dockerfile | 53 ++++++++++++++++++ ...kerfile => 2.xformers.fragment.dockerfile} | 0 3 files changed, 57 insertions(+), 50 deletions(-) create mode 100644 2.ami_and_containers/containers/pytorch/1.pmix-openmpi.fragment.dockerfile rename 2.ami_and_containers/containers/pytorch/{1.xformers.fragment.dockerfile => 2.xformers.fragment.dockerfile} (100%) diff --git a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile index 0616fe62..3f830c1e 100644 --- a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile +++ b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile @@ -131,56 +131,10 @@ RUN apt-get remove -y libnccl2 libnccl-dev \ && rm -rf /tmp/nccl \ && echo NCCL_SOCKET_IFNAME=^docker0,lo,veth_def_agent >> /etc/nccl.conf - -#################################################################################################### -# Rebuild OpenMPI with custom PMIX version. E.g., to match what host's Slurm is built with (see -# /opt/pmix/ on host, or run pmix_info on host). +## When nccl-tests mysteriously crashes with pmix error, consider to rebuild pmix + openmpi by +## putting the stanza from 1.pmix-openmpiaws.fragment.dockerfile below. # -# May be needed on rare occassions when `srun --mpi=pmix --container-image=... ` -# mysteriously crashes. -# -# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the -# aws-ofi-ccnl. -#################################################################################################### -ENV OPEN_MPI_PATH=/opt/amazon/openmpi - -# OpenMPI build script claims PMIX_VERSION, and complains if we use it. -ENV CUSTOM_PMIX_VERSION=4.2.7 -RUN apt-get update && apt-get install -y libevent-dev \ - && cd /tmp \ - && wget https://github.com/openpmix/openpmix/releases/download/v${CUSTOM_PMIX_VERSION}/pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ - && tar -xzf pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ - && rm pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ - && cd pmix-${CUSTOM_PMIX_VERSION}/ \ - && ./autogen.pl \ - && ./configure --prefix=/opt/pmix \ - && make -j \ - && make install \ - && echo /opt/pmix/lib > /etc/ld.so.conf.d/pmix.conf \ - && ldconfig \ - && cd / \ - && rm -fr /tmp/pmix-${CUSTOM_PMIX_VERSION}/ - -# Rebuild openmpi with DLC style (which it remarks as "without libfabric"), with the above pmix. -ENV OMPI_VERSION=4.1.6 -RUN rm -fr ${OPEN_MPI_PATH} \ - && mkdir /tmp/openmpi \ - && cd /tmp/openmpi \ - && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ - && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ - && rm openmpi-${OMPI_VERSION}.tar.gz \ - && cd openmpi-${OMPI_VERSION} \ - && ./configure --enable-orterun-prefix-by-default --prefix=$OPEN_MPI_PATH --with-cuda=${CUDA_HOME} --with-slurm --with-pmix=/opt/pmix \ - && make -j $(nproc) all \ - && make install \ - && ldconfig \ - && cd / \ - && rm -rf /tmp/openmpi \ - && ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \ - # Verify pmix from /opt/pmix/ - && ldd /opt/amazon/openmpi/lib/openmpi/mca_pmix_ext3x.so | grep '/opt/pmix/lib/libpmix.so.* ' > /opt/amazon/openmpi-pmix.txt -#################################################################################################### - +# # NCCL EFA Plugin RUN mkdir -p /tmp && \ @@ -236,6 +190,6 @@ RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ #################################################################################################### -# Add your custom build steps below. For example, from 1.partial-xformers.dockerfile +# Add your custom build steps below. For example, from {2,...}.*.fragment.dockerfile files #################################################################################################### # This section is intentionally left empty by default. diff --git a/2.ami_and_containers/containers/pytorch/1.pmix-openmpi.fragment.dockerfile b/2.ami_and_containers/containers/pytorch/1.pmix-openmpi.fragment.dockerfile new file mode 100644 index 00000000..c4c1af8d --- /dev/null +++ b/2.ami_and_containers/containers/pytorch/1.pmix-openmpi.fragment.dockerfile @@ -0,0 +1,53 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#################################################################################################### +# This is NOT a complete Dockerfile! Attempt to docker build this file is guaranteed to fail. +# +# This file provides an sample stanza to rebuild OpenMPI with custom PMIX version. E.g., to match +# what host's Slurm is built with (see /opt/pmix/ on host, or run pmix_info on host). +# +# You might need this only on rare occassions when `srun --mpi=pmix --container-image=... ` +# mysteriously crashes. +# +# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi **BEFORE** we build +# the aws-ofi-ccnl. +#################################################################################################### + +ENV OPEN_MPI_PATH=/opt/amazon/openmpi + +# OpenMPI build script claims PMIX_VERSION, and complains if we use it. +ENV CUSTOM_PMIX_VERSION=4.2.7 +RUN apt-get update && apt-get install -y libevent-dev \ + && cd /tmp \ + && wget https://github.com/openpmix/openpmix/releases/download/v${CUSTOM_PMIX_VERSION}/pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ + && tar -xzf pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ + && rm pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ + && cd pmix-${CUSTOM_PMIX_VERSION}/ \ + && ./autogen.pl \ + && ./configure --prefix=/opt/pmix \ + && make -j \ + && make install \ + && echo /opt/pmix/lib > /etc/ld.so.conf.d/pmix.conf \ + && ldconfig \ + && cd / \ + && rm -fr /tmp/pmix-${CUSTOM_PMIX_VERSION}/ + +# Rebuild openmpi with DLC style (which it remarks as "without libfabric"), with the above pmix. +ENV OMPI_VERSION=4.1.6 +RUN rm -fr ${OPEN_MPI_PATH} \ + && mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ + && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ + && rm openmpi-${OMPI_VERSION}.tar.gz \ + && cd openmpi-${OMPI_VERSION} \ + && ./configure --enable-orterun-prefix-by-default --prefix=$OPEN_MPI_PATH --with-cuda=${CUDA_HOME} --with-slurm --with-pmix=/opt/pmix \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && cd / \ + && rm -rf /tmp/openmpi \ + && ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \ + # Verify pmix from /opt/pmix/ + && ldd /opt/amazon/openmpi/lib/openmpi/mca_pmix_ext3x.so | grep '/opt/pmix/lib/libpmix.so.* ' > /opt/amazon/openmpi-pmix.txt diff --git a/2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile b/2.ami_and_containers/containers/pytorch/2.xformers.fragment.dockerfile similarity index 100% rename from 2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile rename to 2.ami_and_containers/containers/pytorch/2.xformers.fragment.dockerfile