From 230321b8430af823306d1cea6b4933f9296bec9c Mon Sep 17 00:00:00 2001
From: Verdi March <marcverd@amazon.com>
Date: Fri, 3 May 2024 15:10:06 +0800
Subject: [PATCH 1/2] Bump and refactor pytorch dockerfile template

---
 .../pytorch/0.nvcr-pytorch-aws.dockerfile     | 84 +++++++++----------
 .../pytorch/1.xformers.fragment.dockerfile    | 29 +++++++
 .../containers/pytorch/README.md              |  4 +
 3 files changed, 75 insertions(+), 42 deletions(-)
 create mode 100644 2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile

diff --git a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile
index 49ab8c03..0616fe62 100644
--- a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile
+++ b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile
@@ -3,7 +3,8 @@
 
 ####################################################################################################
 # This is a sample Dockerfile, with optional stanzas. Please read through this Dockerfile,
-# understand what it does, then create your own Dockerfile.
+# understand what it does, then create your own Dockerfile. Software versions are provided for
+# illustration only.
 #
 # Sample build instructions:
 #
@@ -19,13 +20,13 @@
 #     # Load image to local docker registry -> on head node, or new compute/build node.
 #     docker load < /fsx/nvidia-pt-od__latest.tar
 ####################################################################################################
-FROM nvcr.io/nvidia/pytorch:23.12-py3
+FROM nvcr.io/nvidia/pytorch:24.03-py3
 ENV DEBIAN_FRONTEND=noninteractive
 
 # The three must-be-built packages.
 # Efa-installer>=1.29.1 required for nccl>=2.19.0 to avoid libfabric NCCL error.
-ENV EFA_INSTALLER_VERSION=1.30.0
-ENV AWS_OFI_NCCL_VERSION=1.8.1-aws
+ENV EFA_INSTALLER_VERSION=1.32.0
+ENV AWS_OFI_NCCL_VERSION=1.9.1-aws
 ENV NCCL_TESTS_VERSION=master
 
 ## Uncomment below when this Dockerfile builds a container image with efa-installer<1.29.1 and
@@ -82,36 +83,44 @@ RUN apt-get update && \
     rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*
 ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH
 ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH
+ENV OPAL_PREFIX=/opt/amazon/openmpi
 
 
 ####################################################################################################
 # [CUSTOM_NCCL_OPTION_1] Uncomment below stanza to install another NCCL version using the official
 # binaries.
 #
+# Please consult https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html to
+# find out the prebuilt nccl version in the parent image.
+#
 # NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the
 # aws-ofi-ccnl.
 ####################################################################################################
-#ENV NCCL_VERSION=2.19.3-1
+#ENV NCCL_VERSION=2.21.5-1
 #RUN cd /opt && \
 #    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \
 #    dpkg -i cuda-keyring_1.0-1_all.deb && \
 #    apt update && \
 #    apt install -y libnccl2==${NCCL_VERSION} libnccl-dev==${NCCL_VERSION} && \
-#    echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf
+#    echo NCCL_SOCKET_IFNAME=^docker0,lo,veth_def_agent >> /etc/nccl.conf
 
 
 ####################################################################################################
-# [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones. The
-# benefits of installing to the same location as the built-in version are:
+# [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones.
+#
+# Please consult https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html to
+# find out the prebuilt nccl version in the parent image.
+#
+# Installation mechanics:
 #
-# 1. There's only ever a single libnccl version offered by this image, preventing application from
-#    mistakenly chooses a wrong version.
-# 2. No longer needing extra settings for LD_LIBRARY_PATH or LD_PRELOAD.
+# 1. Remove pre-installed nccl to ensure there's only ever a single libnccl version offered by this
+#    image, preventing application from mistakenly chooses a wrong version.
+# 2. Install to default location, so no more extra settings for LD_LIBRARY_PATH or LD_PRELOAD.
 #
 # NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the
 # aws-ofi-ccnl.
 ####################################################################################################
-ENV NCCL_VERSION=2.19.3-1
+ENV NCCL_VERSION=2.21.5-1
 RUN apt-get remove -y libnccl2 libnccl-dev \
    && cd /tmp \
    && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
@@ -120,7 +129,7 @@ RUN apt-get remove -y libnccl2 libnccl-dev \
    # Build for p4 & p5.
    NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \
    && rm -rf /tmp/nccl \
-   && echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf
+   && echo NCCL_SOCKET_IFNAME=^docker0,lo,veth_def_agent >> /etc/nccl.conf
 
 
 ####################################################################################################
@@ -136,7 +145,7 @@ RUN apt-get remove -y libnccl2 libnccl-dev \
 ENV OPEN_MPI_PATH=/opt/amazon/openmpi
 
 # OpenMPI build script claims PMIX_VERSION, and complains if we use it.
-ENV CUSTOM_PMIX_VERSION=4.2.6
+ENV CUSTOM_PMIX_VERSION=4.2.7
 RUN apt-get update && apt-get install -y libevent-dev \
     && cd /tmp \
     && wget https://github.com/openpmix/openpmix/releases/download/v${CUSTOM_PMIX_VERSION}/pmix-${CUSTOM_PMIX_VERSION}.tar.gz \
@@ -151,10 +160,6 @@ RUN apt-get update && apt-get install -y libevent-dev \
     && ldconfig \
     && cd / \
     && rm -fr /tmp/pmix-${CUSTOM_PMIX_VERSION}/
-# To silence this runtime error message:
-# [p4de-st-p4de-2:110912] PMIX ERROR: ERROR in file gds_ds12_lock_pthread.c at line 168
-ENV PMIX_GDS_MODULE=^ds12 \
-    PMIX_MCA_gds=^ds12
 
 # Rebuild openmpi with DLC style (which it remarks as "without libfabric"), with the above pmix.
 ENV OMPI_VERSION=4.1.6
@@ -192,20 +197,31 @@ RUN mkdir -p /tmp && \
         --enable-platform-aws \
         --with-mpi=/opt/amazon/openmpi && \
     make -j$(nproc) install && \
-    rm -rf /tmp/aws-ofi/nccl
+    rm -rf /tmp/aws-ofi-nccl
 
 # Do this to minimize the ld path env vars that users need to define when running this image.
 RUN echo "/usr/local/lib"      >> /etc/ld.so.conf.d/local.conf && \
     echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \
     ldconfig
 
-ENV OMPI_MCA_pml=^cm,ucx            \
-    OMPI_MCA_btl=tcp,self           \
-    OMPI_MCA_btl_tcp_if_exclude=lo,docker0 \
-    OPAL_PREFIX=/opt/amazon/openmpi \
+ENV \
+    # ----- BEGIN pmix env vars
+    # To silence this runtime error message:
+    # [p4de-st-p4de-2:110912] PMIX ERROR: ERROR in file gds_ds12_lock_pthread.c at line 168
+    # https://github.com/open-mpi/ompi/issues/7516#issuecomment-599305327
+    PMIX_GDS_MODULE=^ds12                                   \
+    PMIX_MCA_gds=^ds12                                      \
+    # https://github.com/open-mpi/ompi/issues/11557#issuecomment-1496245026
+    PMIX_MCA_psec=^munge                                    \
+    # ----- BEGIN openmpi env vars
+    OMPI_MCA_pml=^cm,ucx                                    \
+    OMPI_MCA_btl=tcp,self                                   \
+    OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent   \
+    OPAL_PREFIX=/opt/amazon/openmpi                         \
     # https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352
     # https://github.com/pytorch/pytorch/issues/68893
-    NCCL_SOCKET_IFNAME=^docker,lo
+    # NOTE: veth_def_agent is from SageMaker HyperPod
+    NCCL_SOCKET_IFNAME=^docker,lo,veth_def_agent
 
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
 
@@ -220,22 +236,6 @@ RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
 
 
 ####################################################################################################
-# Custom packages. Disable as you like. NOTE: always check `pip list` what's been installed. For
-# example, the base container comes pre-installed with Transformer Engine, flash attention, triton
-# (https://github.com/openai/triton/), etc.
+# Add your custom build steps below. For example, from 1.partial-xformers.dockerfile
 ####################################################################################################
-# Install the xformers dependency from source, because pip install either breaks or try to pull
-# its own pt + cuda.
-#
-# Pre-requisite: build node has enough memory to compile xformers. More info on the stanza.
-RUN export TORCH_CUDA_ARCH_LIST="8.0;9.0+PTX" && \
-    # On p4de.24xlarge:
-    # - MAX_JOBS=16 => 145GB memory
-    # - MAX_JOBS=32 => 241GB memory
-    # - MAX_JOBS=48 => 243GB memory, 542.5s
-    #
-    # NOTE: must export MAX_JOBS. For some reason, `MAX_JOBS=16 pip install ...` doesn't seem to
-    #       work to prevent OOM.
-    export MAX_JOBS=32 && \
-    export NVCC_PREPEND_FLAGS="-t 32" && \
-    pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+# This section is intentionally left empty by default.
diff --git a/2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile b/2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile
new file mode 100644
index 00000000..219b9f87
--- /dev/null
+++ b/2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile
@@ -0,0 +1,29 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+####################################################################################################
+# This is NOT a complete Dockerfile! Attempt to docker build this file is guaranteed to fail.
+#
+# This file provides an sample stanza to build xformers, that you can optionally add to
+# 0.nvcr-pytorch-aws.dockerfile should you need a container image with xformers.
+#
+# NOTE: always check `pip list` what's been installed. The base container (specified in
+# 0.nvcr-pytorch-aws.dockerfile) is already pre-installed with Transformer Engine, flash attention,
+# triton (https://github.com/openai/triton/), etc.
+####################################################################################################
+
+# Install the xformers dependency from source, because pip install either breaks or try to pull
+# its own pt + cuda.
+#
+# Pre-requisite: build node has enough memory to compile xformers. More info on the stanza.
+RUN export TORCH_CUDA_ARCH_LIST="8.0;9.0+PTX" && \
+    # On p4de.24xlarge:
+    # - MAX_JOBS=16 => 145GB memory
+    # - MAX_JOBS=32 => 241GB memory
+    # - MAX_JOBS=48 => 243GB memory, 542.5s
+    #
+    # NOTE: must export MAX_JOBS. For some reason, `MAX_JOBS=16 pip install ...` doesn't seem to
+    #       work to prevent OOM.
+    export MAX_JOBS=32 && \
+    export NVCC_PREPEND_FLAGS="-t 32" && \
+    pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
diff --git a/2.ami_and_containers/containers/pytorch/README.md b/2.ami_and_containers/containers/pytorch/README.md
index 79d65795..f0fc5c14 100644
--- a/2.ami_and_containers/containers/pytorch/README.md
+++ b/2.ami_and_containers/containers/pytorch/README.md
@@ -13,6 +13,10 @@ With that said, feel free to explore the example. Happy coding, and experimentin
 
 ## 1. Essential software
 
+Please note that software versions in the template are provided for illustration only. For
+well-tested combinations, please refer to the various Dockerfile files under `3.test_cases/` and
+`4.validation_and_observability/0.nccl_tests/`.
+
 In principle, the reference `Dockerfile` does the following:
 
 - Provide PyTorch built for NVidia CUDA devices, by using a recent NVidia PyTorch image as the

From f2c8c9796b46c4c5f7c361c2cf2a37d868cc2088 Mon Sep 17 00:00:00 2001
From: Verdi March <marcverd@amazon.com>
Date: Fri, 3 May 2024 20:12:43 +0800
Subject: [PATCH 2/2] Make custom pmix+openmpi optional, since nccl-tests works
 fine without it (as of 20240503 on SMHP).

---
 .../pytorch/0.nvcr-pytorch-aws.dockerfile     | 54 ++-----------------
 .../1.pmix-openmpi.fragment.dockerfile        | 53 ++++++++++++++++++
 ...kerfile => 2.xformers.fragment.dockerfile} |  0
 3 files changed, 57 insertions(+), 50 deletions(-)
 create mode 100644 2.ami_and_containers/containers/pytorch/1.pmix-openmpi.fragment.dockerfile
 rename 2.ami_and_containers/containers/pytorch/{1.xformers.fragment.dockerfile => 2.xformers.fragment.dockerfile} (100%)

diff --git a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile
index 0616fe62..3f830c1e 100644
--- a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile
+++ b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile
@@ -131,56 +131,10 @@ RUN apt-get remove -y libnccl2 libnccl-dev \
    && rm -rf /tmp/nccl \
    && echo NCCL_SOCKET_IFNAME=^docker0,lo,veth_def_agent >> /etc/nccl.conf
 
-
-####################################################################################################
-# Rebuild OpenMPI with custom PMIX version. E.g., to match what host's Slurm is built with (see
-# /opt/pmix/ on host, or run pmix_info on host).
+## When nccl-tests mysteriously crashes with pmix error, consider to rebuild pmix + openmpi by
+## putting the stanza from 1.pmix-openmpiaws.fragment.dockerfile below.
 #
-# May be needed on rare occassions when `srun --mpi=pmix --container-image=... <mpi_application>`
-# mysteriously crashes.
-#
-# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the
-# aws-ofi-ccnl.
-####################################################################################################
-ENV OPEN_MPI_PATH=/opt/amazon/openmpi
-
-# OpenMPI build script claims PMIX_VERSION, and complains if we use it.
-ENV CUSTOM_PMIX_VERSION=4.2.7
-RUN apt-get update && apt-get install -y libevent-dev \
-    && cd /tmp \
-    && wget https://github.com/openpmix/openpmix/releases/download/v${CUSTOM_PMIX_VERSION}/pmix-${CUSTOM_PMIX_VERSION}.tar.gz \
-    && tar -xzf pmix-${CUSTOM_PMIX_VERSION}.tar.gz \
-    && rm pmix-${CUSTOM_PMIX_VERSION}.tar.gz \
-    && cd pmix-${CUSTOM_PMIX_VERSION}/ \
-    && ./autogen.pl \
-    && ./configure --prefix=/opt/pmix \
-    && make -j \
-    && make install \
-    && echo /opt/pmix/lib > /etc/ld.so.conf.d/pmix.conf \
-    && ldconfig \
-    && cd / \
-    && rm -fr /tmp/pmix-${CUSTOM_PMIX_VERSION}/
-
-# Rebuild openmpi with DLC style (which it remarks as "without libfabric"), with the above pmix.
-ENV OMPI_VERSION=4.1.6
-RUN rm -fr ${OPEN_MPI_PATH} \
- && mkdir /tmp/openmpi \
- && cd /tmp/openmpi \
- && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
- && tar zxf openmpi-${OMPI_VERSION}.tar.gz \
- && rm openmpi-${OMPI_VERSION}.tar.gz \
- && cd openmpi-${OMPI_VERSION} \
- && ./configure --enable-orterun-prefix-by-default --prefix=$OPEN_MPI_PATH --with-cuda=${CUDA_HOME} --with-slurm --with-pmix=/opt/pmix \
- && make -j $(nproc) all \
- && make install \
- && ldconfig \
- && cd / \
- && rm -rf /tmp/openmpi \
- && ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \
- # Verify pmix from /opt/pmix/
- && ldd /opt/amazon/openmpi/lib/openmpi/mca_pmix_ext3x.so | grep '/opt/pmix/lib/libpmix.so.* ' > /opt/amazon/openmpi-pmix.txt
-####################################################################################################
-
+# <content from 1.pmix-openmpi.fragment.dockerfile>
 
 # NCCL EFA Plugin
 RUN mkdir -p /tmp && \
@@ -236,6 +190,6 @@ RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
 
 
 ####################################################################################################
-# Add your custom build steps below. For example, from 1.partial-xformers.dockerfile
+# Add your custom build steps below. For example, from {2,...}.*.fragment.dockerfile files
 ####################################################################################################
 # This section is intentionally left empty by default.
diff --git a/2.ami_and_containers/containers/pytorch/1.pmix-openmpi.fragment.dockerfile b/2.ami_and_containers/containers/pytorch/1.pmix-openmpi.fragment.dockerfile
new file mode 100644
index 00000000..c4c1af8d
--- /dev/null
+++ b/2.ami_and_containers/containers/pytorch/1.pmix-openmpi.fragment.dockerfile
@@ -0,0 +1,53 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+####################################################################################################
+# This is NOT a complete Dockerfile! Attempt to docker build this file is guaranteed to fail.
+#
+# This file provides an sample stanza to rebuild OpenMPI with custom PMIX version. E.g., to match
+# what host's Slurm is built with (see /opt/pmix/ on host, or run pmix_info on host).
+#
+# You might need this only on rare occassions when `srun --mpi=pmix --container-image=... <mpi_app>`
+# mysteriously crashes.
+#
+# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi **BEFORE** we build
+# the aws-ofi-ccnl.
+####################################################################################################
+
+ENV OPEN_MPI_PATH=/opt/amazon/openmpi
+
+# OpenMPI build script claims PMIX_VERSION, and complains if we use it.
+ENV CUSTOM_PMIX_VERSION=4.2.7
+RUN apt-get update && apt-get install -y libevent-dev \
+    && cd /tmp \
+    && wget https://github.com/openpmix/openpmix/releases/download/v${CUSTOM_PMIX_VERSION}/pmix-${CUSTOM_PMIX_VERSION}.tar.gz \
+    && tar -xzf pmix-${CUSTOM_PMIX_VERSION}.tar.gz \
+    && rm pmix-${CUSTOM_PMIX_VERSION}.tar.gz \
+    && cd pmix-${CUSTOM_PMIX_VERSION}/ \
+    && ./autogen.pl \
+    && ./configure --prefix=/opt/pmix \
+    && make -j \
+    && make install \
+    && echo /opt/pmix/lib > /etc/ld.so.conf.d/pmix.conf \
+    && ldconfig \
+    && cd / \
+    && rm -fr /tmp/pmix-${CUSTOM_PMIX_VERSION}/
+
+# Rebuild openmpi with DLC style (which it remarks as "without libfabric"), with the above pmix.
+ENV OMPI_VERSION=4.1.6
+RUN rm -fr ${OPEN_MPI_PATH} \
+ && mkdir /tmp/openmpi \
+ && cd /tmp/openmpi \
+ && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
+ && tar zxf openmpi-${OMPI_VERSION}.tar.gz \
+ && rm openmpi-${OMPI_VERSION}.tar.gz \
+ && cd openmpi-${OMPI_VERSION} \
+ && ./configure --enable-orterun-prefix-by-default --prefix=$OPEN_MPI_PATH --with-cuda=${CUDA_HOME} --with-slurm --with-pmix=/opt/pmix \
+ && make -j $(nproc) all \
+ && make install \
+ && ldconfig \
+ && cd / \
+ && rm -rf /tmp/openmpi \
+ && ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \
+ # Verify pmix from /opt/pmix/
+ && ldd /opt/amazon/openmpi/lib/openmpi/mca_pmix_ext3x.so | grep '/opt/pmix/lib/libpmix.so.* ' > /opt/amazon/openmpi-pmix.txt
diff --git a/2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile b/2.ami_and_containers/containers/pytorch/2.xformers.fragment.dockerfile
similarity index 100%
rename from 2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile
rename to 2.ami_and_containers/containers/pytorch/2.xformers.fragment.dockerfile