From cf0eb3ca134a4a1e52e673bbd76145040f248a87 Mon Sep 17 00:00:00 2001
From: Verdi March <marcverd@amazon.com>
Date: Fri, 3 May 2024 15:10:06 +0800
Subject: [PATCH] Bump and refactor pytorch dockerfile template

---
 .../pytorch/0.nvcr-pytorch-aws.dockerfile     | 84 +++++++++----------
 .../pytorch/1.xformers.fragment.dockerfile    | 29 +++++++
 .../containers/pytorch/README.md              |  4 +
 3 files changed, 75 insertions(+), 42 deletions(-)
 create mode 100644 2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile

diff --git a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile
index 49ab8c03..0616fe62 100644
--- a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile
+++ b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile
@@ -3,7 +3,8 @@
 
 ####################################################################################################
 # This is a sample Dockerfile, with optional stanzas. Please read through this Dockerfile,
-# understand what it does, then create your own Dockerfile.
+# understand what it does, then create your own Dockerfile. Software versions are provided for
+# illustration only.
 #
 # Sample build instructions:
 #
@@ -19,13 +20,13 @@
 #     # Load image to local docker registry -> on head node, or new compute/build node.
 #     docker load < /fsx/nvidia-pt-od__latest.tar
 ####################################################################################################
-FROM nvcr.io/nvidia/pytorch:23.12-py3
+FROM nvcr.io/nvidia/pytorch:24.03-py3
 ENV DEBIAN_FRONTEND=noninteractive
 
 # The three must-be-built packages.
 # Efa-installer>=1.29.1 required for nccl>=2.19.0 to avoid libfabric NCCL error.
-ENV EFA_INSTALLER_VERSION=1.30.0
-ENV AWS_OFI_NCCL_VERSION=1.8.1-aws
+ENV EFA_INSTALLER_VERSION=1.32.0
+ENV AWS_OFI_NCCL_VERSION=1.9.1-aws
 ENV NCCL_TESTS_VERSION=master
 
 ## Uncomment below when this Dockerfile builds a container image with efa-installer<1.29.1 and
@@ -82,36 +83,44 @@ RUN apt-get update && \
     rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*
 ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH
 ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH
+ENV OPAL_PREFIX=/opt/amazon/openmpi
 
 
 ####################################################################################################
 # [CUSTOM_NCCL_OPTION_1] Uncomment below stanza to install another NCCL version using the official
 # binaries.
 #
+# Please consult https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html to
+# find out the prebuilt nccl version in the parent image.
+#
 # NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the
 # aws-ofi-ccnl.
 ####################################################################################################
-#ENV NCCL_VERSION=2.19.3-1
+#ENV NCCL_VERSION=2.21.5-1
 #RUN cd /opt && \
 #    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \
 #    dpkg -i cuda-keyring_1.0-1_all.deb && \
 #    apt update && \
 #    apt install -y libnccl2==${NCCL_VERSION} libnccl-dev==${NCCL_VERSION} && \
-#    echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf
+#    echo NCCL_SOCKET_IFNAME=^docker0,lo,veth_def_agent >> /etc/nccl.conf
 
 
 ####################################################################################################
-# [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones. The
-# benefits of installing to the same location as the built-in version are:
+# [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones.
+#
+# Please consult https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html to
+# find out the prebuilt nccl version in the parent image.
+#
+# Installation mechanics:
 #
-# 1. There's only ever a single libnccl version offered by this image, preventing application from
-#    mistakenly chooses a wrong version.
-# 2. No longer needing extra settings for LD_LIBRARY_PATH or LD_PRELOAD.
+# 1. Remove pre-installed nccl to ensure there's only ever a single libnccl version offered by this
+#    image, preventing application from mistakenly chooses a wrong version.
+# 2. Install to default location, so no more extra settings for LD_LIBRARY_PATH or LD_PRELOAD.
 #
 # NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the
 # aws-ofi-ccnl.
 ####################################################################################################
-ENV NCCL_VERSION=2.19.3-1
+ENV NCCL_VERSION=2.21.5-1
 RUN apt-get remove -y libnccl2 libnccl-dev \
    && cd /tmp \
    && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \
@@ -120,7 +129,7 @@ RUN apt-get remove -y libnccl2 libnccl-dev \
    # Build for p4 & p5.
    NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \
    && rm -rf /tmp/nccl \
-   && echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf
+   && echo NCCL_SOCKET_IFNAME=^docker0,lo,veth_def_agent >> /etc/nccl.conf
 
 
 ####################################################################################################
@@ -136,7 +145,7 @@ RUN apt-get remove -y libnccl2 libnccl-dev \
 ENV OPEN_MPI_PATH=/opt/amazon/openmpi
 
 # OpenMPI build script claims PMIX_VERSION, and complains if we use it.
-ENV CUSTOM_PMIX_VERSION=4.2.6
+ENV CUSTOM_PMIX_VERSION=4.2.7
 RUN apt-get update && apt-get install -y libevent-dev \
     && cd /tmp \
     && wget https://github.com/openpmix/openpmix/releases/download/v${CUSTOM_PMIX_VERSION}/pmix-${CUSTOM_PMIX_VERSION}.tar.gz \
@@ -151,10 +160,6 @@ RUN apt-get update && apt-get install -y libevent-dev \
     && ldconfig \
     && cd / \
     && rm -fr /tmp/pmix-${CUSTOM_PMIX_VERSION}/
-# To silence this runtime error message:
-# [p4de-st-p4de-2:110912] PMIX ERROR: ERROR in file gds_ds12_lock_pthread.c at line 168
-ENV PMIX_GDS_MODULE=^ds12 \
-    PMIX_MCA_gds=^ds12
 
 # Rebuild openmpi with DLC style (which it remarks as "without libfabric"), with the above pmix.
 ENV OMPI_VERSION=4.1.6
@@ -192,20 +197,31 @@ RUN mkdir -p /tmp && \
         --enable-platform-aws \
         --with-mpi=/opt/amazon/openmpi && \
     make -j$(nproc) install && \
-    rm -rf /tmp/aws-ofi/nccl
+    rm -rf /tmp/aws-ofi-nccl
 
 # Do this to minimize the ld path env vars that users need to define when running this image.
 RUN echo "/usr/local/lib"      >> /etc/ld.so.conf.d/local.conf && \
     echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \
     ldconfig
 
-ENV OMPI_MCA_pml=^cm,ucx            \
-    OMPI_MCA_btl=tcp,self           \
-    OMPI_MCA_btl_tcp_if_exclude=lo,docker0 \
-    OPAL_PREFIX=/opt/amazon/openmpi \
+ENV \
+    # ----- BEGIN pmix env vars
+    # To silence this runtime error message:
+    # [p4de-st-p4de-2:110912] PMIX ERROR: ERROR in file gds_ds12_lock_pthread.c at line 168
+    # https://github.com/open-mpi/ompi/issues/7516#issuecomment-599305327
+    PMIX_GDS_MODULE=^ds12                                   \
+    PMIX_MCA_gds=^ds12                                      \
+    # https://github.com/open-mpi/ompi/issues/11557#issuecomment-1496245026
+    PMIX_MCA_psec=^munge                                    \
+    # ----- BEGIN openmpi env vars
+    OMPI_MCA_pml=^cm,ucx                                    \
+    OMPI_MCA_btl=tcp,self                                   \
+    OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent   \
+    OPAL_PREFIX=/opt/amazon/openmpi                         \
     # https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352
     # https://github.com/pytorch/pytorch/issues/68893
-    NCCL_SOCKET_IFNAME=^docker,lo
+    # NOTE: veth_def_agent is from SageMaker HyperPod
+    NCCL_SOCKET_IFNAME=^docker,lo,veth_def_agent
 
 ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
 
@@ -220,22 +236,6 @@ RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
 
 
 ####################################################################################################
-# Custom packages. Disable as you like. NOTE: always check `pip list` what's been installed. For
-# example, the base container comes pre-installed with Transformer Engine, flash attention, triton
-# (https://github.com/openai/triton/), etc.
+# Add your custom build steps below. For example, from 1.partial-xformers.dockerfile
 ####################################################################################################
-# Install the xformers dependency from source, because pip install either breaks or try to pull
-# its own pt + cuda.
-#
-# Pre-requisite: build node has enough memory to compile xformers. More info on the stanza.
-RUN export TORCH_CUDA_ARCH_LIST="8.0;9.0+PTX" && \
-    # On p4de.24xlarge:
-    # - MAX_JOBS=16 => 145GB memory
-    # - MAX_JOBS=32 => 241GB memory
-    # - MAX_JOBS=48 => 243GB memory, 542.5s
-    #
-    # NOTE: must export MAX_JOBS. For some reason, `MAX_JOBS=16 pip install ...` doesn't seem to
-    #       work to prevent OOM.
-    export MAX_JOBS=32 && \
-    export NVCC_PREPEND_FLAGS="-t 32" && \
-    pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+# This section is intentionally left empty by default.
diff --git a/2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile b/2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile
new file mode 100644
index 00000000..219b9f87
--- /dev/null
+++ b/2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile
@@ -0,0 +1,29 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+####################################################################################################
+# This is NOT a complete Dockerfile! Attempt to docker build this file is guaranteed to fail.
+#
+# This file provides an sample stanza to build xformers, that you can optionally add to
+# 0.nvcr-pytorch-aws.dockerfile should you need a container image with xformers.
+#
+# NOTE: always check `pip list` what's been installed. The base container (specified in
+# 0.nvcr-pytorch-aws.dockerfile) is already pre-installed with Transformer Engine, flash attention,
+# triton (https://github.com/openai/triton/), etc.
+####################################################################################################
+
+# Install the xformers dependency from source, because pip install either breaks or try to pull
+# its own pt + cuda.
+#
+# Pre-requisite: build node has enough memory to compile xformers. More info on the stanza.
+RUN export TORCH_CUDA_ARCH_LIST="8.0;9.0+PTX" && \
+    # On p4de.24xlarge:
+    # - MAX_JOBS=16 => 145GB memory
+    # - MAX_JOBS=32 => 241GB memory
+    # - MAX_JOBS=48 => 243GB memory, 542.5s
+    #
+    # NOTE: must export MAX_JOBS. For some reason, `MAX_JOBS=16 pip install ...` doesn't seem to
+    #       work to prevent OOM.
+    export MAX_JOBS=32 && \
+    export NVCC_PREPEND_FLAGS="-t 32" && \
+    pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
diff --git a/2.ami_and_containers/containers/pytorch/README.md b/2.ami_and_containers/containers/pytorch/README.md
index 79d65795..f0fc5c14 100644
--- a/2.ami_and_containers/containers/pytorch/README.md
+++ b/2.ami_and_containers/containers/pytorch/README.md
@@ -13,6 +13,10 @@ With that said, feel free to explore the example. Happy coding, and experimentin
 
 ## 1. Essential software
 
+Please note that software versions in the template are provided for illustration only. For
+well-tested combinations, please refer to the various Dockerfile files under `3.test_cases/` and
+`4.validation_and_observability/0.nccl_tests/`.
+
 In principle, the reference `Dockerfile` does the following:
 
 - Provide PyTorch built for NVidia CUDA devices, by using a recent NVidia PyTorch image as the