From f2c8c9796b46c4c5f7c361c2cf2a37d868cc2088 Mon Sep 17 00:00:00 2001 From: Verdi March Date: Fri, 3 May 2024 20:12:43 +0800 Subject: [PATCH] Make custom pmix+openmpi optional, since nccl-tests works fine without it (as of 20240503 on SMHP). --- .../pytorch/0.nvcr-pytorch-aws.dockerfile | 54 ++----------------- .../1.pmix-openmpi.fragment.dockerfile | 53 ++++++++++++++++++ ...kerfile => 2.xformers.fragment.dockerfile} | 0 3 files changed, 57 insertions(+), 50 deletions(-) create mode 100644 2.ami_and_containers/containers/pytorch/1.pmix-openmpi.fragment.dockerfile rename 2.ami_and_containers/containers/pytorch/{1.xformers.fragment.dockerfile => 2.xformers.fragment.dockerfile} (100%) diff --git a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile index 0616fe62..3f830c1e 100644 --- a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile +++ b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile @@ -131,56 +131,10 @@ RUN apt-get remove -y libnccl2 libnccl-dev \ && rm -rf /tmp/nccl \ && echo NCCL_SOCKET_IFNAME=^docker0,lo,veth_def_agent >> /etc/nccl.conf - -#################################################################################################### -# Rebuild OpenMPI with custom PMIX version. E.g., to match what host's Slurm is built with (see -# /opt/pmix/ on host, or run pmix_info on host). +## When nccl-tests mysteriously crashes with pmix error, consider to rebuild pmix + openmpi by +## putting the stanza from 1.pmix-openmpiaws.fragment.dockerfile below. # -# May be needed on rare occassions when `srun --mpi=pmix --container-image=... ` -# mysteriously crashes. -# -# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the -# aws-ofi-ccnl. -#################################################################################################### -ENV OPEN_MPI_PATH=/opt/amazon/openmpi - -# OpenMPI build script claims PMIX_VERSION, and complains if we use it. -ENV CUSTOM_PMIX_VERSION=4.2.7 -RUN apt-get update && apt-get install -y libevent-dev \ - && cd /tmp \ - && wget https://github.com/openpmix/openpmix/releases/download/v${CUSTOM_PMIX_VERSION}/pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ - && tar -xzf pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ - && rm pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ - && cd pmix-${CUSTOM_PMIX_VERSION}/ \ - && ./autogen.pl \ - && ./configure --prefix=/opt/pmix \ - && make -j \ - && make install \ - && echo /opt/pmix/lib > /etc/ld.so.conf.d/pmix.conf \ - && ldconfig \ - && cd / \ - && rm -fr /tmp/pmix-${CUSTOM_PMIX_VERSION}/ - -# Rebuild openmpi with DLC style (which it remarks as "without libfabric"), with the above pmix. -ENV OMPI_VERSION=4.1.6 -RUN rm -fr ${OPEN_MPI_PATH} \ - && mkdir /tmp/openmpi \ - && cd /tmp/openmpi \ - && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ - && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ - && rm openmpi-${OMPI_VERSION}.tar.gz \ - && cd openmpi-${OMPI_VERSION} \ - && ./configure --enable-orterun-prefix-by-default --prefix=$OPEN_MPI_PATH --with-cuda=${CUDA_HOME} --with-slurm --with-pmix=/opt/pmix \ - && make -j $(nproc) all \ - && make install \ - && ldconfig \ - && cd / \ - && rm -rf /tmp/openmpi \ - && ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \ - # Verify pmix from /opt/pmix/ - && ldd /opt/amazon/openmpi/lib/openmpi/mca_pmix_ext3x.so | grep '/opt/pmix/lib/libpmix.so.* ' > /opt/amazon/openmpi-pmix.txt -#################################################################################################### - +# # NCCL EFA Plugin RUN mkdir -p /tmp && \ @@ -236,6 +190,6 @@ RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ #################################################################################################### -# Add your custom build steps below. For example, from 1.partial-xformers.dockerfile +# Add your custom build steps below. For example, from {2,...}.*.fragment.dockerfile files #################################################################################################### # This section is intentionally left empty by default. diff --git a/2.ami_and_containers/containers/pytorch/1.pmix-openmpi.fragment.dockerfile b/2.ami_and_containers/containers/pytorch/1.pmix-openmpi.fragment.dockerfile new file mode 100644 index 00000000..c4c1af8d --- /dev/null +++ b/2.ami_and_containers/containers/pytorch/1.pmix-openmpi.fragment.dockerfile @@ -0,0 +1,53 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#################################################################################################### +# This is NOT a complete Dockerfile! Attempt to docker build this file is guaranteed to fail. +# +# This file provides an sample stanza to rebuild OpenMPI with custom PMIX version. E.g., to match +# what host's Slurm is built with (see /opt/pmix/ on host, or run pmix_info on host). +# +# You might need this only on rare occassions when `srun --mpi=pmix --container-image=... ` +# mysteriously crashes. +# +# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi **BEFORE** we build +# the aws-ofi-ccnl. +#################################################################################################### + +ENV OPEN_MPI_PATH=/opt/amazon/openmpi + +# OpenMPI build script claims PMIX_VERSION, and complains if we use it. +ENV CUSTOM_PMIX_VERSION=4.2.7 +RUN apt-get update && apt-get install -y libevent-dev \ + && cd /tmp \ + && wget https://github.com/openpmix/openpmix/releases/download/v${CUSTOM_PMIX_VERSION}/pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ + && tar -xzf pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ + && rm pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ + && cd pmix-${CUSTOM_PMIX_VERSION}/ \ + && ./autogen.pl \ + && ./configure --prefix=/opt/pmix \ + && make -j \ + && make install \ + && echo /opt/pmix/lib > /etc/ld.so.conf.d/pmix.conf \ + && ldconfig \ + && cd / \ + && rm -fr /tmp/pmix-${CUSTOM_PMIX_VERSION}/ + +# Rebuild openmpi with DLC style (which it remarks as "without libfabric"), with the above pmix. +ENV OMPI_VERSION=4.1.6 +RUN rm -fr ${OPEN_MPI_PATH} \ + && mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ + && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ + && rm openmpi-${OMPI_VERSION}.tar.gz \ + && cd openmpi-${OMPI_VERSION} \ + && ./configure --enable-orterun-prefix-by-default --prefix=$OPEN_MPI_PATH --with-cuda=${CUDA_HOME} --with-slurm --with-pmix=/opt/pmix \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && cd / \ + && rm -rf /tmp/openmpi \ + && ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \ + # Verify pmix from /opt/pmix/ + && ldd /opt/amazon/openmpi/lib/openmpi/mca_pmix_ext3x.so | grep '/opt/pmix/lib/libpmix.so.* ' > /opt/amazon/openmpi-pmix.txt diff --git a/2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile b/2.ami_and_containers/containers/pytorch/2.xformers.fragment.dockerfile similarity index 100% rename from 2.ami_and_containers/containers/pytorch/1.xformers.fragment.dockerfile rename to 2.ami_and_containers/containers/pytorch/2.xformers.fragment.dockerfile