diff --git a/3.test_cases/torchtitan/README.md b/3.test_cases/torchtitan/README.md new file mode 100644 index 00000000..8acb60ae --- /dev/null +++ b/3.test_cases/torchtitan/README.md @@ -0,0 +1,8 @@ + +**Torchtitan** is a pioneering library for large-scale LLM training utilizing native PyTorch. It highlights PyTorch's latest distributed training features through a clean, minimalistic codebase. + +Characteristics of Torchtitan include: + +* User-friendly design, making it easy to understand, use, and extend for various training purposes. +* Minimal modifications required to the model code for applying 1D, 2D, or upcoming 3D parallelism. +* A modular approach over a monolithic codebase, facilitating quick start-ups. \ No newline at end of file diff --git a/3.test_cases/torchtitan/pretrain.sbatch b/3.test_cases/torchtitan/pretrain.sbatch new file mode 100644 index 00000000..3d2d3252 --- /dev/null +++ b/3.test_cases/torchtitan/pretrain.sbatch @@ -0,0 +1,85 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=pretrain +#SBATCH --nodes=2 +#SBATCH --ntasks=2 +#SBATCH --gpus-per-node=8 # Number of GPU per node +#SBATCH --output=logs/%x_%j.out # logfile for stdout +#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs +#SBATCH --wait-all-nodes=1 +#SBATCH --exclusive +set -euxo pipefail + +################################################################## +############# Load environment variables ######################### +################################################################## +# Load environment variables +if [ ! -f .env ] +then + echo "Please create a .env file with the required environment variables" + exit 1 +else + source .env +fi + +################################################################## +######### Define EFA/NCCL/Slurm environment variables ############ +################################################################## +## EFA settings +export FI_LOG_LEVEL=1 +export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons +export FI_EFA_USE_HUGE_PAGE=0 +# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 +# https://github.com/pytorch/pytorch/issues/68893 +export NCCL_SOCKET_IFNAME=en +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_DEBUG=INFO +export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` +export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` +export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) +export NODES_ARRAY=($NODES) +export HEAD_NODE=${NODES_ARRAY[0]} +export MASTER_ADDR=$(hostname --ip-address) +export MASTER_PORT=$RANDOM +export NNODES=$SLURM_JOB_NUM_NODES +export NPROC=$SLURM_GPUS_PER_NODE +export WORLD_SIZE=$(( $NNODES * $NPROC )) + +################################################################## +############### Create train config ############################## +################################################################## + +if [ ! -d ${FSX_PATH}/tmp ]; then + mkdir -p ${FSX_PATH}/tmp +fi +cat ${PWD}/train_configs/pretrain_llama3_70b.toml | envsubst > ${FSX_PATH}/tmp/pretrain_llama3_70b.toml + +################################################################## +################# Set arguments ################################## +################################################################## + +: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}" +declare -a SRUN_ARGS=( + --container-image $ENROOT_IMAGE + --container-mounts $CONTAINER_MOUNT +) +declare -a TORCHRUN_ARGS=( + # change this to match the number of gpus per node: + --master_addr $MASTER_ADDR + --master_port $RANDOM + --nproc_per_node=8 + --nnodes $NNODES + --nnodes=$SLURM_JOB_NUM_NODES + --rdzv_backend=c10d + --rdzv_endpoint=$(hostname) +) +declare -a TRAIN_ARGS=( + --job.config_file ${FSX_PATH}/tmp/pretrain_llama3_70b.toml +) + +srun -l "${SRUN_ARGS[@]}" \ + torchrun "${TORCHRUN_ARGS[@]}" ${PWD}/../torchtitan/train.py "${TRAIN_ARGS[@]}" \ No newline at end of file diff --git a/3.test_cases/torchtitan/slurm/torchtitan.dockerfile b/3.test_cases/torchtitan/slurm/torchtitan.dockerfile new file mode 100644 index 00000000..a279321a --- /dev/null +++ b/3.test_cases/torchtitan/slurm/torchtitan.dockerfile @@ -0,0 +1,234 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#################################################################################################### +# This is a sample Dockerfile, with optional stanzas. Please read through this Dockerfile, +# understand what it does, then create your own Dockerfile. +# +# Sample build instructions: +# +# docker build --progress=plain -t nvidia-pt-od:latest -f 0.nvcr-pytorch-aws.dockerfile . +# rm /fsx/nvidia-pt-od__latest.sqsh ; enroot import -o /fsx/nvidia-pt-od__latest.sqsh dockerd://nvidia-pt-od:latest +# +# Compute nodes (aka build nodes) are transient, so we need to keep the docker image on shared fs, +# which head node can load into its local registry. +# +# # Build node: save image to file +# docker save nvidia-pt-od:latest > /fsx/nvidia-pt-od__latest.tar +# +# # Load image to local docker registry -> on head node, or new compute/build node. +# docker load < /fsx/nvidia-pt-od__latest.tar +#################################################################################################### +FROM nvcr.io/nvidia/pytorch:24.04-py3 +ENV DEBIAN_FRONTEND=noninteractive + +# The three must-be-built packages. +# Efa-installer>=1.29.0 required for nccl>=2.19.0 to avoid libfabric NCCL error. +ARG EFA_INSTALLER_VERSION=1.31.0 +ARG AWS_OFI_NCCL_VERSION=v1.8.1-aws +ARG NCCL_TESTS_VERSION=2.13.9 +ARG NCCL_VERSION=2.20.3-1 + +RUN apt-get update -y +RUN apt-get remove -y --allow-change-held-packages \ + libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 + +# We noticed that since 23.09, we can't just delete the whole /opt/hpcx/, otherwise `import torch` +# complains about missing libuc?.so. +RUN rm -rf /opt/hpcx/ompi \ + && rm -rf /usr/local/mpi \ + && rm -rf /opt/hpcx/nccl_rdma_sharp_plugin \ + && ldconfig +ENV OPAL_PREFIX= +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + git \ + gcc \ + vim \ + kmod \ + openssh-client \ + openssh-server \ + build-essential \ + curl \ + autoconf \ + libtool \ + gdb \ + automake \ + cmake \ + apt-utils \ + libhwloc-dev \ + aptitude && \ + DEBIAN_FRONTEND=noninteractive apt autoremove -y + +# EFA +RUN apt-get update && \ + cd /tmp && \ + curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ + tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ + cd aws-efa-installer && \ + # ONLY add `--skip-kmod`, `--no-verify` and `--skip-limit-conf` flags to container image. + # Those three flags must NOT be used on the host. + # + # Explanations: + # - to build EFA in the Dockerfile, we added --skip-kmod and --no-verify. Without these flags, + # the Dockerfile will fail to build. If installing EFA on the host and not in a container, + # please remove these flags. + # - The --skip-limit-conf can be retained in Dockerfile, but it's redundant as the host already + # has these limits set by efa_installer. + ./efa_installer.sh -y -g -d --skip-kmod --no-verify --skip-limit-conf && \ + ldconfig && \ + rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* +ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH +ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH + + +#################################################################################################### +# [CUSTOM_NCCL_OPTION_1] Uncomment below stanza to install another NCCL version using the official +# binaries. +# +# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the +# aws-ofi-ccnl. +#################################################################################################### +# RUN cd /opt && \ +# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ +# dpkg -i cuda-keyring_1.0-1_all.deb && \ +# apt update && \ +# apt install -y libnccl2==${NCCL_VERSION} libnccl-dev==${NCCL_VERSION} && \ +# echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf + + +#################################################################################################### +# [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones. The +# benefits of installing to the same location as the built-in version are: +# +# 1. There's only ever a single libnccl version offered by this image, preventing application from +# mistakenly chooses a wrong version. +# 2. No longer needing extra settings for LD_LIBRARY_PATH or LD_PRELOAD. +# +# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the +# aws-ofi-ccnl. +#################################################################################################### +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j src.build BUILDDIR=/usr \ + # Build for p4 & p5. + NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \ + && rm -rf /tmp/nccl \ + && echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf + + +#################################################################################################### +# Rebuild OpenMPI with custom PMIX version. E.g., to match what host's Slurm is built with (see +# /opt/pmix/ on host, or run pmix_info on host). +# +# May be needed on rare occassions when `srun --mpi=pmix --container-image=... ` +# mysteriously crashes. +# +# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the +# aws-ofi-ccnl. +#################################################################################################### +ENV OPEN_MPI_PATH=/opt/amazon/openmpi + +# OpenMPI build script claims PMIX_VERSION, and complains if we use it. +ENV CUSTOM_PMIX_VERSION=4.2.6 +RUN apt-get update && apt-get install -y libevent-dev \ + && cd /tmp \ + && wget https://github.com/openpmix/openpmix/releases/download/v${CUSTOM_PMIX_VERSION}/pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ + && tar -xzf pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ + && rm pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ + && cd pmix-${CUSTOM_PMIX_VERSION}/ \ + && ./autogen.pl \ + && ./configure --prefix=/opt/pmix \ + && make -j \ + && make install \ + && echo /opt/pmix/lib > /etc/ld.so.conf.d/pmix.conf \ + && ldconfig \ + && cd / \ + && rm -fr /tmp/pmix-${CUSTOM_PMIX_VERSION}/ +# To silence this runtime error message: +# [p4de-st-p4de-2:110912] PMIX ERROR: ERROR in file gds_ds12_lock_pthread.c at line 168 +ENV PMIX_GDS_MODULE=^ds12 \ + PMIX_MCA_gds=^ds12 + +# Rebuild openmpi with DLC style (which it remarks as "without libfabric"), with the above pmix. +ENV OMPI_VERSION=4.1.6 +RUN rm -fr ${OPEN_MPI_PATH} \ + && mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ + && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ + && rm openmpi-${OMPI_VERSION}.tar.gz \ + && cd openmpi-${OMPI_VERSION} \ + && ./configure --enable-orterun-prefix-by-default --prefix=$OPEN_MPI_PATH --with-cuda=${CUDA_HOME} --with-slurm --with-pmix=/opt/pmix \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && cd / \ + && rm -rf /tmp/openmpi \ + && ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \ + # Verify pmix from /opt/pmix/ + && ldd /opt/amazon/openmpi/lib/openmpi/mca_pmix_ext3x.so | grep '/opt/pmix/lib/libpmix.so.* ' > /opt/amazon/openmpi-pmix.txt +#################################################################################################### + + +## NCCL EFA Plugin +#RUN mkdir -p /tmp && \ +# cd /tmp && \ +# curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ +# tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ +# rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ +# mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \ +# cd /tmp/aws-ofi-nccl && \ +# ./autogen.sh && \ +# ./configure --prefix=/opt/amazon/efa \ +# --with-libfabric=/opt/amazon/efa \ +# --with-cuda=/usr/local/cuda \ +# --enable-platform-aws \ +# --with-mpi=/opt/amazon/openmpi && \ +# make -j$(nproc) install && \ +# rm -rf /tmp/aws-ofi/nccl + +################################################### +## Install AWS-OFI-NCCL plugin +RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y +RUN export OPAL_PREFIX="" \ + && git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \ + && cd /opt/aws-ofi-nccl \ + && git checkout ${AWS_OFI_NCCL_VERSION} \ + && ./autogen.sh \ + && ./configure --prefix=/opt/aws-ofi-nccl/install \ + --with-mpi=/opt/amazon/openmpi \ + --with-libfabric=/opt/amazon/efa \ + --with-cuda=/usr/local/cuda \ + --enable-platform-aws \ + && make -j $(nproc) && make install + + +# Do this to minimize the ld path env vars that users need to define when running this image. +RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \ + echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \ + ldconfig + +ENV OMPI_MCA_pml=^cm,ucx \ + OMPI_MCA_btl=tcp,self \ + OMPI_MCA_btl_tcp_if_exclude=lo,docker0 \ + OPAL_PREFIX=/opt/amazon/openmpi \ + # https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 + # https://github.com/pytorch/pytorch/issues/68893 + NCCL_SOCKET_IFNAME=^docker,lo + +ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" + +# NCCL-tests: always good to include this as a diagnostic tool. +RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ + && cd /opt/nccl-tests \ + && git checkout v${NCCL_TESTS_VERSION} \ + && make MPI=1 \ + MPI_HOME=/opt/amazon/openmpi \ + CUDA_HOME=/usr/local/cuda \ + NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_80,code=sm_80" + + +RUN pip install accelerate appdirs loralib bitsandbytes datasets fire peft transformers>=4.40.0 sentencepiece wandb vllm gradio openai +RUN pip install hydra-core huggingface_hub safetensors tiktoken blobfile>=2 tqdm torchao==0.1 lm_eval==0.4.* +RUN pip uninstall -y transformer-engine \ No newline at end of file diff --git a/3.test_cases/torchtune/.gitignore b/3.test_cases/torchtune/.gitignore new file mode 100644 index 00000000..6fe66831 --- /dev/null +++ b/3.test_cases/torchtune/.gitignore @@ -0,0 +1,2 @@ +torchtune +.env \ No newline at end of file diff --git a/3.test_cases/torchtune/README.md b/3.test_cases/torchtune/README.md new file mode 100644 index 00000000..4c2a1b85 --- /dev/null +++ b/3.test_cases/torchtune/README.md @@ -0,0 +1,28 @@ +# End-to-End LLM Model Development with Torchtune + +This guide demonstrates the comprehensive process of developing a Large Language Model (LLM) from start to finish using [Torchtune](https://github.com/pytorch/torchtune). The journey of creating an LLM encompasses five pivotal steps: + +![LLMOps](docs/LLMOps.png) + +1. **(Continuous) Pretraining the Language Model**: Next, the language model undergoes pretraining on a vast corpus of text data. This step can be bypassed if starting with an already pretrained model. Pretraining is essential for the model to learn the general patterns and structures of language. Refer `torchtitan` test case for the large scale pretraining with the latest techniques such as 3D parallelism and `torch.compile`. + +2. **Instruction Tuning**: The pretrained model is then fine-tuned to cater to specific tasks by updating its parameters with a new dataset. This process involves partially retraining the model with samples that exemplify the desired behavior, thus refining the model weights for the particular application. + +3. **Aligment**: The pretrained model is then fine-tuned to cater to specific tasks by updating its parameters with a new dataset. This process involves partially retraining the model with samples that exemplify the desired behavior, thus refining the model weights for the particular application. + +4. **Evaluation**: Evaluating the LLM's performance is a critical step. It involves using various metrics to assess the model's accuracy and effectiveness. This step is vital for validating new techniques and objectively comparing different model releases. + +5. **Deployment**: Upon achieving the desired performance, the model is deployed as an API. This deployment enables the model's integration into applications, making it accessible to users and other systems. + +Following these steps allows for the iterative development and refinement of a Large Language Model to meet specific needs and ensure its successful deployment. This guide specifically addresses all steps except the initial data preparation. The pretraining phase is facilitated by Torchtitan, while Torchtune manages the fine-tuning and evaluation phases. + +**Torchtune** emerges as a PyTorch-native library dedicated to the easy authoring, fine-tuning, and experimentation with LLMs, proudly announcing its alpha release. + +Features of Torchtune encompass: + +* Native-PyTorch implementations of renowned LLMs using composable and modular building blocks. +* Straightforward and adaptable training recipes for popular fine-tuning techniques such as LoRA and QLoRA, emphasizing a PyTorch-centric approach without the need for trainers or frameworks. +* YAML configurations for simplifying the setup of training, evaluation, quantization, or inference recipes. +* Comprehensive support for numerous popular dataset formats and prompt templates, ensuring a smooth start to training endeavors. + +This case study provides examples for two schedulers, Slurm and Kubernetes, with detailed instructions available in the `slurm` or `kubernetes` subdirectories. \ No newline at end of file diff --git a/3.test_cases/torchtune/docs/LLMOps.png b/3.test_cases/torchtune/docs/LLMOps.png new file mode 100644 index 00000000..fbdb82cd Binary files /dev/null and b/3.test_cases/torchtune/docs/LLMOps.png differ diff --git a/3.test_cases/torchtune/kubernetes/.gitkeep b/3.test_cases/torchtune/kubernetes/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/3.test_cases/torchtune/slurm/README.md b/3.test_cases/torchtune/slurm/README.md new file mode 100644 index 00000000..7c9a0053 --- /dev/null +++ b/3.test_cases/torchtune/slurm/README.md @@ -0,0 +1,131 @@ +# End-to-End LLM Model Development with Torchtune on Slurm + +This test case illustrates the setup and execution of each step in LLMOps on Slurm using the Torchtune environment. This README provides a detailed guide for configuring the necessary environment. For hands-on LLMOps examples, refer to the [tutorials](./tutorials) section. + +## 1. Prerequisites + +Before proceeding with each step of this test case, ensure you have the following prerequisites: + +* A Slurm cluster configured as specified +* Access tokens and keys for Hugging Face and Weights & Biases (W&B) + +Further setup details are provided below. + +### Slurm Cluster + +For this guide, you should have: + +* An operational Slurm cluster on AWS. +* Docker, [Pyxis](https://github.com/NVIDIA/pyxis), and [Enroot](https://github.com/NVIDIA/enroot) installed. +* An FSx for Lustre filesystem mounted at `/fsx`. + +It's recommended to establish your Slurm cluster using the templates found in the [architectures directory](../../../1.architectures). + +### Token and Access Key + +Create an access token at [Hugging Face Tokens](https://huggingface.co/settings/tokens) (`HF_TOKEN`). + +For monitoring model training and computational resource usage, [Weights & Biases](https://wandb.ai/) will be utilized. Create an account and retrieve your `WANDB_API_KEY` from the Weights & Biases [Settings](https://wandb.ai/settings). For comprehensive setup instructions, consult the Weights & Biases [Quickstart Guide](https://docs.wandb.ai/quickstart). + +## 2. Preparing the Environment + +This section outlines the steps to acquire the necessary codebases and configure your development environment. + +### Cloning Repositories + +Start by cloning the required repository on the cluster's head/login node, then navigate to the specific test case directory: + +```bash +git clone https://github.com/aws-samples/awsome-distributed-training /fsx/${USER}/awsome-distributed-training +cd /fsx/${USER}/awsome-distributed-training/3.test_cases/torchtune/slurm +``` + +For demonstration purposes, we will proceed with `USER=ubuntu`. + +Following that, clone the torchtune repository: + +```bash +git clone https://github.com/pytorch/torchtune.git torchtune +``` + +### Setting Up Environment Variables + +Initiate the configuration of your environment by executing the `configure-env-vars.sh` script. This action generates a `.env` file, which is crucial for defining the environment variables needed by all subsequent job files: + +```bash +bash configure-env-vars.sh +``` + +During this setup, you'll be prompted to provide your `WANDB_API_KEY` and `HF_KEY`. These keys are essential for integrating with the Weights & Biases and Hugging Face platforms, respectively: + +```bash +Setting up environment variables +Please enter your WANDB_API_KEY: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +Please enter your HF_KEY: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +.env file created successfully +Please run the following command to set the environment variables +source .env +``` + +The `.env` file will include the following predefined variables. You have the flexibility to modify the `configure-env-vars.sh` script to better suit your project's specific needs: + +```bash +cat .env +``` + +```bash +export FSX_PATH=/fsx/ubuntu +export APPS_PATH=/fsx/ubuntu/apps +export ENROOT_IMAGE=/fsx/ubuntu/apps/torchtune.sqsh +export MODEL_PATH=/fsx/ubuntu/models/torchtune +export TEST_CASE_PATH=/fsx/ubuntu/awsome-distributed-training/3.test_cases/torchtune/slurm +export HF_HOME=/fsx/ubuntu/.cache/huggingface +export WANDB_CACHE_DIR=/fsx/ubuntu/.cache/wandb +export WANDB_DIR=/fsx/ubuntu/models/torchtune/wandb +export WANDB_API_KEY=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx +``` + +## 3. Building the Torchtune Container + +Before initiating any training jobs, it's essential to prepare a Docker container image. We'll utilize [Enroot](https://github.com/NVIDIA/enroot), a tool designed to convert Docker images into unprivileged sandboxes. This is particularly useful for running containers within Slurm-managed clusters. + +### Submitting the Build Job + +To start the build process, submit the `build-image.sbatch` script to Slurm: + +```bash +sbatch build-image.sbatch +``` + +#### Monitoring the Build Progress + +Keep an eye on the build progress by tailing the log files: + +```bash +tail -f logs/build-image_* +``` + +A successful build process is indicated by the following lines at the end of the log file: + +```bash +Number of fifo nodes 0 +Number of socket nodes 0 +Number of directories 41628 +Number of ids (unique uids + gids) 1 +Number of uids 1 + root (0) +Number of gids 1 + root (0) + +==> logs/build-image_xxx.out <== +Image built and saved as /fsx/ubuntu/apps/torchtune.sqsh +``` +These lines confirm that the image has been successfully built and is now stored at the specified location, ready for use. + +## 4. Proceeding to Experiments + +With the torchtune container now built and ready, you're all set to dive into the actual experiments. Navigate to the [tutorials](./tutorials) directory to explore various examples. + + + + diff --git a/3.test_cases/torchtune/slurm/build-image.sbatch b/3.test_cases/torchtune/slurm/build-image.sbatch new file mode 100644 index 00000000..72e4751e --- /dev/null +++ b/3.test_cases/torchtune/slurm/build-image.sbatch @@ -0,0 +1,29 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH -N 1 # number of nodes to use +#SBATCH --job-name=build-image # name of your job +#SBATCH --output=logs/%x_%j.out # logfile for stdout +#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs + +set -euo pipefail + +if [ ! -f .env ] +then + echo "Please create a .env file with the required environment variables" + exit 1 +else + source .env +fi + +docker build -t torchtune -f torchtune.dockerfile . + +# Remove old sqsh file if exists +if [ -f torchtune.sqsh ] ; then + rm torchtune.sqsh +fi +enroot import -o torchtune.sqsh dockerd://torchtune:latest +mv torchtune.sqsh ${ENROOT_IMAGE} +echo "Image built and saved as ${ENROOT_IMAGE}" \ No newline at end of file diff --git a/3.test_cases/torchtune/slurm/configure-env-vars.sh b/3.test_cases/torchtune/slurm/configure-env-vars.sh new file mode 100755 index 00000000..294d6838 --- /dev/null +++ b/3.test_cases/torchtune/slurm/configure-env-vars.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +echo "Setting up environment variables" + +### READ INPUTS +WANDB_API_KEY=$(bash -c 'read -p "Please enter your WANDB_API_KEY: " && echo $REPLY') +HF_KEY=$(bash -c 'read -p "Please enter your HF_KEY: " && echo $REPLY') + +### SET TEST CASE PATH +echo "export FSX_PATH=/fsx/${USER}" > .env +source .env +echo "export APPS_PATH=${FSX_PATH}/apps" >> .env +source .env +if [ ! -d "${APPS_PATH}" ]; then + mkdir -p ${APPS_PATH} +fi +echo "export ENROOT_IMAGE=${APPS_PATH}/torchtune.sqsh" >> .env +source .env +echo "export MODEL_PATH=${FSX_PATH}/models/torchtune" >> .env +source .env +echo "export TEST_CASE_PATH=${FSX_PATH}/awsome-distributed-training/3.test_cases/torchtune/slurm" >> .env +source .env + +### Configure HF_HOME +# https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables +export HF_HOME=${FSX_PATH}/.cache/huggingface +if [ ! -d "${HF_HOME}" ]; then + mkdir -p ${HF_HOME} +fi +echo "export HF_HOME=${HF_HOME}" >> .env +source .env +echo ${HF_KEY} > ${HF_HOME}/token + +### Configure WANDB +# https://docs.wandb.ai/ja/guides/track/environment-variables +export WANDB_CACHE_DIR=${FSX_PATH}/.cache/wandb +if [ ! -d "${WANDB_CACHE_DIR}" ]; then + mkdir -p ${WANDB_CACHE_DIR} +fi +echo "export WANDB_CACHE_DIR=${WANDB_CACHE_DIR}" >> .env +source .env +export WANDB_DIR=${MODEL_PATH}/wandb +if [ ! -d "${WANDB_DIR}" ]; then + mkdir -p ${WANDB_DIR} +fi +echo "export WANDB_DIR=${WANDB_DIR}" >> .env +source .env +echo "export WANDB_API_KEY=${WANDB_API_KEY}" >> .env +source .env + +### Epilogue +echo ".env file created successfully" +echo "Please run the following command to set the environment variables" +echo "source .env" \ No newline at end of file diff --git a/3.test_cases/torchtune/slurm/download_hf_model.sh b/3.test_cases/torchtune/slurm/download_hf_model.sh new file mode 100755 index 00000000..dcc40739 --- /dev/null +++ b/3.test_cases/torchtune/slurm/download_hf_model.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +set -eo pipefail + +if [ ! -f .env ] +then + echo "Please create a .env file with the required environment variables" + exit 1 +else + source .env +fi + +declare -a HELP=( + "Download a Hugging Face model to ${MODEL_PATH} using torchtune" + "Usage: download_hf_model.sh [options]" + "" + "Options:" + " -h, --help" + " Print this help message" + " -m, --model" + " Hugging Face model name" +) +parse_args() { + while [ $# -gt 0 ]; do + case "$1" in + -h|--help) + for line in "${HELP[@]}"; do + echo "$line" + done + exit 0 + ;; + -m|--model) + shift + HF_MODEL=$1 + ;; + *) + echo "[ERROR] Unknown argument: $1" + exit 1 + ;; + esac + shift + done +} +parse_args "$@" + +if [ -z ${HF_MODEL} ]; then + echo "[ERROR] -m|--model not provided." + exit 1 +fi + +if [ ! -d ${MODEL_PATH} ] +then + mkdir -p ${MODEL_PATH} +fi + +: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}" +declare -a SRUN_ARGS=( + --container-image $ENROOT_IMAGE + --container-mounts $CONTAINER_MOUNT +) +declare -a TORCHTUNE_ARGS=( + --output-dir ${MODEL_PATH}/${HF_MODEL} + ${HF_MODEL} +) +if [ ${HF_MODEL} = "meta-llama/Meta-Llama-3-70B" ] +then + # https://github.com/pytorch/torchtune#multi-gpu + TORCHTUNE_ARGS+=("--ignore-patterns" "original/consolidated*") +fi +echo "Executing following command:" +echo "torchtune download ${TORCHTUNE_ARGS[@]}" + + +export TORCHTUNE=${PWD}/torchtune/torchtune/_cli/tune.py +enroot start --env NVIDIA_VISIBLE_DEVICES=void --env PYTHONPATH=${PWD}/torchtune --env HF_HOME=${HF_HOME} \ + --mount ${FSX_PATH}:${FSX_PATH} ${ENROOT_IMAGE} \ + python ${TORCHTUNE} download \ + ${TORCHTUNE_ARGS[@]} + diff --git a/3.test_cases/torchtune/slurm/torchtune.dockerfile b/3.test_cases/torchtune/slurm/torchtune.dockerfile new file mode 100644 index 00000000..6f688356 --- /dev/null +++ b/3.test_cases/torchtune/slurm/torchtune.dockerfile @@ -0,0 +1,234 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#################################################################################################### +# This is a sample Dockerfile, with optional stanzas. Please read through this Dockerfile, +# understand what it does, then create your own Dockerfile. +# +# Sample build instructions: +# +# docker build --progress=plain -t nvidia-pt-od:latest -f 0.nvcr-pytorch-aws.dockerfile . +# rm /fsx/nvidia-pt-od__latest.sqsh ; enroot import -o /fsx/nvidia-pt-od__latest.sqsh dockerd://nvidia-pt-od:latest +# +# Compute nodes (aka build nodes) are transient, so we need to keep the docker image on shared fs, +# which head node can load into its local registry. +# +# # Build node: save image to file +# docker save nvidia-pt-od:latest > /fsx/nvidia-pt-od__latest.tar +# +# # Load image to local docker registry -> on head node, or new compute/build node. +# docker load < /fsx/nvidia-pt-od__latest.tar +#################################################################################################### +FROM nvcr.io/nvidia/pytorch:24.05-py3 +ENV DEBIAN_FRONTEND=noninteractive + +# The three must-be-built packages. +# Efa-installer>=1.29.0 required for nccl>=2.19.0 to avoid libfabric NCCL error. +ARG EFA_INSTALLER_VERSION=1.31.0 +ARG AWS_OFI_NCCL_VERSION=v1.8.1-aws +ARG NCCL_TESTS_VERSION=2.13.9 +ARG NCCL_VERSION=2.20.3-1 + +RUN apt-get update -y +RUN apt-get remove -y --allow-change-held-packages \ + libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 + +# We noticed that since 23.09, we can't just delete the whole /opt/hpcx/, otherwise `import torch` +# complains about missing libuc?.so. +RUN rm -rf /opt/hpcx/ompi \ + && rm -rf /usr/local/mpi \ + && rm -rf /opt/hpcx/nccl_rdma_sharp_plugin \ + && ldconfig +ENV OPAL_PREFIX= +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ + git \ + gcc \ + vim \ + kmod \ + openssh-client \ + openssh-server \ + build-essential \ + curl \ + autoconf \ + libtool \ + gdb \ + automake \ + cmake \ + apt-utils \ + libhwloc-dev \ + aptitude && \ + DEBIAN_FRONTEND=noninteractive apt autoremove -y + +# EFA +RUN apt-get update && \ + cd /tmp && \ + curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ + tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ + cd aws-efa-installer && \ + # ONLY add `--skip-kmod`, `--no-verify` and `--skip-limit-conf` flags to container image. + # Those three flags must NOT be used on the host. + # + # Explanations: + # - to build EFA in the Dockerfile, we added --skip-kmod and --no-verify. Without these flags, + # the Dockerfile will fail to build. If installing EFA on the host and not in a container, + # please remove these flags. + # - The --skip-limit-conf can be retained in Dockerfile, but it's redundant as the host already + # has these limits set by efa_installer. + ./efa_installer.sh -y -g -d --skip-kmod --no-verify --skip-limit-conf && \ + ldconfig && \ + rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/* +ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH +ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH + + +#################################################################################################### +# [CUSTOM_NCCL_OPTION_1] Uncomment below stanza to install another NCCL version using the official +# binaries. +# +# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the +# aws-ofi-ccnl. +#################################################################################################### +# RUN cd /opt && \ +# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ +# dpkg -i cuda-keyring_1.0-1_all.deb && \ +# apt update && \ +# apt install -y libnccl2==${NCCL_VERSION} libnccl-dev==${NCCL_VERSION} && \ +# echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf + + +#################################################################################################### +# [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones. The +# benefits of installing to the same location as the built-in version are: +# +# 1. There's only ever a single libnccl version offered by this image, preventing application from +# mistakenly chooses a wrong version. +# 2. No longer needing extra settings for LD_LIBRARY_PATH or LD_PRELOAD. +# +# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the +# aws-ofi-ccnl. +#################################################################################################### +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ + && cd nccl \ + && make -j src.build BUILDDIR=/usr \ + # Build for p4 & p5. + NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \ + && rm -rf /tmp/nccl \ + && echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf + + +#################################################################################################### +# Rebuild OpenMPI with custom PMIX version. E.g., to match what host's Slurm is built with (see +# /opt/pmix/ on host, or run pmix_info on host). +# +# May be needed on rare occassions when `srun --mpi=pmix --container-image=... ` +# mysteriously crashes. +# +# NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the +# aws-ofi-ccnl. +#################################################################################################### +ENV OPEN_MPI_PATH=/opt/amazon/openmpi + +# OpenMPI build script claims PMIX_VERSION, and complains if we use it. +ENV CUSTOM_PMIX_VERSION=4.2.6 +RUN apt-get update && apt-get install -y libevent-dev \ + && cd /tmp \ + && wget https://github.com/openpmix/openpmix/releases/download/v${CUSTOM_PMIX_VERSION}/pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ + && tar -xzf pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ + && rm pmix-${CUSTOM_PMIX_VERSION}.tar.gz \ + && cd pmix-${CUSTOM_PMIX_VERSION}/ \ + && ./autogen.pl \ + && ./configure --prefix=/opt/pmix \ + && make -j \ + && make install \ + && echo /opt/pmix/lib > /etc/ld.so.conf.d/pmix.conf \ + && ldconfig \ + && cd / \ + && rm -fr /tmp/pmix-${CUSTOM_PMIX_VERSION}/ +# To silence this runtime error message: +# [p4de-st-p4de-2:110912] PMIX ERROR: ERROR in file gds_ds12_lock_pthread.c at line 168 +ENV PMIX_GDS_MODULE=^ds12 \ + PMIX_MCA_gds=^ds12 + +# Rebuild openmpi with DLC style (which it remarks as "without libfabric"), with the above pmix. +ENV OMPI_VERSION=4.1.6 +RUN rm -fr ${OPEN_MPI_PATH} \ + && mkdir /tmp/openmpi \ + && cd /tmp/openmpi \ + && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ + && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ + && rm openmpi-${OMPI_VERSION}.tar.gz \ + && cd openmpi-${OMPI_VERSION} \ + && ./configure --enable-orterun-prefix-by-default --prefix=$OPEN_MPI_PATH --with-cuda=${CUDA_HOME} --with-slurm --with-pmix=/opt/pmix \ + && make -j $(nproc) all \ + && make install \ + && ldconfig \ + && cd / \ + && rm -rf /tmp/openmpi \ + && ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \ + # Verify pmix from /opt/pmix/ + && ldd /opt/amazon/openmpi/lib/openmpi/mca_pmix_ext3x.so | grep '/opt/pmix/lib/libpmix.so.* ' > /opt/amazon/openmpi-pmix.txt +#################################################################################################### + + +## NCCL EFA Plugin +#RUN mkdir -p /tmp && \ +# cd /tmp && \ +# curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ +# tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ +# rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ +# mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \ +# cd /tmp/aws-ofi-nccl && \ +# ./autogen.sh && \ +# ./configure --prefix=/opt/amazon/efa \ +# --with-libfabric=/opt/amazon/efa \ +# --with-cuda=/usr/local/cuda \ +# --enable-platform-aws \ +# --with-mpi=/opt/amazon/openmpi && \ +# make -j$(nproc) install && \ +# rm -rf /tmp/aws-ofi/nccl + +################################################### +## Install AWS-OFI-NCCL plugin +RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y +RUN export OPAL_PREFIX="" \ + && git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \ + && cd /opt/aws-ofi-nccl \ + && git checkout ${AWS_OFI_NCCL_VERSION} \ + && ./autogen.sh \ + && ./configure --prefix=/opt/aws-ofi-nccl/install \ + --with-mpi=/opt/amazon/openmpi \ + --with-libfabric=/opt/amazon/efa \ + --with-cuda=/usr/local/cuda \ + --enable-platform-aws \ + && make -j $(nproc) && make install + + +# Do this to minimize the ld path env vars that users need to define when running this image. +RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \ + echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \ + ldconfig + +ENV OMPI_MCA_pml=^cm,ucx \ + OMPI_MCA_btl=tcp,self \ + OMPI_MCA_btl_tcp_if_exclude=lo,docker0 \ + OPAL_PREFIX=/opt/amazon/openmpi \ + # https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 + # https://github.com/pytorch/pytorch/issues/68893 + NCCL_SOCKET_IFNAME=^docker,lo + +ENV LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" + +# NCCL-tests: always good to include this as a diagnostic tool. +RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ + && cd /opt/nccl-tests \ + && git checkout v${NCCL_TESTS_VERSION} \ + && make MPI=1 \ + MPI_HOME=/opt/amazon/openmpi \ + CUDA_HOME=/usr/local/cuda \ + NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_80,code=sm_80" + + +RUN pip install accelerate appdirs loralib bitsandbytes datasets fire peft transformers>=4.40.0 sentencepiece wandb vllm gradio openai +RUN pip install hydra-core huggingface_hub safetensors tiktoken blobfile>=2 tqdm torchao==0.1 lm_eval==0.4.* +RUN pip uninstall -y transformer-engine \ No newline at end of file diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/README.md b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/README.md new file mode 100644 index 00000000..6b5f5e84 --- /dev/null +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/README.md @@ -0,0 +1,287 @@ +# End-to-End LLama3-70B model development with Torchtune + +This tutorial guide you through each following LLM model development steps using Llama3-70B: + +* Continual Pretraining +* Instruction Finetuning +* Alignment +* Evaluation +* Deployment + +for details of each step, refer the [overview documentation](../../README.md). + +## 1. Prerequisites +Before starting, ensure you have requested access to Meta-Llama-3-70B by visiting [Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) on Hugging Face and following the access request instructions. Additionally, make sure all prerequisites described in the [slurm](..) directory are set up. + +## 2. Download llama3 model + +To begin working with the Llama3-70B model, follow these steps to download the model weights and tokenizer: + +### Setting Up Your Environment + +Navigate to the [test case path](..) and prepare your environment by sourcing the `.env` file. This step is essential for setting up the paths and credentials needed to access and interact with the Llama3-70B model: + +```bash +source .env +``` + +### Fetching the Model Weights and Tokenizer + +Execute the `download_hf_model.sh` script with the model identifier as an argument to download the model weights and tokenizer: + +```bash +bash download_hf_model.sh --model meta-llama/Meta-Llama-3-70B +``` + +Upon successful execution, the script will output messages indicating the progress of the download. Here's what you can expect to see: + +```bash +Executing following command: +torchtune download --output-dir /fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B meta-llama/Meta-Llama-3-70B --ignore-patterns original/consolidated* + +============= +== PyTorch == +============= + +NVIDIA Release 24.04 (build 88113656) +... +Downloading builder script: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5.67k/5.67k [00:00<00:00, 29.6MB/s] +No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda' +Ignoring files matching the following patterns: original/consolidated* +USE_POLICY.md: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.70k/4.70k [00:00<00:00, 16.3MB/s] +generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 177/177 [00:00<00:00, 1.96MB/s] +.gitattributes: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.52k/1.52k [00:00<00:00, 17.3MB/s] +README.md: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36.6k/36.6k [00:00<00:00, 181MB/s] +LICENSE: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7.80k/7.80k [00:00<00:00, 77.7MB/s] +... +/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B/model-00007-of-00030.safetensors +/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B/model-00016-of-00030.safetensors +/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B/model-00010-of-00030.safetensors +/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B/model-00001-of-00030.safetensors +/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B/model-00028-of-00030.safetensors +/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B/model-00023-of-00030.safetensors +``` + +This output confirms that the `torchtune download` command has been executed within the container, successfully downloading the safetensors for `meta-llama/Meta-Llama-3-70B` into the specified `${MODEL_PATH}`. +By following these steps, you ensure that the necessary model components are in place, setting the stage for subsequent tasks such as pretraining, finetuning, evaluation, and deployment. + + +## 3. Continuous Pretraining + +In this step, you will fine-tune the Llama3 model starting from the original checkpoint using the WikiText dataset. This process, known as Full-Parameter Finetuning, updates all the parameters in the original model. The configuration file used for this process is `./tutorials/e2e-llama3-70b-development/full_finetune_distributed.yaml`. + +### Memory Consumption Challenges +One of the primary challenges during such training is memory consumption. A typical model trained in mixed precision with AdamW requires 18 bytes per model parameter plus activation memory (6 bytes for parameters in mixed precision training, 8 bytes for AdamW, and 4 bytes for other overheads). For more details on the anatomy, see the [Hugging Face blog post](https://huggingface.co/docs/transformers/model_memory_anatomy) blog post. This means that training a 70B parameter model would require more than 1.12 TB of accelerated memory, which far exceeds the 80 GB capacity of H100 accelerated memory. To address this issue, torchtune integrates PyTorch Fully Sharded Data Parallel (FSDP). + +### Basic concepts and relevant configuration + +**FSDP** is a distributed training feature designed to efficiently handle large model training by sharding model parameters, gradients, and optimizer states across multiple devices. This approach significantly reduces memory consumption and optimizes resource utilization, making it possible to train models that are too large to fit on a single GPU. In `torchtune` users can launch FSDP training job with command `tune run full_finetune_distributed`. + +**The WikiText language modeling dataset** is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. `torchtune` has a module preconfigured for this dataset. The configuration file preconfigures the WikiText dataset as follows: + +```yaml +dataset: + _component_: torchtune.datasets.wikitext_dataset +``` + +### Submit the training job + +Submit the job with the following command: + +```bash +sbatch tutorials/e2e-llama3-70b-development/full_finetune_distributed.sbatch +``` + +By default, this script launches the FSDP training job with two instances. Once the job has been scheduled, you will see the following outputs in the log file named `logs/full-finetuning*`: + +```bash +# tail -f logs/full-finetuning* +Executing following command: +tune run --master_addr 10.1.62.14 --master_port 28415 --nproc_per_node=8 --nnodes 2 --rdzv_backend=c10d --rdzv_endpoint=p5-st-p5-1 full_finetune_distributed --config /fsx/ubuntu/awsome-distributed-training/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/full_finetune_distributed.yaml tokenizer.path=/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B/original/tokenizer.model checkpointer.checkpoint_dir=/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B checkpointer.output_dir=/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B-tuned output_dir=/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B-tuned/log metric_logger.log_dir=/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B-tuned/log/metrics +... +0: wandb: Currently logged in as: . Use `wandb login --relogin` to force relogin +0: wandb: Tracking run with wandb version 0.17.0 +0: wandb: Run data is saved locally in /fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B-tuned/log/metrics/wandb/run-20240527_001350-oziekm6j +0: wandb: Run `wandb offline` to turn off syncing. +0: wandb: Syncing run helpful-surf-1 +0: wandb: ⭐️ View project at https://wandb.ai//torchtune +0: wandb: 🚀 View run at https://wandb.ai//torchtune/runs/oziekm6j +0: 2024-05-27:00:13:50,919 INFO [metric_logging.py:225] Logging /fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B/torchtune_config.yaml to W&B under Files +... +``` + +Notice that the job is being tracked by WANDB because of the following section in the config file: + +```yaml +metric_logger: + _component_: torchtune.utils.metric_logging.WandBLogger + log_dir: None +``` + +On the WANDB dashboard (`https://wandb.ai//torchtune`), you can monitor the learning curve, compute resource utilization, log outputs, and more. + + +## 4. Instruction-tuning + +In this step, you will fine-tune the Llama model using Low-Rank Adaptation (LoRA) with the Alpaca dataset. We will first cover the basic concepts and relevant configurations found in the [config file](configs/lora_finetune_distributed.yaml), followed by a detailed fine-tuning tutorial. + + +### Basic Concepts and Relevant Configurations + +**Low-Rank Adaptation (LoRA)** is a method for fine-tuning large language models efficiently. It is a Parameter-efficient Fine-tuning (PEFT) technique that modifies a small, low-rank subset of a model's parameters, significantly reducing the computational cost and time required for fine-tuning. LoRA operates on the principle that large models, despite their size, inherently possess a low-dimensional structure, allowing significant changes to be represented with fewer parameters. This method involves decomposing large weight matrices into smaller matrices, drastically reducing the number of trainable parameters and making the adaptation process faster and less resource-intensive. It leverages the concept of lower-rank matrices to efficiently train models, making it a cost-effective solution for fine-tuning large language models. +![lora](./docs/lora.png) +In the config we have following relevant section: + +```yaml +model: + _component_: torchtune.models.llama3.lora_llama3_70b + lora_attn_modules: ['q_proj', 'k_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 16 + lora_alpha: 32 +``` +This config can be read as follows: + +* This means that we only create LoRA adapters in attention heads, specifically for its Query, Key, and Value matrices. +* `lora_alpha` is a hyper-parameter to control the initialization scale. +* `lora_rank` is the rank of the LoRA parameters. The smaller the `lora_rank`, the fewer parameters LoRA has. + +**The Stanford Alpaca dataset** is a synthetic dataset created by Stanford researchers to fine-tune large language models (LLMs) for instruction-following tasks. It contains 52,000 unique instruction-output pairs generated using OpenAI's text-davinci-003 model. + +In the config we have the following relevant section: + +```yaml +dataset: + _component_: torchtune.datasets.alpaca_dataset + train_on_input: True +``` + +As the config suggests, we use a predefined dataset class prepared in torchtune. + + +### Submit Finetuning job + +You can submit the finetuning job with the following command: + + +```bash +sbatch tutorials/e2e-llama3-70b-development/lora_finetune_distributed.sbatch +``` + +Once the job has been scheduled, you will see following outputs in the logo output named `logs/: + +```bash +... +Executing following command: +tune run --master_addr 10.1.28.89 --master_port 14280 --nproc_per_node=8 --nnodes 1 --nnodes=1 --rdzv_backend=c10d --rdzv_endpoint=p5-st-p5-2 lora_finetune_distributed +... +0: wandb: Currently logged in as: . Use `wandb login --relogin` to force relogin +0: wandb: Tracking run with wandb version 0.17.0 +0: wandb: Run data is saved locally in /fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B-tuned/log/metrics/wandb/run-20240527_001350-oziekm6j +0: wandb: Run `wandb offline` to turn off syncing. +0: wandb: Syncing run helpful-surf-1 +0: wandb: ⭐️ View project at https://wandb.ai//torchtune +0: wandb: 🚀 View run at https://wandb.ai//torchtune/runs/oziekm6j +0: 2024-05-27:00:13:50,919 INFO [metric_logging.py:225] Logging /fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B/torchtune_config.yaml to W&B under Files +``` + +As the output indicates, we run a single-node distributed training job with 8 GPUs here. + +```bash +tune run --master_addr 10.1.28.89 --master_port 14280 --nproc_per_node=8 --nnodes 1 --nnodes=1 --rdzv_backend=c10d --rdzv_endpoint=p5-st-p5-2 lora_finetune_distributed +``` + + +After the training, checkpoints are saved as below: + +```bash +$ ls /fsx/models/torchtitan-torchtune/meta-llama/Meta-Llama-3-70B-tuned/ +adapter_0.pt hf_model_0002_0.pt hf_model_0005_0.pt hf_model_0008_0.pt hf_model_0011_0.pt hf_model_0014_0.pt hf_model_0017_0.pt hf_model_0020_0.pt hf_model_0023_0.pt hf_model_0026_0.pt hf_model_0029_0.pt +config.json hf_model_0003_0.pt hf_model_0006_0.pt hf_model_0009_0.pt hf_model_0012_0.pt hf_model_0015_0.pt hf_model_0018_0.pt hf_model_0021_0.pt hf_model_0024_0.pt hf_model_0027_0.pt hf_model_0030_0.pt +hf_model_0001_0.pt hf_model_0004_0.pt hf_model_0007_0.pt hf_model_0010_0.pt hf_model_0013_0.pt hf_model_0016_0.pt hf_model_0019_0.pt hf_model_0022_0.pt hf_model_0025_0.pt hf_model_0028_0.pt +``` + +Notice that you have `adapter_0.pt`, which stores weighhs for the LoRA adapter. + + +## 5. Evaluate Llama3 model with lm-evaluation harness + +In this last section, you will evaluate the finetuned Llama models. `torchtune` makes use of [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) to conduct various benchmarks. + +### Basic Concepts and Relevant Configurations + +**The lm-evaluation-harness** is a tool designed to evaluate large language models (LLMs) on various natural language processing (NLP) tasks, ensuring objective and reproducible results. It supports multiple tasks, including text classification, question answering, and commonsense reasoning, allowing for comprehensive model evaluation. The tool provides detailed metrics such as F1-score, accuracy, balanced accuracy, and Matthews correlation coefficient (MCC) for specific tasks. Users can also evaluate models on custom datasets by setting up the environment and creating new task files. The primary goal is to standardize evaluations, making results comparable across different models and implementations. This is crucial for validating new techniques and approaches in LLM development. + +We specify evaluation task in the [config](./configs/evaluate_llama3.yaml) as below: + +```yaml +# EleutherAI specific eval args +tasks: ["truthfulqa_mc2"] +``` + +In this default setting we will use TruthfulQA benchmark with MC2 mode. + +**TruthfulQA** is a benchmark designed to evaluate the truthfulness of language models in generating answers to questions. It consists of 817 questions across 38 topics, including health, law, finance, and politics, specifically targeting common misconceptions that humans might incorrectly answer due to false beliefs or misinformation. TruthfulQA features two evaluation modes: MC1 and MC2. In MC2 (Multi-true), given a question and multiple true/false reference answers, the score is the normalized total probability assigned to the set of true answers. For more details, please refer to [the project repository](https://github.com/sylinrl/TruthfulQA). + +You can submit sample evaluation job by: + + +```bash +sbatch evaluate.sbatch +``` + +## 6. Quantization + +In the production setting, it is often not feasible to deploy large model as it is, this requires + +`torchao` is a PyTorch library focused on architecture optimization, specifically targeting quantization and sparsity to enhance model performance. +* __Quantization__: Provides tools to reduce model size and improve inference speed, with settings to optimize performance using PyTorch's torch.compile function. +* __Sparsity__: Supports the identification and utilization of sparse subnetworks within models, leading to more efficient computations by reducing active parameters. Tools like WeightNormSparsifier are included to facilitate this process. + +In this example we use quantization feature of `torchao`. In the config file: + +```yaml +quantizer: + _component_: torchtune.utils.quantization.Int4WeightOnlyQuantizer + groupsize: 256 +``` + +`Int4WeightOnlyQuantizer` performs per-axis group quantization, which means it quantizes weights in groups rather than individually. By adjusting the `groupsize`, one can control the trade-off between compression ratio and accuracy. Smaller group sizes typically lead to higher accuracy but lower compression, while larger group sizes achieve higher compression at the potential cost of accuracy. + +```bash +sbatch quentize.sbatch +``` + + +```bash +Executing following command: +torchtune run quantize --config /fsx/ubuntu/awsome-distributed-training/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/quantize.yaml tokenizer.path=/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B/original/tokenizer.model checkpointer.checkpoint_dir=/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B-tuned checkpointer.output_dir=/fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B-quantized +``` + +The resultant quantized weights is saved as follows: + +```bash +0: 2024-05-31:02:10:46,964 DEBUG [seed.py:60] Setting manual seed to local seed 1234. Local seed is seed + rank = 1234 + 0 +0: 2024-05-31:02:18:17,728 INFO [quantize.py:90] Model is initialized with precision torch.bfloat16. +0: 2024-05-31:02:20:33,576 INFO [quantize.py:98] Time for quantization: 133.08 sec +0: 2024-05-31:02:20:33,577 INFO [quantize.py:99] Memory used: 40.03 GB +0: 2024-05-31:02:21:18,609 INFO [quantize.py:112] Model checkpoint of size 37.94 GB saved to /fsx/ubuntu/models/torchtune/meta-llama/Meta-Llama-3-70B-quantized/hf_model_0001_0-4w.pt +``` + + +## 7. Generation + +Now that you have production-ready quantized model. This last step test text generation using the model. + +```bash +sbatch 7.generate.sbatch --config configs/generate_llama3.yaml --prompt "Hello, my name is" +``` + +``` +[generate.py:122] Hello, my name is Sarah and I am a busy working mum of two young children, living in the North East of England. +... +[generate.py:135] Time for inference: 10.88 sec total, 18.94 tokens/sec +[generate.py:138] Bandwidth achieved: 346.09 GB/s +[generate.py:139] Memory used: 18.31 GB +``` \ No newline at end of file diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/evaluate.yaml b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/evaluate.yaml new file mode 100644 index 00000000..28c1b3c0 --- /dev/null +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/evaluate.yaml @@ -0,0 +1,66 @@ +# Config for EleutherEvalRecipe in eleuther_eval.py +# +# To launch, run the following command from root torchtune directory: +# tune run eleuther_eval --config eleuther_evaluation tasks=["truthfulqa_mc2","hellaswag"] + +# Model Arguments +model: + _component_: torchtune.models.llama3.llama3_70b + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: None + checkpoint_files: [ + hf_model_0001_0.pt, + hf_model_0002_0.pt, + hf_model_0003_0.pt, + hf_model_0004_0.pt, + hf_model_0005_0.pt, + hf_model_0006_0.pt, + hf_model_0007_0.pt, + hf_model_0007_0.pt, + hf_model_0008_0.pt, + hf_model_0009_0.pt, + hf_model_0010_0.pt, + hf_model_0011_0.pt, + hf_model_0012_0.pt, + hf_model_0013_0.pt, + hf_model_0014_0.pt, + hf_model_0015_0.pt, + hf_model_0016_0.pt, + hf_model_0017_0.pt, + hf_model_0018_0.pt, + hf_model_0019_0.pt, + hf_model_0020_0.pt, + hf_model_0021_0.pt, + hf_model_0022_0.pt, + hf_model_0023_0.pt, + hf_model_0024_0.pt, + hf_model_0025_0.pt, + hf_model_0026_0.pt, + hf_model_0027_0.pt, + hf_model_0028_0.pt, + hf_model_0029_0.pt, + hf_model_0030_0.pt, + ] + output_dir: /tmp/Llama3-70b-hf + model_type: LLAMA3 + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /fsx/models/torchtitan-torchtune/meta-llama/Meta-Llama-3-70B/original/tokenizer.model + +# Environment +device: cpu +dtype: bf16 +seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed + +# EleutherAI specific eval args +tasks: ["truthfulqa_mc2"] +limit: null +max_seq_length: 4096 +batch_size: 8 + +# Quantization specific args +quantizer: null \ No newline at end of file diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/full_finetune_distributed.yaml b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/full_finetune_distributed.yaml new file mode 100644 index 00000000..4d40e4d9 --- /dev/null +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/full_finetune_distributed.yaml @@ -0,0 +1,108 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Llama3 8B Instruct model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Meta-Llama-3-8B-Instruct --output-dir /tmp/Meta-Llama-3-8B-Instruct --hf-token +# +# To launch on 4 devices, run the following command from root: +# tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 8B_full_single_device.yaml for those cases + + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model + +# Dataset +dataset: + _component_: torchtune.datasets.wikitext_dataset +seed: null +shuffle: False + +# Model Arguments +model: + _component_: torchtune.models.llama3.llama3_70b + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Meta-Llama-3-70B-Instruct + checkpoint_files: [ + model-00001-of-00030.safetensors, + model-00002-of-00030.safetensors, + model-00003-of-00030.safetensors, + model-00004-of-00030.safetensors, + model-00005-of-00030.safetensors, + model-00006-of-00030.safetensors, + model-00007-of-00030.safetensors, + model-00008-of-00030.safetensors, + model-00009-of-00030.safetensors, + model-00010-of-00030.safetensors, + model-00011-of-00030.safetensors, + model-00012-of-00030.safetensors, + model-00013-of-00030.safetensors, + model-00014-of-00030.safetensors, + model-00015-of-00030.safetensors, + model-00016-of-00030.safetensors, + model-00017-of-00030.safetensors, + model-00018-of-00030.safetensors, + model-00019-of-00030.safetensors, + model-00020-of-00030.safetensors, + model-00021-of-00030.safetensors, + model-00022-of-00030.safetensors, + model-00023-of-00030.safetensors, + model-00024-of-00030.safetensors, + model-00025-of-00030.safetensors, + model-00026-of-00030.safetensors, + model-00027-of-00030.safetensors, + model-00028-of-00030.safetensors, + model-00029-of-00030.safetensors, + model-00030-of-00030.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Meta-Llama-3-70B-Instruct + model_type: LLAMA3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 1 +epochs: 1 + +optimizer: + _component_: torch.optim.AdamW + lr: 2e-5 + foreach: False + +loss: + _component_: torch.nn.CrossEntropyLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 + + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True +memory_efficient_fsdp_wrap: True +fsdp_cpu_offload: True + +# Reduced precision +dtype: bf16 + +# Logging +output_dir: None +metric_logger: + _component_: torchtune.utils.metric_logging.WandBLogger + log_dir: None +log_every_n_steps: 1 +log_peak_memory_stats: False \ No newline at end of file diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/generate_llama3.yaml b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/generate_llama3.yaml new file mode 100644 index 00000000..18ad40e8 --- /dev/null +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/generate_llama3.yaml @@ -0,0 +1,66 @@ +# Config for running the InferenceRecipe in generate.py to generate output from an LLM +# +# To launch, run the following command from root torchtune directory: +# tune run generate --config generation + +# Model arguments +model: + _component_: torchtune.models.llama3.llama3_70b + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: ${MODEL_PATH}/${HF_MODEL} + checkpoint_files: [ + model-00001-of-00030.safetensors, + model-00002-of-00030.safetensors, + model-00003-of-00030.safetensors, + model-00004-of-00030.safetensors, + model-00005-of-00030.safetensors, + model-00006-of-00030.safetensors, + model-00007-of-00030.safetensors, + model-00008-of-00030.safetensors, + model-00009-of-00030.safetensors, + model-00010-of-00030.safetensors, + model-00011-of-00030.safetensors, + model-00012-of-00030.safetensors, + model-00013-of-00030.safetensors, + model-00014-of-00030.safetensors, + model-00015-of-00030.safetensors, + model-00016-of-00030.safetensors, + model-00017-of-00030.safetensors, + model-00018-of-00030.safetensors, + model-00019-of-00030.safetensors, + model-00020-of-00030.safetensors, + model-00021-of-00030.safetensors, + model-00022-of-00030.safetensors, + model-00023-of-00030.safetensors, + model-00024-of-00030.safetensors, + model-00025-of-00030.safetensors, + model-00026-of-00030.safetensors, + model-00027-of-00030.safetensors, + model-00028-of-00030.safetensors, + model-00029-of-00030.safetensors, + model-00030-of-00030.safetensors, + ] + recipe_checkpoint: null + output_dir: ${MODEL_PATH}/${HF_MODEL}-quantized + model_type: LLAMA3 + +device: cuda +dtype: bf16 + +seed: 1234 + +# Tokenizer arguments +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: ${MODEL_PATH}/${HF_MODEL}/original/tokenizer.model + + +# Generation arguments; defaults taken from gpt-fast +prompt: "Hello, my name is" +max_new_tokens: 300 +temperature: 0.6 # 0.8 and 0.6 are popular values to try +top_k: 300 + +quantizer: null diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/lora_finetune_distributed-alignment.yaml b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/lora_finetune_distributed-alignment.yaml new file mode 100644 index 00000000..0ace5e1b --- /dev/null +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/lora_finetune_distributed-alignment.yaml @@ -0,0 +1,101 @@ +# Config for multi-device LoRA in lora_finetune_distributed.py +# using a Llama3 70B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Meta-Llama-3-70B-Instruct --hf-token --output-dir /tmp/Meta-Llama-3-70b --ignore-patterns "original/consolidated*" +# +# This config needs 8 GPUs to run +# # tune run --nproc_per_node 8 lora_finetune_distributed --config recipes/configs/llama3/70B_lora.yaml +# + +# Model Arguments +model: + _component_: torchtune.models.llama3.lora_llama3_70b + lora_attn_modules: ['q_proj', 'k_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 16 + lora_alpha: 32 + +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: None + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: None + checkpoint_files: [ + model-00001-of-00030.safetensors, + model-00002-of-00030.safetensors, + model-00003-of-00030.safetensors, + model-00004-of-00030.safetensors, + model-00005-of-00030.safetensors, + model-00006-of-00030.safetensors, + model-00007-of-00030.safetensors, + model-00008-of-00030.safetensors, + model-00009-of-00030.safetensors, + model-00010-of-00030.safetensors, + model-00011-of-00030.safetensors, + model-00012-of-00030.safetensors, + model-00013-of-00030.safetensors, + model-00014-of-00030.safetensors, + model-00015-of-00030.safetensors, + model-00016-of-00030.safetensors, + model-00017-of-00030.safetensors, + model-00018-of-00030.safetensors, + model-00019-of-00030.safetensors, + model-00020-of-00030.safetensors, + model-00021-of-00030.safetensors, + model-00022-of-00030.safetensors, + model-00023-of-00030.safetensors, + model-00024-of-00030.safetensors, + model-00025-of-00030.safetensors, + model-00026-of-00030.safetensors, + model-00027-of-00030.safetensors, + model-00028-of-00030.safetensors, + model-00029-of-00030.safetensors, + model-00030-of-00030.safetensors, + ] + recipe_checkpoint: null + output_dir: None + model_type: LLAMA3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_dataset + train_on_input: True +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 1 + +# Logging +output_dir: None +metric_logger: + _component_: torchtune.utils.metric_logging.WandBLogger + log_dir: None +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/lora_finetune_distributed.yaml b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/lora_finetune_distributed.yaml new file mode 100644 index 00000000..0ace5e1b --- /dev/null +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/lora_finetune_distributed.yaml @@ -0,0 +1,101 @@ +# Config for multi-device LoRA in lora_finetune_distributed.py +# using a Llama3 70B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Meta-Llama-3-70B-Instruct --hf-token --output-dir /tmp/Meta-Llama-3-70b --ignore-patterns "original/consolidated*" +# +# This config needs 8 GPUs to run +# # tune run --nproc_per_node 8 lora_finetune_distributed --config recipes/configs/llama3/70B_lora.yaml +# + +# Model Arguments +model: + _component_: torchtune.models.llama3.lora_llama3_70b + lora_attn_modules: ['q_proj', 'k_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 16 + lora_alpha: 32 + +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: None + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: None + checkpoint_files: [ + model-00001-of-00030.safetensors, + model-00002-of-00030.safetensors, + model-00003-of-00030.safetensors, + model-00004-of-00030.safetensors, + model-00005-of-00030.safetensors, + model-00006-of-00030.safetensors, + model-00007-of-00030.safetensors, + model-00008-of-00030.safetensors, + model-00009-of-00030.safetensors, + model-00010-of-00030.safetensors, + model-00011-of-00030.safetensors, + model-00012-of-00030.safetensors, + model-00013-of-00030.safetensors, + model-00014-of-00030.safetensors, + model-00015-of-00030.safetensors, + model-00016-of-00030.safetensors, + model-00017-of-00030.safetensors, + model-00018-of-00030.safetensors, + model-00019-of-00030.safetensors, + model-00020-of-00030.safetensors, + model-00021-of-00030.safetensors, + model-00022-of-00030.safetensors, + model-00023-of-00030.safetensors, + model-00024-of-00030.safetensors, + model-00025-of-00030.safetensors, + model-00026-of-00030.safetensors, + model-00027-of-00030.safetensors, + model-00028-of-00030.safetensors, + model-00029-of-00030.safetensors, + model-00030-of-00030.safetensors, + ] + recipe_checkpoint: null + output_dir: None + model_type: LLAMA3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_dataset + train_on_input: True +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 1 + +# Logging +output_dir: None +metric_logger: + _component_: torchtune.utils.metric_logging.WandBLogger + log_dir: None +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/pretrain_llama3_70b.toml b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/pretrain_llama3_70b.toml new file mode 100644 index 00000000..5325d88f --- /dev/null +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/pretrain_llama3_70b.toml @@ -0,0 +1,53 @@ +# torchtitan Config.toml +# NOTE: this toml config is a preset for 16 H100 GPUs. +# TODO: find out how to set FSDP full-shard + +[job] +dump_folder = "${MODEL_PATH}/torchtitan-outputs" +description = "Llama 3 70B training" + +[profiling] +enable_profiling = true +save_traces_folder = "profile_trace" +profile_freq = 100 + +[metrics] +log_freq = 10 +enable_tensorboard = true +save_tb_folder = "${MODEL_PATH}/torchtitan-tb" + +[model] +name = "llama3" +flavor = "70B" +norm_type = "rmsnorm" # [layernorm, np_layernorm, rmsnorm, fused_rmsnorm] +tokenizer_path = "${MODEL_PATH}/${HF_MODEL}/tokenizer.model" + +[optimizer] +name = "AdamW" +lr = 1.5e-4 + +[training] +batch_size = 1 +seq_len = 8192 +warmup_steps = 200 # lr scheduler warm up, normally 20% of the train steps +max_norm = 1.0 # grad norm clipping +steps = 1000 +data_parallel_degree = -1 +tensor_parallel_degree = 8 # 8-way TP +pipeline_parallel_degree = 1 +fp8_linear = "" +compile = false +dataset = "c4" +reshard_after_forward = true + +[checkpoint] +enable_checkpoint = true +folder = "${MODEL_PATH}/torchtitan-checkpoint" +interval_type = "steps" +interval = 500 +model_weights_only = false +export_dtype = "float32" +async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"] + +[activation_checkpoint] +mode = 'full' diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/quantize.yaml b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/quantize.yaml new file mode 100644 index 00000000..1060a081 --- /dev/null +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/quantize.yaml @@ -0,0 +1,57 @@ +# Config for QuantizationRecipe in quantize.py +# +# To launch, run the following command from root torchtune directory: +# tune run quantize --config quantization + +# +# Model arguments +model: + _component_: torchtune.models.llama3.llama3_70b + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: ${MODEL_PATH}/${HF_MODEL} + checkpoint_files: [ + hf_model_0001_0.pt, + hf_model_0002_0.pt, + hf_model_0003_0.pt, + hf_model_0004_0.pt, + hf_model_0005_0.pt, + hf_model_0006_0.pt, + hf_model_0007_0.pt, + hf_model_0007_0.pt, + hf_model_0008_0.pt, + hf_model_0009_0.pt, + hf_model_0010_0.pt, + hf_model_0011_0.pt, + hf_model_0012_0.pt, + hf_model_0013_0.pt, + hf_model_0014_0.pt, + hf_model_0015_0.pt, + hf_model_0016_0.pt, + hf_model_0017_0.pt, + hf_model_0018_0.pt, + hf_model_0019_0.pt, + hf_model_0020_0.pt, + hf_model_0021_0.pt, + hf_model_0022_0.pt, + hf_model_0023_0.pt, + hf_model_0024_0.pt, + hf_model_0025_0.pt, + hf_model_0026_0.pt, + hf_model_0027_0.pt, + hf_model_0028_0.pt, + hf_model_0029_0.pt, + hf_model_0030_0.pt, + ] + recipe_checkpoint: null + output_dir: ${MODEL_PATH}/${HF_MODEL}-quantized + model_type: LLAMA3 + +device: cpu +dtype: bf16 +seed: 1234 + +quantizer: + _component_: torchtune.utils.quantization.Int4WeightOnlyQuantizer + groupsize: 256 diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/docs/lora.png b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/docs/lora.png new file mode 100644 index 00000000..d319efd2 Binary files /dev/null and b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/docs/lora.png differ diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/evaluate.sbatch b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/evaluate.sbatch new file mode 100644 index 00000000..8c564cb4 --- /dev/null +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/evaluate.sbatch @@ -0,0 +1,82 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=eleuther_eval +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --gpus-per-node=8 # Number of GPU per node +#SBATCH --output=logs/%x_%j.out # logfile for stdout +#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs +#SBATCH --wait-all-nodes=1 +#SBATCH --exclusive +set -euxo pipefail + +################################################################## +########### Check current working directory ###################### +################################################################## +if [ $(basename $(pwd)) != "slurm" ] +then + echo "Please run this script from the slurm directory" + exit 1 +fi +################################################################## +############# Load environment variables ######################### +################################################################## +# Load environment variables +if [ ! -f .env ] +then + echo "Please create a .env file with the required environment variables" + exit 1 +else + source .env +fi + +################################################################## +######### Define EFA/NCCL/Slurm environment variables ############ +################################################################## +## EFA settings +export FI_LOG_LEVEL=1 +export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons +export FI_EFA_USE_HUGE_PAGE=0 +# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 +# https://github.com/pytorch/pytorch/issues/68893 +export NCCL_SOCKET_IFNAME=en +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_DEBUG=INFO +export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` +export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` +export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) +export NODES_ARRAY=($NODES) +export HEAD_NODE=${NODES_ARRAY[0]} +export MASTER_ADDR=$(hostname --ip-address) +export MASTER_PORT=$RANDOM +export NNODES=$SLURM_JOB_NUM_NODES +export NPROC=$SLURM_GPUS_PER_NODE +export WORLD_SIZE=$(( $NNODES * $NPROC )) + +################################################################## +############# Set training arguments ############################# +################################################################## +export HF_MODEL="meta-llama/Meta-Llama-3-70B" +: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}" +declare -a SRUN_ARGS=( + --container-image $ENROOT_IMAGE + --container-mounts $CONTAINER_MOUNT +) +declare -a TRAIN_ARGS=( + --config ${PWD}/tutorials/e2e-llama3-70b-development/configs/evaluate.yaml + tokenizer.path=${MODEL_PATH}/${HF_MODEL}/original/tokenizer.model + checkpointer.checkpoint_dir=${MODEL_PATH}/${HF_MODEL}-tuned +) +################################################################## +################# Run torchtune ################################## +################################################################## +export PYTHONPATH=${PWD}/torchtune +export TORCHTUNE=${PWD}/torchtune/torchtune/_cli/tune.py +export TORCHTUNE_COMMAND="eleuther_eval" +echo "Executing following command:" +echo "tune" "run" "${TORCHTUNE_COMMAND}" "${TRAIN_ARGS[@]}" +srun -l "${SRUN_ARGS[@]}" python ${TORCHTUNE} run "${TORCHTUNE_COMMAND}" "${TRAIN_ARGS[@]}" diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/full_finetune_distributed.sbatch b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/full_finetune_distributed.sbatch new file mode 100644 index 00000000..33e5bcd0 --- /dev/null +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/full_finetune_distributed.sbatch @@ -0,0 +1,95 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=full-finetuning +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --gpus-per-node=8 # Number of GPU per node +#SBATCH --output=logs/%x_%j.out # logfile for stdout +#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs +#SBATCH --wait-all-nodes=1 +#SBATCH --exclusive +set -euxo pipefail + +################################################################## +########### Check current working directory ###################### +################################################################## +if [ $(basename $(pwd)) != "slurm" ] +then + echo "Please run this script from the slurm directory" + exit 1 +fi +################################################################## +############# Load environment variables ######################### +################################################################## +# Load environment variables +if [ ! -f .env ] +then + echo "Please create a .env file with the required environment variables" + exit 1 +else + source .env +fi + +################################################################## +######### Define EFA/NCCL/Slurm environment variables ############ +################################################################## +## EFA settings +export FI_LOG_LEVEL=1 +export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons +export FI_EFA_USE_HUGE_PAGE=0 +# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 +# https://github.com/pytorch/pytorch/issues/68893 +export NCCL_SOCKET_IFNAME=en +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_DEBUG=INFO +export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` +export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` +export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) +export NODES_ARRAY=($NODES) +export HEAD_NODE=${NODES_ARRAY[0]} +export MASTER_ADDR=$(hostname --ip-address) +export MASTER_PORT=$RANDOM +export NNODES=$SLURM_JOB_NUM_NODES +export NPROC=$SLURM_GPUS_PER_NODE +export WORLD_SIZE=$(( $NNODES * $NPROC )) + +################################################################## +############# Set training arguments ############################# +################################################################## +export HF_MODEL="meta-llama/Meta-Llama-3-70B" +: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}" +declare -a SRUN_ARGS=( + --container-image $ENROOT_IMAGE + --container-mounts $CONTAINER_MOUNT +) +declare -a TORCHRUN_ARGS=( + # change this to match the number of gpus per node: + --master_addr $MASTER_ADDR + --master_port $RANDOM + --nproc_per_node=8 + --nnodes $NNODES + --nnodes=$SLURM_JOB_NUM_NODES + --rdzv_backend=c10d + --rdzv_endpoint=$(hostname) +) +declare -a TRAIN_ARGS=( + --config ${PWD}/tutorials/e2e-llama3-70b-development/configs/full_finetune_distributed.yaml + tokenizer.path=${MODEL_PATH}/${HF_MODEL}/original/tokenizer.model + checkpointer.checkpoint_dir=${MODEL_PATH}/${HF_MODEL} + checkpointer.output_dir=${MODEL_PATH}/${HF_MODEL}-tuned + output_dir=${MODEL_PATH}/${HF_MODEL}-tuned/log + metric_logger.log_dir=${MODEL_PATH}/${HF_MODEL}-tuned/log/metrics +) +################################################################## +################# Run torchtune ################################## +################################################################## +export PYTHONPATH=${PWD}/torchtune +export TORCHTUNE=${PWD}/torchtune/torchtune/_cli/tune.py +export TORCHTUNE_COMMAND="full_finetune_distributed" +echo "Executing following command:" +echo "tune" "run" "${TORCHRUN_ARGS[@]}" "${TORCHTUNE_COMMAND}" "${TORCHTUNE_ARGS[@]}" "${TRAIN_ARGS[@]}" +srun -l "${SRUN_ARGS[@]}" python ${TORCHTUNE} run "${TORCHRUN_ARGS[@]}" "${TORCHTUNE_COMMAND}" "${TRAIN_ARGS[@]}" diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/generate.sbatch b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/generate.sbatch new file mode 100644 index 00000000..f0306e75 --- /dev/null +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/generate.sbatch @@ -0,0 +1,75 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=generate +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --gpus-per-node=8 # Number of GPU per node +#SBATCH --output=logs/%x_%j.out # logfile for stdout +#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs +#SBATCH --wait-all-nodes=1 +#SBATCH --exclusive +set -euxo pipefail + +################################################################## +############# Load environment variables ######################### +################################################################## +# Load environment variables +if [ ! -f .env ] +then + echo "Please create a .env file with the required environment variables" + exit 1 +else + source .env +fi + +################################################################## +######### Define EFA/NCCL/Slurm environment variables ############ +################################################################## +## EFA settings +export FI_LOG_LEVEL=1 +export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons +export FI_EFA_USE_HUGE_PAGE=0 +# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 +# https://github.com/pytorch/pytorch/issues/68893 +export NCCL_SOCKET_IFNAME=en +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_DEBUG=INFO +export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` +export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` +export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) +export NODES_ARRAY=($NODES) +export HEAD_NODE=${NODES_ARRAY[0]} +export MASTER_ADDR=$(hostname --ip-address) +export MASTER_PORT=$RANDOM +export NNODES=$SLURM_JOB_NUM_NODES +export NPROC=$SLURM_GPUS_PER_NODE +export WORLD_SIZE=$(( $NNODES * $NPROC )) + +################################################################## +############### Create train config ############################## +################################################################## +if [ ! -d ${FSX_PATH}/tmp ]; then + mkdir -p ${FSX_PATH}/tmp +fi +cat ${PWD}/train_configs/generate_llama3.yaml | envsubst > ${FSX_PATH}/tmp/generate_llama3.yaml +################################################################## +################# Set arguments ################################## +################################################################## +: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}" +declare -a SRUN_ARGS=( + --container-image $ENROOT_IMAGE + --container-mounts $CONTAINER_MOUNT +) +declare -a TUNE_ARGS=( + --config ${FSX_PATH}/tmp/generate_llama3.yaml +) + +export TORCHTUNE=${PWD}/torchtune/torchtune/_cli/tune.py +export PYTHONPATH=${PWD}/torchtune + +#srun -l "${SRUN_ARGS[@]}" python ${TORCHTUNE} cp generation /fsx/tmp/generate_llama3.yaml +srun -l "${SRUN_ARGS[@]}" python ${TORCHTUNE} run generate "${TUNE_ARGS[@]}" diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/lora_finetune_distributed.sbatch b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/lora_finetune_distributed.sbatch new file mode 100644 index 00000000..256046b2 --- /dev/null +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/lora_finetune_distributed.sbatch @@ -0,0 +1,95 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=lora-finetuning +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --gpus-per-node=8 # Number of GPU per node +#SBATCH --output=logs/%x_%j.out # logfile for stdout +#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs +#SBATCH --wait-all-nodes=1 +#SBATCH --exclusive +set -euxo pipefail + +################################################################## +########### Check current working directory ###################### +################################################################## +if [ $(basename $(pwd)) != "slurm" ] +then + echo "Please run this script from the slurm directory" + exit 1 +fi +################################################################## +############# Load environment variables ######################### +################################################################## +# Load environment variables +if [ ! -f .env ] +then + echo "Please create a .env file with the required environment variables" + exit 1 +else + source .env +fi + +################################################################## +######### Define EFA/NCCL/Slurm environment variables ############ +################################################################## +## EFA settings +export FI_LOG_LEVEL=1 +export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons +export FI_EFA_USE_HUGE_PAGE=0 +# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 +# https://github.com/pytorch/pytorch/issues/68893 +export NCCL_SOCKET_IFNAME=en +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_DEBUG=INFO +export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` +export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` +export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) +export NODES_ARRAY=($NODES) +export HEAD_NODE=${NODES_ARRAY[0]} +export MASTER_ADDR=$(hostname --ip-address) +export MASTER_PORT=$RANDOM +export NNODES=$SLURM_JOB_NUM_NODES +export NPROC=$SLURM_GPUS_PER_NODE +export WORLD_SIZE=$(( $NNODES * $NPROC )) + +################################################################## +############# Set training arguments ############################# +################################################################## +export HF_MODEL="meta-llama/Meta-Llama-3-70B" +: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}" +declare -a SRUN_ARGS=( + --container-image $ENROOT_IMAGE + --container-mounts $CONTAINER_MOUNT +) +declare -a TORCHRUN_ARGS=( + # change this to match the number of gpus per node: + --master_addr $MASTER_ADDR + --master_port $RANDOM + --nproc_per_node=8 + --nnodes $NNODES + --nnodes=$SLURM_JOB_NUM_NODES + --rdzv_backend=c10d + --rdzv_endpoint=$(hostname) +) +declare -a TRAIN_ARGS=( + --config ${PWD}/tutorials/e2e-llama3-70b-development/configs/lora_finetune_distributed.yaml + tokenizer.path=${MODEL_PATH}/${HF_MODEL}/original/tokenizer.model + checkpointer.checkpoint_dir=${MODEL_PATH}/${HF_MODEL} + checkpointer.output_dir=${MODEL_PATH}/${HF_MODEL}-tuned + output_dir=${MODEL_PATH}/${HF_MODEL}-tuned/log + metric_logger.log_dir=${MODEL_PATH}/${HF_MODEL}-tuned/log/metrics +) +################################################################## +################# Run torchtune ################################## +################################################################## +export PYTHONPATH=${PWD}/torchtune +export TORCHTUNE=${PWD}/torchtune/torchtune/_cli/tune.py +export TORCHTUNE_COMMAND="lora_finetune_distributed" +echo "Executing following command:" +echo "torchtune" "run" "${TORCHRUN_ARGS[@]}" "${TORCHTUNE_COMMAND}" "${TORCHTUNE_ARGS[@]}" +srun -l "${SRUN_ARGS[@]}" python ${TORCHTUNE} run "${TORCHRUN_ARGS[@]}" "${TORCHTUNE_COMMAND}" "${TRAIN_ARGS[@]}" diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/quantize.sbatch b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/quantize.sbatch new file mode 100644 index 00000000..c094e87b --- /dev/null +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/quantize.sbatch @@ -0,0 +1,83 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 + +#SBATCH --job-name=quantize +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --gpus-per-node=8 # Number of GPU per node +#SBATCH --output=logs/%x_%j.out # logfile for stdout +#SBATCH --error=logs/%x_%j.err # logfile for stderr, remove it to merge both outputs +#SBATCH --wait-all-nodes=1 +#SBATCH --exclusive +set -euxo pipefail + +################################################################## +########### Check current working directory ###################### +################################################################## +if [ $(basename $(pwd)) != "slurm" ] +then + echo "Please run this script from the slurm directory" + exit 1 +fi +################################################################## +############# Load environment variables ######################### +################################################################## +# Load environment variables +if [ ! -f .env ] +then + echo "Please create a .env file with the required environment variables" + exit 1 +else + source .env +fi + +################################################################## +######### Define EFA/NCCL/Slurm environment variables ############ +################################################################## +## EFA settings +export FI_LOG_LEVEL=1 +export FI_PROVIDER=efa # change to eth if you want to use ENA for comparisons +export FI_EFA_USE_HUGE_PAGE=0 +# https://discuss.pytorch.org/t/nccl-network-is-unreachable-connection-refused-when-initializing-ddp/137352 +# https://github.com/pytorch/pytorch/issues/68893 +export NCCL_SOCKET_IFNAME=en +export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 +export NCCL_DEBUG=INFO +export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` +export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) +export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` +export NODES=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) +export NODES_ARRAY=($NODES) +export HEAD_NODE=${NODES_ARRAY[0]} +export MASTER_ADDR=$(hostname --ip-address) +export MASTER_PORT=$RANDOM +export NNODES=$SLURM_JOB_NUM_NODES +export NPROC=$SLURM_GPUS_PER_NODE +export WORLD_SIZE=$(( $NNODES * $NPROC )) + +################################################################## +############# Set training arguments ############################# +################################################################## +export HF_MODEL="meta-llama/Meta-Llama-3-70B" +: "${CONTAINER_MOUNT:=$FSX_PATH:$FSX_PATH}" +declare -a SRUN_ARGS=( + --container-image $ENROOT_IMAGE + --container-mounts $CONTAINER_MOUNT +) +declare -a TRAIN_ARGS=( + --config ${PWD}/tutorials/e2e-llama3-70b-development/configs/quantize.yaml + tokenizer.path=${MODEL_PATH}/${HF_MODEL}/original/tokenizer.model + checkpointer.checkpoint_dir=${MODEL_PATH}/${HF_MODEL}-tuned + checkpointer.output_dir=${MODEL_PATH}/${HF_MODEL}-quantized +) +################################################################## +################# Run torchtune ################################## +################################################################## +export PYTHONPATH=${PWD}/torchtune +export TORCHTUNE=${PWD}/torchtune/torchtune/_cli/tune.py +export TORCHTUNE_COMMAND="quantize" +echo "Executing following command:" +echo "torchtune" "run" "${TORCHTUNE_COMMAND}" "${TRAIN_ARGS[@]}" +srun -l "${SRUN_ARGS[@]}" python ${TORCHTUNE} run "${TORCHTUNE_COMMAND}" "${TRAIN_ARGS[@]}"