From 1350f8ae6ae854713d705c0fd2d39261f821936b Mon Sep 17 00:00:00 2001 From: Anh Uong Date: Thu, 26 Sep 2024 10:51:05 -0600 Subject: [PATCH 1/2] build: install additional fms-acceleration plugins (#350) * deps: add fms-acceleration fast kernels and padding plugins Signed-off-by: Anh Uong * docs: using fms-acceleration flags as json Signed-off-by: Anh Uong * remove experimental from padding free and multipack Signed-off-by: Anh Uong * remove experimental from readme Signed-off-by: Anh Uong --------- Signed-off-by: Anh Uong --- README.md | 15 +++++++++++++-- build/Dockerfile | 5 +++++ .../acceleration_framework_config.py | 4 ++-- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 22e0f4312..9a509106a 100644 --- a/README.md +++ b/README.md @@ -647,10 +647,10 @@ The list of configurations for various `fms_acceleration` plugins: - [quantized_lora_config](./tuning/config/acceleration_configs/quantized_lora_config.py): For quantized 4bit LoRA training - `--auto_gptq`: 4bit GPTQ-LoRA with AutoGPTQ - `--bnb_qlora`: 4bit QLoRA with bitsandbytes -- [fused_ops_and_kernels](./tuning/config/acceleration_configs/fused_ops_and_kernels.py) (experimental): +- [fused_ops_and_kernels](./tuning/config/acceleration_configs/fused_ops_and_kernels.py): - `--fused_lora`: fused lora for more efficient LoRA training. - `--fast_kernels`: fast cross-entropy, rope, rms loss kernels. -- [attention_and_distributed_packing](./tuning/config/acceleration_configs/attention_and_distributed_packing.py) (experimental): +- [attention_and_distributed_packing](./tuning/config/acceleration_configs/attention_and_distributed_packing.py): - `--padding_free`: technique to process multiple examples in single batch without adding padding tokens that waste compute. - `--multipack`: technique for *multi-gpu training* to balance out number of tokens processed in each device, to minimize waiting time. @@ -663,6 +663,7 @@ Notes: - pass `--fast_kernels True True True` for full finetuning/LoRA - pass `--fast_kernels True True True --auto_gptq triton_v2 --fused_lora auto_gptq True` for GPTQ-LoRA - pass `--fast_kernels True True True --bitsandbytes nf4 --fused_lora bitsandbytes True` for QLoRA + - Note the list of supported models [here](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/fused-ops-and-kernels/README.md#supported-models). * Notes on Padding Free - works for both *single* and *multi-gpu*. - works on both *pretokenized* and *untokenized* datasets @@ -671,6 +672,16 @@ Notes: - works only for *multi-gpu*. - currently only includes the version of *multipack* optimized for linear attention implementations like *flash-attn*. +Note: To pass the above flags via a JSON config, each of the flags expects the value to be a mixed type list, so the values must be a list. For example: +```json +{ + "fast_kernels": [true, true, true], + "padding_free": ["huggingface"], + "multipack": [16], + "auto_gptq": ["triton_v2"] +} +``` + Activate `TRANSFORMERS_VERBOSITY=info` to see the huggingface trainer printouts and verify that `AccelerationFramework` is activated! ``` diff --git a/build/Dockerfile b/build/Dockerfile index ffae818da..507dd4f52 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -137,9 +137,14 @@ RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \ python -m pip install --user "$(head bdist_name)" && \ python -m pip install --user "$(head bdist_name)[flash-attn]" +# fms_acceleration_peft = PEFT-training, e.g., 4bit QLoRA +# fms_acceleration_foak = Fused LoRA and triton kernels +# fms_acceleration_aadp = Padding-Free Flash Attention Computation RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \ python -m pip install --user "$(head bdist_name)[fms-accel]"; \ python -m fms_acceleration.cli install fms_acceleration_peft; \ + python -m fms_acceleration.cli install fms_acceleration_foak; \ + python -m fms_acceleration.cli install fms_acceleration_aadp; \ fi RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \ diff --git a/tuning/config/acceleration_configs/acceleration_framework_config.py b/tuning/config/acceleration_configs/acceleration_framework_config.py index 46fbe6b03..76fef1a78 100644 --- a/tuning/config/acceleration_configs/acceleration_framework_config.py +++ b/tuning/config/acceleration_configs/acceleration_framework_config.py @@ -103,7 +103,7 @@ class AccelerationFrameworkConfig: PaddingFree, ConfigAnnotation( path="training.attention", - experimental=True, + experimental=False, required_packages=["aadp"], ), ] = None @@ -112,7 +112,7 @@ class AccelerationFrameworkConfig: MultiPack, ConfigAnnotation( path="training.dataloader", - experimental=True, + experimental=False, required_packages=["aadp"], ), ] = None From 0c6a0628ef17fac513a297f084adaa3a61117e0d Mon Sep 17 00:00:00 2001 From: Abhishek Maurya <124327945+Abhishek-TAMU@users.noreply.github.com> Date: Thu, 26 Sep 2024 13:31:27 -0400 Subject: [PATCH 2/2] fix: unable to find output_dir in multi-GPU during resume_from_checkpoint check (#352) * fix: output_dir doesn't exist during resume_from_checkpoint Signed-off-by: Abhishek * fix: fmt Signed-off-by: Abhishek --------- Signed-off-by: Abhishek Signed-off-by: Anh Uong --- build/accelerate_launch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py index 0a0cfa755..6cbc7d252 100644 --- a/build/accelerate_launch.py +++ b/build/accelerate_launch.py @@ -98,6 +98,8 @@ def main(): # ########## output_dir = job_config.get("output_dir") + if not os.path.exists(output_dir): + os.makedirs(output_dir) try: # checkpoints outputted to tempdir, only final checkpoint copied to output dir launch_command(args)