From 1350f8ae6ae854713d705c0fd2d39261f821936b Mon Sep 17 00:00:00 2001
From: Anh Uong <anh.uong@ibm.com>
Date: Thu, 26 Sep 2024 10:51:05 -0600
Subject: [PATCH 1/2] build: install additional fms-acceleration plugins (#350)

* deps: add fms-acceleration fast kernels and padding plugins

Signed-off-by: Anh Uong <anh.uong@ibm.com>

* docs: using fms-acceleration flags as json

Signed-off-by: Anh Uong <anh.uong@ibm.com>

* remove experimental from padding free and multipack

Signed-off-by: Anh Uong <anh.uong@ibm.com>

* remove experimental from readme

Signed-off-by: Anh Uong <anh.uong@ibm.com>

---------

Signed-off-by: Anh Uong <anh.uong@ibm.com>
---
 README.md                                         | 15 +++++++++++++--
 build/Dockerfile                                  |  5 +++++
 .../acceleration_framework_config.py              |  4 ++--
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 22e0f4312..9a509106a 100644
--- a/README.md
+++ b/README.md
@@ -647,10 +647,10 @@ The list of configurations for various `fms_acceleration` plugins:
 - [quantized_lora_config](./tuning/config/acceleration_configs/quantized_lora_config.py): For quantized 4bit LoRA training
   - `--auto_gptq`: 4bit GPTQ-LoRA with AutoGPTQ
   - `--bnb_qlora`: 4bit QLoRA with bitsandbytes
-- [fused_ops_and_kernels](./tuning/config/acceleration_configs/fused_ops_and_kernels.py) (experimental):
+- [fused_ops_and_kernels](./tuning/config/acceleration_configs/fused_ops_and_kernels.py):
   - `--fused_lora`: fused lora for more efficient LoRA training.
   - `--fast_kernels`: fast cross-entropy, rope, rms loss kernels.
-- [attention_and_distributed_packing](./tuning/config/acceleration_configs/attention_and_distributed_packing.py) (experimental):
+- [attention_and_distributed_packing](./tuning/config/acceleration_configs/attention_and_distributed_packing.py):
   - `--padding_free`: technique to process multiple examples in single batch without adding padding tokens that waste compute.
   - `--multipack`: technique for *multi-gpu training* to balance out number of tokens processed in each device, to minimize waiting time.
 
@@ -663,6 +663,7 @@ Notes:
     - pass `--fast_kernels True True True` for full finetuning/LoRA
     - pass `--fast_kernels True True True --auto_gptq triton_v2 --fused_lora auto_gptq True` for GPTQ-LoRA
     - pass `--fast_kernels True True True --bitsandbytes nf4 --fused_lora bitsandbytes True` for QLoRA
+    - Note the list of supported models [here](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/fused-ops-and-kernels/README.md#supported-models).
  * Notes on Padding Free
     - works for both *single* and *multi-gpu*. 
     - works on both *pretokenized* and *untokenized* datasets
@@ -671,6 +672,16 @@ Notes:
     - works only for *multi-gpu*.
     - currently only includes the version of *multipack* optimized for linear attention implementations like *flash-attn*.
 
+Note: To pass the above flags via a JSON config, each of the flags expects the value to be a mixed type list, so the values must be a list. For example:
+```json
+{
+  "fast_kernels": [true, true, true],
+  "padding_free": ["huggingface"],
+  "multipack": [16],
+  "auto_gptq": ["triton_v2"]
+}
+```
+
 Activate `TRANSFORMERS_VERBOSITY=info` to see the huggingface trainer printouts and verify that `AccelerationFramework` is activated!
 
 ```
diff --git a/build/Dockerfile b/build/Dockerfile
index ffae818da..507dd4f52 100644
--- a/build/Dockerfile
+++ b/build/Dockerfile
@@ -137,9 +137,14 @@ RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
     python -m pip install --user "$(head bdist_name)" && \
     python -m pip install --user "$(head bdist_name)[flash-attn]"
 
+# fms_acceleration_peft = PEFT-training, e.g., 4bit QLoRA
+# fms_acceleration_foak = Fused LoRA and triton kernels
+# fms_acceleration_aadp = Padding-Free Flash Attention Computation
 RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \
         python -m pip install --user "$(head bdist_name)[fms-accel]"; \
         python -m fms_acceleration.cli install fms_acceleration_peft; \
+        python -m fms_acceleration.cli install fms_acceleration_foak; \
+        python -m fms_acceleration.cli install fms_acceleration_aadp; \
     fi
 
 RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \
diff --git a/tuning/config/acceleration_configs/acceleration_framework_config.py b/tuning/config/acceleration_configs/acceleration_framework_config.py
index 46fbe6b03..76fef1a78 100644
--- a/tuning/config/acceleration_configs/acceleration_framework_config.py
+++ b/tuning/config/acceleration_configs/acceleration_framework_config.py
@@ -103,7 +103,7 @@ class AccelerationFrameworkConfig:
         PaddingFree,
         ConfigAnnotation(
             path="training.attention",
-            experimental=True,
+            experimental=False,
             required_packages=["aadp"],
         ),
     ] = None
@@ -112,7 +112,7 @@ class AccelerationFrameworkConfig:
         MultiPack,
         ConfigAnnotation(
             path="training.dataloader",
-            experimental=True,
+            experimental=False,
             required_packages=["aadp"],
         ),
     ] = None

From 0c6a0628ef17fac513a297f084adaa3a61117e0d Mon Sep 17 00:00:00 2001
From: Abhishek Maurya <124327945+Abhishek-TAMU@users.noreply.github.com>
Date: Thu, 26 Sep 2024 13:31:27 -0400
Subject: [PATCH 2/2] fix: unable to find output_dir in multi-GPU during
 resume_from_checkpoint check (#352)

* fix: output_dir doesn't exist during resume_from_checkpoint

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>

* fix: fmt

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>

---------

Signed-off-by: Abhishek <maurya.abhishek@ibm.com>
Signed-off-by: Anh Uong <anh.uong@ibm.com>
---
 build/accelerate_launch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py
index 0a0cfa755..6cbc7d252 100644
--- a/build/accelerate_launch.py
+++ b/build/accelerate_launch.py
@@ -98,6 +98,8 @@ def main():
     #
     ##########
     output_dir = job_config.get("output_dir")
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
     try:
         # checkpoints outputted to tempdir, only final checkpoint copied to output dir
         launch_command(args)