From a36fd88a1e382c39fbe4dcb226306e64ab4fbc05 Mon Sep 17 00:00:00 2001
From: Aswin Zayasankaran <156493059+Aswinmcw@users.noreply.github.com>
Date: Tue, 26 Nov 2024 17:20:06 +0530
Subject: [PATCH] Add all gather perf to pipeline for TG (#15001)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Ticket
#14406

Adds all gather perf to pipeline for TG

https://github.com/tenstorrent/tt-metal/actions/runs/12006139751

<img width="1062" alt="Screenshot 2024-11-13 at 6 01 40 PM"
src="https://github.com/user-attachments/assets/3271906c-ccce-407a-97b3-0f26f8ff987a">


### Checklist
- [ ] Post commit CI passes
- [ ] Blackhole Post commit (if applicable)
- [ ] Model regression CI testing passes (if applicable)
- [ ] Device performance regression CI testing passes (if applicable)
- [ ] New/Existing tests provide coverage for changes
---
 .github/workflows/pipeline-select-galaxy.yaml |  1 +
 .../workflows/tg-model-perf-tests-impl.yaml   | 56 +++++++++++--
 .github/workflows/tg-model-perf-tests.yaml    |  5 +-
 tests/scripts/run_tests.sh                    |  2 +
 .../operations/ccl/perf/perf_csv.py           |  6 +-
 .../ccl/perf/run_all_gather_profile.sh        |  6 +-
 .../operations/ccl/perf/test_ccl_perf.py      | 66 ++++++++++++++++
 .../ccl/test_all_gather_TG_post_commit.py     | 79 +++++++++++++++++--
 8 files changed, 200 insertions(+), 21 deletions(-)
diff --git a/.github/workflows/pipeline-select-galaxy.yaml b/.github/workflows/pipeline-select-galaxy.yaml
index 5e947a69bad..69e09c900f7 100644
--- a/.github/workflows/pipeline-select-galaxy.yaml
+++ b/.github/workflows/pipeline-select-galaxy.yaml
@@ -37,6 +37,7 @@ on:
         type: boolean
         default: false
       tg-model-perf:
+        description: "TG model perf tests (requires tracy build)"
         required: false
         type: boolean
         default: false
diff --git a/.github/workflows/tg-model-perf-tests-impl.yaml b/.github/workflows/tg-model-perf-tests-impl.yaml
index 300f16f8e14..8565fc8f93f 100644
--- a/.github/workflows/tg-model-perf-tests-impl.yaml
+++ b/.github/workflows/tg-model-perf-tests-impl.yaml
@@ -23,6 +23,13 @@ jobs:
             runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"],
             cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tg_device --dispatch-mode ""'
           },
+          { name: "t3k CCL all_gather perf tests",
+            arch: wormhole_b0,
+            cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type ccl_all_gather_perf_tg_device --dispatch-mode ""',
+            timeout: 75,
+            tracy: true,
+            runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"],
+            owner_id: ULMEPM2MA}, # Sean Nijjar
         ]
     name: ${{ matrix.test-group.name }}
     env:
@@ -41,9 +48,10 @@ jobs:
         run: |
           echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
           echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
-      - uses: actions/download-artifact@v4
+      - name: Download profiler build artifact
+        uses: actions/download-artifact@v4
         with:
-          name: TTMetal_build_${{ matrix.test-group.arch }}
+          name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
       - name: Extract files
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
@@ -58,16 +66,48 @@ jobs:
         id: check-perf-report
         if: ${{ !cancelled() }}
         run: |
-          ls -hal
-          export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv
-          ls -hal $PERF_REPORT_FILENAME
-          echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
-      - name: Upload perf report
-        if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
+          TODAY=$(date +%Y_%m_%d)
+          PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"
+          PERF_REPORT_FILENAME_CCL_ALL_GATHER="CCL_all_gather_Perf_${TODAY}.csv"
+          PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER="CCL_reduce_scatter_Perf_${TODAY}.csv"
+          if [ "${{ matrix.test-group.tracy }}" == "true" ]; then
+            found_reports=false
+            if [ -f "$PERF_REPORT_FILENAME_CCL_ALL_GATHER" ]; then
+              echo "Found CCL AllGather Perf report: $PERF_REPORT_FILENAME_CCL_ALL_GATHER"
+              echo "perf_report_filename_all_gather=$PERF_REPORT_FILENAME_CCL_ALL_GATHER" >> "$GITHUB_OUTPUT"
+              found_reports=true
+            fi
+            if [ -f "$PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER" ]; then
+              echo "Found CCL ReduceScatter Perf report: $PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER"
+              echo "perf_report_filename_reduce_scatter=$PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER" >> "$GITHUB_OUTPUT"
+              found_reports=true
+            fi
+            if [ "$found_reports" = false ]; then
+              echo "No CCL perf report found for today."
+              exit 1
+            fi
+          else
+            if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then
+              echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS"
+              echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT"
+            else
+              echo "No Models perf report found for today."
+              exit 1
+            fi
+          fi
+      - name: Upload Models perf report
+        if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' && !matrix.test-group.tracy}}
         uses: actions/upload-artifact@v4
         with:
           name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }}
           path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
+      - name: Upload CCL perf report
+        if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' && matrix.test-group.tracy}}
+        uses: actions/upload-artifact@v4
+        with:
+          name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.model }}-bare-metal
+          path:
+            ${{ steps.check-perf-report.outputs.perf_report_filename_all_gather }}
       - name: Disable performance mode
         if: always()
         run: |
diff --git a/.github/workflows/tg-model-perf-tests.yaml b/.github/workflows/tg-model-perf-tests.yaml
index a813b763602..4202cc46ad3 100644
--- a/.github/workflows/tg-model-perf-tests.yaml
+++ b/.github/workflows/tg-model-perf-tests.yaml
@@ -6,12 +6,13 @@ on:
     - cron: "0 */12 * * *" # This cron schedule runs the workflow every 12 hours
 
 jobs:
-  build-artifact:
+  build-artifact-profiler:
     uses: ./.github/workflows/build-artifact.yaml
     with:
       arch: '["wormhole_b0"]'
+      tracy: true
     secrets: inherit
   tg-model-perf-tests:
-    needs: build-artifact
+    needs: build-artifact-profiler
     secrets: inherit
     uses: ./.github/workflows/tg-model-perf-tests-impl.yaml
diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh
index 6662f2f7b2c..f71fdb7f39f 100755
--- a/tests/scripts/run_tests.sh
+++ b/tests/scripts/run_tests.sh
@@ -321,6 +321,8 @@ run_pipeline_tests() {
         demos_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
     elif [[ $pipeline_type == *"model_perf_tg_device" ]]; then
         model_perf_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
+    elif [[ $pipeline_type == "ccl_all_gather_perf_tg_device" ]]; then
+        ./tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t tg
     # TGG pipelines
     elif [[ $pipeline_type == "unit_tgg_device" ]]; then
         unit_tgg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
index e955c09feaf..b1cdb0b2782 100644
--- a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
@@ -180,10 +180,12 @@ def calculate_bandwidth(row):
         group_df.rename(columns={"INPUT_0_LAYOUT": "Layout", "INPUT_0_DATATYPE": "Data Type"}, inplace=True)
 
         group_df["Input Shape"] = group_df.apply(
-            lambda row: f"[{row['INPUT_0_W']}, {row['INPUT_0_Z']}, {row['INPUT_0_Y']}, {row['INPUT_0_X']}]", axis=1
+            lambda row: f"[{int(row['INPUT_0_W'])}, {int(row['INPUT_0_Z'])}, {int(row['INPUT_0_Y'])}, {int(row['INPUT_0_X'])}]",
+            axis=1,
         )
         group_df["Output Shape"] = group_df.apply(
-            lambda row: f"[{row['OUTPUT_0_W']}, {row['OUTPUT_0_Z']}, {row['OUTPUT_0_Y']}, {row['OUTPUT_0_X']}]", axis=1
+            lambda row: f"[{int(row['OUTPUT_0_W'])}, {int(row['OUTPUT_0_Z'])}, {int(row['OUTPUT_0_Y'])}, {int(row['OUTPUT_0_X'])}]",
+            axis=1,
         )
         group_df["Cycles Count"] = group_df["DEVICE FW END CYCLE"] - group_df["DEVICE FW START CYCLE"]
         group_df[["Op BW [GB/s]", "Link BW [GB/s]"]] = group_df.apply(calculate_bandwidth, axis=1, result_type="expand")
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
index 1d6bedd49ac..9b80d6dbc86 100755
--- a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
@@ -11,7 +11,7 @@ show_help() {
     echo
     echo "Options:"
     echo "  -d, --debug        Enable debug mode to show real-time output."
-    echo "  -t, --target       Specify the target configuration (t3000 or n300). Default is n300."
+    echo "  -t, --target       Specify the target configuration (t3000 or n300 or tg). Default is n300."
     echo "  -h, --help         Display this help message."
     echo
     echo "Example:"
@@ -42,8 +42,8 @@ while [ $# -gt 0 ]; do
             shift 2
 
             # Validate the target value
-            if [ "$TARGET" != "t3000" ] && [ "$TARGET" != "n300" ]; then
-                echo "Error: Invalid target configuration: $TARGET. Must be either 't3000' or 'n300'."
+            if [ "$TARGET" != "t3000" ] && [ "$TARGET" != "tg" ] && [ "$TARGET" != "n300" ]; then
+                echo "Error: Invalid target configuration: $TARGET. Must be 't3000' or 'n300' or 'tg'."
                 exit 1
             fi
             ;;
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
index 800d25befb8..0a729b88f84 100644
--- a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
@@ -12,6 +12,9 @@
 from tests.ttnn.unit_tests.operations.ccl.test_reduce_scatter_post_commit import (
     run_reduce_scatter_test,
 )
+from tests.ttnn.unit_tests.operations.ccl.test_all_gather_TG_post_commit import (
+    run_line_all_gather_on_TG_with_mesh_tensor_along_rows,
+)
 
 
 @skip_for_grayskull("Requires eth connected devices to run")
@@ -266,3 +269,66 @@ def test_reduce_scatter_on_n300(
         enable_async=enable_async,
         trace_mode=True,
     )
+
+
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links, per_chip_output_shape, dim, layout",
+    [
+        (4, 3, [4, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
+        (4, 3, [1, 1, 32, 16384 * 4], 3, ttnn.TILE_LAYOUT),
+        (4, 3, [1, 4, 32, 6656], 1, ttnn.TILE_LAYOUT),
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+        ttnn.bfloat8_b,
+    ],
+)
+@pytest.mark.parametrize(
+    "buffer_type",
+    [
+        ttnn.BufferType.DRAM,
+        ttnn.BufferType.L1,
+    ],
+)
+@pytest.mark.parametrize("replication_factor", [8])
+@pytest.mark.parametrize("num_iters", [20])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
+@pytest.mark.parametrize("device_params", [{"trace_region_size": 532480}], indirect=True)
+def test_all_gather_on_tg(
+    mesh_device,
+    num_devices,
+    per_chip_output_shape,
+    dim,
+    num_links,
+    input_dtype,
+    layout,
+    buffer_type,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    replication_factor,
+    num_iters,
+):
+    run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
+        mesh_device,
+        num_devices,
+        per_chip_output_shape,
+        ttnn.TensorMemoryLayout.INTERLEAVED,
+        dim,
+        num_links,
+        input_dtype,
+        layout,
+        buffer_type,
+        use_program_cache,
+        function_level_defaults,
+        enable_async=enable_async,
+        num_iters=num_iters,
+        num_all_gather_instances=replication_factor,
+        cluster_axis=1,
+        trace_mode=True,
+    )
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py b/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py
index dd416923e2c..03222c33b8f 100644
--- a/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py
+++ b/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py
@@ -45,6 +45,59 @@ def print_tile_corners_of_tensor(t):
                 print(f"{str_vals}")
 
 
+def run_with_trace(
+    mesh_device,
+    all_gather_topology,
+    input_tensor,
+    dim,
+    num_links,
+    cluster_axis,
+    output_mem_config,
+    n_worker=None,
+    n_buffer=None,
+    num_iter=20,
+):
+    # Compile Run
+    logger.info("Compiling model")
+    tt_out_tensor = ttnn.all_gather(
+        input_tensor,
+        dim=dim,
+        cluster_axis=cluster_axis,
+        mesh_device=mesh_device,
+        num_links=num_links,
+        memory_config=output_mem_config,
+        topology=all_gather_topology,
+    )
+    for d in mesh_device.get_devices():
+        ttnn.synchronize_device(d)
+
+    # Capture trace
+    logger.info("Capturing trace")
+    trace_id = ttnn.begin_trace_capture(mesh_device, cq_id=0)
+    for i in range(num_iter):
+        tt_out_tensor = ttnn.all_gather(
+            input_tensor,
+            dim=dim,
+            cluster_axis=cluster_axis,
+            mesh_device=mesh_device,
+            num_links=num_links,
+            memory_config=output_mem_config,
+            topology=all_gather_topology,
+        )
+    ttnn.end_trace_capture(mesh_device, trace_id, cq_id=0)
+    for d in mesh_device.get_devices():
+        ttnn.synchronize_device(d)
+
+    # Run the op
+    logger.info("Starting Trace perf test...")
+    ttnn.execute_trace(mesh_device, trace_id, blocking=False)
+    ttnn.release_trace(mesh_device, trace_id)
+    for d in mesh_device.get_devices():
+        ttnn.synchronize_device(d)
+
+    return tt_out_tensor
+
+
 def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
     mesh_device,
     num_devices_per_line,
@@ -63,6 +116,8 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
     num_iters: int = 1,
     cluster_axis: int = 0,
     tile=(32, 32),
+    trace_mode=False,
+    debug=False,
 ):
     if len(mesh_device.get_devices()) != 32:
         pytest.skip("Not TG!")
@@ -120,16 +175,28 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
     ttnn_tensor = ttnn.to_device(ttnn_tensor, mesh_device)
 
     # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor)
-    for _ in range(num_iters):
-        ttnn_tensor_out = ttnn.all_gather(
-            ttnn_tensor,
+    if trace_mode:
+        ttnn_tensor_out = run_with_trace(
+            input_tensor=ttnn_tensor,
             dim=dim,
             cluster_axis=cluster_axis,
             mesh_device=mesh_device,
             num_links=num_links,
-            memory_config=output_mem_config,
-            topology=ttnn.Topology.Linear,
+            output_mem_config=output_mem_config,
+            all_gather_topology=ttnn.Topology.Linear,
+            num_iter=num_iters,
         )
+    else:
+        for _ in range(num_iters):
+            ttnn_tensor_out = ttnn.all_gather(
+                ttnn_tensor,
+                dim=dim,
+                cluster_axis=cluster_axis,
+                mesh_device=mesh_device,
+                num_links=num_links,
+                memory_config=output_mem_config,
+                topology=ttnn.Topology.Linear,
+            )
 
     # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor_out)
     tt_output_tensor = ttnn.to_torch(
@@ -150,7 +217,7 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
         if not eq and debug is True:
             logger.error(f"found mismatches")
             report_mismatches(tt_output_tensor, output_golden, 100)
-            print_tile_corners_of_tensor(output_tensor)
+            print_tile_corners_of_tensor(tt_output_tensor)
     else:
         eq, output = comp_pcc(tt_output_tensor, output_golden)
     if not eq: