From a36fd88a1e382c39fbe4dcb226306e64ab4fbc05 Mon Sep 17 00:00:00 2001 From: Aswin Zayasankaran <156493059+Aswinmcw@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:20:06 +0530 Subject: [PATCH] Add all gather perf to pipeline for TG (#15001) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Ticket #14406 Adds all gather perf to pipeline for TG https://github.com/tenstorrent/tt-metal/actions/runs/12006139751 Screenshot 2024-11-13 at 6 01 40 PM ### Checklist - [ ] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- .github/workflows/pipeline-select-galaxy.yaml | 1 + .../workflows/tg-model-perf-tests-impl.yaml | 56 +++++++++++-- .github/workflows/tg-model-perf-tests.yaml | 5 +- tests/scripts/run_tests.sh | 2 + .../operations/ccl/perf/perf_csv.py | 6 +- .../ccl/perf/run_all_gather_profile.sh | 6 +- .../operations/ccl/perf/test_ccl_perf.py | 66 ++++++++++++++++ .../ccl/test_all_gather_TG_post_commit.py | 79 +++++++++++++++++-- 8 files changed, 200 insertions(+), 21 deletions(-) diff --git a/.github/workflows/pipeline-select-galaxy.yaml b/.github/workflows/pipeline-select-galaxy.yaml index 5e947a69bad..69e09c900f7 100644 --- a/.github/workflows/pipeline-select-galaxy.yaml +++ b/.github/workflows/pipeline-select-galaxy.yaml @@ -37,6 +37,7 @@ on: type: boolean default: false tg-model-perf: + description: "TG model perf tests (requires tracy build)" required: false type: boolean default: false diff --git a/.github/workflows/tg-model-perf-tests-impl.yaml b/.github/workflows/tg-model-perf-tests-impl.yaml index 300f16f8e14..8565fc8f93f 100644 --- a/.github/workflows/tg-model-perf-tests-impl.yaml +++ b/.github/workflows/tg-model-perf-tests-impl.yaml @@ -23,6 +23,13 @@ jobs: runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"], cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tg_device --dispatch-mode ""' }, + { name: "t3k CCL all_gather perf tests", + arch: wormhole_b0, + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type ccl_all_gather_perf_tg_device --dispatch-mode ""', + timeout: 75, + tracy: true, + runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"], + owner_id: ULMEPM2MA}, # Sean Nijjar ] name: ${{ matrix.test-group.name }} env: @@ -41,9 +48,10 @@ jobs: run: | echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV - - uses: actions/download-artifact@v4 + - name: Download profiler build artifact + uses: actions/download-artifact@v4 with: - name: TTMetal_build_${{ matrix.test-group.arch }} + name: TTMetal_build_${{ matrix.test-group.arch }}_profiler - name: Extract files run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps @@ -58,16 +66,48 @@ jobs: id: check-perf-report if: ${{ !cancelled() }} run: | - ls -hal - export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv - ls -hal $PERF_REPORT_FILENAME - echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT" - - name: Upload perf report - if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }} + TODAY=$(date +%Y_%m_%d) + PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv" + PERF_REPORT_FILENAME_CCL_ALL_GATHER="CCL_all_gather_Perf_${TODAY}.csv" + PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER="CCL_reduce_scatter_Perf_${TODAY}.csv" + if [ "${{ matrix.test-group.tracy }}" == "true" ]; then + found_reports=false + if [ -f "$PERF_REPORT_FILENAME_CCL_ALL_GATHER" ]; then + echo "Found CCL AllGather Perf report: $PERF_REPORT_FILENAME_CCL_ALL_GATHER" + echo "perf_report_filename_all_gather=$PERF_REPORT_FILENAME_CCL_ALL_GATHER" >> "$GITHUB_OUTPUT" + found_reports=true + fi + if [ -f "$PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER" ]; then + echo "Found CCL ReduceScatter Perf report: $PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER" + echo "perf_report_filename_reduce_scatter=$PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER" >> "$GITHUB_OUTPUT" + found_reports=true + fi + if [ "$found_reports" = false ]; then + echo "No CCL perf report found for today." + exit 1 + fi + else + if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then + echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS" + echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT" + else + echo "No Models perf report found for today." + exit 1 + fi + fi + - name: Upload Models perf report + if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' && !matrix.test-group.tracy}} uses: actions/upload-artifact@v4 with: name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }} path: "${{ steps.check-perf-report.outputs.perf_report_filename }}" + - name: Upload CCL perf report + if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' && matrix.test-group.tracy}} + uses: actions/upload-artifact@v4 + with: + name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.model }}-bare-metal + path: + ${{ steps.check-perf-report.outputs.perf_report_filename_all_gather }} - name: Disable performance mode if: always() run: | diff --git a/.github/workflows/tg-model-perf-tests.yaml b/.github/workflows/tg-model-perf-tests.yaml index a813b763602..4202cc46ad3 100644 --- a/.github/workflows/tg-model-perf-tests.yaml +++ b/.github/workflows/tg-model-perf-tests.yaml @@ -6,12 +6,13 @@ on: - cron: "0 */12 * * *" # This cron schedule runs the workflow every 12 hours jobs: - build-artifact: + build-artifact-profiler: uses: ./.github/workflows/build-artifact.yaml with: arch: '["wormhole_b0"]' + tracy: true secrets: inherit tg-model-perf-tests: - needs: build-artifact + needs: build-artifact-profiler secrets: inherit uses: ./.github/workflows/tg-model-perf-tests-impl.yaml diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index 6662f2f7b2c..f71fdb7f39f 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -321,6 +321,8 @@ run_pipeline_tests() { demos_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode" elif [[ $pipeline_type == *"model_perf_tg_device" ]]; then model_perf_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode" + elif [[ $pipeline_type == "ccl_all_gather_perf_tg_device" ]]; then + ./tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t tg # TGG pipelines elif [[ $pipeline_type == "unit_tgg_device" ]]; then unit_tgg_device "$tt_arch" "$pipeline_type" "$dispatch_mode" diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py index e955c09feaf..b1cdb0b2782 100644 --- a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py +++ b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py @@ -180,10 +180,12 @@ def calculate_bandwidth(row): group_df.rename(columns={"INPUT_0_LAYOUT": "Layout", "INPUT_0_DATATYPE": "Data Type"}, inplace=True) group_df["Input Shape"] = group_df.apply( - lambda row: f"[{row['INPUT_0_W']}, {row['INPUT_0_Z']}, {row['INPUT_0_Y']}, {row['INPUT_0_X']}]", axis=1 + lambda row: f"[{int(row['INPUT_0_W'])}, {int(row['INPUT_0_Z'])}, {int(row['INPUT_0_Y'])}, {int(row['INPUT_0_X'])}]", + axis=1, ) group_df["Output Shape"] = group_df.apply( - lambda row: f"[{row['OUTPUT_0_W']}, {row['OUTPUT_0_Z']}, {row['OUTPUT_0_Y']}, {row['OUTPUT_0_X']}]", axis=1 + lambda row: f"[{int(row['OUTPUT_0_W'])}, {int(row['OUTPUT_0_Z'])}, {int(row['OUTPUT_0_Y'])}, {int(row['OUTPUT_0_X'])}]", + axis=1, ) group_df["Cycles Count"] = group_df["DEVICE FW END CYCLE"] - group_df["DEVICE FW START CYCLE"] group_df[["Op BW [GB/s]", "Link BW [GB/s]"]] = group_df.apply(calculate_bandwidth, axis=1, result_type="expand") diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh index 1d6bedd49ac..9b80d6dbc86 100755 --- a/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh +++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh @@ -11,7 +11,7 @@ show_help() { echo echo "Options:" echo " -d, --debug Enable debug mode to show real-time output." - echo " -t, --target Specify the target configuration (t3000 or n300). Default is n300." + echo " -t, --target Specify the target configuration (t3000 or n300 or tg). Default is n300." echo " -h, --help Display this help message." echo echo "Example:" @@ -42,8 +42,8 @@ while [ $# -gt 0 ]; do shift 2 # Validate the target value - if [ "$TARGET" != "t3000" ] && [ "$TARGET" != "n300" ]; then - echo "Error: Invalid target configuration: $TARGET. Must be either 't3000' or 'n300'." + if [ "$TARGET" != "t3000" ] && [ "$TARGET" != "tg" ] && [ "$TARGET" != "n300" ]; then + echo "Error: Invalid target configuration: $TARGET. Must be 't3000' or 'n300' or 'tg'." exit 1 fi ;; diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py index 800d25befb8..0a729b88f84 100644 --- a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py +++ b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py @@ -12,6 +12,9 @@ from tests.ttnn.unit_tests.operations.ccl.test_reduce_scatter_post_commit import ( run_reduce_scatter_test, ) +from tests.ttnn.unit_tests.operations.ccl.test_all_gather_TG_post_commit import ( + run_line_all_gather_on_TG_with_mesh_tensor_along_rows, +) @skip_for_grayskull("Requires eth connected devices to run") @@ -266,3 +269,66 @@ def test_reduce_scatter_on_n300( enable_async=enable_async, trace_mode=True, ) + + +@skip_for_grayskull("Requires eth connected devices to run") +@pytest.mark.parametrize( + "num_devices, num_links, per_chip_output_shape, dim, layout", + [ + (4, 3, [4, 1, 32, 1280], 0, ttnn.TILE_LAYOUT), + (4, 3, [1, 1, 32, 16384 * 4], 3, ttnn.TILE_LAYOUT), + (4, 3, [1, 4, 32, 6656], 1, ttnn.TILE_LAYOUT), + ], +) +@pytest.mark.parametrize( + "input_dtype", + [ + ttnn.bfloat16, + ttnn.bfloat8_b, + ], +) +@pytest.mark.parametrize( + "buffer_type", + [ + ttnn.BufferType.DRAM, + ttnn.BufferType.L1, + ], +) +@pytest.mark.parametrize("replication_factor", [8]) +@pytest.mark.parametrize("num_iters", [20]) +@pytest.mark.parametrize("enable_async", [True]) +@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True) +@pytest.mark.parametrize("device_params", [{"trace_region_size": 532480}], indirect=True) +def test_all_gather_on_tg( + mesh_device, + num_devices, + per_chip_output_shape, + dim, + num_links, + input_dtype, + layout, + buffer_type, + use_program_cache, + function_level_defaults, + enable_async, + replication_factor, + num_iters, +): + run_line_all_gather_on_TG_with_mesh_tensor_along_rows( + mesh_device, + num_devices, + per_chip_output_shape, + ttnn.TensorMemoryLayout.INTERLEAVED, + dim, + num_links, + input_dtype, + layout, + buffer_type, + use_program_cache, + function_level_defaults, + enable_async=enable_async, + num_iters=num_iters, + num_all_gather_instances=replication_factor, + cluster_axis=1, + trace_mode=True, + ) diff --git a/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py b/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py index dd416923e2c..03222c33b8f 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_all_gather_TG_post_commit.py @@ -45,6 +45,59 @@ def print_tile_corners_of_tensor(t): print(f"{str_vals}") +def run_with_trace( + mesh_device, + all_gather_topology, + input_tensor, + dim, + num_links, + cluster_axis, + output_mem_config, + n_worker=None, + n_buffer=None, + num_iter=20, +): + # Compile Run + logger.info("Compiling model") + tt_out_tensor = ttnn.all_gather( + input_tensor, + dim=dim, + cluster_axis=cluster_axis, + mesh_device=mesh_device, + num_links=num_links, + memory_config=output_mem_config, + topology=all_gather_topology, + ) + for d in mesh_device.get_devices(): + ttnn.synchronize_device(d) + + # Capture trace + logger.info("Capturing trace") + trace_id = ttnn.begin_trace_capture(mesh_device, cq_id=0) + for i in range(num_iter): + tt_out_tensor = ttnn.all_gather( + input_tensor, + dim=dim, + cluster_axis=cluster_axis, + mesh_device=mesh_device, + num_links=num_links, + memory_config=output_mem_config, + topology=all_gather_topology, + ) + ttnn.end_trace_capture(mesh_device, trace_id, cq_id=0) + for d in mesh_device.get_devices(): + ttnn.synchronize_device(d) + + # Run the op + logger.info("Starting Trace perf test...") + ttnn.execute_trace(mesh_device, trace_id, blocking=False) + ttnn.release_trace(mesh_device, trace_id) + for d in mesh_device.get_devices(): + ttnn.synchronize_device(d) + + return tt_out_tensor + + def run_line_all_gather_on_TG_with_mesh_tensor_along_rows( mesh_device, num_devices_per_line, @@ -63,6 +116,8 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows( num_iters: int = 1, cluster_axis: int = 0, tile=(32, 32), + trace_mode=False, + debug=False, ): if len(mesh_device.get_devices()) != 32: pytest.skip("Not TG!") @@ -120,16 +175,28 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows( ttnn_tensor = ttnn.to_device(ttnn_tensor, mesh_device) # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor) - for _ in range(num_iters): - ttnn_tensor_out = ttnn.all_gather( - ttnn_tensor, + if trace_mode: + ttnn_tensor_out = run_with_trace( + input_tensor=ttnn_tensor, dim=dim, cluster_axis=cluster_axis, mesh_device=mesh_device, num_links=num_links, - memory_config=output_mem_config, - topology=ttnn.Topology.Linear, + output_mem_config=output_mem_config, + all_gather_topology=ttnn.Topology.Linear, + num_iter=num_iters, ) + else: + for _ in range(num_iters): + ttnn_tensor_out = ttnn.all_gather( + ttnn_tensor, + dim=dim, + cluster_axis=cluster_axis, + mesh_device=mesh_device, + num_links=num_links, + memory_config=output_mem_config, + topology=ttnn.Topology.Linear, + ) # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor_out) tt_output_tensor = ttnn.to_torch( @@ -150,7 +217,7 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows( if not eq and debug is True: logger.error(f"found mismatches") report_mismatches(tt_output_tensor, output_golden, 100) - print_tile_corners_of_tensor(output_tensor) + print_tile_corners_of_tensor(tt_output_tensor) else: eq, output = comp_pcc(tt_output_tensor, output_golden) if not eq: