From 96fd763cab1813c1ad2502a73bcd96855a462141 Mon Sep 17 00:00:00 2001 From: Aswinmcw Date: Wed, 27 Nov 2024 10:24:07 +0000 Subject: [PATCH] Add reduce scatter to pipeline --- .../workflows/tg-model-perf-tests-impl.yaml | 4 +-- tests/scripts/run_tests.sh | 3 +- .../operations/ccl/perf/test_ccl_perf.py | 2 +- .../ccl/test_reduce_scatter_TG_nightly.py | 32 ++++++------------- 4 files changed, 15 insertions(+), 26 deletions(-) diff --git a/.github/workflows/tg-model-perf-tests-impl.yaml b/.github/workflows/tg-model-perf-tests-impl.yaml index 8565fc8f93f2..304cfa616d4a 100644 --- a/.github/workflows/tg-model-perf-tests-impl.yaml +++ b/.github/workflows/tg-model-perf-tests-impl.yaml @@ -23,9 +23,9 @@ jobs: runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"], cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tg_device --dispatch-mode ""' }, - { name: "t3k CCL all_gather perf tests", + { name: "t3k CCL perf tests", arch: wormhole_b0, - cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type ccl_all_gather_perf_tg_device --dispatch-mode ""', + cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type ccl_perf_tg_device --dispatch-mode ""', timeout: 75, tracy: true, runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"], diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index f71fdb7f39f0..4eae8141a2ed 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -321,8 +321,9 @@ run_pipeline_tests() { demos_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode" elif [[ $pipeline_type == *"model_perf_tg_device" ]]; then model_perf_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode" - elif [[ $pipeline_type == "ccl_all_gather_perf_tg_device" ]]; then + elif [[ $pipeline_type == "ccl_perf_tg_device" ]]; then ./tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t tg + ./tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh -t tg # TGG pipelines elif [[ $pipeline_type == "unit_tgg_device" ]]; then unit_tgg_device "$tt_arch" "$pipeline_type" "$dispatch_mode" diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py index 01df9cbf6f62..54b31b9119cd 100644 --- a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py +++ b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py @@ -365,7 +365,7 @@ def test_all_gather_on_tg( @pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True) @pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum]) @pytest.mark.parametrize("device_params", [{"trace_region_size": 10281600}], indirect=True) -def test_line_reduce_scatter_on_TG_rows_post_commit( +def test_reduce_scatter_on_tg( mesh_device, num_devices, per_chip_output_shape, diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py index 5dc6a377e9f4..128907ce82cf 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_TG_nightly.py @@ -49,8 +49,9 @@ def run_with_trace( mesh_device, all_gather_topology, input_tensor, - scatter_dim, + dim, num_links, + math_op, cluster_axis, output_mem_config, n_worker=None, @@ -60,8 +61,8 @@ def run_with_trace( # Compile Run logger.info("Compiling model") tt_out_tensor = ttnn.reduce_scatter( - ttnn_tensor, - scatter_dim=scatter_dim, + input_tensor, + dim=dim, cluster_axis=cluster_axis, mesh_device=mesh_device, math_op=math_op, @@ -77,8 +78,8 @@ def run_with_trace( trace_id = ttnn.begin_trace_capture(mesh_device, cq_id=0) for i in range(num_iter): tt_out_tensor = ttnn.reduce_scatter( - ttnn_tensor, - scatter_dim=scatter_dim, + input_tensor, + dim=dim, cluster_axis=cluster_axis, mesh_device=mesh_device, math_op=math_op, @@ -198,22 +199,9 @@ def run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows( ) ttnn_tensor = ttnn.to_device(ttnn_tensor, mesh_device) - # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor) - ttnn_tensor_out = ttnn.reduce_scatter( - ttnn_tensor, - dim=dim, - cluster_axis=cluster_axis, - mesh_device=mesh_device, - math_op=math_op, - num_links=num_links, - memory_config=output_mem_config, - topology=ttnn.Topology.Linear, - ) - trace_id = ttnn.begin_trace_capture(mesh_device, cq_id=0) - # ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor) - for _ in range(num_iters): - ttnn_tensor_out = ttnn.reduce_scatter( - ttnn_tensor, + if trace_mode: + ttnn_tensor_out = run_with_trace( + input_tensor=ttnn_tensor, dim=dim, cluster_axis=cluster_axis, mesh_device=mesh_device, @@ -227,7 +215,7 @@ def run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows( for _ in range(num_iters): ttnn_tensor_out = ttnn.reduce_scatter( ttnn_tensor, - scatter_dim=dim, + dim=dim, cluster_axis=cluster_axis, mesh_device=mesh_device, math_op=math_op,