Skip to content

Commit

Permalink
Add all gather perf to pipeline for TG (#15001)
Browse files Browse the repository at this point in the history
### Ticket
#14406 

Adds all gather perf to pipeline for TG

https://github.com/tenstorrent/tt-metal/actions/runs/12006139751

<img width="1062" alt="Screenshot 2024-11-13 at 6 01 40 PM"
src="https://github.com/user-attachments/assets/3271906c-ccce-407a-97b3-0f26f8ff987a">


### Checklist
- [ ] Post commit CI passes
- [ ] Blackhole Post commit (if applicable)
- [ ] Model regression CI testing passes (if applicable)
- [ ] Device performance regression CI testing passes (if applicable)
- [ ] New/Existing tests provide coverage for changes
  • Loading branch information
Aswinmcw authored Nov 26, 2024
1 parent 88829c5 commit a36fd88
Show file tree
Hide file tree
Showing 8 changed files with 200 additions and 21 deletions.
1 change: 1 addition & 0 deletions .github/workflows/pipeline-select-galaxy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ on:
type: boolean
default: false
tg-model-perf:
description: "TG model perf tests (requires tracy build)"
required: false
type: boolean
default: false
Expand Down
56 changes: 48 additions & 8 deletions .github/workflows/tg-model-perf-tests-impl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ jobs:
runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"],
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type cnn_model_perf_tg_device --dispatch-mode ""'
},
{ name: "t3k CCL all_gather perf tests",
arch: wormhole_b0,
cmd: './tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type ccl_all_gather_perf_tg_device --dispatch-mode ""',
timeout: 75,
tracy: true,
runs-on: ["arch-wormhole_b0", "config-tg", "in-service", "bare-metal", "pipeline-perf"],
owner_id: ULMEPM2MA}, # Sean Nijjar
]
name: ${{ matrix.test-group.name }}
env:
Expand All @@ -41,9 +48,10 @@ jobs:
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
- name: Download profiler build artifact
uses: actions/download-artifact@v4
with:
name: TTMetal_build_${{ matrix.test-group.arch }}
name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
- name: Extract files
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
Expand All @@ -58,16 +66,48 @@ jobs:
id: check-perf-report
if: ${{ !cancelled() }}
run: |
ls -hal
export PERF_REPORT_FILENAME=Models_Perf_$(date +%Y_%m_%d).csv
ls -hal $PERF_REPORT_FILENAME
echo "perf_report_filename=$PERF_REPORT_FILENAME" >> "$GITHUB_OUTPUT"
- name: Upload perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' }}
TODAY=$(date +%Y_%m_%d)
PERF_REPORT_FILENAME_MODELS="Models_Perf_${TODAY}.csv"
PERF_REPORT_FILENAME_CCL_ALL_GATHER="CCL_all_gather_Perf_${TODAY}.csv"
PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER="CCL_reduce_scatter_Perf_${TODAY}.csv"
if [ "${{ matrix.test-group.tracy }}" == "true" ]; then
found_reports=false
if [ -f "$PERF_REPORT_FILENAME_CCL_ALL_GATHER" ]; then
echo "Found CCL AllGather Perf report: $PERF_REPORT_FILENAME_CCL_ALL_GATHER"
echo "perf_report_filename_all_gather=$PERF_REPORT_FILENAME_CCL_ALL_GATHER" >> "$GITHUB_OUTPUT"
found_reports=true
fi
if [ -f "$PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER" ]; then
echo "Found CCL ReduceScatter Perf report: $PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER"
echo "perf_report_filename_reduce_scatter=$PERF_REPORT_FILENAME_CCL_REDUCE_SCATTER" >> "$GITHUB_OUTPUT"
found_reports=true
fi
if [ "$found_reports" = false ]; then
echo "No CCL perf report found for today."
exit 1
fi
else
if [ -f "$PERF_REPORT_FILENAME_MODELS" ]; then
echo "Found Models Perf report: $PERF_REPORT_FILENAME_MODELS"
echo "perf_report_filename=$PERF_REPORT_FILENAME_MODELS" >> "$GITHUB_OUTPUT"
else
echo "No Models perf report found for today."
exit 1
fi
fi
- name: Upload Models perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' && !matrix.test-group.tracy}}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.machine-type }}
path: "${{ steps.check-perf-report.outputs.perf_report_filename }}"
- name: Upload CCL perf report
if: ${{ !cancelled() && steps.check-perf-report.conclusion == 'success' && matrix.test-group.tracy}}
uses: actions/upload-artifact@v4
with:
name: perf-report-csv-${{ matrix.test-group.model-type }}-${{ matrix.test-group.arch }}-${{ matrix.test-group.model }}-bare-metal
path:
${{ steps.check-perf-report.outputs.perf_report_filename_all_gather }}
- name: Disable performance mode
if: always()
run: |
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/tg-model-perf-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@ on:
- cron: "0 */12 * * *" # This cron schedule runs the workflow every 12 hours

jobs:
build-artifact:
build-artifact-profiler:
uses: ./.github/workflows/build-artifact.yaml
with:
arch: '["wormhole_b0"]'
tracy: true
secrets: inherit
tg-model-perf-tests:
needs: build-artifact
needs: build-artifact-profiler
secrets: inherit
uses: ./.github/workflows/tg-model-perf-tests-impl.yaml
2 changes: 2 additions & 0 deletions tests/scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,8 @@ run_pipeline_tests() {
demos_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
elif [[ $pipeline_type == *"model_perf_tg_device" ]]; then
model_perf_tg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
elif [[ $pipeline_type == "ccl_all_gather_perf_tg_device" ]]; then
./tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh -t tg
# TGG pipelines
elif [[ $pipeline_type == "unit_tgg_device" ]]; then
unit_tgg_device "$tt_arch" "$pipeline_type" "$dispatch_mode"
Expand Down
6 changes: 4 additions & 2 deletions tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,12 @@ def calculate_bandwidth(row):
group_df.rename(columns={"INPUT_0_LAYOUT": "Layout", "INPUT_0_DATATYPE": "Data Type"}, inplace=True)

group_df["Input Shape"] = group_df.apply(
lambda row: f"[{row['INPUT_0_W']}, {row['INPUT_0_Z']}, {row['INPUT_0_Y']}, {row['INPUT_0_X']}]", axis=1
lambda row: f"[{int(row['INPUT_0_W'])}, {int(row['INPUT_0_Z'])}, {int(row['INPUT_0_Y'])}, {int(row['INPUT_0_X'])}]",
axis=1,
)
group_df["Output Shape"] = group_df.apply(
lambda row: f"[{row['OUTPUT_0_W']}, {row['OUTPUT_0_Z']}, {row['OUTPUT_0_Y']}, {row['OUTPUT_0_X']}]", axis=1
lambda row: f"[{int(row['OUTPUT_0_W'])}, {int(row['OUTPUT_0_Z'])}, {int(row['OUTPUT_0_Y'])}, {int(row['OUTPUT_0_X'])}]",
axis=1,
)
group_df["Cycles Count"] = group_df["DEVICE FW END CYCLE"] - group_df["DEVICE FW START CYCLE"]
group_df[["Op BW [GB/s]", "Link BW [GB/s]"]] = group_df.apply(calculate_bandwidth, axis=1, result_type="expand")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ show_help() {
echo
echo "Options:"
echo " -d, --debug Enable debug mode to show real-time output."
echo " -t, --target Specify the target configuration (t3000 or n300). Default is n300."
echo " -t, --target Specify the target configuration (t3000 or n300 or tg). Default is n300."
echo " -h, --help Display this help message."
echo
echo "Example:"
Expand Down Expand Up @@ -42,8 +42,8 @@ while [ $# -gt 0 ]; do
shift 2

# Validate the target value
if [ "$TARGET" != "t3000" ] && [ "$TARGET" != "n300" ]; then
echo "Error: Invalid target configuration: $TARGET. Must be either 't3000' or 'n300'."
if [ "$TARGET" != "t3000" ] && [ "$TARGET" != "tg" ] && [ "$TARGET" != "n300" ]; then
echo "Error: Invalid target configuration: $TARGET. Must be 't3000' or 'n300' or 'tg'."
exit 1
fi
;;
Expand Down
66 changes: 66 additions & 0 deletions tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
from tests.ttnn.unit_tests.operations.ccl.test_reduce_scatter_post_commit import (
run_reduce_scatter_test,
)
from tests.ttnn.unit_tests.operations.ccl.test_all_gather_TG_post_commit import (
run_line_all_gather_on_TG_with_mesh_tensor_along_rows,
)


@skip_for_grayskull("Requires eth connected devices to run")
Expand Down Expand Up @@ -266,3 +269,66 @@ def test_reduce_scatter_on_n300(
enable_async=enable_async,
trace_mode=True,
)


@skip_for_grayskull("Requires eth connected devices to run")
@pytest.mark.parametrize(
"num_devices, num_links, per_chip_output_shape, dim, layout",
[
(4, 3, [4, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
(4, 3, [1, 1, 32, 16384 * 4], 3, ttnn.TILE_LAYOUT),
(4, 3, [1, 4, 32, 6656], 1, ttnn.TILE_LAYOUT),
],
)
@pytest.mark.parametrize(
"input_dtype",
[
ttnn.bfloat16,
ttnn.bfloat8_b,
],
)
@pytest.mark.parametrize(
"buffer_type",
[
ttnn.BufferType.DRAM,
ttnn.BufferType.L1,
],
)
@pytest.mark.parametrize("replication_factor", [8])
@pytest.mark.parametrize("num_iters", [20])
@pytest.mark.parametrize("enable_async", [True])
@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
@pytest.mark.parametrize("device_params", [{"trace_region_size": 532480}], indirect=True)
def test_all_gather_on_tg(
mesh_device,
num_devices,
per_chip_output_shape,
dim,
num_links,
input_dtype,
layout,
buffer_type,
use_program_cache,
function_level_defaults,
enable_async,
replication_factor,
num_iters,
):
run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
mesh_device,
num_devices,
per_chip_output_shape,
ttnn.TensorMemoryLayout.INTERLEAVED,
dim,
num_links,
input_dtype,
layout,
buffer_type,
use_program_cache,
function_level_defaults,
enable_async=enable_async,
num_iters=num_iters,
num_all_gather_instances=replication_factor,
cluster_axis=1,
trace_mode=True,
)
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,59 @@ def print_tile_corners_of_tensor(t):
print(f"{str_vals}")


def run_with_trace(
mesh_device,
all_gather_topology,
input_tensor,
dim,
num_links,
cluster_axis,
output_mem_config,
n_worker=None,
n_buffer=None,
num_iter=20,
):
# Compile Run
logger.info("Compiling model")
tt_out_tensor = ttnn.all_gather(
input_tensor,
dim=dim,
cluster_axis=cluster_axis,
mesh_device=mesh_device,
num_links=num_links,
memory_config=output_mem_config,
topology=all_gather_topology,
)
for d in mesh_device.get_devices():
ttnn.synchronize_device(d)

# Capture trace
logger.info("Capturing trace")
trace_id = ttnn.begin_trace_capture(mesh_device, cq_id=0)
for i in range(num_iter):
tt_out_tensor = ttnn.all_gather(
input_tensor,
dim=dim,
cluster_axis=cluster_axis,
mesh_device=mesh_device,
num_links=num_links,
memory_config=output_mem_config,
topology=all_gather_topology,
)
ttnn.end_trace_capture(mesh_device, trace_id, cq_id=0)
for d in mesh_device.get_devices():
ttnn.synchronize_device(d)

# Run the op
logger.info("Starting Trace perf test...")
ttnn.execute_trace(mesh_device, trace_id, blocking=False)
ttnn.release_trace(mesh_device, trace_id)
for d in mesh_device.get_devices():
ttnn.synchronize_device(d)

return tt_out_tensor


def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
mesh_device,
num_devices_per_line,
Expand All @@ -63,6 +116,8 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
num_iters: int = 1,
cluster_axis: int = 0,
tile=(32, 32),
trace_mode=False,
debug=False,
):
if len(mesh_device.get_devices()) != 32:
pytest.skip("Not TG!")
Expand Down Expand Up @@ -120,16 +175,28 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
ttnn_tensor = ttnn.to_device(ttnn_tensor, mesh_device)

# ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor)
for _ in range(num_iters):
ttnn_tensor_out = ttnn.all_gather(
ttnn_tensor,
if trace_mode:
ttnn_tensor_out = run_with_trace(
input_tensor=ttnn_tensor,
dim=dim,
cluster_axis=cluster_axis,
mesh_device=mesh_device,
num_links=num_links,
memory_config=output_mem_config,
topology=ttnn.Topology.Linear,
output_mem_config=output_mem_config,
all_gather_topology=ttnn.Topology.Linear,
num_iter=num_iters,
)
else:
for _ in range(num_iters):
ttnn_tensor_out = ttnn.all_gather(
ttnn_tensor,
dim=dim,
cluster_axis=cluster_axis,
mesh_device=mesh_device,
num_links=num_links,
memory_config=output_mem_config,
topology=ttnn.Topology.Linear,
)

# ttnn.visualize_mesh_device(mesh_device, tensor=ttnn_tensor_out)
tt_output_tensor = ttnn.to_torch(
Expand All @@ -150,7 +217,7 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
if not eq and debug is True:
logger.error(f"found mismatches")
report_mismatches(tt_output_tensor, output_golden, 100)
print_tile_corners_of_tensor(output_tensor)
print_tile_corners_of_tensor(tt_output_tensor)
else:
eq, output = comp_pcc(tt_output_tensor, output_golden)
if not eq:
Expand Down

0 comments on commit a36fd88

Please sign in to comment.