Skip to content

Commit

Permalink
Revert "Add a padding-aware, interleaved, tiled transpose HC with a f…
Browse files Browse the repository at this point in the history
…used padding value parameter (#15224)"

This reverts commit da40fd0.
  • Loading branch information
Aswinmcw committed Nov 26, 2024
1 parent f3108ee commit 26c46a5
Show file tree
Hide file tree
Showing 22 changed files with 239 additions and 665 deletions.
2 changes: 1 addition & 1 deletion models/demos/distilbert/tests/test_perf_distilbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def test_distilbert_perf_device(batch_size, test, reset_seeds):
if is_grayskull():
expected_perf = 57.3
elif is_wormhole_b0():
expected_perf = 103.884
expected_perf = 90.2505

command = f"pytest tests/ttnn/integration_tests/distilbert/test_ttnn_distilbert.py::test_distilbert_for_question_answering[sequence_size=768-batch_size=8-model_name=distilbert-base-uncased-distilled-squad]"
cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]
Expand Down
4 changes: 2 additions & 2 deletions models/demos/vgg/tests/test_perf_vgg.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,10 @@ def test_perf_device_bare_metal_vgg(batch_size, model_name):
margin = 0.03

if model_name == "ttnn_vgg11":
expected_perf = 168 if is_grayskull() else 283.289
expected_perf = 132.2436 if is_grayskull() else 272.8989
command = f"pytest tests/ttnn/integration_tests/vgg/test_ttnn_vgg11.py"
else:
expected_perf = 144 if is_grayskull() else 201.3867
expected_perf = 116.1459 if is_grayskull() else 194.4063
command = f"pytest tests/ttnn/integration_tests/vgg/test_ttnn_vgg16.py"

cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,5 @@ def test_rotary_embedding_llama_with_program_cache(
# When batch size is 1, transpose is a no-op
if batch == 1:
num_ops -= 1
elif batch % 32 == 0:
num_ops -= 1 # When batch size is a multiple of 32, no padding

assert device.num_program_cache_entries() == num_ops
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,6 @@ def test_rotary_embedding_llama_fused_qk_with_program_cache(

cache_tensors.append(test_tensor)

if batch == 32 or batch == 16:
num_ops = 4
else:
num_ops = 5 # embedding + fused_qk_rope + transpose + pad + interleaved_to_sharded
num_ops = 5 # embedding + fused_qk_rope + transpose + pad + interleaved_to_sharded

assert device.num_program_cache_entries() == num_ops
182 changes: 8 additions & 174 deletions tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.

# SPDX-License-Identifier: Apache-2.0

Expand All @@ -9,7 +9,7 @@
import ttnn

from loguru import logger
from models.utility_functions import is_grayskull, is_blackhole, torch_random
from models.utility_functions import is_grayskull
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal
from models.utility_functions import skip_for_grayskull, skip_for_blackhole
from tests.ttnn.utils_for_testing import assert_with_pcc
Expand All @@ -25,7 +25,6 @@ def transpose(
input_dtype=ttnn.bfloat16,
expected_program_cache_size=None,
):
torch.manual_seed(2005)
output_shape = list(input_shape)
output_shape[dim0], output_shape[dim1] = input_shape[dim1], input_shape[dim0]

Expand Down Expand Up @@ -125,7 +124,9 @@ def test_transpose_hc_program_cache(dtype, device, use_program_cache):
H = 32
W = 32
input_shape = (N, C, H, W)
transpose(input_shape, device, dim0=1, dim1=-2, expected_program_cache_size=3, input_dtype=dtype)
# CACHE MISS since its single core
# Cache size 2 more because of pad op in single core impl + transpose
transpose(input_shape, device, dim0=1, dim1=-2, expected_program_cache_size=4, input_dtype=dtype)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -154,8 +155,8 @@ def test_transpose_cn_program_cache(dtype, device, use_program_cache):

@pytest.mark.parametrize(
"dtype",
(ttnn.bfloat16, ttnn.float32, ttnn.bfloat8_b),
ids=["bfloat16", "float", "bfloat8_b"],
(ttnn.bfloat16, ttnn.float32),
ids=["bfloat16", "float"],
)
def test_transpose_wh_program_cache(dtype, device, use_program_cache):
if is_grayskull() and dtype == ttnn.float32:
Expand Down Expand Up @@ -306,7 +307,6 @@ def test_transpose_wh_sharded_program_cache(dtype, device, use_program_cache):
@pytest.mark.parametrize("h", [230])
@pytest.mark.parametrize("w", [256])
def test_tranpose_hw_rm_with_padding(device, n, c, h, w):
torch.manual_seed(2005)
torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
torch_output_tensor = torch_input_tensor.transpose(2, 3)
activation_pyt_padded = ttnn.from_torch(
Expand Down Expand Up @@ -340,7 +340,6 @@ def test_tranpose_hw_rm_with_padding(device, n, c, h, w):
@pytest.mark.parametrize("h", [8])
@pytest.mark.parametrize("w", [256])
def test_tranpose_hw_rm_no_padding(device, n, c, h, w):
torch.manual_seed(2005)
torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
torch_output_tensor = torch_input_tensor.transpose(2, 3)
activation_pyt_padded = ttnn.from_torch(
Expand All @@ -358,7 +357,6 @@ def test_tranpose_hw_rm_no_padding(device, n, c, h, w):


def run_tranpose_hw_rm_program_cache(device, n, c, h, w, use_program_cache):
torch.manual_seed(2005)
torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
torch_output_tensor = torch_input_tensor.transpose(2, 3)
activation_pyt_padded = ttnn.from_torch(
Expand Down Expand Up @@ -403,7 +401,6 @@ def test_tranpose_hw_rm_with_program_cache(device, n, c, h, w, use_program_cache
@pytest.mark.parametrize("h", [16])
@pytest.mark.parametrize("w", [112])
def test_tranpose_hw_sharded_rm(device, n, c, h, w):
torch.manual_seed(2005)
torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
torch_output_tensor = torch_input_tensor.transpose(2, 3)
tt_input_tensor = ttnn.from_torch(
Expand Down Expand Up @@ -439,7 +436,6 @@ def test_tranpose_hw_sharded_rm(device, n, c, h, w):


def run_tranpose_hw_sharded_rm_with_program_cache(device, n, c, h, w):
torch.manual_seed(2005)
torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
torch_output_tensor = torch_input_tensor.transpose(2, 3)
tt_input_tensor = ttnn.from_torch(
Expand Down Expand Up @@ -495,7 +491,6 @@ def test_tranpose_hw_sharded_rm_with_program_cache(device, n, c, h, w, use_progr
@pytest.mark.parametrize("h", [128])
@pytest.mark.parametrize("w", [16])
def test_tranpose_hc_rm(device, n, c, h, w):
torch.manual_seed(2005)
torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
torch_output_tensor = torch_input_tensor.transpose(1, 2)
activation_pyt_padded = ttnn.from_torch(
Expand All @@ -514,7 +509,6 @@ def test_tranpose_hc_rm(device, n, c, h, w):


def run_tranpose_hc_rm_with_program_cache(device, n, c, h, w, use_program_cache):
torch.manual_seed(2005)
torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
torch_output_tensor = torch_input_tensor.transpose(1, 2)
activation_pyt_padded = ttnn.from_torch(
Expand Down Expand Up @@ -552,7 +546,6 @@ def test_tranpose_hc_rm_with_program_cache(device, n, c, h, w, use_program_cache


def run_tranpose_hc_sharded(device, n, c, h, w, grid_size):
torch.manual_seed(2005)
torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
torch_output_tensor = torch_input_tensor.transpose(1, 2)
tt_input_tensor = ttnn.from_torch(
Expand Down Expand Up @@ -621,12 +614,9 @@ def test_tranpose_hc_sharded_with_program_cache(device, n, c, h, w, grid_size, u
((32, 32, 32, 32), (1, 2)),
((32, 32, 32, 32), (0, 3)),
((32, 32, 32, 32), (1, 3)),
((32, 32, 32, 32), (2, 3)),
((32, 32, 32, 32), (0, 1)),
],
)
def test_transpose_bfloat8_b(device, shape, swap_dims):
torch.manual_seed(2005)
input = torch.randn(shape, dtype=torch.bfloat16)
torch_output = input.transpose(*swap_dims)

Expand Down Expand Up @@ -669,7 +659,6 @@ def test_transpose_hc(dtype, shape, device):
[ttnn.TILE_LAYOUT],
)
def test_transpose_2D(dtype, shape, layout, device):
torch.manual_seed(2005)
if is_grayskull() and dtype == ttnn.float32:
pytest.skip("Skipping float32 tests on Grayskull")
if layout == ttnn.ROW_MAJOR_LAYOUT and dtype == ttnn.bfloat16 and (shape[-1] % 2 or shape[-2] % 2):
Expand Down Expand Up @@ -709,7 +698,6 @@ def test_transpose_2D(dtype, shape, layout, device):
],
)
def test_transpose_3D(dtype, shape, layout, dims, device):
torch.manual_seed(2005)
if is_grayskull() and dtype == ttnn.float32:
pytest.skip("Skipping float32 tests on Grayskull")
if layout == ttnn.ROW_MAJOR_LAYOUT and dtype == ttnn.bfloat16 and (shape[-1] % 2 or shape[dims[-1]] % 2):
Expand All @@ -729,7 +717,6 @@ def test_transpose_3D(dtype, shape, layout, dims, device):
[[4, 3, 1280, 40], [1, 4096, 4096]],
)
def test_transpose_4d_wh_rm(shape, device):
torch.manual_seed(2005)
torch_input = torch.randn(shape, dtype=torch.bfloat16)
torch_output = torch_input.transpose(-1, -2)

Expand All @@ -744,7 +731,6 @@ def test_transpose_4d_wh_rm(shape, device):
[[4, 3, 1280, 40], [1, 1, 1200, 1280], [1, 1, 4096, 4096]],
)
def test_transpose_4d_wh_tile(shape, device):
torch.manual_seed(2005)
torch_input = torch.randn(shape, dtype=torch.bfloat16)
torch_output = torch_input.transpose(-1, -2)

Expand All @@ -764,8 +750,7 @@ def test_transpose_4d_wh_tile(shape, device):
)
@pytest.mark.parametrize("memory_config", [ttnn.L1_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG])
def test_transpose_failures(config, memory_config, device):
pytest.skip("Failing pytorch 2.0 trace sweeps")
torch.manual_seed(2005)
pytest.skip("Failures to fix after #13217 and #13005 are in - 5D, HC PCC issue and unaligned RM tensor")
torch_input = torch.randn(config[0], dtype=torch.bfloat16)
torch_output = torch_input.transpose(config[1][0], config[1][1])

Expand Down Expand Up @@ -813,7 +798,6 @@ def test_transpose_failures(config, memory_config, device):
)
@pytest.mark.parametrize("memory_config", [ttnn.L1_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG])
def test_transpose_unaligned(config, memory_config, device):
torch.manual_seed(2005)
# this will convert to tiled for now
torch_input = torch.randn(config[0], dtype=torch.bfloat16)
torch_output = torch_input.transpose(config[1][0], config[1][1])
Expand Down Expand Up @@ -854,160 +838,10 @@ def test_transpose_hc_padded_c(shape, device):
[ttnn.ROW_MAJOR_LAYOUT],
)
def test_transpose_5d(shape, dims, layout, device):
torch.manual_seed(2005)
torch_input = torch.randn(shape, dtype=torch.bfloat16)
torch_output = torch_input.transpose(dims[0], dims[1])

tt_input = ttnn.from_torch(torch_input, dtype=ttnn.DataType.BFLOAT16, layout=layout, device=device)
tt_output = ttnn.transpose(tt_input, dims[0], dims[1])
tt_output = ttnn.to_torch(tt_output)
assert_with_pcc(torch_output, tt_output, 0.9999)


@pytest.mark.parametrize(
"shape",
[
[1, 5, 10, 15],
[1, 1, 1, 2],
[1, 3, 2, 1],
[1, 17, 1, 1],
[1, 1, 16, 1],
[1, 1, 17, 1],
[1, 1, 1, 17],
[2, 1, 1, 1],
[2, 33, 33, 33],
],
)
@pytest.mark.parametrize(
"dims",
[
(1, 2),
(0, 2),
],
)
@pytest.mark.parametrize(
"layout",
[ttnn.TILE_LAYOUT],
)
@pytest.mark.parametrize(
"dtype",
[ttnn.float32, ttnn.bfloat16],
)
def test_transpose_issue_11650_10350(shape, dims, layout, dtype, device):
torch.manual_seed(2005)
torch_input = torch.randn(shape, dtype=torch.bfloat16)
torch_output = torch_input.transpose(dims[0], dims[1])

tt_input = ttnn.from_torch(torch_input, dtype=dtype, layout=layout, device=device)
tt_output = ttnn.transpose(tt_input, dims[0], dims[1])
tt_output = ttnn.to_torch(tt_output)
assert_with_pcc(torch_output, tt_output, 0.9999)


@pytest.mark.parametrize(
"shape",
[
[1, 17, 1, 1],
[1, 1, 16, 1],
[1, 1, 17, 1],
[1, 1, 1, 17],
[2, 1, 1, 1],
[2, 33, 33, 33],
],
)
@pytest.mark.parametrize(
"dims",
[
(1, 2),
(0, 2),
],
)
@pytest.mark.parametrize(
"layout",
[ttnn.TILE_LAYOUT],
)
@pytest.mark.parametrize(
"dtype",
[ttnn.float32, ttnn.bfloat16],
)
@pytest.mark.parametrize(
"pad_value",
[None, float("-inf")],
)
def test_transpose_unpadded(shape, dims, layout, dtype, pad_value, device):
torch.manual_seed(2005)
if pad_value is not None and is_blackhole():
pytest.skip("Blackhole reduce is needed for the full test to work")
elif dtype == ttnn.float32 and is_grayskull():
pytest.skip("Grayskull does not support float32")
torch_input = torch.randn(shape, dtype=torch.bfloat16)
torch_output = torch_input.transpose(dims[0], dims[1])

tt_input = ttnn.from_torch(torch_input, dtype=dtype, layout=layout, device=device)
tt_output = ttnn.transpose(tt_input, dims[0], dims[1], pad_value=pad_value)
if pad_value is not None:
a = ttnn.min(
tt_output
) # if min becomes padding aware, this will fail, so feel free to delete this test then @future op writer
assert ttnn.to_torch(a) == float("-inf")
tt_output = ttnn.to_torch(tt_output)
assert_with_pcc(torch_output, tt_output, 0.9999)


@pytest.mark.parametrize("b", [1])
@pytest.mark.parametrize("h", [18])
@pytest.mark.parametrize("w", [65])
@pytest.mark.parametrize("dim0", [1])
@pytest.mark.parametrize("dim1", [2])
def test_transpose_forge_llama(device, b, h, w, dim0, dim1):
torch.manual_seed(2005)

torch_input_tensor = torch_random((b, h, w), -0.1, 0.1, dtype=torch.bfloat16)
torch_output_tensor = torch_input_tensor.transpose(dim0, dim1)

input_tensor = ttnn.to_device(ttnn.from_torch(torch_input_tensor), device, memory_config=ttnn.DRAM_MEMORY_CONFIG)
input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT)
output_tensor = ttnn.transpose(input_tensor, dim0, dim1, memory_config=ttnn.DRAM_MEMORY_CONFIG)
output_tensor = ttnn.from_device(output_tensor)
output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
output_tensor = ttnn.to_torch(output_tensor)

assert_with_pcc(torch_output_tensor, output_tensor)


@pytest.mark.parametrize("b", [1])
@pytest.mark.parametrize("h", [2])
@pytest.mark.parametrize("w", [3])
@pytest.mark.parametrize("dim0", [-1])
@pytest.mark.parametrize("dim1", [-2])
def test_transpose_forge_basic(device, b, h, w, dim0, dim1):
torch.manual_seed(2005)
torch_input_tensor = torch_random((1, b, h, w), -0.1, 0.1, dtype=torch.bfloat16)
torch_output_tensor = torch_input_tensor.transpose(dim0, dim1)
input_tensor = ttnn.to_device(ttnn.from_torch(torch_input_tensor), device, memory_config=ttnn.DRAM_MEMORY_CONFIG)
input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT)
output_tensor = ttnn.transpose(input_tensor, dim0, dim1, memory_config=ttnn.DRAM_MEMORY_CONFIG)
output_tensor = ttnn.from_device(output_tensor)
output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
output_tensor = ttnn.to_torch(output_tensor)

assert_with_pcc(torch_output_tensor, output_tensor)


@pytest.mark.parametrize("b", [6])
@pytest.mark.parametrize("h", [33])
@pytest.mark.parametrize("w", [34])
@pytest.mark.parametrize("dim0", [1])
@pytest.mark.parametrize("dim1", [0])
def test_transpose_forge_hc(device, b, h, w, dim0, dim1):
torch.manual_seed(2005)
torch_input_tensor = torch_random((1, b, h, w), -0.1, 0.1, dtype=torch.bfloat16)
torch_output_tensor = torch_input_tensor.transpose(dim0, dim1)
input_tensor = ttnn.to_device(ttnn.from_torch(torch_input_tensor), device, memory_config=ttnn.DRAM_MEMORY_CONFIG)
input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT)
output_tensor = ttnn.transpose(input_tensor, dim0, dim1, memory_config=ttnn.DRAM_MEMORY_CONFIG)
output_tensor = ttnn.from_device(output_tensor)
output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
output_tensor = ttnn.to_torch(output_tensor)

assert_with_pcc(torch_output_tensor, output_tensor)
Loading

0 comments on commit 26c46a5

Please sign in to comment.