Revert "Add a padding-aware, interleaved, tiled transpose HC with a f…

…used padding value parameter (#15224)" This reverts commit da40fd0.
tenstorrent · Nov 26, 2024 · 26c46a5 · 26c46a5
1 parent f3108ee
commit 26c46a5
Show file tree

Hide file tree

Showing 22 changed files with 239 additions and 665 deletions.
diff --git a/models/demos/distilbert/tests/test_perf_distilbert.py b/models/demos/distilbert/tests/test_perf_distilbert.py
@@ -154,7 +154,7 @@ def test_distilbert_perf_device(batch_size, test, reset_seeds):
     if is_grayskull():
         expected_perf = 57.3
     elif is_wormhole_b0():
-        expected_perf = 103.884
+        expected_perf = 90.2505
 
     command = f"pytest tests/ttnn/integration_tests/distilbert/test_ttnn_distilbert.py::test_distilbert_for_question_answering[sequence_size=768-batch_size=8-model_name=distilbert-base-uncased-distilled-squad]"
     cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]

diff --git a/models/demos/vgg/tests/test_perf_vgg.py b/models/demos/vgg/tests/test_perf_vgg.py
@@ -137,10 +137,10 @@ def test_perf_device_bare_metal_vgg(batch_size, model_name):
     margin = 0.03
 
     if model_name == "ttnn_vgg11":
-        expected_perf = 168 if is_grayskull() else 283.289
+        expected_perf = 132.2436 if is_grayskull() else 272.8989
         command = f"pytest tests/ttnn/integration_tests/vgg/test_ttnn_vgg11.py"
     else:
-        expected_perf = 144 if is_grayskull() else 201.3867
+        expected_perf = 116.1459 if is_grayskull() else 194.4063
         command = f"pytest tests/ttnn/integration_tests/vgg/test_ttnn_vgg16.py"
 
     cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"]

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py
@@ -464,7 +464,5 @@ def test_rotary_embedding_llama_with_program_cache(
         # When batch size is 1, transpose is a no-op
         if batch == 1:
             num_ops -= 1
-        elif batch % 32 == 0:
-            num_ops -= 1  # When batch size is a multiple of 32, no padding
 
     assert device.num_program_cache_entries() == num_ops
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py
@@ -132,9 +132,6 @@ def test_rotary_embedding_llama_fused_qk_with_program_cache(
 
         cache_tensors.append(test_tensor)
 
-    if batch == 32 or batch == 16:
-        num_ops = 4
-    else:
-        num_ops = 5  # embedding + fused_qk_rope + transpose + pad + interleaved_to_sharded
+    num_ops = 5  # embedding + fused_qk_rope + transpose + pad + interleaved_to_sharded
 
     assert device.num_program_cache_entries() == num_ops
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
 
 # SPDX-License-Identifier: Apache-2.0
 
@@ -9,7 +9,7 @@
 import ttnn
 
 from loguru import logger
-from models.utility_functions import is_grayskull, is_blackhole, torch_random
+from models.utility_functions import is_grayskull
 from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal
 from models.utility_functions import skip_for_grayskull, skip_for_blackhole
 from tests.ttnn.utils_for_testing import assert_with_pcc
@@ -25,7 +25,6 @@ def transpose(
     input_dtype=ttnn.bfloat16,
     expected_program_cache_size=None,
 ):
-    torch.manual_seed(2005)
     output_shape = list(input_shape)
     output_shape[dim0], output_shape[dim1] = input_shape[dim1], input_shape[dim0]
 
@@ -125,7 +124,9 @@ def test_transpose_hc_program_cache(dtype, device, use_program_cache):
     H = 32
     W = 32
     input_shape = (N, C, H, W)
-    transpose(input_shape, device, dim0=1, dim1=-2, expected_program_cache_size=3, input_dtype=dtype)
+    # CACHE MISS since its single core
+    # Cache size 2 more because of pad op in single core impl + transpose
+    transpose(input_shape, device, dim0=1, dim1=-2, expected_program_cache_size=4, input_dtype=dtype)
 
 
 @pytest.mark.parametrize(
@@ -154,8 +155,8 @@ def test_transpose_cn_program_cache(dtype, device, use_program_cache):
 
 @pytest.mark.parametrize(
     "dtype",
-    (ttnn.bfloat16, ttnn.float32, ttnn.bfloat8_b),
-    ids=["bfloat16", "float", "bfloat8_b"],
+    (ttnn.bfloat16, ttnn.float32),
+    ids=["bfloat16", "float"],
 )
 def test_transpose_wh_program_cache(dtype, device, use_program_cache):
     if is_grayskull() and dtype == ttnn.float32:
@@ -306,7 +307,6 @@ def test_transpose_wh_sharded_program_cache(dtype, device, use_program_cache):
 @pytest.mark.parametrize("h", [230])
 @pytest.mark.parametrize("w", [256])
 def test_tranpose_hw_rm_with_padding(device, n, c, h, w):
-    torch.manual_seed(2005)
     torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
     torch_output_tensor = torch_input_tensor.transpose(2, 3)
     activation_pyt_padded = ttnn.from_torch(
@@ -340,7 +340,6 @@ def test_tranpose_hw_rm_with_padding(device, n, c, h, w):
 @pytest.mark.parametrize("h", [8])
 @pytest.mark.parametrize("w", [256])
 def test_tranpose_hw_rm_no_padding(device, n, c, h, w):
-    torch.manual_seed(2005)
     torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
     torch_output_tensor = torch_input_tensor.transpose(2, 3)
     activation_pyt_padded = ttnn.from_torch(
@@ -358,7 +357,6 @@ def test_tranpose_hw_rm_no_padding(device, n, c, h, w):
 
 
 def run_tranpose_hw_rm_program_cache(device, n, c, h, w, use_program_cache):
-    torch.manual_seed(2005)
     torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
     torch_output_tensor = torch_input_tensor.transpose(2, 3)
     activation_pyt_padded = ttnn.from_torch(
@@ -403,7 +401,6 @@ def test_tranpose_hw_rm_with_program_cache(device, n, c, h, w, use_program_cache
 @pytest.mark.parametrize("h", [16])
 @pytest.mark.parametrize("w", [112])
 def test_tranpose_hw_sharded_rm(device, n, c, h, w):
-    torch.manual_seed(2005)
     torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
     torch_output_tensor = torch_input_tensor.transpose(2, 3)
     tt_input_tensor = ttnn.from_torch(
@@ -439,7 +436,6 @@ def test_tranpose_hw_sharded_rm(device, n, c, h, w):
 
 
 def run_tranpose_hw_sharded_rm_with_program_cache(device, n, c, h, w):
-    torch.manual_seed(2005)
     torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
     torch_output_tensor = torch_input_tensor.transpose(2, 3)
     tt_input_tensor = ttnn.from_torch(
@@ -495,7 +491,6 @@ def test_tranpose_hw_sharded_rm_with_program_cache(device, n, c, h, w, use_progr
 @pytest.mark.parametrize("h", [128])
 @pytest.mark.parametrize("w", [16])
 def test_tranpose_hc_rm(device, n, c, h, w):
-    torch.manual_seed(2005)
     torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
     torch_output_tensor = torch_input_tensor.transpose(1, 2)
     activation_pyt_padded = ttnn.from_torch(
@@ -514,7 +509,6 @@ def test_tranpose_hc_rm(device, n, c, h, w):
 
 
 def run_tranpose_hc_rm_with_program_cache(device, n, c, h, w, use_program_cache):
-    torch.manual_seed(2005)
     torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
     torch_output_tensor = torch_input_tensor.transpose(1, 2)
     activation_pyt_padded = ttnn.from_torch(
@@ -552,7 +546,6 @@ def test_tranpose_hc_rm_with_program_cache(device, n, c, h, w, use_program_cache
 
 
 def run_tranpose_hc_sharded(device, n, c, h, w, grid_size):
-    torch.manual_seed(2005)
     torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16)
     torch_output_tensor = torch_input_tensor.transpose(1, 2)
     tt_input_tensor = ttnn.from_torch(
@@ -621,12 +614,9 @@ def test_tranpose_hc_sharded_with_program_cache(device, n, c, h, w, grid_size, u
         ((32, 32, 32, 32), (1, 2)),
         ((32, 32, 32, 32), (0, 3)),
         ((32, 32, 32, 32), (1, 3)),
-        ((32, 32, 32, 32), (2, 3)),
-        ((32, 32, 32, 32), (0, 1)),
     ],
 )
 def test_transpose_bfloat8_b(device, shape, swap_dims):
-    torch.manual_seed(2005)
     input = torch.randn(shape, dtype=torch.bfloat16)
     torch_output = input.transpose(*swap_dims)
 
@@ -669,7 +659,6 @@ def test_transpose_hc(dtype, shape, device):
     [ttnn.TILE_LAYOUT],
 )
 def test_transpose_2D(dtype, shape, layout, device):
-    torch.manual_seed(2005)
     if is_grayskull() and dtype == ttnn.float32:
         pytest.skip("Skipping float32 tests on Grayskull")
     if layout == ttnn.ROW_MAJOR_LAYOUT and dtype == ttnn.bfloat16 and (shape[-1] % 2 or shape[-2] % 2):
@@ -709,7 +698,6 @@ def test_transpose_2D(dtype, shape, layout, device):
     ],
 )
 def test_transpose_3D(dtype, shape, layout, dims, device):
-    torch.manual_seed(2005)
     if is_grayskull() and dtype == ttnn.float32:
         pytest.skip("Skipping float32 tests on Grayskull")
     if layout == ttnn.ROW_MAJOR_LAYOUT and dtype == ttnn.bfloat16 and (shape[-1] % 2 or shape[dims[-1]] % 2):
@@ -729,7 +717,6 @@ def test_transpose_3D(dtype, shape, layout, dims, device):
     [[4, 3, 1280, 40], [1, 4096, 4096]],
 )
 def test_transpose_4d_wh_rm(shape, device):
-    torch.manual_seed(2005)
     torch_input = torch.randn(shape, dtype=torch.bfloat16)
     torch_output = torch_input.transpose(-1, -2)
 
@@ -744,7 +731,6 @@ def test_transpose_4d_wh_rm(shape, device):
     [[4, 3, 1280, 40], [1, 1, 1200, 1280], [1, 1, 4096, 4096]],
 )
 def test_transpose_4d_wh_tile(shape, device):
-    torch.manual_seed(2005)
     torch_input = torch.randn(shape, dtype=torch.bfloat16)
     torch_output = torch_input.transpose(-1, -2)
 
@@ -764,8 +750,7 @@ def test_transpose_4d_wh_tile(shape, device):
 )
 @pytest.mark.parametrize("memory_config", [ttnn.L1_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG])
 def test_transpose_failures(config, memory_config, device):
-    pytest.skip("Failing pytorch 2.0 trace sweeps")
-    torch.manual_seed(2005)
+    pytest.skip("Failures to fix after #13217 and #13005 are in - 5D, HC PCC issue and unaligned RM tensor")
     torch_input = torch.randn(config[0], dtype=torch.bfloat16)
     torch_output = torch_input.transpose(config[1][0], config[1][1])
 
@@ -813,7 +798,6 @@ def test_transpose_failures(config, memory_config, device):
 )
 @pytest.mark.parametrize("memory_config", [ttnn.L1_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG])
 def test_transpose_unaligned(config, memory_config, device):
-    torch.manual_seed(2005)
     # this will convert to tiled for now
     torch_input = torch.randn(config[0], dtype=torch.bfloat16)
     torch_output = torch_input.transpose(config[1][0], config[1][1])
@@ -854,160 +838,10 @@ def test_transpose_hc_padded_c(shape, device):
     [ttnn.ROW_MAJOR_LAYOUT],
 )
 def test_transpose_5d(shape, dims, layout, device):
-    torch.manual_seed(2005)
     torch_input = torch.randn(shape, dtype=torch.bfloat16)
     torch_output = torch_input.transpose(dims[0], dims[1])
 
     tt_input = ttnn.from_torch(torch_input, dtype=ttnn.DataType.BFLOAT16, layout=layout, device=device)
     tt_output = ttnn.transpose(tt_input, dims[0], dims[1])
     tt_output = ttnn.to_torch(tt_output)
     assert_with_pcc(torch_output, tt_output, 0.9999)
-
-
-@pytest.mark.parametrize(
-    "shape",
-    [
-        [1, 5, 10, 15],
-        [1, 1, 1, 2],
-        [1, 3, 2, 1],
-        [1, 17, 1, 1],
-        [1, 1, 16, 1],
-        [1, 1, 17, 1],
-        [1, 1, 1, 17],
-        [2, 1, 1, 1],
-        [2, 33, 33, 33],
-    ],
-)
-@pytest.mark.parametrize(
-    "dims",
-    [
-        (1, 2),
-        (0, 2),
-    ],
-)
-@pytest.mark.parametrize(
-    "layout",
-    [ttnn.TILE_LAYOUT],
-)
-@pytest.mark.parametrize(
-    "dtype",
-    [ttnn.float32, ttnn.bfloat16],
-)
-def test_transpose_issue_11650_10350(shape, dims, layout, dtype, device):
-    torch.manual_seed(2005)
-    torch_input = torch.randn(shape, dtype=torch.bfloat16)
-    torch_output = torch_input.transpose(dims[0], dims[1])
-
-    tt_input = ttnn.from_torch(torch_input, dtype=dtype, layout=layout, device=device)
-    tt_output = ttnn.transpose(tt_input, dims[0], dims[1])
-    tt_output = ttnn.to_torch(tt_output)
-    assert_with_pcc(torch_output, tt_output, 0.9999)
-
-
-@pytest.mark.parametrize(
-    "shape",
-    [
-        [1, 17, 1, 1],
-        [1, 1, 16, 1],
-        [1, 1, 17, 1],
-        [1, 1, 1, 17],
-        [2, 1, 1, 1],
-        [2, 33, 33, 33],
-    ],
-)
-@pytest.mark.parametrize(
-    "dims",
-    [
-        (1, 2),
-        (0, 2),
-    ],
-)
-@pytest.mark.parametrize(
-    "layout",
-    [ttnn.TILE_LAYOUT],
-)
-@pytest.mark.parametrize(
-    "dtype",
-    [ttnn.float32, ttnn.bfloat16],
-)
-@pytest.mark.parametrize(
-    "pad_value",
-    [None, float("-inf")],
-)
-def test_transpose_unpadded(shape, dims, layout, dtype, pad_value, device):
-    torch.manual_seed(2005)
-    if pad_value is not None and is_blackhole():
-        pytest.skip("Blackhole reduce is needed for the full test to work")
-    elif dtype == ttnn.float32 and is_grayskull():
-        pytest.skip("Grayskull does not support float32")
-    torch_input = torch.randn(shape, dtype=torch.bfloat16)
-    torch_output = torch_input.transpose(dims[0], dims[1])
-
-    tt_input = ttnn.from_torch(torch_input, dtype=dtype, layout=layout, device=device)
-    tt_output = ttnn.transpose(tt_input, dims[0], dims[1], pad_value=pad_value)
-    if pad_value is not None:
-        a = ttnn.min(
-            tt_output
-        )  # if min becomes padding aware, this will fail, so feel free to delete this test then @future op writer
-        assert ttnn.to_torch(a) == float("-inf")
-    tt_output = ttnn.to_torch(tt_output)
-    assert_with_pcc(torch_output, tt_output, 0.9999)
-
-
-@pytest.mark.parametrize("b", [1])
-@pytest.mark.parametrize("h", [18])
-@pytest.mark.parametrize("w", [65])
-@pytest.mark.parametrize("dim0", [1])
-@pytest.mark.parametrize("dim1", [2])
-def test_transpose_forge_llama(device, b, h, w, dim0, dim1):
-    torch.manual_seed(2005)
-
-    torch_input_tensor = torch_random((b, h, w), -0.1, 0.1, dtype=torch.bfloat16)
-    torch_output_tensor = torch_input_tensor.transpose(dim0, dim1)
-
-    input_tensor = ttnn.to_device(ttnn.from_torch(torch_input_tensor), device, memory_config=ttnn.DRAM_MEMORY_CONFIG)
-    input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT)
-    output_tensor = ttnn.transpose(input_tensor, dim0, dim1, memory_config=ttnn.DRAM_MEMORY_CONFIG)
-    output_tensor = ttnn.from_device(output_tensor)
-    output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
-    output_tensor = ttnn.to_torch(output_tensor)
-
-    assert_with_pcc(torch_output_tensor, output_tensor)
-
-
-@pytest.mark.parametrize("b", [1])
-@pytest.mark.parametrize("h", [2])
-@pytest.mark.parametrize("w", [3])
-@pytest.mark.parametrize("dim0", [-1])
-@pytest.mark.parametrize("dim1", [-2])
-def test_transpose_forge_basic(device, b, h, w, dim0, dim1):
-    torch.manual_seed(2005)
-    torch_input_tensor = torch_random((1, b, h, w), -0.1, 0.1, dtype=torch.bfloat16)
-    torch_output_tensor = torch_input_tensor.transpose(dim0, dim1)
-    input_tensor = ttnn.to_device(ttnn.from_torch(torch_input_tensor), device, memory_config=ttnn.DRAM_MEMORY_CONFIG)
-    input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT)
-    output_tensor = ttnn.transpose(input_tensor, dim0, dim1, memory_config=ttnn.DRAM_MEMORY_CONFIG)
-    output_tensor = ttnn.from_device(output_tensor)
-    output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
-    output_tensor = ttnn.to_torch(output_tensor)
-
-    assert_with_pcc(torch_output_tensor, output_tensor)
-
-
-@pytest.mark.parametrize("b", [6])
-@pytest.mark.parametrize("h", [33])
-@pytest.mark.parametrize("w", [34])
-@pytest.mark.parametrize("dim0", [1])
-@pytest.mark.parametrize("dim1", [0])
-def test_transpose_forge_hc(device, b, h, w, dim0, dim1):
-    torch.manual_seed(2005)
-    torch_input_tensor = torch_random((1, b, h, w), -0.1, 0.1, dtype=torch.bfloat16)
-    torch_output_tensor = torch_input_tensor.transpose(dim0, dim1)
-    input_tensor = ttnn.to_device(ttnn.from_torch(torch_input_tensor), device, memory_config=ttnn.DRAM_MEMORY_CONFIG)
-    input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT)
-    output_tensor = ttnn.transpose(input_tensor, dim0, dim1, memory_config=ttnn.DRAM_MEMORY_CONFIG)
-    output_tensor = ttnn.from_device(output_tensor)
-    output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
-    output_tensor = ttnn.to_torch(output_tensor)
-
-    assert_with_pcc(torch_output_tensor, output_tensor)