diff --git a/models/demos/distilbert/tests/test_perf_distilbert.py b/models/demos/distilbert/tests/test_perf_distilbert.py index f3b0a6373fa..44eddd054c1 100644 --- a/models/demos/distilbert/tests/test_perf_distilbert.py +++ b/models/demos/distilbert/tests/test_perf_distilbert.py @@ -154,7 +154,7 @@ def test_distilbert_perf_device(batch_size, test, reset_seeds): if is_grayskull(): expected_perf = 57.3 elif is_wormhole_b0(): - expected_perf = 103.884 + expected_perf = 90.2505 command = f"pytest tests/ttnn/integration_tests/distilbert/test_ttnn_distilbert.py::test_distilbert_for_question_answering[sequence_size=768-batch_size=8-model_name=distilbert-base-uncased-distilled-squad]" cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"] diff --git a/models/demos/vgg/tests/test_perf_vgg.py b/models/demos/vgg/tests/test_perf_vgg.py index b6f2af0e230..f687e217cba 100644 --- a/models/demos/vgg/tests/test_perf_vgg.py +++ b/models/demos/vgg/tests/test_perf_vgg.py @@ -137,10 +137,10 @@ def test_perf_device_bare_metal_vgg(batch_size, model_name): margin = 0.03 if model_name == "ttnn_vgg11": - expected_perf = 168 if is_grayskull() else 283.289 + expected_perf = 132.2436 if is_grayskull() else 272.8989 command = f"pytest tests/ttnn/integration_tests/vgg/test_ttnn_vgg11.py" else: - expected_perf = 144 if is_grayskull() else 201.3867 + expected_perf = 116.1459 if is_grayskull() else 194.4063 command = f"pytest tests/ttnn/integration_tests/vgg/test_ttnn_vgg16.py" cols = ["DEVICE FW", "DEVICE KERNEL", "DEVICE BRISC KERNEL"] diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py index c9958604dad..617d72af3fb 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama.py @@ -464,7 +464,5 @@ def test_rotary_embedding_llama_with_program_cache( # When batch size is 1, transpose is a no-op if batch == 1: num_ops -= 1 - elif batch % 32 == 0: - num_ops -= 1 # When batch size is a multiple of 32, no padding assert device.num_program_cache_entries() == num_ops diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py index 579791f0eab..e7de947c9a6 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_rotary_embedding_llama_fused_qk.py @@ -132,9 +132,6 @@ def test_rotary_embedding_llama_fused_qk_with_program_cache( cache_tensors.append(test_tensor) - if batch == 32 or batch == 16: - num_ops = 4 - else: - num_ops = 5 # embedding + fused_qk_rope + transpose + pad + interleaved_to_sharded + num_ops = 5 # embedding + fused_qk_rope + transpose + pad + interleaved_to_sharded assert device.num_program_cache_entries() == num_ops diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py index 489b25ba5e9..8c3f7ce0128 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. # SPDX-License-Identifier: Apache-2.0 @@ -9,7 +9,7 @@ import ttnn from loguru import logger -from models.utility_functions import is_grayskull, is_blackhole, torch_random +from models.utility_functions import is_grayskull from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_pcc, comp_equal from models.utility_functions import skip_for_grayskull, skip_for_blackhole from tests.ttnn.utils_for_testing import assert_with_pcc @@ -25,7 +25,6 @@ def transpose( input_dtype=ttnn.bfloat16, expected_program_cache_size=None, ): - torch.manual_seed(2005) output_shape = list(input_shape) output_shape[dim0], output_shape[dim1] = input_shape[dim1], input_shape[dim0] @@ -125,7 +124,9 @@ def test_transpose_hc_program_cache(dtype, device, use_program_cache): H = 32 W = 32 input_shape = (N, C, H, W) - transpose(input_shape, device, dim0=1, dim1=-2, expected_program_cache_size=3, input_dtype=dtype) + # CACHE MISS since its single core + # Cache size 2 more because of pad op in single core impl + transpose + transpose(input_shape, device, dim0=1, dim1=-2, expected_program_cache_size=4, input_dtype=dtype) @pytest.mark.parametrize( @@ -154,8 +155,8 @@ def test_transpose_cn_program_cache(dtype, device, use_program_cache): @pytest.mark.parametrize( "dtype", - (ttnn.bfloat16, ttnn.float32, ttnn.bfloat8_b), - ids=["bfloat16", "float", "bfloat8_b"], + (ttnn.bfloat16, ttnn.float32), + ids=["bfloat16", "float"], ) def test_transpose_wh_program_cache(dtype, device, use_program_cache): if is_grayskull() and dtype == ttnn.float32: @@ -306,7 +307,6 @@ def test_transpose_wh_sharded_program_cache(dtype, device, use_program_cache): @pytest.mark.parametrize("h", [230]) @pytest.mark.parametrize("w", [256]) def test_tranpose_hw_rm_with_padding(device, n, c, h, w): - torch.manual_seed(2005) torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16) torch_output_tensor = torch_input_tensor.transpose(2, 3) activation_pyt_padded = ttnn.from_torch( @@ -340,7 +340,6 @@ def test_tranpose_hw_rm_with_padding(device, n, c, h, w): @pytest.mark.parametrize("h", [8]) @pytest.mark.parametrize("w", [256]) def test_tranpose_hw_rm_no_padding(device, n, c, h, w): - torch.manual_seed(2005) torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16) torch_output_tensor = torch_input_tensor.transpose(2, 3) activation_pyt_padded = ttnn.from_torch( @@ -358,7 +357,6 @@ def test_tranpose_hw_rm_no_padding(device, n, c, h, w): def run_tranpose_hw_rm_program_cache(device, n, c, h, w, use_program_cache): - torch.manual_seed(2005) torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16) torch_output_tensor = torch_input_tensor.transpose(2, 3) activation_pyt_padded = ttnn.from_torch( @@ -403,7 +401,6 @@ def test_tranpose_hw_rm_with_program_cache(device, n, c, h, w, use_program_cache @pytest.mark.parametrize("h", [16]) @pytest.mark.parametrize("w", [112]) def test_tranpose_hw_sharded_rm(device, n, c, h, w): - torch.manual_seed(2005) torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16) torch_output_tensor = torch_input_tensor.transpose(2, 3) tt_input_tensor = ttnn.from_torch( @@ -439,7 +436,6 @@ def test_tranpose_hw_sharded_rm(device, n, c, h, w): def run_tranpose_hw_sharded_rm_with_program_cache(device, n, c, h, w): - torch.manual_seed(2005) torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16) torch_output_tensor = torch_input_tensor.transpose(2, 3) tt_input_tensor = ttnn.from_torch( @@ -495,7 +491,6 @@ def test_tranpose_hw_sharded_rm_with_program_cache(device, n, c, h, w, use_progr @pytest.mark.parametrize("h", [128]) @pytest.mark.parametrize("w", [16]) def test_tranpose_hc_rm(device, n, c, h, w): - torch.manual_seed(2005) torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16) torch_output_tensor = torch_input_tensor.transpose(1, 2) activation_pyt_padded = ttnn.from_torch( @@ -514,7 +509,6 @@ def test_tranpose_hc_rm(device, n, c, h, w): def run_tranpose_hc_rm_with_program_cache(device, n, c, h, w, use_program_cache): - torch.manual_seed(2005) torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16) torch_output_tensor = torch_input_tensor.transpose(1, 2) activation_pyt_padded = ttnn.from_torch( @@ -552,7 +546,6 @@ def test_tranpose_hc_rm_with_program_cache(device, n, c, h, w, use_program_cache def run_tranpose_hc_sharded(device, n, c, h, w, grid_size): - torch.manual_seed(2005) torch_input_tensor = torch.rand((n, c, h, w), dtype=torch.bfloat16) torch_output_tensor = torch_input_tensor.transpose(1, 2) tt_input_tensor = ttnn.from_torch( @@ -621,12 +614,9 @@ def test_tranpose_hc_sharded_with_program_cache(device, n, c, h, w, grid_size, u ((32, 32, 32, 32), (1, 2)), ((32, 32, 32, 32), (0, 3)), ((32, 32, 32, 32), (1, 3)), - ((32, 32, 32, 32), (2, 3)), - ((32, 32, 32, 32), (0, 1)), ], ) def test_transpose_bfloat8_b(device, shape, swap_dims): - torch.manual_seed(2005) input = torch.randn(shape, dtype=torch.bfloat16) torch_output = input.transpose(*swap_dims) @@ -669,7 +659,6 @@ def test_transpose_hc(dtype, shape, device): [ttnn.TILE_LAYOUT], ) def test_transpose_2D(dtype, shape, layout, device): - torch.manual_seed(2005) if is_grayskull() and dtype == ttnn.float32: pytest.skip("Skipping float32 tests on Grayskull") if layout == ttnn.ROW_MAJOR_LAYOUT and dtype == ttnn.bfloat16 and (shape[-1] % 2 or shape[-2] % 2): @@ -709,7 +698,6 @@ def test_transpose_2D(dtype, shape, layout, device): ], ) def test_transpose_3D(dtype, shape, layout, dims, device): - torch.manual_seed(2005) if is_grayskull() and dtype == ttnn.float32: pytest.skip("Skipping float32 tests on Grayskull") if layout == ttnn.ROW_MAJOR_LAYOUT and dtype == ttnn.bfloat16 and (shape[-1] % 2 or shape[dims[-1]] % 2): @@ -729,7 +717,6 @@ def test_transpose_3D(dtype, shape, layout, dims, device): [[4, 3, 1280, 40], [1, 4096, 4096]], ) def test_transpose_4d_wh_rm(shape, device): - torch.manual_seed(2005) torch_input = torch.randn(shape, dtype=torch.bfloat16) torch_output = torch_input.transpose(-1, -2) @@ -744,7 +731,6 @@ def test_transpose_4d_wh_rm(shape, device): [[4, 3, 1280, 40], [1, 1, 1200, 1280], [1, 1, 4096, 4096]], ) def test_transpose_4d_wh_tile(shape, device): - torch.manual_seed(2005) torch_input = torch.randn(shape, dtype=torch.bfloat16) torch_output = torch_input.transpose(-1, -2) @@ -764,8 +750,7 @@ def test_transpose_4d_wh_tile(shape, device): ) @pytest.mark.parametrize("memory_config", [ttnn.L1_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG]) def test_transpose_failures(config, memory_config, device): - pytest.skip("Failing pytorch 2.0 trace sweeps") - torch.manual_seed(2005) + pytest.skip("Failures to fix after #13217 and #13005 are in - 5D, HC PCC issue and unaligned RM tensor") torch_input = torch.randn(config[0], dtype=torch.bfloat16) torch_output = torch_input.transpose(config[1][0], config[1][1]) @@ -813,7 +798,6 @@ def test_transpose_failures(config, memory_config, device): ) @pytest.mark.parametrize("memory_config", [ttnn.L1_MEMORY_CONFIG, ttnn.DRAM_MEMORY_CONFIG]) def test_transpose_unaligned(config, memory_config, device): - torch.manual_seed(2005) # this will convert to tiled for now torch_input = torch.randn(config[0], dtype=torch.bfloat16) torch_output = torch_input.transpose(config[1][0], config[1][1]) @@ -854,7 +838,6 @@ def test_transpose_hc_padded_c(shape, device): [ttnn.ROW_MAJOR_LAYOUT], ) def test_transpose_5d(shape, dims, layout, device): - torch.manual_seed(2005) torch_input = torch.randn(shape, dtype=torch.bfloat16) torch_output = torch_input.transpose(dims[0], dims[1]) @@ -862,152 +845,3 @@ def test_transpose_5d(shape, dims, layout, device): tt_output = ttnn.transpose(tt_input, dims[0], dims[1]) tt_output = ttnn.to_torch(tt_output) assert_with_pcc(torch_output, tt_output, 0.9999) - - -@pytest.mark.parametrize( - "shape", - [ - [1, 5, 10, 15], - [1, 1, 1, 2], - [1, 3, 2, 1], - [1, 17, 1, 1], - [1, 1, 16, 1], - [1, 1, 17, 1], - [1, 1, 1, 17], - [2, 1, 1, 1], - [2, 33, 33, 33], - ], -) -@pytest.mark.parametrize( - "dims", - [ - (1, 2), - (0, 2), - ], -) -@pytest.mark.parametrize( - "layout", - [ttnn.TILE_LAYOUT], -) -@pytest.mark.parametrize( - "dtype", - [ttnn.float32, ttnn.bfloat16], -) -def test_transpose_issue_11650_10350(shape, dims, layout, dtype, device): - torch.manual_seed(2005) - torch_input = torch.randn(shape, dtype=torch.bfloat16) - torch_output = torch_input.transpose(dims[0], dims[1]) - - tt_input = ttnn.from_torch(torch_input, dtype=dtype, layout=layout, device=device) - tt_output = ttnn.transpose(tt_input, dims[0], dims[1]) - tt_output = ttnn.to_torch(tt_output) - assert_with_pcc(torch_output, tt_output, 0.9999) - - -@pytest.mark.parametrize( - "shape", - [ - [1, 17, 1, 1], - [1, 1, 16, 1], - [1, 1, 17, 1], - [1, 1, 1, 17], - [2, 1, 1, 1], - [2, 33, 33, 33], - ], -) -@pytest.mark.parametrize( - "dims", - [ - (1, 2), - (0, 2), - ], -) -@pytest.mark.parametrize( - "layout", - [ttnn.TILE_LAYOUT], -) -@pytest.mark.parametrize( - "dtype", - [ttnn.float32, ttnn.bfloat16], -) -@pytest.mark.parametrize( - "pad_value", - [None, float("-inf")], -) -def test_transpose_unpadded(shape, dims, layout, dtype, pad_value, device): - torch.manual_seed(2005) - if pad_value is not None and is_blackhole(): - pytest.skip("Blackhole reduce is needed for the full test to work") - elif dtype == ttnn.float32 and is_grayskull(): - pytest.skip("Grayskull does not support float32") - torch_input = torch.randn(shape, dtype=torch.bfloat16) - torch_output = torch_input.transpose(dims[0], dims[1]) - - tt_input = ttnn.from_torch(torch_input, dtype=dtype, layout=layout, device=device) - tt_output = ttnn.transpose(tt_input, dims[0], dims[1], pad_value=pad_value) - if pad_value is not None: - a = ttnn.min( - tt_output - ) # if min becomes padding aware, this will fail, so feel free to delete this test then @future op writer - assert ttnn.to_torch(a) == float("-inf") - tt_output = ttnn.to_torch(tt_output) - assert_with_pcc(torch_output, tt_output, 0.9999) - - -@pytest.mark.parametrize("b", [1]) -@pytest.mark.parametrize("h", [18]) -@pytest.mark.parametrize("w", [65]) -@pytest.mark.parametrize("dim0", [1]) -@pytest.mark.parametrize("dim1", [2]) -def test_transpose_forge_llama(device, b, h, w, dim0, dim1): - torch.manual_seed(2005) - - torch_input_tensor = torch_random((b, h, w), -0.1, 0.1, dtype=torch.bfloat16) - torch_output_tensor = torch_input_tensor.transpose(dim0, dim1) - - input_tensor = ttnn.to_device(ttnn.from_torch(torch_input_tensor), device, memory_config=ttnn.DRAM_MEMORY_CONFIG) - input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT) - output_tensor = ttnn.transpose(input_tensor, dim0, dim1, memory_config=ttnn.DRAM_MEMORY_CONFIG) - output_tensor = ttnn.from_device(output_tensor) - output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT) - output_tensor = ttnn.to_torch(output_tensor) - - assert_with_pcc(torch_output_tensor, output_tensor) - - -@pytest.mark.parametrize("b", [1]) -@pytest.mark.parametrize("h", [2]) -@pytest.mark.parametrize("w", [3]) -@pytest.mark.parametrize("dim0", [-1]) -@pytest.mark.parametrize("dim1", [-2]) -def test_transpose_forge_basic(device, b, h, w, dim0, dim1): - torch.manual_seed(2005) - torch_input_tensor = torch_random((1, b, h, w), -0.1, 0.1, dtype=torch.bfloat16) - torch_output_tensor = torch_input_tensor.transpose(dim0, dim1) - input_tensor = ttnn.to_device(ttnn.from_torch(torch_input_tensor), device, memory_config=ttnn.DRAM_MEMORY_CONFIG) - input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT) - output_tensor = ttnn.transpose(input_tensor, dim0, dim1, memory_config=ttnn.DRAM_MEMORY_CONFIG) - output_tensor = ttnn.from_device(output_tensor) - output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT) - output_tensor = ttnn.to_torch(output_tensor) - - assert_with_pcc(torch_output_tensor, output_tensor) - - -@pytest.mark.parametrize("b", [6]) -@pytest.mark.parametrize("h", [33]) -@pytest.mark.parametrize("w", [34]) -@pytest.mark.parametrize("dim0", [1]) -@pytest.mark.parametrize("dim1", [0]) -def test_transpose_forge_hc(device, b, h, w, dim0, dim1): - torch.manual_seed(2005) - torch_input_tensor = torch_random((1, b, h, w), -0.1, 0.1, dtype=torch.bfloat16) - torch_output_tensor = torch_input_tensor.transpose(dim0, dim1) - input_tensor = ttnn.to_device(ttnn.from_torch(torch_input_tensor), device, memory_config=ttnn.DRAM_MEMORY_CONFIG) - input_tensor = ttnn.to_layout(input_tensor, layout=ttnn.TILE_LAYOUT) - output_tensor = ttnn.transpose(input_tensor, dim0, dim1, memory_config=ttnn.DRAM_MEMORY_CONFIG) - output_tensor = ttnn.from_device(output_tensor) - output_tensor = ttnn.to_layout(output_tensor, layout=ttnn.ROW_MAJOR_LAYOUT) - output_tensor = ttnn.to_torch(output_tensor) - - assert_with_pcc(torch_output_tensor, output_tensor) diff --git a/tests/ttnn/unit_tests/operations/test_permute.py b/tests/ttnn/unit_tests/operations/test_permute.py index 40a57515f56..5c3c2e65156 100644 --- a/tests/ttnn/unit_tests/operations/test_permute.py +++ b/tests/ttnn/unit_tests/operations/test_permute.py @@ -9,13 +9,13 @@ import ttnn from tests.ttnn.utils_for_testing import assert_with_pcc -from models.utility_functions import is_blackhole + +torch.manual_seed(2005) @pytest.mark.parametrize("h", [32]) @pytest.mark.parametrize("w", [64]) def test_permute(device, h, w): - torch.manual_seed(2005) torch_input_tensor = torch.rand((1, 1, h, w), dtype=torch.bfloat16) torch_output_tensor = torch.permute(torch_input_tensor, (0, 1, 3, 2)) @@ -32,7 +32,6 @@ def test_permute(device, h, w): @pytest.mark.parametrize("h", [32]) @pytest.mark.parametrize("w", [64]) def test_transpose(device, h, w): - torch.manual_seed(2005) torch_input_tensor = torch.rand((1, 1, h, w), dtype=torch.bfloat16) torch_output_tensor = torch_input_tensor.transpose(2, 3) @@ -49,7 +48,6 @@ def test_transpose(device, h, w): @pytest.mark.parametrize("h", [32]) @pytest.mark.parametrize("w", [64]) def test_permute_on_4D_tensor_with_smaller_tuple_size(device, h, w): - torch.manual_seed(2005) torch_input_tensor = torch.rand((1, 1, h, w), dtype=torch.bfloat16) input_tensor = ttnn.from_torch(torch_input_tensor) input_tensor = ttnn.to_device(input_tensor, device) @@ -64,7 +62,6 @@ def test_permute_on_4D_tensor_with_smaller_tuple_size(device, h, w): "perm", [(0,), (0, 1), (1, 0), (0, 1, 2), (0, 2, 1), (1, 2, 0), (1, 0, 2), (2, 0, 1), (2, 1, 0)] ) def test_permute_on_less_than_4D(device, perm): - torch.manual_seed(2005) tuple_shape = tuple([32 * (value + 1) for value in perm]) torch_input_tensor = torch.rand(tuple_shape, dtype=torch.bfloat16) torch_output_tensor = torch.permute(torch_input_tensor, perm) @@ -84,7 +81,6 @@ def test_permute_on_less_than_4D(device, perm): @pytest.mark.parametrize("h", [1500]) @pytest.mark.parametrize("w", [64]) def test_permute_for_specific_case(device, b, s, h, w): - torch.manual_seed(2005) torch_input_tensor = torch.rand((b, s, h, w), dtype=torch.bfloat16) torch_output_tensor = torch.permute(torch_input_tensor, (0, 1, 3, 2)) input_tensor = ttnn.from_torch(torch_input_tensor) @@ -98,7 +94,6 @@ def test_permute_for_specific_case(device, b, s, h, w): def test_add_after_permute(device): - torch.manual_seed(2005) torch_a = torch.randn(2, 1280, 8, 8) torch_b = torch.randn(1, 1, 2, 1280) torch_b_permuted = torch.permute(torch_b, (2, 3, 0, 1)) @@ -115,7 +110,6 @@ def test_add_after_permute(device): @pytest.mark.parametrize("h", [32]) @pytest.mark.parametrize("w", [64]) def test_permute_negative_dim(device, h, w): - torch.manual_seed(2005) torch_input_tensor = torch.rand((1, 1, h, w), dtype=torch.bfloat16) torch_output_tensor = torch.permute(torch_input_tensor, (0, -3, -1, -2)) @@ -130,7 +124,6 @@ def test_permute_negative_dim(device, h, w): def test_permute_bfloat8(device): - torch.manual_seed(2005) input_a = torch.randn(1, 160, 32, 32) torch_output = torch.permute(input_a, (0, 2, 3, 1)) @@ -145,7 +138,6 @@ def test_permute_bfloat8(device): ) @pytest.mark.parametrize("perm", [(0, 3, 2, 1, 4), (3, 1, 2, 0, 4), (0, 3, 2, 1, 4), (1, 3, 2, 0, 4), (0, 3, 1, 2, 4)]) def test_permute_5d(shape, perm, device): - torch.manual_seed(2005) input_a = torch.randn(shape) torch_output = torch.permute(input_a, perm) @@ -154,20 +146,3 @@ def test_permute_5d(shape, perm, device): tt_output = ttnn.permute(tt_input, perm) tt_output = ttnn.to_torch(tt_output) assert_with_pcc(torch_output, tt_output, 0.9999) - - -@pytest.mark.parametrize("pad_value", [float("-inf"), None]) -def test_permute_pad_value(device, pad_value): - if pad_value is not None and is_blackhole(): - pytest.skip("Blackhole reduce is needed for the full test to work") - torch.manual_seed(2005) - input_a = torch.randn((2, 11, 33, 17), dtype=torch.bfloat16) - torch_output = torch.permute(input_a, (3, 2, 1, 0)) - - tt_input = ttnn.from_torch(input_a, device=device, layout=ttnn.TILE_LAYOUT, dtype=ttnn.bfloat16) - tt_output = ttnn.permute(tt_input, (3, 2, 1, 0), pad_value=pad_value) - if pad_value is not None: - a = ttnn.min(tt_output) - assert ttnn.to_torch(a) == float("-inf") - tt_output = ttnn.to_torch(tt_output) - assert_with_pcc(torch_output, tt_output, 0.9999) diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index 6afcf05d46a..1a8fe892b54 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -86,7 +86,6 @@ set(ALL_TTNN_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/pad/pad.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/permute/permute.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/permute/permute_pybind.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/permute/device/permute_device_operation.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/permute/device/permute_program_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/repeat/device/repeat_op.cpp @@ -125,7 +124,6 @@ set(ALL_TTNN_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/transpose/transpose.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/untilize/device/untilize_op.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/untilize/device/untilize_program_factory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/untilize/untilize.cpp diff --git a/ttnn/cpp/ttnn/operations/data_movement/common/kernels/common.hpp b/ttnn/cpp/ttnn/operations/data_movement/common/kernels/common.hpp index 5ca4e03ec4a..bf7062ab92b 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/common/kernels/common.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/common/kernels/common.hpp @@ -9,7 +9,7 @@ namespace tt::data_movement::common { // this function is useful for converting bfloat16 values to float32 - FORCE_INLINE float bfloat16_to_float32(uint16_t bfloat16_data) { + float bfloat16_to_float32(uint16_t bfloat16_data) { uint32_t bits = static_cast(bfloat16_data) << 16; // Extract the sign bit @@ -44,24 +44,4 @@ namespace tt::data_movement::common { ieee_float.u = sign | exponent | mantissa; return ieee_float.f; } - - - FORCE_INLINE void fill_with_val(uint32_t begin_addr, uint32_t n, uint32_t val) { - auto* ptr = reinterpret_cast(begin_addr); - for (uint32_t i = 0; i < n; ++i) { - ptr[i] = val; - } - } - - // Utility functions - template - FORCE_INLINE constexpr uint32_t div_up() { - static_assert(b > 0, "divisor must be greater than 0"); - return static_cast((a + b - 1) / b); - } - - template - FORCE_INLINE constexpr uint32_t round_up() { - return b * div_up(); - } } diff --git a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp index b4e66c61433..ab0bd1da8f4 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/concat/concat.cpp @@ -161,7 +161,7 @@ MassagedConcat build_prepost_transpose_concat(uint8_t queue_id, const MemoryConf tensors.end(), std::back_inserter(itensors), [dim1, dim2](const ttnn::Tensor& input_tensor) -> ttnn::Tensor { - return ttnn::transpose(input_tensor, dim1, dim2); + return ttnn::transpose(input_tensor, dim1, dim2, std::nullopt); } ); auto norm_dim1 = tensors.front().get_shape().get_normalized_index(dim1); diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp index 3650dc11396..435639b9190 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.cpp @@ -34,7 +34,7 @@ inline bool has_tile_padding(const Tensor& t) { return false; } -ttnn::Tensor permute_impl(const ttnn::Tensor &a, const SmallVector& dims, const MemoryConfig& output_mem_config, const std::optional& pad_value) { +ttnn::Tensor permute_impl(const ttnn::Tensor &a, const SmallVector& dims, const MemoryConfig& output_mem_config) { using ttnn::operations::experimental::auto_format::AutoFormat; Device * device; @@ -48,7 +48,6 @@ ttnn::Tensor permute_impl(const ttnn::Tensor &a, const SmallVector& di if (a.get_shape().rank() > 4) { auto input = a.get_layout() == Layout::TILE ? ttnn::to_layout(a, Layout::ROW_MAJOR, std::nullopt, std::nullopt, (Device*)nullptr) : a; - TT_FATAL(!(pad_value.has_value() && pad_value.value() != 0.0f), "Non-zero padding is not supported for permute on tensors with rank > 4."); input = ttnn::prim::permute(input, dims, output_mem_config, std::nullopt); return ttnn::to_layout(input, a.get_layout(), std::nullopt, std::nullopt, (Device*)nullptr); } @@ -56,31 +55,30 @@ ttnn::Tensor permute_impl(const ttnn::Tensor &a, const SmallVector& di TT_FATAL(dims.size() == 4, "Only 4D tensor are supported for permute."); uint32_t N = dims[0], C = dims[1], H = dims[2], W = dims[3]; + bool pad_n = H == 0 || W == 0; + bool pad_c = H == 1 || W == 1; // Convert tensor back to original auto input_shape = a.get_logical_shape(); + // create_output_tensor shape is useless when we potentially have new padding to deal with + SmallVector output_shape = {input_shape[N], input_shape[C], input_shape[H], input_shape[W]}; + SmallVector padded_output_shape = output_shape; + + uint32_t input_rank = a.get_logical_shape().rank(); + if (a.layout() == Layout::TILE) { + padded_output_shape[input_rank - 1] = tt::round_up(padded_output_shape[input_rank - 1], tt::constants::TILE_WIDTH); + padded_output_shape[input_rank - 2] = tt::round_up(padded_output_shape[input_rank - 2], tt::constants::TILE_HEIGHT); + } + + ttnn::Shape final_shape = ttnn::Shape(output_shape, padded_output_shape); auto formatted_input_tensor = a; - // WH and CN should be supported without typecast - bool wh = N == 0 && C == 1 && H == 3 && W == 2; - bool cn = N == 1 && C == 0 && H == 2 && W == 3; - bool cnwh = N == 1 && C == 0 && H == 3 && W == 2; - bool bfloat8_supported = wh || cn || cnwh; - bool typecast = formatted_input_tensor.get_dtype() == DataType::BFLOAT8_B and !bfloat8_supported && !a.is_sharded(); + bool typecast = formatted_input_tensor.get_dtype() == DataType::BFLOAT8_B and formatted_input_tensor.get_layout() == Layout::TILE and (pad_n or pad_c) and !a.is_sharded(); formatted_input_tensor = typecast ? ttnn::typecast(formatted_input_tensor, DataType::BFLOAT16) : formatted_input_tensor; auto output = formatted_input_tensor; - auto transpose_wh = [&](const ttnn::Tensor& input) -> ttnn::Tensor { - return ttnn::transpose(input, -2, -1, output_mem_config, std::nullopt); - }; - - auto transpose_hc = [&](const ttnn::Tensor& input) -> ttnn::Tensor { - return ttnn::transpose(input, 1, -2, output_mem_config, pad_value); - }; - - auto transpose_cn = [&](const ttnn::Tensor& input) -> ttnn::Tensor { - return ttnn::transpose(input, 0, 1, output_mem_config, std::nullopt); - }; - + static auto transpose_wh = std::bind(ttnn::transpose, std::placeholders::_1, -2, -1, output_mem_config); + static auto transpose_hc = std::bind(ttnn::transpose, std::placeholders::_1, 1, -2, output_mem_config); + static auto transpose_cn = std::bind(ttnn::transpose, std::placeholders::_1, 0, 1, output_mem_config); if (N == 0 && C == 1 && H == 2 && W == 3) { output = formatted_input_tensor; } else if (N == 0 && C == 1 && H == 3 && W == 2) { @@ -132,14 +130,15 @@ ttnn::Tensor permute_impl(const ttnn::Tensor &a, const SmallVector& di } else { TT_ASSERT(false, "Illegal permute args"); } + output = ttnn::reshape(output, final_shape); output = typecast ? ttnn::typecast(output, DataType::BFLOAT8_B) : output; return output; } -ttnn::Tensor permute_launch(const ttnn::Tensor &a, tt::stl::Span dims, const MemoryConfig& output_mem_config, const std::optional& pad_value) { +ttnn::Tensor permute_launch(const ttnn::Tensor &a, tt::stl::Span dims, const MemoryConfig& output_mem_config) { std::vector output_tensors = {ttnn::Tensor(operation::get_workers_for_op_output({a}))}; operation::launch_with_autoformat( - [dims, output_mem_config, pad_value] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { + [dims, output_mem_config] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& a = input_tensors.at(0); SmallVector normalized_dims(dims.size()); std::transform(dims.begin(), dims.end(), normalized_dims.begin(), [a](std::int64_t idx) {return a.get_legacy_shape().get_normalized_index(idx);}); @@ -148,7 +147,7 @@ ttnn::Tensor permute_launch(const ttnn::Tensor &a, tt::stl::Span if (normalized_dims == seq_dims) { return {ttnn::operations::experimental::auto_format::AutoFormat::move_tensor_to_mem_config(a, output_mem_config)}; } - return {permute_impl(a, normalized_dims, output_mem_config, pad_value)}; + return {permute_impl(a, normalized_dims, output_mem_config)}; }, {a}, output_tensors); return output_tensors.at(0); } @@ -156,10 +155,9 @@ ttnn::Tensor permute_launch(const ttnn::Tensor &a, tt::stl::Span Tensor composite_invoke( const ttnn::Tensor& input_tensor, tt::stl::Span dims, - const std::optional& memory_config, - const std::optional& pad_value) { + const std::optional& memory_config) { - auto output_tensor = permute_launch(input_tensor, dims, memory_config.value_or(input_tensor.memory_config()), pad_value); + auto output_tensor = permute_launch(input_tensor, dims, memory_config.value_or(input_tensor.memory_config())); return output_tensor; } @@ -170,11 +168,10 @@ ttnn::Tensor ExecutePermute::invoke( const ttnn::Tensor& input_tensor, tt::stl::Span dims, const std::optional& memory_config, - bool composite, - const std::optional& pad_value) { + bool composite) { if (composite) - return detail::composite_invoke(input_tensor, dims, memory_config, pad_value); + return detail::composite_invoke(input_tensor, dims, memory_config); const bool initial_input_tensor_on_device = detail::is_on_device(input_tensor); const auto input_layout = input_tensor.get_layout(); @@ -200,7 +197,7 @@ ttnn::Tensor ExecutePermute::invoke( auto iorder = dims.size() < 4 ? adjust_order(dims) : dims; // internals of permute_impl already adjust negative indices TT_FATAL(detail::is_on_device(itensor), "Error"); - auto output_tensor = detail::permute_launch(itensor, iorder, memory_config.value_or(input_tensor.memory_config()), pad_value); + auto output_tensor = detail::permute_launch(itensor, iorder, memory_config.value_or(input_tensor.memory_config())); output_tensor = ttnn::to_layout(output_tensor, input_layout, std::nullopt, std::nullopt, (Device*)nullptr); if (input_rank < 4) { @@ -228,13 +225,12 @@ ttnn::Tensor ExecutePermute::invoke( ttnn::Tensor ExecutePermute::invoke( const ttnn::Tensor& input_tensor, tt::stl::Span dims, - const std::optional& memory_config, - const std::optional& pad_value) { - return invoke(DefaultQueueId, input_tensor, dims, memory_config, true, pad_value); + const std::optional& memory_config) { + return invoke(DefaultQueueId, input_tensor, dims, memory_config); } -ttnn::Tensor ExecutePermute::invoke(const ttnn::Tensor& input_tensor, tt::stl::Span dims, const std::optional& pad_value) { - return invoke(input_tensor, dims, std::nullopt, pad_value); +ttnn::Tensor ExecutePermute::invoke(const ttnn::Tensor& input_tensor, tt::stl::Span dims) { + return invoke(input_tensor, dims, std::nullopt); } } // ttnn::operations::data_movement namespace diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.hpp b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.hpp index 6338a1c843f..04f5231956b 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/permute/permute.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/permute/permute.hpp @@ -15,16 +15,14 @@ struct ExecutePermute { const ttnn::Tensor& input_tensor, tt::stl::Span dims, const std::optional& memory_config, - bool composite = true, - const std::optional& pad_value = 0.0f); + bool composite = true); static ttnn::Tensor invoke( const ttnn::Tensor& input_tensor, tt::stl::Span dims, - const std::optional& memory_config, - const std::optional& pad_value = 0.0f); + const std::optional& memory_config); - static ttnn::Tensor invoke(const ttnn::Tensor& input_tensor, tt::stl::Span dims, const std::optional& pad_value = 0.0f); + static ttnn::Tensor invoke(const ttnn::Tensor& input_tensor, tt::stl::Span dims); }; } // namespace operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.cpp deleted file mode 100644 index ed165483757..00000000000 --- a/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.cpp +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "permute_pybind.hpp" - -namespace ttnn::operations::data_movement::detail { -namespace py = pybind11; - -void bind_permute(py::module& module) { - auto doc = - R"doc(permute(input_tensor: ttnn.Tensor, dims: List[int], memory_config: Optional[MemoryConfig] = std::nullopt, queue_id: int = 0) -> ttnn.Tensor - - Permutes the dimensions of the input tensor according to the specified permutation. - - Args: - input_tensor (ttnn.Tensor): the input tensor. - dim (number): tthe permutation of the dimensions of the input tensor. - - Keyword Args: - memory_config (ttnn.MemoryConfig, optional): Memory configuration for the operation. Defaults to `None`. - queue_id (int, optional): command queue id. Defaults to `0`. - pad_value (float, optional): padding value for when tiles are broken in a transpose. Defaults to `0.0`. If set to None, it will be random garbage values. - - Returns: - List of ttnn.Tensor: the output tensor. - - Example: - - >>> tensor = ttnn.to_device(ttnn.from_torch(torch.zeros((1, 1, 64, 32), dtype=torch.bfloat16)), device) - >>> output = ttnn.permute(tensor, (0, 1, 3, 2)) - >>> print(output.shape) - [1, 1, 32, 64])doc"; - - using OperationType = decltype(ttnn::permute); - ttnn::bind_registered_operation( - module, - ttnn::permute, - doc, - ttnn::pybind_overload_t{ - [] (const OperationType& self, - const ttnn::Tensor& input_tensor, - const ttnn::SmallVector &dims, - const std::optional& memory_config, - uint8_t queue_id, - const std::optional& pad_value) { - return self(queue_id, input_tensor, dims, memory_config, false, pad_value); - }, - py::arg("input_tensor").noconvert(), - py::arg("dims"), - py::kw_only(), - py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, - py::arg("pad_value") = 0.0f, - }); -} - -} // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.hpp index ea7d2c449fd..4f2b82a14ea 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/permute/permute_pybind.hpp @@ -14,6 +14,48 @@ namespace ttnn::operations::data_movement::detail { namespace py = pybind11; -void bind_permute(py::module& module); +void bind_permute(py::module& module) { + auto doc = + R"doc(permute(input_tensor: ttnn.Tensor, dims: List[int], memory_config: Optional[MemoryConfig] = std::nullopt, queue_id: int = 0) -> ttnn.Tensor + + Permutes the dimensions of the input tensor according to the specified permutation. + + Args: + input_tensor (ttnn.Tensor): the input tensor. + dim (number): tthe permutation of the dimensions of the input tensor. + + Keyword Args: + memory_config (ttnn.MemoryConfig, optional): Memory configuration for the operation. Defaults to `None`. + queue_id (int, optional): command queue id. Defaults to `0`. + + Returns: + List of ttnn.Tensor: the output tensor. + + Example: + + >>> tensor = ttnn.to_device(ttnn.from_torch(torch.zeros((1, 1, 64, 32), dtype=torch.bfloat16)), device) + >>> output = ttnn.permute(tensor, (0, 1, 3, 2)) + >>> print(output.shape) + [1, 1, 32, 64])doc"; + + using OperationType = decltype(ttnn::permute); + ttnn::bind_registered_operation( + module, + ttnn::permute, + doc, + ttnn::pybind_overload_t{ + [] (const OperationType& self, + const ttnn::Tensor& input_tensor, + const ttnn::SmallVector &dims, + const std::optional& memory_config, + uint8_t queue_id) { + return self(queue_id, input_tensor, dims, memory_config, false); + }, + py::arg("input_tensor").noconvert(), + py::arg("dims"), + py::kw_only(), + py::arg("memory_config") = std::nullopt, + py::arg("queue_id") = 0}); +} } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp index 49766f75387..6dad529d179 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_op.cpp @@ -18,8 +18,7 @@ void Transpose::validate(const std::vector &input_tensors) const { const auto& input_tensor = input_tensors.at(0); TT_FATAL(input_tensor.storage_type() == StorageType::DEVICE, "Operands to transpose need to be on device!"); TT_FATAL(input_tensor.buffer() != nullptr , "Operands to transpose need to be allocated in buffers on device!"); - TT_FATAL(!(this->dim != TransposeOpDim::HC && this->pad_value.has_value() && this->pad_value != 0.0f), "Non-zero padding is not supported for any transpose other than HC."); - const auto shape = input_tensor.get_padded_shape(); + const auto shape = input_tensor.get_legacy_shape(); bool row_major = input_tensor.get_layout() == Layout::ROW_MAJOR; uint32_t W = shape[3], H = shape[2], C = shape[1], N = shape[0]; uint32_t HW = H*W; @@ -57,15 +56,14 @@ void Transpose::validate(const std::vector &input_tensors) const { if (row_major) { auto BUFFER_ALIGNMENT = input_tensor.buffer()->buffer_type() == tt::tt_metal::BufferType::DRAM ? DRAM_ALIGNMENT : L1_ALIGNMENT; TT_FATAL((W * input_tensor.element_size()) % BUFFER_ALIGNMENT == 0, "Buffer is not aligned for this implementation row_size_bytes {} buffer_alignment {}", W * input_tensor.element_size(), BUFFER_ALIGNMENT); + } else { + TT_FATAL(C % TILE_HEIGHT == 0, "Error"); } TT_FATAL( input_tensor.get_dtype() == DataType::BFLOAT16 || input_tensor.get_dtype() == DataType::FLOAT32, "Error"); TT_FATAL( !(input_tensor.is_sharded() && input_tensor.get_layout() == Layout::TILE), "HC transpose does not support sharded+tilized inputs"); - TT_FATAL( - !(input_tensor.is_sharded() && pad_value.has_value() && pad_value.value() != 0.0f), - "Sharded HC transpose does not support non-zero padding"); } else if (this->dim == TransposeOpDim::CW) { TT_FATAL(C % TILE_WIDTH == 0, "Error"); TT_FATAL(input_tensor.get_dtype() == DataType::BFLOAT16 || input_tensor.get_dtype() == DataType::FLOAT32, "Error"); @@ -94,21 +92,9 @@ std::vector Transpose::compute_output_specs(const std::vector< std::swap(output_padded_shape[0], output_padded_shape[1]); break; case TransposeOpDim::HC: - if (input_tensor.is_sharded() || input_tensor.get_layout() != Layout::TILE) { - std::swap(output_shape[1], output_shape[2]); - std::swap(output_padded_shape[1], output_padded_shape[2]); - break; - } else { - uint32_t C = output_shape[1]; - uint32_t C_p = tt::round_up(C, input_tensor.get_tensor_spec().tile().get_height()); - uint32_t H = output_shape[2]; - output_shape[1] = H; - output_shape[2] = C; - output_padded_shape[1] = H; - output_padded_shape[2] = C_p; - break; - } - + std::swap(output_shape[1], output_shape[2]); + std::swap(output_padded_shape[1], output_padded_shape[2]); + break; case TransposeOpDim::WH: std::swap(output_shape[2], output_shape[3]); std::swap(output_padded_shape[2], output_padded_shape[3]); @@ -166,7 +152,7 @@ operation::ProgramWithCallbacks Transpose::create_program(const std::vector pad_value; void validate(const std::vector &input_tensors) const; std::vector compute_output_specs(const std::vector &input_tensors) const; diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp index eb254bfbe15..0db7569ebfa 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp @@ -449,222 +449,9 @@ void override_runtime_args_mc_hc_rm( } } -template -void override_runtime_args_mc_hc_tiled_interleaved( - const Program& program, - tt::tt_metal::KernelHandle reader_kernel_id, - tt::tt_metal::KernelHandle writer_kernel_id, - const Tensor &input_tensor, - Tensor &output_tensor - ){ - auto input_buffer = input_tensor.buffer(); - auto output_buffer = output_tensor.buffer(); - - auto tile_shape = input_tensor.get_tensor_spec().tile().get_tile_shape(); - auto tile_hw = (tile_shape[0] * tile_shape[1]); - uint32_t num_tensor_tiles = input_tensor.volume() / tile_hw; - uint32_t num_output_tiles = output_tensor.volume() / tile_hw; - uint32_t W = input_tensor.get_logical_shape()[3], H = input_tensor.get_logical_shape()[2], C = input_tensor.get_logical_shape()[1], N = input_tensor.get_logical_shape()[0]; - bool needs_padding = C % tile_shape[0] != 0; - uint32_t padded_num_tensor_tiles = num_output_tiles / (output_tensor.get_padded_shape()[2] / tile_shape[0]); // only last row of Ct should have padding - - auto& cached_reader_args = GetRuntimeArgs(program, reader_kernel_id); - auto& cached_writer_args = GetRuntimeArgs(program, writer_kernel_id); - - auto compute_with_storage_grid_size = input_tensor.device()->compute_with_storage_grid_size(); - auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, num_tensor_tiles); - auto [padded_num_cores, padded_all_cores, padded_core_group_1, padded_core_group_2, padded_num_tiles_per_core_group_1, padded_num_tiles_per_core_group_2] = tt::tt_metal::split_work_to_cores(compute_with_storage_grid_size, padded_num_tensor_tiles); - - - - all_cores = num_cores > padded_num_cores ? all_cores : padded_all_cores; - auto cores = corerange_to_cores(all_cores, std::nullopt); - - uint32_t start_idx = 0; - uint32_t padded_start_idx = 0; - for(const auto &core : cores) { - uint32_t num_tiles_per_core; - uint32_t padded_tiles_per_core; - - if (core_group_1.contains(core)) { - num_tiles_per_core = num_tiles_per_core_group_1; - } else if (core_group_2.contains(core)) { - num_tiles_per_core = num_tiles_per_core_group_2; - } else { - //no-op - num_tiles_per_core = 0; - } - - if (padded_core_group_1.contains(core)) { - padded_tiles_per_core = padded_num_tiles_per_core_group_1; - } else if (padded_core_group_2.contains(core)) { - padded_tiles_per_core = padded_num_tiles_per_core_group_2; - } else { - //no-op - padded_tiles_per_core = 0; - } - - uint32_t end_idx = start_idx + num_tiles_per_core; - uint32_t padded_end_idx = padded_start_idx + padded_tiles_per_core; - if constexpr (IS_CREATING) { - tt::tt_metal::SetRuntimeArgs( - program, - reader_kernel_id, - core, - { - input_buffer->address(), - num_tiles_per_core, - start_idx, - } - ); - - tt::tt_metal::SetRuntimeArgs( - program, - writer_kernel_id, - core, - { - output_buffer->address(), - start_idx, - end_idx, - padded_start_idx, - padded_end_idx - } - ); - } - else { - auto& reader_args = cached_reader_args.at(core.x).at(core.y); - auto& writer_args = cached_writer_args.at(core.x).at(core.y); - - reader_args[0] = input_buffer->address(); - writer_args[0] = output_buffer->address(); - - } - start_idx = end_idx; - padded_start_idx = padded_end_idx; - } -} - -operation::ProgramWithCallbacks transpose_hc_multi_core_tiled_interleaved(const Tensor &a, Tensor &output, const std::optional& pad_value) { - - TT_ASSERT(a.storage_type() == StorageType::DEVICE, "Operand to transpose_hc needs to be on device!"); - TT_ASSERT(a.buffer() != nullptr, "Operand to transpose_hc needs to be allocated in a buffer on device!"); - - tt::tt_metal::Program program = tt::tt_metal::Program(); - auto tile = a.get_tensor_spec().tile(); - auto tile_shape = tile.get_tile_shape(); - auto face_shape = tile.get_face_shape(); - uint32_t num_tensor_tiles = a.volume() / (tile_shape[0] * tile_shape[1]); - uint32_t num_output_tiles = output.volume() / (tile_shape[0] * tile_shape[1]); - uint32_t W = a.get_logical_shape()[3], H = a.get_logical_shape()[2], C = a.get_logical_shape()[1], N = a.get_logical_shape()[0]; - bool needs_padding = (C % tile_shape[1] != 0) && pad_value.has_value(); - uint32_t padded_num_tensor_tiles = num_output_tiles / (output.get_padded_shape()[2] / tile_shape[0]); // only last row of Ct should have padding - - tt::DataFormat cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(a.get_dtype()); - uint32_t single_tile_size = tt::tt_metal::detail::TileSize(cb_data_format); - - auto compute_with_storage_grid_size = a.device()->compute_with_storage_grid_size(); - uint32_t num_cores_x = compute_with_storage_grid_size.x; - uint32_t num_cores_y = compute_with_storage_grid_size.y; - uint32_t num_cores_total = num_cores_x * num_cores_y; - CoreRange total_cores({0, 0}, {num_cores_x-1, num_cores_y-1}); - - uint32_t src0_cb_index = tt::CBIndex::c_0; - uint32_t padding_cb_index = tt::CBIndex::c_1; - - tt::tt_metal::CircularBufferConfig cb_src0_config = - tt::tt_metal::CircularBufferConfig(2 * single_tile_size, {{src0_cb_index, cb_data_format}}) - .set_page_size(src0_cb_index, single_tile_size); - auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config); - - if (needs_padding) { - tt::tt_metal::CircularBufferConfig cb_src1_config = - tt::tt_metal::CircularBufferConfig(face_shape[1] * a.element_size(), {{padding_cb_index, cb_data_format}}) - .set_page_size(padding_cb_index, face_shape[1] * a.element_size()); - auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, total_cores, cb_src1_config); - } - - // create reader kernel with compile time and runtime args - tt::tt_metal::Buffer *src_buffer = a.buffer(); - bool src_is_dram = src_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0; - - uint32_t element_size = a.element_size(); - uint32_t padding_val_packed = 0; - uint32_t num_writes = 0; - if (pad_value.has_value()) { - if (C % tile_shape[1] != 0) { - uint32_t num_packed_values = sizeof(uint32_t) / element_size; - num_writes = face_shape[1]/num_packed_values; - if (a.get_dtype() == DataType::BFLOAT16) { - padding_val_packed = pack_two_bfloat16_into_uint32({bfloat16(pad_value.value()), bfloat16(pad_value.value())}); - } else if (num_packed_values == 2) { - padding_val_packed = static_cast(pad_value.value()) | (static_cast(pad_value.value()) << 16); - } else { - padding_val_packed = std::bit_cast(pad_value.value());; - } - } - } - std::vector reader_compile_time_args = {(uint32_t)src_is_dram, num_writes, padding_val_packed, (uint32_t) needs_padding}; - - tt::tt_metal::KernelHandle unary_reader_kernel_id = tt::tt_metal::CreateKernel( - program, - "ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/reader_unary_transpose_hc_interleaved_tiled_padding_aware.cpp", - total_cores, - tt::tt_metal::ReaderDataMovementConfig(reader_compile_time_args)); - - // create writer kernel with compile time and runtime args - - tt::tt_metal::Buffer *dst_buffer = output.buffer(); - bool dst_is_dram = dst_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0; - std::vector writer_compile_time_args = - {(std::uint32_t)dst_is_dram, a.element_size(), tt::CBIndex::c_0, C, H, W, tile_shape[0], tile_shape[1], face_shape[0], face_shape[1], (uint32_t) needs_padding}; - - tt::tt_metal::KernelHandle unary_writer_kernel_id = tt::tt_metal::CreateKernel( - program, - "ttnn/cpp/ttnn/operations/data_movement/transpose/device/kernels/dataflow/writer_unary_transpose_hc_interleaved_tiled_padding_aware.cpp", - total_cores, - tt::tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - - override_runtime_args_mc_hc_tiled_interleaved( - program, - unary_reader_kernel_id, - unary_writer_kernel_id, - a, output - ); - - auto override_runtime_args_callback = [ - unary_reader_kernel_id, - unary_writer_kernel_id, - compute_with_storage_grid_size - ] - ( - const void* operation, - const Program& program, - const std::vector& input_tensors, - const std::vector>&, - const std::vector& output_tensors - ) { - auto src_tensor = input_tensors.at(0); - auto dst_tensor = output_tensors.at(0); - - override_runtime_args_mc_hc_tiled_interleaved( - program, - unary_reader_kernel_id, - unary_writer_kernel_id, - src_tensor, dst_tensor - ); - - }; - - return {.program=std::move(program), .override_runtime_arguments_callback=override_runtime_args_callback}; - -} - -operation::ProgramWithCallbacks transpose_hc_multi_core(const Tensor &a, Tensor &output, const std::optional &pad_value) { +operation::ProgramWithCallbacks transpose_hc_multi_core(const Tensor &a, Tensor &output) { const auto shape = a.get_legacy_shape(); - if (a.get_layout() == Layout::TILE && !a.is_sharded()) { - return transpose_hc_multi_core_tiled_interleaved(a, output, pad_value); - } uint32_t sub_tile_line_bytes = 16 * a.element_size(); uint32_t num_tensor_tiles = a.volume() / TILE_HW; diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.hpp index 62326fc85db..f8df6b78229 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.hpp @@ -8,8 +8,7 @@ namespace ttnn::operations::data_movement::detail { operation::ProgramWithCallbacks transpose_wh_multi_core(const Tensor &a, Tensor &output); operation::ProgramWithCallbacks transpose_wh_multi_core_sharded(const Tensor &a, Tensor &output); operation::ProgramWithCallbacks transpose_wh_multi_core_sharded_rm(const Tensor &a, Tensor &output); -operation::ProgramWithCallbacks transpose_hc_multi_core(const Tensor &a, Tensor &output, const std::optional& pad_value); -operation::ProgramWithCallbacks transpose_hc_multi_core_tiled_interleaved(const Tensor &a, Tensor &output, const std::optional& pad_value); +operation::ProgramWithCallbacks transpose_hc_multi_core(const Tensor &a, Tensor &output); operation::ProgramWithCallbacks transpose_hc_multi_core_sharded(const Tensor &a, Tensor &output); operation::ProgramWithCallbacks transpose_cn_multi_core(const Tensor &a, Tensor &output); diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp index 59be520f57d..a96f0131ac7 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.cpp @@ -52,8 +52,10 @@ inline bool rm_enough_available_space(const Tensor& input_tensor_a) { return max_l1_space > estimated_size_of_cbs; } -inline Tensor transpose_(const Tensor &a, TransposeOpDim transpose_dim, const MemoryConfig& output_mem_config, const std::optional& pad_value) { +inline Tensor transpose_(const Tensor &a, TransposeOpDim transpose_dim, const MemoryConfig& output_mem_config) { + bool pad_c = false; bool tiled_only = false; + bool pad_n = false; constexpr uint32_t FACE_WIDTH = tt::constants::FACE_WIDTH; // this is a highly restrictive constraint on the RM transpose_wh kernel, and with all the other bugs/limitations we should rewrite it // use device->get_allocator_alignment when the it reflects the alignment of the buffer and doesn't just default to DRAM auto BUFFER_ALIGNMENT = a.buffer()->buffer_type() == tt::tt_metal::BufferType::DRAM ? DRAM_ALIGNMENT : L1_ALIGNMENT; @@ -65,14 +67,15 @@ inline Tensor transpose_(const Tensor &a, TransposeOpDim transpose_dim, const Me if ((!tiled_only) && ((W * a.element_size()) % BUFFER_ALIGNMENT != 0)) { // tiled_only = true; } + pad_c = tiled_only && a.get_padded_shape()[1] % tt::constants::TILE_HEIGHT != 0; break; // bubble dim around to make it possible as these implementations don't have a kernel case TransposeOpDim::NH: - return ttnn::permute((const ttnn::Tensor)a, ttnn::SmallVector({2, 1, 0, 3}), output_mem_config, pad_value); + return ttnn::permute((const ttnn::Tensor)a, ttnn::SmallVector({2, 1, 0, 3}), output_mem_config); case TransposeOpDim::NW: - return ttnn::permute((const ttnn::Tensor)a, ttnn::SmallVector({3, 1, 2, 0}), output_mem_config, pad_value); + return ttnn::permute((const ttnn::Tensor)a, ttnn::SmallVector({3, 1, 2, 0}), output_mem_config); case TransposeOpDim::CW: - return ttnn::permute((const ttnn::Tensor)a, ttnn::SmallVector({0, 3, 2, 1}), output_mem_config, pad_value); + return ttnn::permute((const ttnn::Tensor)a, ttnn::SmallVector({0, 3, 2, 1}), output_mem_config); case TransposeOpDim::CN: tiled_only = true; // CN only has a tiled implementation at the moment break; @@ -89,20 +92,60 @@ inline Tensor transpose_(const Tensor &a, TransposeOpDim transpose_dim, const Me default: break; } + if (a.get_layout() == Layout::ROW_MAJOR) { - // the assorted cases where only tiled works right now (HC with stick width constraint, WH with stick width constraint, CN). + // the assorted cases where only tiled works right now (HC with stick width constraint, WH with stick width constraint, CN) if (tiled_only) { // convert to tiled Tensor b = ttnn::to_layout(a, Layout::TILE, std::nullopt, std::nullopt, (Device *)nullptr); - // run the transpose. - b = operation::run(Transpose{transpose_dim, output_mem_config, pad_value}, {b}).at(0); + // pad c if needed + if (pad_c) { + auto padded_shape = b.get_padded_shape(); + auto shape = b.get_logical_shape(); + uint32_t C_rounded = tt::round_up(padded_shape[1], tt::constants::TILE_HEIGHT); + b = ttnn::pad(b, std::array({padded_shape[0], C_rounded, padded_shape[2], padded_shape[3]}), + std::array({0, 0, 0, 0}), 0); + b = ttnn::reshape(b, ttnn::Shape({shape[0], shape[1], shape[2], shape[3]}, {padded_shape[0], C_rounded, padded_shape[2], padded_shape[3]})); + } + // run the transpose + b = operation::run(Transpose{transpose_dim, output_mem_config}, {b}).at(0); + auto logical_shape = b.get_logical_shape(); + auto padded_shape = b.get_padded_shape(); // back to original layout b = ttnn::to_layout(b, a.get_layout(), std::nullopt, std::nullopt, (Device *)nullptr); + // slice back to original shape + if (logical_shape != padded_shape) { + std::array begins = {0, 0, 0, 0}; + std::array ends = {logical_shape[0], logical_shape[1], logical_shape[2], logical_shape[3]}; + std::array step = {1, 1, 1, 1}; + b = ttnn::slice(b, begins, ends, step); + } return b; } - return operation::run(Transpose{transpose_dim, output_mem_config, pad_value}, {a}).at(0); + return operation::run(Transpose{transpose_dim, output_mem_config}, {a}).at(0); } else { - return operation::run(Transpose{transpose_dim, output_mem_config, pad_value}, {a}).at(0); + if (TransposeOpDim::HC == transpose_dim) { + Tensor b = a; + if (pad_c) { + auto padded_shape = a.get_padded_shape(); + auto shape = a.get_logical_shape(); + uint32_t C_rounded = tt::round_up(padded_shape[1], tt::constants::TILE_HEIGHT); + b = ttnn::pad(a, std::array({padded_shape[0], C_rounded, padded_shape[2], padded_shape[3]}), + std::array({0, 0, 0, 0}), 0); + b = ttnn::reshape(b, ttnn::Shape({shape[0], shape[1], shape[2], shape[3]}, {padded_shape[0], C_rounded, padded_shape[2], padded_shape[3]})); + } + + b = operation::run(Transpose{transpose_dim, output_mem_config}, {b}).at(0); + + if (b.get_logical_shape()[1] != b.get_padded_shape()[1] || b.get_logical_shape()[0] != b.get_padded_shape()[0]) { + std::array begins = {0, 0, 0, 0}; + std::array ends = {b.get_logical_shape()[0], b.get_logical_shape()[1], b.get_padded_shape()[2], b.get_padded_shape()[3]}; + std::array step = {1, 1, 1, 1}; + return ttnn::slice(b, begins, ends, step); + } + return b; + } + return operation::run(Transpose{transpose_dim, output_mem_config}, {a}).at(0); } } @@ -110,15 +153,14 @@ ttnn::Tensor transpose_nd( const ttnn::Tensor& input_tensor, const uint32_t dim1, const uint32_t dim2, - const std::optional& memory_config_arg, - const std::optional& pad_value) { + const std::optional& memory_config_arg) { std::vector permutation; permutation.reserve(input_tensor.get_shape().rank()); for (uint32_t i = 0; i < input_tensor.get_shape().rank(); ++i) { permutation.push_back(i); } std::swap(permutation[dim1], permutation[dim2]); - return ttnn::permute(input_tensor, permutation, memory_config_arg, pad_value); + return ttnn::permute(input_tensor, permutation, memory_config_arg); } } //detail namespace @@ -128,8 +170,7 @@ ttnn::Tensor ExecuteTranspose::invoke( const ttnn::Tensor& input_tensor, const int64_t& dim1, const int64_t& dim2, - const std::optional& memory_config_arg, - const std::optional& pad_value) { + const std::optional& memory_config_arg) { uint32_t normalized_dim1 = input_tensor.get_shape().get_normalized_index(dim1); uint32_t normalized_dim2 = input_tensor.get_shape().get_normalized_index(dim2); @@ -142,18 +183,36 @@ ttnn::Tensor ExecuteTranspose::invoke( normalized_dim1 += rank_diff; normalized_dim2 += rank_diff; } else if (initial_rank > 4) { - return detail::transpose_nd(input_tensor, normalized_dim1, normalized_dim2, memory_config_arg, pad_value); + return detail::transpose_nd(input_tensor, normalized_dim1, normalized_dim2, memory_config_arg); } - bool wh = (normalized_dim1 == 2 && normalized_dim2 == 3) || (normalized_dim2 == 2 && normalized_dim1 == 3); - bool cn = (normalized_dim1 == 0 && normalized_dim2 == 1) || (normalized_dim2 == 0 && normalized_dim1 == 1); - bool bfloat8_supported = cn || wh; - bool typecast = input_unsqueezed.get_dtype() == DataType::BFLOAT8_B and !bfloat8_supported and !input_unsqueezed.is_sharded(); + bool wh = (normalized_dim2 == 2 && normalized_dim1 == 0) || (normalized_dim2 == 0 && normalized_dim1 == 2); + bool typecast = input_unsqueezed.get_dtype() == DataType::BFLOAT8_B and input_unsqueezed.get_layout() == Layout::TILE and !wh and !input_unsqueezed.is_sharded(); Tensor input_typecasted = typecast ? ttnn::typecast(input_unsqueezed, DataType::BFLOAT16) : input_unsqueezed; + auto input_shape = input_typecasted.get_logical_shape(); + + // create_output_tensor shape is useless when we potentially have new padding to deal with + SmallVector output_shape; + output_shape.reserve(input_shape.rank()); + for (int i = 0; i < input_shape.rank(); ++i) { + output_shape.push_back(input_shape[i]); + } + SmallVector padded_output_shape = output_shape; + + std::swap(output_shape[normalized_dim1], output_shape[normalized_dim2]); + std::swap(padded_output_shape[normalized_dim1], padded_output_shape[normalized_dim2]); + + uint32_t input_rank = input_typecasted.get_logical_shape().rank(); + if (input_typecasted.layout() == Layout::TILE) { + padded_output_shape[input_rank - 1] = tt::round_up(padded_output_shape[input_rank - 1], tt::constants::TILE_HEIGHT); + padded_output_shape[input_rank - 2] = tt::round_up(padded_output_shape[input_rank - 2], tt::constants::TILE_WIDTH); + } + std::vector output_tensors = {Tensor(operation::get_workers_for_op_output({input_typecasted}))}; + operation::launch_with_autoformat( - [normalized_dim1, normalized_dim2, memory_config_arg, pad_value] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { + [normalized_dim1, normalized_dim2, memory_config_arg] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { auto& a = input_tensors.at(0); auto memory_config = memory_config_arg.value_or(a.memory_config()); @@ -188,10 +247,10 @@ ttnn::Tensor ExecuteTranspose::invoke( } else { TT_ASSERT(false, "Unsupported transpose dims"); } - return {detail::transpose_(a, transpose_dim, memory_config, pad_value)}; + return {detail::transpose_(a, transpose_dim, memory_config)}; }, {input_typecasted}, output_tensors); - auto output = output_tensors.at(0); + auto output = ttnn::reshape(output_tensors.at(0), ttnn::Shape(output_shape, padded_output_shape)); output = initial_rank < 4u ? ttnn::squeeze_from_4D(output, initial_rank) : output; return typecast ? ttnn::typecast(output, DataType::BFLOAT8_B) : output; @@ -201,13 +260,12 @@ ttnn::Tensor ExecuteTranspose::invoke( const ttnn::Tensor& input_tensor, const int64_t& dim1, const int64_t& dim2, - const std::optional& memory_config, - const std::optional& pad_value) { - return invoke(DefaultQueueId, input_tensor, dim1, dim2, memory_config, pad_value); + const std::optional& memory_config) { + return invoke(DefaultQueueId, input_tensor, dim1, dim2, memory_config); } -ttnn::Tensor ExecuteTranspose::invoke(const ttnn::Tensor& input_tensor, const int64_t& dim1, const int64_t& dim2, const std::optional& pad_value) { - return invoke(DefaultQueueId, input_tensor, dim1, dim2, std::nullopt, pad_value); +ttnn::Tensor ExecuteTranspose::invoke(const ttnn::Tensor& input_tensor, const int64_t& dim1, const int64_t& dim2) { + return invoke(DefaultQueueId, input_tensor, dim1, dim2, std::nullopt); } } // ttnn::operations::data_movement namespace diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.hpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.hpp index 942fe94bdeb..521f9884301 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose.hpp @@ -15,17 +15,15 @@ struct ExecuteTranspose { const ttnn::Tensor& input_tensor, const int64_t& dim1, const int64_t& dim2, - const std::optional& memory_config_arg, - const std::optional& pad_value = 0.0f); + const std::optional& memory_config_arg); static ttnn::Tensor invoke( const ttnn::Tensor& input_tensor, const int64_t& dim1, const int64_t& dim2, - const std::optional& memory_config, - const std::optional& pad_value = 0.0f); + const std::optional& memory_config); - static ttnn::Tensor invoke(const ttnn::Tensor& input_tensor, const int64_t& dim1, const int64_t& dim2, const std::optional& pad_value = 0.0f); + static ttnn::Tensor invoke(const ttnn::Tensor& input_tensor, const int64_t& dim1, const int64_t& dim2); }; } // namespace operations::data_movement diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.cpp deleted file mode 100644 index b2aa2d78874..00000000000 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.cpp +++ /dev/null @@ -1,60 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include "transpose_pybind.hpp" - -namespace ttnn::operations::data_movement::detail { -namespace py = pybind11; - -void bind_transpose(py::module& module) { - auto doc = - R"doc( - transpose(input_tensor: ttnn.Tensor, dim1: int, dim2: int, *, Optional[ttnn.MemoryConfig] = None) -> ttnn.Tensor - - Returns a tensor that is transposed along dims dim1 and dim2 - - Equivalent pytorch code: - - .. code-block:: python - - output_tensor = torch.transpose(input_tensor, 0, 1) - - Args: - * :attr:`input_tensor`: Input Tensor. - * :attr:`dim1`: First dim of transpose. - * :attr:`dim2`: Second dim of transpose. - * :attr:`pad_value` (Optional[float]): padding value for when tiles are broken in a transpose. Defaults to `0.0`. If set to None, it will be random garbage values. - - Keyword Args: - * :attr:`memory_config`: Memory Config of the output tensor - * :attr:`queue_id` (Optional[uint8]): command queue id - )doc"; - - using OperationType = decltype(ttnn::transpose); - ttnn::bind_registered_operation( - module, - ttnn::transpose, - doc, - ttnn::pybind_overload_t{ - [] (const OperationType& self, - const ttnn::Tensor& input_tensor, - const int64_t & dim1, - const int64_t & dim2, - const std::optional& memory_config, - uint8_t queue_id, - const std::optional& pad_value - ) { - return self(queue_id, input_tensor, dim1, dim2, memory_config, pad_value); - }, - py::arg("input_tensor"), - py::arg("dim1"), - py::arg("dim2"), - py::kw_only(), - py::arg("memory_config") = std::nullopt, - py::arg("queue_id") = 0, - py::arg("pad_value") = 0.0f, - } - ); -} -} // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.hpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.hpp index 41607f5708b..f4abb13dcff 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/transpose_pybind.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -14,5 +14,50 @@ namespace ttnn::operations::data_movement::detail { namespace py = pybind11; -void bind_transpose(py::module& module); +void bind_transpose(py::module& module) { + auto doc = + R"doc( + transpose(input_tensor: ttnn.Tensor, dim1: int, dim2: int, *, Optional[ttnn.MemoryConfig] = None) -> ttnn.Tensor + + Returns a tensor that is transposed along dims dim1 and dim2 + + Equivalent pytorch code: + + .. code-block:: python + + output_tensor = torch.transpose(input_tensor, 0, 1) + + Args: + * :attr:`input_tensor`: Input Tensor. + * :attr:`dim1`: First dim of transpose. + * :attr:`dim2`: Second dim of transpose. + + Keyword Args: + * :attr:`memory_config`: Memory Config of the output tensor + * :attr:`queue_id` (Optional[uint8]): command queue id + )doc"; + + using OperationType = decltype(ttnn::transpose); + ttnn::bind_registered_operation( + module, + ttnn::transpose, + doc, + ttnn::pybind_overload_t{ + [] (const OperationType& self, + const ttnn::Tensor& input_tensor, + const int64_t & dim1, + const int64_t & dim2, + const std::optional& memory_config, + uint8_t queue_id) { + return self(queue_id, input_tensor, dim1, dim2, memory_config); + }, + py::arg("input_tensor"), + py::arg("dim1"), + py::arg("dim2"), + py::kw_only(), + py::arg("memory_config") = std::nullopt, + py::arg("queue_id") = 0, + } + ); +} } // namespace ttnn::operations::data_movement::detail diff --git a/ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/reader_unary_interleaved_start_id.cpp b/ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/reader_unary_interleaved_start_id.cpp index 60b2bad12bf..c945c502f31 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/reader_unary_interleaved_start_id.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/reader_unary_interleaved_start_id.cpp @@ -5,6 +5,8 @@ #include #include "dataflow_api.h" +//#include "debug/dprint.h" + void kernel_main() { uint32_t src_addr = get_arg_val(0); uint32_t num_tiles = get_arg_val(1);