#0: Remove alignment requirements for Row Major tensors (#15245)

### Ticket ### Problem description Currently its required that RowMajor tensor size must be aligned to 4 bytes, because the data is transferred to device as a vector of `uint32_t`. This creates limitations in TTNN and complicates handling of Row Major tensors. This PR is a prerequisite for #15028 Thanks to @abhullar-tt for helping with lifting those restrictions. ### What's changed Modify Metal API allowing to send an arbitrary vector to device Modify PageConfig removing alignment restrictions Remove conversions to/from `vector<uint32_t>` for sending tensor to/from device Remove asserts on buffer alignment for row major Enable previously disabled tensor tests ### Checklist - [x] [Post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/11944863653) - [x] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/runs/11929076543) - [x] [Model regression CI testing passes](https://github.com/tenstorrent/tt-metal/actions/runs/11929113643) - [x] [Device performance regression CI testing passes](https://github.com/tenstorrent/tt-metal/actions/runs/11929096855) - [x] New/Existing tests provide coverage for changes
tenstorrent · Nov 21, 2024 · b057e09 · b057e09
1 parent 75868aa
commit b057e09
Show file tree

Hide file tree

Showing 23 changed files with 174 additions and 305 deletions.
diff --git a/.../source/tt-metalium/tt_metal/apis/host_apis/command_queue/EnqueueReadBuffer.rst b/.../source/tt-metalium/tt_metal/apis/host_apis/command_queue/EnqueueReadBuffer.rst
@@ -1,5 +1,5 @@
 EnqueueReadBuffer
 ==================
 
-.. doxygenfunction:: tt::tt_metal::v0::EnqueueReadBuffer(CommandQueue& cq, std::variant<std::reference_wrapper<Buffer>, std::shared_ptr<Buffer> > buffer, std::vector<uint32_t>& dst, bool blocking, tt::stl::Span<const SubDeviceId> sub_device_ids)
+.. doxygenfunction:: tt::tt_metal::v0::EnqueueReadBuffer(CommandQueue &cq, Buffer &buffer, std::vector<DType> &dst, bool blocking, tt::stl::Span<const SubDeviceId> sub_device_ids = {})
 .. doxygenfunction:: tt::tt_metal::v0::EnqueueReadBuffer(CommandQueue& cq, std::variant<std::reference_wrapper<Buffer>, std::shared_ptr<Buffer> > buffer, void * dst, bool blocking, tt::stl::Span<const SubDeviceId> sub_device_ids)
diff --git a/...source/tt-metalium/tt_metal/apis/host_apis/command_queue/EnqueueWriteBuffer.rst b/...source/tt-metalium/tt_metal/apis/host_apis/command_queue/EnqueueWriteBuffer.rst
@@ -1,5 +1,5 @@
 EnqueueWriteBuffer
 ==================
 
-.. doxygenfunction:: tt::tt_metal::v0::EnqueueWriteBuffer(CommandQueue& cq, std::variant<std::reference_wrapper<Buffer>, std::shared_ptr<Buffer> > buffer, std::vector<uint32_t>& src, bool blocking, tt::stl::Span<const SubDeviceId> sub_device_ids)
+.. doxygenfunction:: tt::tt_metal::v0::EnqueueWriteBuffer(CommandQueue& cq, std::variant<std::reference_wrapper<Buffer>, std::shared_ptr<Buffer> > buffer, std::vector<DType>&, bool blocking, tt::stl::Span<const SubDeviceId> sub_device_ids)
 .. doxygenfunction:: tt::tt_metal::v0::EnqueueWriteBuffer(CommandQueue& cq, std::variant<std::reference_wrapper<Buffer>, std::shared_ptr<Buffer> > buffer, HostDataType src, bool blocking, tt::stl::Span<const SubDeviceId> sub_device_ids)
diff --git a/tests/tt_metal/tt_metal/unit_tests/ethernet/ring_gather_kernels.cpp b/tests/tt_metal/tt_metal/unit_tests/ethernet/ring_gather_kernels.cpp
@@ -237,7 +237,7 @@ bool eth_direct_ring_gather_sender_receiver_kernels(
         llrt::write_hex_vec_to_core(
             sender_device->id(),
             sender_device->ethernet_core_from_logical_core(eth_sender_core),
-            {INVALID},
+            std::vector{INVALID},
             sem_l1_byte_address);
 
         ////////////////////////////////////////////////////////////////////////////
@@ -260,7 +260,7 @@ bool eth_direct_ring_gather_sender_receiver_kernels(
         llrt::write_hex_vec_to_core(
             receiver_device->id(),
             receiver_device->ethernet_core_from_logical_core(eth_receiver_core),
-            {INVALID},
+            std::vector{INVALID},
             sem_l1_byte_address);
         auto eth_receiver_kernel = tt_metal::CreateKernel(
             receiver_program,
@@ -390,10 +390,10 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels(
              (uint32_t)cfg.page_size_bytes,
              (uint32_t)sem_l1_byte_address});
         llrt::write_hex_vec_to_core(
-            device->id(), device->ethernet_core_from_logical_core(eth_sender_core), {INVALID}, sem_l1_byte_address);
+            device->id(), device->ethernet_core_from_logical_core(eth_sender_core), std::vector{INVALID}, sem_l1_byte_address);
 
         llrt::write_hex_vec_to_core(
-            device->id(), device->ethernet_core_from_logical_core(eth_receiver_core), {INVALID}, sem_l1_byte_address);
+            device->id(), device->ethernet_core_from_logical_core(eth_receiver_core), std::vector{INVALID}, sem_l1_byte_address);
 
         auto eth_receiver_kernel = tt_metal::CreateKernel(
             program,

diff --git a/.../unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/.../unit_tests_fast_dispatch/command_queue/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
@@ -314,7 +314,7 @@ bool stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_wrap(
     bool pass = true;
     vector<uint32_t> dst;
     uint32_t idx = start;
-    for (auto buffer : bufs) {
+    for (const auto& buffer : bufs) {
         EnqueueReadBuffer(cq, buffer, dst, true);
         pass &= dst == unique_vectors[idx % unique_vectors.size()];
         idx++;

diff --git a/...metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp b/...metal/tt_metal/unit_tests_fast_dispatch/multichip/test_eth_ring_gather_EnqueueProgram.cpp
@@ -236,7 +236,7 @@ bool eth_direct_ring_gather_sender_receiver_kernels(
         llrt::write_hex_vec_to_core(
             sender_device->id(),
             sender_device->ethernet_core_from_logical_core(eth_sender_core),
-            {INVALID},
+            std::vector{INVALID},
             sem_l1_byte_address);
 
         ////////////////////////////////////////////////////////////////////////////
@@ -259,7 +259,7 @@ bool eth_direct_ring_gather_sender_receiver_kernels(
         llrt::write_hex_vec_to_core(
             receiver_device->id(),
             receiver_device->ethernet_core_from_logical_core(eth_receiver_core),
-            {INVALID},
+            std::vector{INVALID},
             sem_l1_byte_address);
         auto eth_receiver_kernel = tt_metal::CreateKernel(
             receiver_program,
@@ -394,10 +394,10 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels(
              (uint32_t)cfg.page_size_bytes,
              (uint32_t)sem_l1_byte_address});
         llrt::write_hex_vec_to_core(
-            device->id(), device->ethernet_core_from_logical_core(eth_sender_core), {INVALID}, sem_l1_byte_address);
+            device->id(), device->ethernet_core_from_logical_core(eth_sender_core), std::vector{INVALID}, sem_l1_byte_address);
 
         llrt::write_hex_vec_to_core(
-            device->id(), device->ethernet_core_from_logical_core(eth_receiver_core), {INVALID}, sem_l1_byte_address);
+            device->id(), device->ethernet_core_from_logical_core(eth_receiver_core), std::vector{INVALID}, sem_l1_byte_address);
 
         auto eth_receiver_kernel = tt_metal::CreateKernel(
             program,

diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
@@ -121,9 +121,9 @@ INSTANTIATE_TEST_SUITE_P(
     EmptyTensorTest,
     ::testing::Combine(
         ::testing::Values(
-            //ttnn::Shape({}),
-            //ttnn::Shape({0}),
-            //ttnn::Shape({1}),
+            ttnn::Shape({}),
+            ttnn::Shape({0}),
+            ttnn::Shape({1}),
             ttnn::Shape({1, 2}),
             ttnn::Shape({1, 2, 3}),
             ttnn::Shape({1, 2, 3, 4}),

diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_sharding_with_alignment.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_sharding_with_alignment.cpp
@@ -822,10 +822,11 @@ INSTANTIATE_TEST_SUITE_P(
         },
         /////////////////////////////////////////////////////////////////////////////////////////////////////
         // EXAMPLE 2: ROW_MAJOR tensor with different representation for width sharded / interleaved
-        // - In this example, (shard) width alignment is 4 because UINT8 = 1 bytes and we pack with uint32_t
+        // - In this example, (shard) width alignment is 1 because it's row major
         /////////////////////////////////////////////////////////////////////////////////////////////////////
-        // Example 2a: Logical shard shape + alignment after
-        // - Along width: 5 / 1 is 5 shards; 5 * 4 = 20
+        // Example 2a: Logical shard shape that is already aligned
+        // NOTE: ShardMode::PHYSICAL is equivalent in this case
+        // - Along width: 5 / 1 is 5 shards; 5 * 1 = 5
         CreateShardedTensorWithAlignmentParams{
             CreateShardedTensorWithAlignmentInputs{
                 .shape = SimpleShape{1, 2, 10, 5},
@@ -844,15 +845,15 @@ INSTANTIATE_TEST_SUITE_P(
                     }
             },
             CreateShardedTensorWithAlignmentExpected{
-                .physical_size = Size{20, 20}
+                .physical_size = Size{20, 5}
             }
         },
         // Example 2b: Logical shard shape that is already aligned
         // NOTE: ShardMode::PHYSICAL is equivalent in this case
-        // - Along width: 5 / 4 is 2 shards; 2 * 4 = 8
+        // - Along width: 8 / 4 is 2 shards; 2 * 4 = 8
         CreateShardedTensorWithAlignmentParams{
             CreateShardedTensorWithAlignmentInputs{
-                .shape = SimpleShape{1, 2, 10, 5},
+                .shape = SimpleShape{1, 2, 10, 8},
                 .data_type = DataType::UINT8,
                 .page_config = PageConfig(Layout::ROW_MAJOR),
                 .memory_config =
@@ -872,7 +873,7 @@ INSTANTIATE_TEST_SUITE_P(
             }
         },
         // Example 2c: For interleaved, we treat entire height/width as "logical shard shape" for calculations
-        // 20 "shards" with 5 aligned to 4 for uint32_t alignment
+        // 20 "shards" with 5 aligned to 1
         CreateShardedTensorWithAlignmentParams{
             CreateShardedTensorWithAlignmentInputs{
                 .shape = SimpleShape{1, 2, 10, 5},
@@ -886,7 +887,7 @@ INSTANTIATE_TEST_SUITE_P(
                     }
             },
             CreateShardedTensorWithAlignmentExpected{
-                .physical_size = Size{20, 8}
+                .physical_size = Size{20, 5}
             }
         },
         ////////////////////////////////////////////////////////////////////

diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_tensor_layout.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_tensor_layout.cpp
@@ -79,9 +79,9 @@ INSTANTIATE_TEST_SUITE_P(
                 .layout = Layout::ROW_MAJOR
             },
             Expected{
-                .physical_size = {6*5*4, 4},
-                .alignment = Alignment({2}),
-                .strides = Strides({5*4*4, 4*4, 4, 1})
+                .physical_size = {6*5*4, 3},
+                .alignment = Alignment({1}),
+                .strides = Strides({5*4*3, 4*3, 3, 1})
             }
         },
 
@@ -108,7 +108,7 @@ INSTANTIATE_TEST_SUITE_P(
             },
             Expected{
                 .physical_size = {6*5*4, 8},
-                .alignment = Alignment({2}),
+                .alignment = Alignment({1}),
                 .strides = Strides({5*4*8, 4*8, 8, 1})
             }
         },
@@ -135,9 +135,9 @@ INSTANTIATE_TEST_SUITE_P(
                 .layout = Layout::ROW_MAJOR
             },
             Expected{
-                .physical_size = {1, 2},
-                .alignment = Alignment({2}),
-                .strides = Strides({2, 2, 2, 1})
+                .physical_size = {1, 1},
+                .alignment = Alignment({1}),
+                .strides = Strides({1, 1, 1, 1})
             }
         },
 
@@ -163,8 +163,8 @@ INSTANTIATE_TEST_SUITE_P(
                 .layout = Layout::ROW_MAJOR
             },
             Expected{
-                .physical_size = {1, 2},
-                .alignment = Alignment({2}),
+                .physical_size = {1, 1},
+                .alignment = Alignment({1}),
                 .strides = Strides({}),
                 .tensor_creation_works = false
             }
@@ -208,8 +208,8 @@ INSTANTIATE_TEST_SUITE_P(
                 .layout = Layout::ROW_MAJOR
             },
             Expected{
-                .physical_size = {1, 2},
-                .alignment = Alignment({2}),
+                .physical_size = {1, 1},
+                .alignment = Alignment({1}),
                 .strides = Strides({1}),
                 .tensor_creation_works = false
             }

diff --git a/tt_metal/detail/tt_metal.hpp b/tt_metal/detail/tt_metal.hpp
@@ -11,6 +11,7 @@
 #include "tt_metal/hostdevcommon/common_values.hpp"
 #include "tt_metal/common/core_coord.hpp"
 #include "tt_metal/impl/dispatch/dispatch_core_manager.hpp"
+#include "tt_metal/impl/buffers/buffer.hpp"
 
 namespace tt::tt_metal {
 inline namespace v0 {
@@ -44,10 +45,29 @@ inline namespace v0 {
         * | Argument    | Description                                     | Data type               | Valid range                                      | Required |
         * |-------------|-------------------------------------------------|-------------------------|--------------------------------------------------|----------|
         * | buffer      | Buffer to send data to                          | Buffer &                |                                                  | Yes      |
-        * | host_buffer | Buffer on host to copy data from                | std::vector<uint32_t> & | Host buffer size must match buffer               | Yes      |
+        * | host_buffer | Buffer on host to copy data from                | Span<const uint8_t> &   | Host buffer size must match buffer               | Yes      |
         */
-        void WriteToBuffer(Buffer &buffer, const std::vector<uint32_t> &host_buffer);
-        void WriteToBuffer( std::shared_ptr<Buffer> buffer, const std::vector<uint32_t> &host_buffer);
+        void WriteToBuffer(Buffer &buffer, tt::stl::Span<const uint8_t> host_buffer);
+        /**
+        * Copies data from a host buffer into the specified buffer
+        *
+        * Return value: void
+        *
+        * | Argument    | Description                                     | Data type               | Valid range                                      | Required |
+        * |-------------|-------------------------------------------------|-------------------------|--------------------------------------------------|----------|
+        * | buffer      | Buffer to send data to                          | Buffer &                |                                                  | Yes      |
+        * | host_buffer | Buffer on host to copy data from                | std::vector<DType> &    | Host buffer size must match buffer               | Yes      |
+        */
+        template<typename DType>
+        void WriteToBuffer(Buffer &buffer, const std::vector<DType>& host_buffer) {
+            WriteToBuffer(buffer, tt::stl::Span<const uint8_t>(reinterpret_cast<const uint8_t*>(host_buffer.data()), host_buffer.size() * sizeof(DType)));
+        }
+        template<typename DType>
+        void WriteToBuffer(std::shared_ptr<Buffer> buffer, const std::vector<DType>& host_buffer) {
+            WriteToBuffer(*buffer, host_buffer);
+        }
+
+        void ReadFromBuffer(Buffer &buffer, uint8_t* host_buffer, bool shard_order = false);
         /**
         * Copies data from a buffer into a host buffer
         *
@@ -56,12 +76,22 @@ inline namespace v0 {
         * | Argument    | Description                                     | Data type               | Valid range                                      | Required |
         * |-------------|-------------------------------------------------|-------------------------|--------------------------------------------------|----------|
         * | buffer      | Buffer to read data from                        | Buffer &                |                                                  | Yes      |
-        * | host_buffer | Buffer on host to copy data into                | std::vector<uint32_t> & |                                                  | Yes      |
+        * | host_buffer | Buffer on host to copy data into                | std::vector<DType> &    |                                                  | Yes      |
         * | shard_order | For a sharded buffer we can read in shard order | bool                    |                                                  | No       |
         */
-        void ReadFromBuffer(Buffer &buffer, std::vector<uint32_t> &host_buffer, bool shard_order = false);
-        void ReadFromBuffer(std::shared_ptr<Buffer> buffer, std::vector<uint32_t> &host_buffer, bool shard_order = false);
-
+        template<typename DType>
+        void ReadFromBuffer(Buffer &buffer, std::vector<DType> &host_buffer, bool shard_order = false) {
+            auto buffer_size = buffer.size();
+            TT_FATAL(buffer_size % sizeof(DType) == 0, "Buffer size is not divisible by dtype size");
+            host_buffer.resize(buffer.size() / sizeof(DType));
+            ReadFromBuffer(buffer, reinterpret_cast<uint8_t*>(host_buffer.data()), shard_order);
+        }
+        template<typename DType>
+        void ReadFromBuffer(std::shared_ptr<Buffer> buffer, std::vector<DType> &host_buffer, bool shard_order = false) {
+            ReadFromBuffer(*buffer, host_buffer, shard_order);
+        }
+
+        void ReadShard(Buffer &buffer, uint8_t* host_buffer, const uint32_t & core_id);
         /**
         * Copies data from a buffer into a host buffer
         *
@@ -70,12 +100,14 @@ inline namespace v0 {
         * | Argument    | Description                                     | Data type               | Valid range                                      | Required |
         * |-------------|-------------------------------------------------|-------------------------|--------------------------------------------------|----------|
         * | buffer      | Buffer to read data from                        | Buffer &                |                                                  | Yes      |
-        * | host_buffer | Buffer on host to copy data into                | std::vector<uint32_t> & |                                                  | Yes      |
+        * | host_buffer | Buffer on host to copy data into                | std::vector<DType> &    |                                                  | Yes      |
         * | core_id     | ID of core                                      | const uint32_t &        |                                                  | Yes      |
         */
-        void ReadShard(Buffer &buffer, std::vector<uint32_t> &host_buffer, const uint32_t & core_id);
-
-
+        template<typename DType>
+        void ReadShard(Buffer &buffer, std::vector<DType> &host_buffer, const uint32_t & core_id) {
+            host_buffer.resize(buffer.page_size() * buffer.shard_spec().size());
+            ReadShard(buffer, reinterpret_cast<uint8_t*>(host_buffer.data()), core_id);
+        }
 
         // Launches all kernels on cores specified with kernels in the program.
         // All kernels on a given Tensix core must be launched.