Skip to content

Commit

Permalink
#0: Remove alignment requirements for Row Major tensors (#15245)
Browse files Browse the repository at this point in the history
### Ticket

### Problem description
Currently its required that RowMajor tensor size must be aligned to 4
bytes, because the data is transferred to device as a vector of
`uint32_t`. This creates limitations in TTNN and complicates handling of
Row Major tensors.
This PR is a prerequisite for
#15028
Thanks to @abhullar-tt for helping with lifting those restrictions.

### What's changed
Modify Metal API allowing to send an arbitrary vector to device
Modify PageConfig removing alignment restrictions
Remove conversions to/from `vector<uint32_t>` for sending tensor to/from
device
Remove asserts on buffer alignment for row major
Enable previously disabled tensor tests

### Checklist
- [x] [Post commit CI
passes](https://github.com/tenstorrent/tt-metal/actions/runs/11944863653)
- [x] [Blackhole Post
commit](https://github.com/tenstorrent/tt-metal/actions/runs/11929076543)
- [x] [Model regression CI testing
passes](https://github.com/tenstorrent/tt-metal/actions/runs/11929113643)
- [x] [Device performance regression CI testing
passes](https://github.com/tenstorrent/tt-metal/actions/runs/11929096855)
- [x] New/Existing tests provide coverage for changes
  • Loading branch information
sminakov-tt authored Nov 21, 2024
1 parent 75868aa commit b057e09
Show file tree
Hide file tree
Showing 23 changed files with 174 additions and 305 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
EnqueueReadBuffer
==================

.. doxygenfunction:: tt::tt_metal::v0::EnqueueReadBuffer(CommandQueue& cq, std::variant<std::reference_wrapper<Buffer>, std::shared_ptr<Buffer> > buffer, std::vector<uint32_t>& dst, bool blocking, tt::stl::Span<const SubDeviceId> sub_device_ids)
.. doxygenfunction:: tt::tt_metal::v0::EnqueueReadBuffer(CommandQueue &cq, Buffer &buffer, std::vector<DType> &dst, bool blocking, tt::stl::Span<const SubDeviceId> sub_device_ids = {})
.. doxygenfunction:: tt::tt_metal::v0::EnqueueReadBuffer(CommandQueue& cq, std::variant<std::reference_wrapper<Buffer>, std::shared_ptr<Buffer> > buffer, void * dst, bool blocking, tt::stl::Span<const SubDeviceId> sub_device_ids)
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
EnqueueWriteBuffer
==================

.. doxygenfunction:: tt::tt_metal::v0::EnqueueWriteBuffer(CommandQueue& cq, std::variant<std::reference_wrapper<Buffer>, std::shared_ptr<Buffer> > buffer, std::vector<uint32_t>& src, bool blocking, tt::stl::Span<const SubDeviceId> sub_device_ids)
.. doxygenfunction:: tt::tt_metal::v0::EnqueueWriteBuffer(CommandQueue& cq, std::variant<std::reference_wrapper<Buffer>, std::shared_ptr<Buffer> > buffer, std::vector<DType>&, bool blocking, tt::stl::Span<const SubDeviceId> sub_device_ids)
.. doxygenfunction:: tt::tt_metal::v0::EnqueueWriteBuffer(CommandQueue& cq, std::variant<std::reference_wrapper<Buffer>, std::shared_ptr<Buffer> > buffer, HostDataType src, bool blocking, tt::stl::Span<const SubDeviceId> sub_device_ids)
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ bool eth_direct_ring_gather_sender_receiver_kernels(
llrt::write_hex_vec_to_core(
sender_device->id(),
sender_device->ethernet_core_from_logical_core(eth_sender_core),
{INVALID},
std::vector{INVALID},
sem_l1_byte_address);

////////////////////////////////////////////////////////////////////////////
Expand All @@ -260,7 +260,7 @@ bool eth_direct_ring_gather_sender_receiver_kernels(
llrt::write_hex_vec_to_core(
receiver_device->id(),
receiver_device->ethernet_core_from_logical_core(eth_receiver_core),
{INVALID},
std::vector{INVALID},
sem_l1_byte_address);
auto eth_receiver_kernel = tt_metal::CreateKernel(
receiver_program,
Expand Down Expand Up @@ -390,10 +390,10 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels(
(uint32_t)cfg.page_size_bytes,
(uint32_t)sem_l1_byte_address});
llrt::write_hex_vec_to_core(
device->id(), device->ethernet_core_from_logical_core(eth_sender_core), {INVALID}, sem_l1_byte_address);
device->id(), device->ethernet_core_from_logical_core(eth_sender_core), std::vector{INVALID}, sem_l1_byte_address);

llrt::write_hex_vec_to_core(
device->id(), device->ethernet_core_from_logical_core(eth_receiver_core), {INVALID}, sem_l1_byte_address);
device->id(), device->ethernet_core_from_logical_core(eth_receiver_core), std::vector{INVALID}, sem_l1_byte_address);

auto eth_receiver_kernel = tt_metal::CreateKernel(
program,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ bool stress_test_EnqueueWriteBuffer_and_EnqueueReadBuffer_wrap(
bool pass = true;
vector<uint32_t> dst;
uint32_t idx = start;
for (auto buffer : bufs) {
for (const auto& buffer : bufs) {
EnqueueReadBuffer(cq, buffer, dst, true);
pass &= dst == unique_vectors[idx % unique_vectors.size()];
idx++;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ bool eth_direct_ring_gather_sender_receiver_kernels(
llrt::write_hex_vec_to_core(
sender_device->id(),
sender_device->ethernet_core_from_logical_core(eth_sender_core),
{INVALID},
std::vector{INVALID},
sem_l1_byte_address);

////////////////////////////////////////////////////////////////////////////
Expand All @@ -259,7 +259,7 @@ bool eth_direct_ring_gather_sender_receiver_kernels(
llrt::write_hex_vec_to_core(
receiver_device->id(),
receiver_device->ethernet_core_from_logical_core(eth_receiver_core),
{INVALID},
std::vector{INVALID},
sem_l1_byte_address);
auto eth_receiver_kernel = tt_metal::CreateKernel(
receiver_program,
Expand Down Expand Up @@ -394,10 +394,10 @@ bool eth_interleaved_ring_gather_sender_receiver_kernels(
(uint32_t)cfg.page_size_bytes,
(uint32_t)sem_l1_byte_address});
llrt::write_hex_vec_to_core(
device->id(), device->ethernet_core_from_logical_core(eth_sender_core), {INVALID}, sem_l1_byte_address);
device->id(), device->ethernet_core_from_logical_core(eth_sender_core), std::vector{INVALID}, sem_l1_byte_address);

llrt::write_hex_vec_to_core(
device->id(), device->ethernet_core_from_logical_core(eth_receiver_core), {INVALID}, sem_l1_byte_address);
device->id(), device->ethernet_core_from_logical_core(eth_receiver_core), std::vector{INVALID}, sem_l1_byte_address);

auto eth_receiver_kernel = tt_metal::CreateKernel(
program,
Expand Down
6 changes: 3 additions & 3 deletions tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,9 @@ INSTANTIATE_TEST_SUITE_P(
EmptyTensorTest,
::testing::Combine(
::testing::Values(
//ttnn::Shape({}),
//ttnn::Shape({0}),
//ttnn::Shape({1}),
ttnn::Shape({}),
ttnn::Shape({0}),
ttnn::Shape({1}),
ttnn::Shape({1, 2}),
ttnn::Shape({1, 2, 3}),
ttnn::Shape({1, 2, 3, 4}),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -822,10 +822,11 @@ INSTANTIATE_TEST_SUITE_P(
},
/////////////////////////////////////////////////////////////////////////////////////////////////////
// EXAMPLE 2: ROW_MAJOR tensor with different representation for width sharded / interleaved
// - In this example, (shard) width alignment is 4 because UINT8 = 1 bytes and we pack with uint32_t
// - In this example, (shard) width alignment is 1 because it's row major
/////////////////////////////////////////////////////////////////////////////////////////////////////
// Example 2a: Logical shard shape + alignment after
// - Along width: 5 / 1 is 5 shards; 5 * 4 = 20
// Example 2a: Logical shard shape that is already aligned
// NOTE: ShardMode::PHYSICAL is equivalent in this case
// - Along width: 5 / 1 is 5 shards; 5 * 1 = 5
CreateShardedTensorWithAlignmentParams{
CreateShardedTensorWithAlignmentInputs{
.shape = SimpleShape{1, 2, 10, 5},
Expand All @@ -844,15 +845,15 @@ INSTANTIATE_TEST_SUITE_P(
}
},
CreateShardedTensorWithAlignmentExpected{
.physical_size = Size{20, 20}
.physical_size = Size{20, 5}
}
},
// Example 2b: Logical shard shape that is already aligned
// NOTE: ShardMode::PHYSICAL is equivalent in this case
// - Along width: 5 / 4 is 2 shards; 2 * 4 = 8
// - Along width: 8 / 4 is 2 shards; 2 * 4 = 8
CreateShardedTensorWithAlignmentParams{
CreateShardedTensorWithAlignmentInputs{
.shape = SimpleShape{1, 2, 10, 5},
.shape = SimpleShape{1, 2, 10, 8},
.data_type = DataType::UINT8,
.page_config = PageConfig(Layout::ROW_MAJOR),
.memory_config =
Expand All @@ -872,7 +873,7 @@ INSTANTIATE_TEST_SUITE_P(
}
},
// Example 2c: For interleaved, we treat entire height/width as "logical shard shape" for calculations
// 20 "shards" with 5 aligned to 4 for uint32_t alignment
// 20 "shards" with 5 aligned to 1
CreateShardedTensorWithAlignmentParams{
CreateShardedTensorWithAlignmentInputs{
.shape = SimpleShape{1, 2, 10, 5},
Expand All @@ -886,7 +887,7 @@ INSTANTIATE_TEST_SUITE_P(
}
},
CreateShardedTensorWithAlignmentExpected{
.physical_size = Size{20, 8}
.physical_size = Size{20, 5}
}
},
////////////////////////////////////////////////////////////////////
Expand Down
22 changes: 11 additions & 11 deletions tests/ttnn/unit_tests/gtests/tensor/test_tensor_layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,9 @@ INSTANTIATE_TEST_SUITE_P(
.layout = Layout::ROW_MAJOR
},
Expected{
.physical_size = {6*5*4, 4},
.alignment = Alignment({2}),
.strides = Strides({5*4*4, 4*4, 4, 1})
.physical_size = {6*5*4, 3},
.alignment = Alignment({1}),
.strides = Strides({5*4*3, 4*3, 3, 1})
}
},

Expand All @@ -108,7 +108,7 @@ INSTANTIATE_TEST_SUITE_P(
},
Expected{
.physical_size = {6*5*4, 8},
.alignment = Alignment({2}),
.alignment = Alignment({1}),
.strides = Strides({5*4*8, 4*8, 8, 1})
}
},
Expand All @@ -135,9 +135,9 @@ INSTANTIATE_TEST_SUITE_P(
.layout = Layout::ROW_MAJOR
},
Expected{
.physical_size = {1, 2},
.alignment = Alignment({2}),
.strides = Strides({2, 2, 2, 1})
.physical_size = {1, 1},
.alignment = Alignment({1}),
.strides = Strides({1, 1, 1, 1})
}
},

Expand All @@ -163,8 +163,8 @@ INSTANTIATE_TEST_SUITE_P(
.layout = Layout::ROW_MAJOR
},
Expected{
.physical_size = {1, 2},
.alignment = Alignment({2}),
.physical_size = {1, 1},
.alignment = Alignment({1}),
.strides = Strides({}),
.tensor_creation_works = false
}
Expand Down Expand Up @@ -208,8 +208,8 @@ INSTANTIATE_TEST_SUITE_P(
.layout = Layout::ROW_MAJOR
},
Expected{
.physical_size = {1, 2},
.alignment = Alignment({2}),
.physical_size = {1, 1},
.alignment = Alignment({1}),
.strides = Strides({1}),
.tensor_creation_works = false
}
Expand Down
54 changes: 43 additions & 11 deletions tt_metal/detail/tt_metal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "tt_metal/hostdevcommon/common_values.hpp"
#include "tt_metal/common/core_coord.hpp"
#include "tt_metal/impl/dispatch/dispatch_core_manager.hpp"
#include "tt_metal/impl/buffers/buffer.hpp"

namespace tt::tt_metal {
inline namespace v0 {
Expand Down Expand Up @@ -44,10 +45,29 @@ inline namespace v0 {
* | Argument | Description | Data type | Valid range | Required |
* |-------------|-------------------------------------------------|-------------------------|--------------------------------------------------|----------|
* | buffer | Buffer to send data to | Buffer & | | Yes |
* | host_buffer | Buffer on host to copy data from | std::vector<uint32_t> & | Host buffer size must match buffer | Yes |
* | host_buffer | Buffer on host to copy data from | Span<const uint8_t> & | Host buffer size must match buffer | Yes |
*/
void WriteToBuffer(Buffer &buffer, const std::vector<uint32_t> &host_buffer);
void WriteToBuffer( std::shared_ptr<Buffer> buffer, const std::vector<uint32_t> &host_buffer);
void WriteToBuffer(Buffer &buffer, tt::stl::Span<const uint8_t> host_buffer);
/**
* Copies data from a host buffer into the specified buffer
*
* Return value: void
*
* | Argument | Description | Data type | Valid range | Required |
* |-------------|-------------------------------------------------|-------------------------|--------------------------------------------------|----------|
* | buffer | Buffer to send data to | Buffer & | | Yes |
* | host_buffer | Buffer on host to copy data from | std::vector<DType> & | Host buffer size must match buffer | Yes |
*/
template<typename DType>
void WriteToBuffer(Buffer &buffer, const std::vector<DType>& host_buffer) {
WriteToBuffer(buffer, tt::stl::Span<const uint8_t>(reinterpret_cast<const uint8_t*>(host_buffer.data()), host_buffer.size() * sizeof(DType)));
}
template<typename DType>
void WriteToBuffer(std::shared_ptr<Buffer> buffer, const std::vector<DType>& host_buffer) {
WriteToBuffer(*buffer, host_buffer);
}

void ReadFromBuffer(Buffer &buffer, uint8_t* host_buffer, bool shard_order = false);
/**
* Copies data from a buffer into a host buffer
*
Expand All @@ -56,12 +76,22 @@ inline namespace v0 {
* | Argument | Description | Data type | Valid range | Required |
* |-------------|-------------------------------------------------|-------------------------|--------------------------------------------------|----------|
* | buffer | Buffer to read data from | Buffer & | | Yes |
* | host_buffer | Buffer on host to copy data into | std::vector<uint32_t> & | | Yes |
* | host_buffer | Buffer on host to copy data into | std::vector<DType> & | | Yes |
* | shard_order | For a sharded buffer we can read in shard order | bool | | No |
*/
void ReadFromBuffer(Buffer &buffer, std::vector<uint32_t> &host_buffer, bool shard_order = false);
void ReadFromBuffer(std::shared_ptr<Buffer> buffer, std::vector<uint32_t> &host_buffer, bool shard_order = false);

template<typename DType>
void ReadFromBuffer(Buffer &buffer, std::vector<DType> &host_buffer, bool shard_order = false) {
auto buffer_size = buffer.size();
TT_FATAL(buffer_size % sizeof(DType) == 0, "Buffer size is not divisible by dtype size");
host_buffer.resize(buffer.size() / sizeof(DType));
ReadFromBuffer(buffer, reinterpret_cast<uint8_t*>(host_buffer.data()), shard_order);
}
template<typename DType>
void ReadFromBuffer(std::shared_ptr<Buffer> buffer, std::vector<DType> &host_buffer, bool shard_order = false) {
ReadFromBuffer(*buffer, host_buffer, shard_order);
}

void ReadShard(Buffer &buffer, uint8_t* host_buffer, const uint32_t & core_id);
/**
* Copies data from a buffer into a host buffer
*
Expand All @@ -70,12 +100,14 @@ inline namespace v0 {
* | Argument | Description | Data type | Valid range | Required |
* |-------------|-------------------------------------------------|-------------------------|--------------------------------------------------|----------|
* | buffer | Buffer to read data from | Buffer & | | Yes |
* | host_buffer | Buffer on host to copy data into | std::vector<uint32_t> & | | Yes |
* | host_buffer | Buffer on host to copy data into | std::vector<DType> & | | Yes |
* | core_id | ID of core | const uint32_t & | | Yes |
*/
void ReadShard(Buffer &buffer, std::vector<uint32_t> &host_buffer, const uint32_t & core_id);


template<typename DType>
void ReadShard(Buffer &buffer, std::vector<DType> &host_buffer, const uint32_t & core_id) {
host_buffer.resize(buffer.page_size() * buffer.shard_spec().size());
ReadShard(buffer, reinterpret_cast<uint8_t*>(host_buffer.data()), core_id);
}

// Launches all kernels on cores specified with kernels in the program.
// All kernels on a given Tensix core must be launched.
Expand Down
Loading

0 comments on commit b057e09

Please sign in to comment.