Skip to content

Commit

Permalink
#0: Use TensorLayout in Tensor (#15028)
Browse files Browse the repository at this point in the history
### Ticket

### Problem description
We're migrating TTNN infrastructure to use new TensorLayout everywhere,
including Tensor

### What's changed
Introduced `TensorSpec`, which represents `logical_shape` +
`tensor_layout`
Used `TensorSpec` in Tensor instead of `(shape, dtype, layout, tile)`
Refactored Tensor constructors
Use `set_tensor_spec` instead of a bunch of different setters, removed
the need of manually setting `metadata_populated = true`

### Checklist
- [x] [Post commit CI
passes](https://github.com/tenstorrent/tt-metal/actions/runs/11960140767)
- [ ] Blackhole Post commit (if applicable)
- [x] [Model regression CI testing
passes](https://github.com/tenstorrent/tt-metal/actions/runs/11956334891)
- [x] [Device performance regression CI testing
passes](https://github.com/tenstorrent/tt-metal/actions/runs/11956337771)
- [ ] New/Existing tests provide coverage for changes
  • Loading branch information
sminakov-tt authored Nov 21, 2024
1 parent d8308d7 commit 43ebbcb
Show file tree
Hide file tree
Showing 56 changed files with 641 additions and 523 deletions.
13 changes: 4 additions & 9 deletions tests/tt_eager/tensors/test_async_tensor_apis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ TEST_F(CommonFixture, TestTensorOwnershipSanity) {
// Ensure that tensor data is copied and owned as expected
Device* device = this->devices_[0];
Tensor host_tensor = ttnn::numpy::arange<float>(0, 32 * 32 * 4, 1);
Tensor readback_tensor({}, 1);
Tensor readback_tensor(1);

auto func = [device, host_tensor, readback_tensor]() mutable {
// Ensure that both the lambda and global scope have ownership to this tensor
Expand All @@ -67,9 +67,7 @@ TEST_F(CommonFixture, TestTensorOwnershipSanity) {
auto device_tensor = reshaped_tensor.to(Layout::TILE).to(device);
auto thread_local_tensor = device_tensor.cpu().to(Layout::ROW_MAJOR);
readback_tensor.set_storage(thread_local_tensor.get_storage());
readback_tensor.set_shape(thread_local_tensor.get_shape());
readback_tensor.set_dtype(thread_local_tensor.get_dtype());
readback_tensor.set_layout(thread_local_tensor.get_layout());
readback_tensor.set_tensor_spec(thread_local_tensor.get_tensor_spec());
readback_tensor.tensor_attributes->metadata_populated = true;
readback_tensor.tensor_attributes->num_workers_completed++;
// Ensure that the readback buffer is owned inside and outside the lambda
Expand Down Expand Up @@ -240,8 +238,7 @@ TEST_F(CommonFixture, TestTensorAsyncDataMovement) {
uint32_t tensor_start = 0;
uint32_t num_tiles = 128;
uint32_t tensor_stop = TILE_HEIGHT * TILE_WIDTH * num_tiles;
Tensor readback_tensor({}, 1);
;
Tensor readback_tensor(1);
std::thread worker;

{
Expand Down Expand Up @@ -278,9 +275,7 @@ TEST_F(CommonFixture, TestTensorAsyncDataMovement) {
auto thread_local_tensor = device_tensor.cpu().to(Layout::ROW_MAJOR);
log_info(LogTest, "Worker populating empty host readback_tensor");
readback_tensor.set_storage(thread_local_tensor.get_storage());
readback_tensor.set_shape(thread_local_tensor.get_shape());
readback_tensor.set_dtype(thread_local_tensor.get_dtype());
readback_tensor.set_layout(thread_local_tensor.get_layout());
readback_tensor.set_tensor_spec(thread_local_tensor.get_tensor_spec());
readback_tensor.tensor_attributes->metadata_populated = true;
readback_tensor.tensor_attributes->num_workers_completed++;
// Ensure that this buffer is currently owned by both the thread_local and read_back tensors
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ void test_tensor_on_device(const ttnn::SimpleShape& input_shape, const TensorLay
host_data[i] = i % random_prime_number;
}

auto tensor = tt::tt_metal::create_device_tensor(input_shape, layout, device);
auto tensor = tt::tt_metal::create_device_tensor(TensorSpec(input_shape, layout), device);
ttnn::queue_synchronize(device->command_queue(io_cq));

ttnn::write_buffer(io_cq, tensor, {host_data});
Expand Down
6 changes: 3 additions & 3 deletions tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ void run_create_tensor_test(tt::tt_metal::Device* device, ttnn::SimpleShape inpu
host_data[i] = 1;
}

tt::tt_metal::TensorLayout tensor_layout(dtype, PageConfig(Layout::TILE), mem_cfg);
ASSERT_EQ(input_buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(input_shape));
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, input_shape, tensor_layout);
TensorSpec tensor_spec(input_shape, TensorLayout(dtype, PageConfig(Layout::TILE), mem_cfg));
ASSERT_EQ(input_buf_size_datums * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);

auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class CreateTensorWithLayoutTest : public ttnn::TTNNFixtureWithDevice, public ::
TEST_P(CreateTensorWithLayoutTest, Tile) {
CreateTensorParams params = GetParam();

auto tensor = tt::tt_metal::create_device_tensor(params.inputs.shape, params.inputs.layout, device_);
auto tensor = tt::tt_metal::create_device_tensor(TensorSpec(params.inputs.shape, params.inputs.layout), device_);
EXPECT_EQ(tensor.get_padded_shape(), params.expected.padded_shape);
EXPECT_EQ(tensor.get_logical_shape(), params.inputs.shape);
}
Expand Down
13 changes: 7 additions & 6 deletions tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) {
tt_metal::TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
ASSERT_EQ(input_buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(input_shape.padded_shape()));
ASSERT_EQ(output_buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(np_out.get_padded_shape()));
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, input_shape.padded_shape(), tensor_layout);
auto output_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, np_out.get_padded_shape(), tensor_layout);
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(input_shape.padded_shape(), tensor_layout));
auto output_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(np_out.get_padded_shape(), tensor_layout));
auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
auto output_storage = tt::tt_metal::DeviceStorage{output_buffer};
Tensor input_tensor = Tensor(input_storage, input_shape, DataType::BFLOAT16, Layout::TILE);
Expand Down Expand Up @@ -124,7 +124,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) {
auto workload_event = std::make_shared<Event>();
TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(shape));
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(shape, tensor_layout));
auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
ttnn::write_buffer(io_cq, input_tensor, {host_data}); // Write using cq 1
Expand All @@ -134,10 +134,10 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) {

// Run operation on cq 0
Tensor output_tensor = ttnn::sqrt(workload_dispatch_cq, input_tensor);
auto dummy_buffer_0 = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
auto dummy_buffer_0 = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(shape, tensor_layout));
output_tensor = ttnn::neg(workload_dispatch_cq, output_tensor);
// Allocate this buffer to stress test async allocation across op execution and explicit allocation
auto dummy_buffer_1 = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
auto dummy_buffer_1 = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(shape, tensor_layout));
// Record cq 0 prog execution
ttnn::record_event(device->command_queue(workload_dispatch_cq), workload_event);
// Wait until cq 0 prog execution is done
Expand Down Expand Up @@ -169,9 +169,10 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeBufferDestructor) {
// This will asynchronously allocate the buffer, wait for the allocation to complete (address to be assigned to the buffer), destroy the buffer (which will asynchronously
// deallocate the buffer) in a loop
TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
TensorSpec tensor_spec(shape, tensor_layout);
for (int loop = 0; loop < 100000; loop++) {
{
auto input_buffer_dummy = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
auto input_buffer_dummy = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);
device->synchronize();
}
}
Expand Down
12 changes: 6 additions & 6 deletions tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,9 @@ TEST(GalaxyTests, TestAllGatherDeadlock) {
log_info(LogTest, "Running iteration {}", i);
}
for (auto& dev : devs) {
tt::tt_metal::TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(shape));
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(dev, shape, tensor_layout);
TensorSpec tensor_spec(shape, TensorLayout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg));
ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(dev, tensor_spec);
auto input_storage = DeviceStorage{input_buffer};
Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
// Push inputs.
Expand Down Expand Up @@ -253,10 +253,10 @@ TEST(GalaxyTests, TestReduceScatterDeadlock) {
log_info(LogTest, "Running iteration {}", i);
}

TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(shape));
TensorSpec tensor_spec(shape, TensorLayout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg));
ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
for (auto& dev : ring_devices) {
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(dev, shape, tensor_layout);
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(dev, tensor_spec);
auto input_storage = DeviceStorage{input_buffer};
Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
// Push inputs.
Expand Down
16 changes: 8 additions & 8 deletions tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ1) {
for (int j = 0; j < buf_size_datums; j++) {
host_data[j] = bfloat16(static_cast<float>(i + dev_idx));
}
tt_metal::TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(shape));
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
TensorSpec tensor_spec(shape, TensorLayout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg));
ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);
auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);

Expand Down Expand Up @@ -102,8 +102,8 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ0) {
auto host_data = std::shared_ptr<bfloat16 []>(new bfloat16[buf_size_datums]);
auto readback_data = std::shared_ptr<bfloat16 []>(new bfloat16[buf_size_datums]);

TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(shape));
TensorSpec tensor_spec(shape, TensorLayout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg));
ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
for (int outer_loop = 0; outer_loop < 5; outer_loop++) {
log_info(LogTest, "Running outer loop {}", outer_loop);
for (int i = 0; i < 30; i++) {
Expand All @@ -115,7 +115,7 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ0) {
for (int j = 0; j < buf_size_datums; j++) {
host_data[j] = bfloat16(static_cast<float>(i + dev_idx));
}
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);
auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);

Expand Down Expand Up @@ -167,8 +167,8 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceWithCQ1Only) {
host_data[j] = bfloat16(static_cast<float>(i + dev_idx));
}

TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
TensorSpec tensor_spec(shape, TensorLayout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg));
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);
auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);

Expand Down
12 changes: 7 additions & 5 deletions tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,9 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiProducerLockBasedQueue) {
}
// Allocate and write buffer
tt_metal::TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
ASSERT_EQ(tensor_buf_size * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(tensor_shape));
auto t0_input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_shape, tensor_layout);
tt_metal::TensorSpec tensor_spec(tensor_shape, tensor_layout);
ASSERT_EQ(tensor_buf_size * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
auto t0_input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);
auto t0_input_storage = tt::tt_metal::DeviceStorage{t0_input_buffer};
Tensor t0_input_tensor = Tensor(t0_input_storage, tensor_shape, DataType::BFLOAT16, Layout::TILE);
ttnn::write_buffer(t0_io_cq, t0_input_tensor, {t0_host_data});
Expand All @@ -71,12 +72,13 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiProducerLockBasedQueue) {

std::thread t1([&]() {
TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
ASSERT_EQ(tensor_buf_size * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(tensor_shape));
TensorSpec tensor_spec(tensor_shape, tensor_layout);
ASSERT_EQ(tensor_buf_size * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
for (int j = 0; j < 100; j++) {
for (int i = 0; i < tensor_buf_size; i++) {
t1_host_data[i] = bfloat16(static_cast<float>(4 + j));
}
auto t1_input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_shape, tensor_layout);
auto t1_input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);
auto t1_input_storage = tt::tt_metal::DeviceStorage{t1_input_buffer};
Tensor t1_input_tensor = Tensor(t1_input_storage, tensor_shape, DataType::BFLOAT16, Layout::TILE);

Expand Down Expand Up @@ -124,7 +126,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiAppThreadSync) {
ttnn::SimpleShape tensor_shape{1, 1, 1024, 1024};
auto host_data = std::shared_ptr<bfloat16 []>(new bfloat16[tensor_buf_size]);
TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
auto allocated_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_shape, tensor_layout);
auto allocated_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(tensor_shape, tensor_layout));
auto allocated_storage = tt::tt_metal::DeviceStorage{allocated_buffer};
auto allocated_tensor = Tensor(allocated_storage, tensor_shape, DataType::BFLOAT16, Layout::TILE);
auto readback_data = std::shared_ptr<bfloat16 []>(new bfloat16[tensor_buf_size]);
Expand Down
9 changes: 7 additions & 2 deletions tests/ttnn/unit_tests/operations/test_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,16 +259,21 @@ def test_zeros(device, input_shape):
[
[32, 32],
[5, 96, 64],
[1, 50257],
],
)
@pytest.mark.parametrize(
"fill_value",
[-5.25, 0, 1.0],
)
def test_full(device, input_shape, fill_value):
@pytest.mark.parametrize(
"layout",
[ttnn.Layout.ROW_MAJOR, ttnn.Layout.TILE],
)
def test_full(device, input_shape, fill_value, layout):
torch_tensor = torch.full(input_shape, dtype=torch.bfloat16, fill_value=fill_value)

tensor = ttnn.full(input_shape, device=device, fill_value=fill_value)
tensor = ttnn.full(input_shape, device=device, fill_value=fill_value, layout=layout)
assert ttnn.is_tensor_storage_on_device(tensor)
tensor = ttnn.to_torch(tensor)

Expand Down
Loading

0 comments on commit 43ebbcb

Please sign in to comment.