#0: Use TensorLayout in Tensor (#15028)

### Ticket ### Problem description We're migrating TTNN infrastructure to use new TensorLayout everywhere, including Tensor ### What's changed Introduced `TensorSpec`, which represents `logical_shape` + `tensor_layout` Used `TensorSpec` in Tensor instead of `(shape, dtype, layout, tile)` Refactored Tensor constructors Use `set_tensor_spec` instead of a bunch of different setters, removed the need of manually setting `metadata_populated = true` ### Checklist - [x] [Post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/11960140767) - [ ] Blackhole Post commit (if applicable) - [x] [Model regression CI testing passes](https://github.com/tenstorrent/tt-metal/actions/runs/11956334891) - [x] [Device performance regression CI testing passes](https://github.com/tenstorrent/tt-metal/actions/runs/11956337771) - [ ] New/Existing tests provide coverage for changes
tenstorrent · Nov 21, 2024 · 43ebbcb · 43ebbcb
1 parent d8308d7
commit 43ebbcb
Show file tree

Hide file tree

Showing 56 changed files with 641 additions and 523 deletions.
diff --git a/tests/tt_eager/tensors/test_async_tensor_apis.cpp b/tests/tt_eager/tensors/test_async_tensor_apis.cpp
@@ -42,7 +42,7 @@ TEST_F(CommonFixture, TestTensorOwnershipSanity) {
     // Ensure that tensor data is copied and owned as expected
     Device* device = this->devices_[0];
     Tensor host_tensor = ttnn::numpy::arange<float>(0, 32 * 32 * 4, 1);
-    Tensor readback_tensor({}, 1);
+    Tensor readback_tensor(1);
 
     auto func = [device, host_tensor, readback_tensor]() mutable {
         // Ensure that both the lambda and global scope have ownership to this tensor
@@ -67,9 +67,7 @@ TEST_F(CommonFixture, TestTensorOwnershipSanity) {
         auto device_tensor = reshaped_tensor.to(Layout::TILE).to(device);
         auto thread_local_tensor = device_tensor.cpu().to(Layout::ROW_MAJOR);
         readback_tensor.set_storage(thread_local_tensor.get_storage());
-        readback_tensor.set_shape(thread_local_tensor.get_shape());
-        readback_tensor.set_dtype(thread_local_tensor.get_dtype());
-        readback_tensor.set_layout(thread_local_tensor.get_layout());
+        readback_tensor.set_tensor_spec(thread_local_tensor.get_tensor_spec());
         readback_tensor.tensor_attributes->metadata_populated = true;
         readback_tensor.tensor_attributes->num_workers_completed++;
         // Ensure that the readback buffer is owned inside and outside the lambda
@@ -240,8 +238,7 @@ TEST_F(CommonFixture, TestTensorAsyncDataMovement) {
     uint32_t tensor_start = 0;
     uint32_t num_tiles = 128;
     uint32_t tensor_stop = TILE_HEIGHT * TILE_WIDTH * num_tiles;
-    Tensor readback_tensor({}, 1);
-    ;
+    Tensor readback_tensor(1);
     std::thread worker;
 
     {
@@ -278,9 +275,7 @@ TEST_F(CommonFixture, TestTensorAsyncDataMovement) {
             auto thread_local_tensor = device_tensor.cpu().to(Layout::ROW_MAJOR);
             log_info(LogTest, "Worker populating empty host readback_tensor");
             readback_tensor.set_storage(thread_local_tensor.get_storage());
-            readback_tensor.set_shape(thread_local_tensor.get_shape());
-            readback_tensor.set_dtype(thread_local_tensor.get_dtype());
-            readback_tensor.set_layout(thread_local_tensor.get_layout());
+            readback_tensor.set_tensor_spec(thread_local_tensor.get_tensor_spec());
             readback_tensor.tensor_attributes->metadata_populated = true;
             readback_tensor.tensor_attributes->num_workers_completed++;
             // Ensure that this buffer is currently owned by both the thread_local and read_back tensors

diff --git a/tests/ttnn/unit_tests/gtests/tensor/common_tensor_test_utils.cpp b/tests/ttnn/unit_tests/gtests/tensor/common_tensor_test_utils.cpp
@@ -27,7 +27,7 @@ void test_tensor_on_device(const ttnn::SimpleShape& input_shape, const TensorLay
         host_data[i] = i % random_prime_number;
     }
 
-    auto tensor = tt::tt_metal::create_device_tensor(input_shape, layout, device);
+    auto tensor = tt::tt_metal::create_device_tensor(TensorSpec(input_shape, layout), device);
     ttnn::queue_synchronize(device->command_queue(io_cq));
 
     ttnn::write_buffer(io_cq, tensor, {host_data});

diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
@@ -37,9 +37,9 @@ void run_create_tensor_test(tt::tt_metal::Device* device, ttnn::SimpleShape inpu
         host_data[i] = 1;
     }
 
-    tt::tt_metal::TensorLayout tensor_layout(dtype, PageConfig(Layout::TILE), mem_cfg);
-    ASSERT_EQ(input_buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(input_shape));
-    auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, input_shape, tensor_layout);
+    TensorSpec tensor_spec(input_shape, TensorLayout(dtype, PageConfig(Layout::TILE), mem_cfg));
+    ASSERT_EQ(input_buf_size_datums * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
+    auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);
 
     auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
 

diff --git a/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_with_layout.cpp b/tests/ttnn/unit_tests/gtests/tensor/test_create_tensor_with_layout.cpp
@@ -38,7 +38,7 @@ class CreateTensorWithLayoutTest : public ttnn::TTNNFixtureWithDevice, public ::
 TEST_P(CreateTensorWithLayoutTest, Tile) {
     CreateTensorParams params = GetParam();
 
-    auto tensor = tt::tt_metal::create_device_tensor(params.inputs.shape, params.inputs.layout, device_);
+    auto tensor = tt::tt_metal::create_device_tensor(TensorSpec(params.inputs.shape, params.inputs.layout), device_);
     EXPECT_EQ(tensor.get_padded_shape(), params.expected.padded_shape);
     EXPECT_EQ(tensor.get_logical_shape(), params.inputs.shape);
 }

diff --git a/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp b/tests/ttnn/unit_tests/gtests/test_async_runtime.cpp
@@ -58,8 +58,8 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncPreallocatedOutputs) {
     tt_metal::TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
     ASSERT_EQ(input_buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(input_shape.padded_shape()));
     ASSERT_EQ(output_buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(np_out.get_padded_shape()));
-    auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, input_shape.padded_shape(), tensor_layout);
-    auto output_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, np_out.get_padded_shape(), tensor_layout);
+    auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(input_shape.padded_shape(), tensor_layout));
+    auto output_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(np_out.get_padded_shape(), tensor_layout));
     auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
     auto output_storage = tt::tt_metal::DeviceStorage{output_buffer};
     Tensor input_tensor = Tensor(input_storage, input_shape, DataType::BFLOAT16, Layout::TILE);
@@ -124,7 +124,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) {
             auto workload_event = std::make_shared<Event>();
             TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
             ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(shape));
-            auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
+            auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(shape, tensor_layout));
             auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
             Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
             ttnn::write_buffer(io_cq, input_tensor, {host_data}); // Write using cq 1
@@ -134,10 +134,10 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeAllocatedBuffers) {
 
             // Run operation on cq 0
             Tensor output_tensor = ttnn::sqrt(workload_dispatch_cq, input_tensor);
-            auto dummy_buffer_0 = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
+            auto dummy_buffer_0 = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(shape, tensor_layout));
             output_tensor = ttnn::neg(workload_dispatch_cq, output_tensor);
             // Allocate this buffer to stress test async allocation across op execution and explicit allocation
-            auto dummy_buffer_1 = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
+            auto dummy_buffer_1 = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(shape, tensor_layout));
             // Record cq 0 prog execution
             ttnn::record_event(device->command_queue(workload_dispatch_cq), workload_event);
             // Wait until cq 0 prog execution is done
@@ -169,9 +169,10 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestAsyncRuntimeBufferDestructor) {
     // This will asynchronously allocate the buffer, wait for the allocation to complete (address to be assigned to the buffer), destroy the buffer (which will asynchronously
     // deallocate the buffer) in a loop
     TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
+    TensorSpec tensor_spec(shape, tensor_layout);
     for (int loop = 0; loop < 100000; loop++) {
         {
-            auto input_buffer_dummy = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
+            auto input_buffer_dummy = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);
             device->synchronize();
         }
     }

diff --git a/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp b/tests/ttnn/unit_tests/gtests/test_ccl_on_galaxy.cpp
@@ -156,9 +156,9 @@ TEST(GalaxyTests, TestAllGatherDeadlock) {
                 log_info(LogTest, "Running iteration {}", i);
             }
             for (auto& dev : devs) {
-                tt::tt_metal::TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
-                ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(shape));
-                auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(dev, shape, tensor_layout);
+                TensorSpec tensor_spec(shape, TensorLayout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg));
+                ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
+                auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(dev, tensor_spec);
                 auto input_storage = DeviceStorage{input_buffer};
                 Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
                 // Push inputs.
@@ -253,10 +253,10 @@ TEST(GalaxyTests, TestReduceScatterDeadlock) {
             log_info(LogTest, "Running iteration {}", i);
         }
 
-        TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
-        ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(shape));
+        TensorSpec tensor_spec(shape, TensorLayout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg));
+        ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
         for (auto& dev : ring_devices) {
-            auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(dev, shape, tensor_layout);
+            auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(dev, tensor_spec);
             auto input_storage = DeviceStorage{input_buffer};
             Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
             // Push inputs.

diff --git a/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp b/tests/ttnn/unit_tests/gtests/test_multi_cq_multi_dev.cpp
@@ -60,9 +60,9 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ1) {
                 for (int j = 0; j < buf_size_datums; j++) {
                     host_data[j] = bfloat16(static_cast<float>(i + dev_idx));
                 }
-                tt_metal::TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
-                ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(shape));
-                auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
+                TensorSpec tensor_spec(shape, TensorLayout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg));
+                ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
+                auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);
                 auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
                 Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
 
@@ -102,8 +102,8 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ0) {
     auto host_data = std::shared_ptr<bfloat16 []>(new bfloat16[buf_size_datums]);
     auto readback_data = std::shared_ptr<bfloat16 []>(new bfloat16[buf_size_datums]);
 
-    TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
-    ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(shape));
+    TensorSpec tensor_spec(shape, TensorLayout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg));
+    ASSERT_EQ(buf_size_datums * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
     for (int outer_loop = 0; outer_loop < 5; outer_loop++) {
         log_info(LogTest, "Running outer loop {}", outer_loop);
         for (int i = 0; i < 30; i++) {
@@ -115,7 +115,7 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceProgramsOnCQ0) {
                 for (int j = 0; j < buf_size_datums; j++) {
                     host_data[j] = bfloat16(static_cast<float>(i + dev_idx));
                 }
-                auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
+                auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);
                 auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
                 Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
 
@@ -167,8 +167,8 @@ TEST_F(MultiCommandQueueT3KFixture, Test2CQMultiDeviceWithCQ1Only) {
                     host_data[j] = bfloat16(static_cast<float>(i + dev_idx));
                 }
 
-                TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
-                auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, shape, tensor_layout);
+                TensorSpec tensor_spec(shape, TensorLayout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg));
+                auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);
                 auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};
                 Tensor input_tensor = Tensor(input_storage, shape, DataType::BFLOAT16, Layout::TILE);
 

diff --git a/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp b/tests/ttnn/unit_tests/gtests/test_multiprod_queue.cpp
@@ -54,8 +54,9 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiProducerLockBasedQueue) {
             }
             // Allocate and write buffer
             tt_metal::TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
-            ASSERT_EQ(tensor_buf_size * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(tensor_shape));
-            auto t0_input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_shape, tensor_layout);
+            tt_metal::TensorSpec tensor_spec(tensor_shape, tensor_layout);
+            ASSERT_EQ(tensor_buf_size * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
+            auto t0_input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);
             auto t0_input_storage = tt::tt_metal::DeviceStorage{t0_input_buffer};
             Tensor t0_input_tensor = Tensor(t0_input_storage, tensor_shape, DataType::BFLOAT16, Layout::TILE);
             ttnn::write_buffer(t0_io_cq, t0_input_tensor, {t0_host_data});
@@ -71,12 +72,13 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiProducerLockBasedQueue) {
 
     std::thread t1([&]() {
         TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
-        ASSERT_EQ(tensor_buf_size * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(tensor_shape));
+        TensorSpec tensor_spec(tensor_shape, tensor_layout);
+        ASSERT_EQ(tensor_buf_size * datum_size_bytes, tensor_spec.compute_packed_buffer_size_bytes());
         for (int j = 0; j < 100; j++) {
             for (int i = 0; i < tensor_buf_size; i++) {
                 t1_host_data[i] = bfloat16(static_cast<float>(4 + j));
             }
-            auto t1_input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_shape, tensor_layout);
+            auto t1_input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_spec);
             auto t1_input_storage = tt::tt_metal::DeviceStorage{t1_input_buffer};
             Tensor t1_input_tensor = Tensor(t1_input_storage, tensor_shape, DataType::BFLOAT16, Layout::TILE);
 
@@ -124,7 +126,7 @@ TEST_F(MultiCommandQueueSingleDeviceFixture, TestMultiAppThreadSync) {
     ttnn::SimpleShape tensor_shape{1, 1, 1024, 1024};
     auto host_data = std::shared_ptr<bfloat16 []>(new bfloat16[tensor_buf_size]);
     TensorLayout tensor_layout(DataType::BFLOAT16, PageConfig(Layout::TILE), mem_cfg);
-    auto allocated_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, tensor_shape, tensor_layout);
+    auto allocated_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(device, TensorSpec(tensor_shape, tensor_layout));
     auto allocated_storage = tt::tt_metal::DeviceStorage{allocated_buffer};
     auto allocated_tensor = Tensor(allocated_storage, tensor_shape, DataType::BFLOAT16, Layout::TILE);
     auto readback_data = std::shared_ptr<bfloat16 []>(new bfloat16[tensor_buf_size]);

diff --git a/tests/ttnn/unit_tests/operations/test_creation.py b/tests/ttnn/unit_tests/operations/test_creation.py
@@ -259,16 +259,21 @@ def test_zeros(device, input_shape):
     [
         [32, 32],
         [5, 96, 64],
+        [1, 50257],
     ],
 )
 @pytest.mark.parametrize(
     "fill_value",
     [-5.25, 0, 1.0],
 )
-def test_full(device, input_shape, fill_value):
+@pytest.mark.parametrize(
+    "layout",
+    [ttnn.Layout.ROW_MAJOR, ttnn.Layout.TILE],
+)
+def test_full(device, input_shape, fill_value, layout):
     torch_tensor = torch.full(input_shape, dtype=torch.bfloat16, fill_value=fill_value)
 
-    tensor = ttnn.full(input_shape, device=device, fill_value=fill_value)
+    tensor = ttnn.full(input_shape, device=device, fill_value=fill_value, layout=layout)
     assert ttnn.is_tensor_storage_on_device(tensor)
     tensor = ttnn.to_torch(tensor)