Skip to content

Commit

Permalink
Introduce TensorLayout in Tensor implementation (#13607)
Browse files Browse the repository at this point in the history
  • Loading branch information
ayerofieiev-tt authored Oct 31, 2024
1 parent 948fafb commit f68c436
Show file tree
Hide file tree
Showing 49 changed files with 1,756 additions and 570 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ target_compile_options(
)

# add additional compile warning flags depending on the compiler
ADJUST_COMPILER_WARNINGS()
ADJUST_METAL_COMPILER_WARNINGS()

add_library(compiler_flags INTERFACE)
target_link_libraries(
Expand Down
5 changes: 4 additions & 1 deletion cmake/compilers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ function(CHECK_COMPILERS)
endif()
endfunction()

function(ADJUST_COMPILER_WARNINGS)
function(ADJUST_METAL_COMPILER_WARNINGS)
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
message(STATUS "Adjusting compiler warnings for Clang")
target_compile_options(
compiler_warnings
INTERFACE
Expand All @@ -56,10 +57,12 @@ function(ADJUST_COMPILER_WARNINGS)
-Wno-deprecated-declarations
)
else() # GCC-12 or higher
message(STATUS "Adjusting compiler warnings for GCC")
target_compile_options(
compiler_warnings
INTERFACE
-Wno-deprecated
-Wno-deprecated-declarations
-Wno-attributes
-Wno-stringop-overread
-Wno-stringop-overflow
Expand Down
10 changes: 5 additions & 5 deletions tests/tt_metal/tt_metal/test_bcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ int main(int argc, char **argv) {
ref_bcast_values[j] = bfloat16(bcast_1value+(j%7)).to_uint16();
// convert the reference broadcast tensor to tiled format
tiled_bcast_values = convert_layout<uint16_t>(
ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
ref_bcast_values, ref_bcast_shape, tests::utils::TensorLayoutType::LIN_ROW_MAJOR, tests::utils::TensorLayoutType::TILED_NFACES);
TT_FATAL(tiled_bcast_values[0] == bcast_1value16, "Error");
// restore ref values and shape to 1
ref_bcast_shape[3] = 1;
Expand All @@ -184,7 +184,7 @@ int main(int argc, char **argv) {
// add something not too large but different between tiles
ref_bcast_values[j] = bfloat16(bcast_1value+(j%7)).to_uint16();
tiled_bcast_values = convert_layout<uint16_t>(
ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
ref_bcast_values, ref_bcast_shape, tests::utils::TensorLayoutType::LIN_ROW_MAJOR, tests::utils::TensorLayoutType::TILED_NFACES);
num_bcast_tiles = NC*Wt;
// restore values and shape to W
} else if (bcast_dim == BcastDim::W) {
Expand All @@ -195,7 +195,7 @@ int main(int argc, char **argv) {
// add something not too large but different between tiles
ref_bcast_values[j] = bfloat16(bcast_1value+(j%7)).to_uint16();
tiled_bcast_values = convert_layout<uint16_t>(
ref_bcast_values, ref_bcast_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
ref_bcast_values, ref_bcast_shape, tests::utils::TensorLayoutType::LIN_ROW_MAJOR, tests::utils::TensorLayoutType::TILED_NFACES);
num_bcast_tiles = NC*Ht;
}

Expand Down Expand Up @@ -314,15 +314,15 @@ int main(int argc, char **argv) {
// recover a linear view of input vector for consumption by gold_ function
auto u16_src0_vec = u16_from_u32_vector(src0_vec);
vector<uint16_t> src_linear = convert_layout<uint16_t>(
u16_src0_vec, shape, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
u16_src0_vec, shape, tests::utils::TensorLayoutType::TILED_NFACES, tests::utils::TensorLayoutType::LIN_ROW_MAJOR);
vector<uint16_t> gold_added = gold_bcast_op(
src_linear, shape, ref_bcast_values, bcast_dim, bcast_op); // result is uint16_t untilized

// Tilize from row major and convert to pairs (uint32_t)
vector<uint32_t> shapeR{shape[0], shape[1], shape[2], shape[3]};
auto gold_4f_u32 = u32_from_u16_vector(
convert_layout<uint16_t>(
gold_added, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES));
gold_added, shapeR, tests::utils::TensorLayoutType::LIN_ROW_MAJOR, tests::utils::TensorLayoutType::TILED_NFACES));

pass &= packed_uint32_t_vector_comparison(result_vec, gold_4f_u32, comparison_function, &argfail);
if (!pass)
Expand Down
6 changes: 3 additions & 3 deletions tests/tt_metal/tt_metal/test_bfp4_conversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ int main(int argc, char **argv) {
}

std::vector<uint32_t> shape_vec = {1, num_tiles, 32, 32};
std::vector<float> tiled_fp32_vec = convert_layout(fp32_vec, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
std::vector<float> tiled_fp32_vec = convert_layout(fp32_vec, shape_vec, tests::utils::TensorLayoutType::LIN_ROW_MAJOR, tests::utils::TensorLayoutType::TILED_NFACES);

std::vector<uint32_t> packed_bfp4b_tile_vec_rm_in = pack_fp32_vec_as_bfp4_tiles(fp32_vec, /*row_major_input=*/true, /*is_exp_a=*/false);
std::vector<float> unpacked_bfp4b_tile_vec_rm_out = unpack_bfp4_tiles_into_float_vec(packed_bfp4b_tile_vec_rm_in, /*row_major_output*/true, /*is_exp_a=*/false);
Expand All @@ -44,8 +44,8 @@ int main(int argc, char **argv) {
// ////////////////////////////////////////////////////////////////////////////
// // Validation
// ////////////////////////////////////////////////////////////////////////////
std::vector<float> tiled_to_rm_fp32_vec = convert_layout(unpacked_bfp4b_tile_vec_tile_out, shape_vec, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
std::vector<float> rm_to_tiled_fp32_vec = convert_layout(unpacked_bfp4b_tile_vec_rm_out, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
std::vector<float> tiled_to_rm_fp32_vec = convert_layout(unpacked_bfp4b_tile_vec_tile_out, shape_vec, tests::utils::TensorLayoutType::TILED_NFACES, tests::utils::TensorLayoutType::LIN_ROW_MAJOR);
std::vector<float> rm_to_tiled_fp32_vec = convert_layout(unpacked_bfp4b_tile_vec_rm_out, shape_vec, tests::utils::TensorLayoutType::LIN_ROW_MAJOR, tests::utils::TensorLayoutType::TILED_NFACES);

// Ensure that passing in row_major_input=true and row_major_output=true are inverses of row_major_input=false and row_major_output=false yield the same result
pass &= (packed_bfp4b_tile_vec_rm_in == packed_bfp4b_tile_vec_tile_in);
Expand Down
6 changes: 3 additions & 3 deletions tests/tt_metal/tt_metal/test_bfp8_conversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ int main(int argc, char **argv) {
}

std::vector<uint32_t> shape_vec = {1, 1, 32, 32};
std::vector<float> tiled_fp32_vec = convert_layout(fp32_vec, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
std::vector<float> tiled_fp32_vec = convert_layout(fp32_vec, shape_vec, tests::utils::TensorLayoutType::LIN_ROW_MAJOR, tests::utils::TensorLayoutType::TILED_NFACES);

std::vector<uint32_t> packed_bfp8b_tile_vec_rm_in = pack_fp32_vec_as_bfp8_tiles(fp32_vec, /*row_major_input=*/true, /*is_exp_a=*/false);
std::vector<float> unpacked_bfp8b_tile_vec_rm_out = unpack_bfp8_tiles_into_float_vec(packed_bfp8b_tile_vec_rm_in, /*row_major_output*/true, /*is_exp_a=*/false);
Expand All @@ -44,8 +44,8 @@ int main(int argc, char **argv) {
// ////////////////////////////////////////////////////////////////////////////
// // Validation
// ////////////////////////////////////////////////////////////////////////////
std::vector<float> tiled_to_rm_fp32_vec = convert_layout(unpacked_bfp8b_tile_vec_tile_out, shape_vec, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
std::vector<float> rm_to_tiled_fp32_vec = convert_layout(unpacked_bfp8b_tile_vec_rm_out, shape_vec, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
std::vector<float> tiled_to_rm_fp32_vec = convert_layout(unpacked_bfp8b_tile_vec_tile_out, shape_vec, tests::utils::TensorLayoutType::TILED_NFACES, tests::utils::TensorLayoutType::LIN_ROW_MAJOR);
std::vector<float> rm_to_tiled_fp32_vec = convert_layout(unpacked_bfp8b_tile_vec_rm_out, shape_vec, tests::utils::TensorLayoutType::LIN_ROW_MAJOR, tests::utils::TensorLayoutType::TILED_NFACES);

// Ensure that passing in row_major_input=true and row_major_output=true are inverses of row_major_input=false and row_major_output=false yield the same result
pass &= (packed_bfp8b_tile_vec_rm_in == packed_bfp8b_tile_vec_tile_in);
Expand Down
6 changes: 3 additions & 3 deletions tests/tt_metal/tt_metal/test_bmm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,13 @@ int main(int argc, char **argv) {
vector<uint32_t> shapeC = {1, B, Mt*32, Nt*32};
auto u16_src0_vec = u16_from_u32_vector(src0_vec);
auto u16_src1_vec = u16_from_u32_vector(src1_vec);
vector<uint16_t> src0_linear = convert_layout<uint16_t>(u16_src0_vec, shapeA, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
vector<uint16_t> src1_linear = convert_layout<uint16_t>(u16_src1_vec, shapeB, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
vector<uint16_t> src0_linear = convert_layout<uint16_t>(u16_src0_vec, shapeA, tests::utils::TensorLayoutType::TILED_NFACES, tests::utils::TensorLayoutType::LIN_ROW_MAJOR);
vector<uint16_t> src1_linear = convert_layout<uint16_t>(u16_src1_vec, shapeB, tests::utils::TensorLayoutType::TILED_NFACES, tests::utils::TensorLayoutType::LIN_ROW_MAJOR);
vector<uint16_t> ref_bmm = gold_bmm(shapeA, src0_linear, shapeB, src1_linear);

// Tilize gold from row major and convert to pairs (uint32_t)
auto gold_4f_u32 = u32_from_u16_vector( convert_layout<uint16_t>(
ref_bmm, shapeC, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES));
ref_bmm, shapeC, tests::utils::TensorLayoutType::LIN_ROW_MAJOR, tests::utils::TensorLayoutType::TILED_NFACES));

pass &= packed_uint32_t_vector_comparison(result_vec, gold_4f_u32, comparison_function, &argfail);
if (!pass)
Expand Down
4 changes: 2 additions & 2 deletions tests/tt_metal/tt_metal/test_transpose_hc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,12 +185,12 @@ int main(int argc, char **argv) {
};

// recover a linear view of input vector for consumption by gold_ function
vector<uint16_t> src_linear = convert_layout<uint16_t>(src_4f_16, shape, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
vector<uint16_t> src_linear = convert_layout<uint16_t>(src_4f_16, shape, tests::utils::TensorLayoutType::TILED_NFACES, tests::utils::TensorLayoutType::LIN_ROW_MAJOR);
vector<uint16_t> gold_reduced = gold_transpose_hc(src_linear, shape); // result is uint16_t untilized

// Tilize from row major and convert to pairs (uint32_t)
vector<uint32_t> shapeR{shape[0], shape[2], shape[1], shape[3]};
auto gold_16_4f = convert_layout<uint16_t>(gold_reduced, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES);
auto gold_16_4f = convert_layout<uint16_t>(gold_reduced, shapeR, tests::utils::TensorLayoutType::LIN_ROW_MAJOR, tests::utils::TensorLayoutType::TILED_NFACES);
auto gold_4f_u32 = u32_from_u16_vector(gold_16_4f);
auto u16_result = u16_from_u32_vector(result_vec);

Expand Down
4 changes: 2 additions & 2 deletions tests/tt_metal/tt_metal/unit_tests/compute/test_reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,11 +360,11 @@ void run_single_core_reduce_program(tt_metal::Device* device, const ReduceConfig
}
}
// recover a linear view of input vector for consumption by gold_ function
std::vector<uint16_t> src_linear = convert_layout<uint16_t>(u16_src0_vec, test_config.shape, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
std::vector<uint16_t> src_linear = convert_layout<uint16_t>(u16_src0_vec, test_config.shape, tests::utils::TensorLayoutType::TILED_NFACES, tests::utils::TensorLayoutType::LIN_ROW_MAJOR);
std::vector<uint16_t> gold_reduced = test_config.golden_function(src_linear, test_config.shape, scaler, uint8_t(test_config.reduce_type), true); // result is uint16_t untilized

// Tilize from row major and convert to pairs (uint32_t)
auto gold_4f_u32 = u32_from_u16_vector(convert_layout<uint16_t>(gold_reduced, test_config.result_shape, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES));
auto gold_4f_u32 = u32_from_u16_vector(convert_layout<uint16_t>(gold_reduced, test_config.result_shape, tests::utils::TensorLayoutType::LIN_ROW_MAJOR, tests::utils::TensorLayoutType::TILED_NFACES));

bool pass = packed_uint32_t_vector_comparison(result_vec, gold_4f_u32, comparison_function, &argfail);
if (!pass)
Expand Down
4 changes: 2 additions & 2 deletions tests/tt_metal/tt_metal/unit_tests/compute/test_transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,13 @@ void validate_transpose_wh(const std::vector<uint32_t> &src_vec, const std::vect

// recover a linear view of input vector for consumption by gold_ function
auto u16_src0_vec = u16_from_u32_vector(src_vec);
vector<uint16_t> src_linear = convert_layout<uint16_t>(u16_src0_vec, shape, TensorLayout::TILED_NFACES, TensorLayout::LIN_ROW_MAJOR);
vector<uint16_t> src_linear = convert_layout<uint16_t>(u16_src0_vec, shape, tests::utils::TensorLayoutType::TILED_NFACES, tests::utils::TensorLayoutType::LIN_ROW_MAJOR);
vector<uint16_t> gold_reduced = gold_transpose_wh(src_linear, shape); // result is uint16_t untilized

// Tilize from row major and convert to pairs (uint32_t)
TT_FATAL(shape.size() == 4, "Error");
vector<uint32_t> shapeR{shape[0], shape[1], shape[3], shape[2]};
auto gold_4f_u32 = u32_from_u16_vector(convert_layout<uint16_t>(gold_reduced, shapeR, TensorLayout::LIN_ROW_MAJOR, TensorLayout::TILED_NFACES));
auto gold_4f_u32 = u32_from_u16_vector(convert_layout<uint16_t>(gold_reduced, shapeR, tests::utils::TensorLayoutType::LIN_ROW_MAJOR, tests::utils::TensorLayoutType::TILED_NFACES));

bool pass = packed_uint32_t_vector_comparison(result_vec, gold_4f_u32, comparison_function, &argfail);
if (not pass) {
Expand Down
10 changes: 9 additions & 1 deletion tests/ttnn/unit_tests/gtests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,16 @@ set(TTNN_UNIT_TESTS_SRC
${CMAKE_CURRENT_SOURCE_DIR}/test_reflect.cpp
${CMAKE_CURRENT_SOURCE_DIR}/test_to_and_from_json.cpp
)

set(TTNN_CCL_UNIT_TESTS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/ccl/test_erisc_data_mover_with_workers.cpp)
set(TTNN_TENSOR_UNIT_TESTS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_create_tensor.cpp)

set(TTNN_TENSOR_UNIT_TESTS_SRC
${CMAKE_CURRENT_SOURCE_DIR}/tensor/common_tensor_test_utils.cpp
${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_create_tensor.cpp
${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_tensor_layout.cpp
${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_create_tensor_with_layout.cpp
${CMAKE_CURRENT_SOURCE_DIR}/tensor/test_shape_base.cpp
)

add_executable(unit_tests_ttnn ${TTNN_UNIT_TESTS_SRC})
TT_ENABLE_UNITY_BUILD(unit_tests_ttnn)
Expand Down
55 changes: 55 additions & 0 deletions tests/ttnn/unit_tests/gtests/tensor/common_tensor_test_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#include "common_tensor_test_utils.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/async_runtime.hpp"

#include "gtest/gtest.h"

namespace test_utils {

void test_tensor_on_device(const ttnn::SimpleShape& input_shape, const TensorLayout& layout, tt::tt_metal::Device* device) {
using namespace tt::tt_metal;

const uint32_t io_cq = 0;

const auto input_buf_size_bytes = layout.compute_packed_buffer_size_bytes(input_shape);
const auto host_buffer_datum_size_bytes = sizeof(uint32_t);
const auto input_buf_size = input_buf_size_bytes / host_buffer_datum_size_bytes;

auto host_data = std::make_shared<uint32_t[]>(input_buf_size);
auto readback_data = std::make_shared<uint32_t[]>(input_buf_size);

const auto random_prime_number = 4051;
for (int i = 0; i < input_buf_size; i++) {
host_data[i] = i % random_prime_number;
}

auto tensor = tt::tt_metal::create_device_tensor(input_shape, layout, device);
ttnn::queue_synchronize(device->command_queue(io_cq));

ttnn::write_buffer(io_cq, tensor, {host_data});
ttnn::queue_synchronize(device->command_queue(io_cq));

ttnn::read_buffer(io_cq, tensor, {readback_data});
ttnn::queue_synchronize(device->command_queue(io_cq));

for (int i = 0; i < input_buf_size; i++) {
EXPECT_EQ(host_data[i], readback_data[i]);
}

EXPECT_EQ(tensor.get_padded_shape(), layout.compute_padded_shape(input_shape));
tensor.deallocate();
}

void test_tensor_on_device(const ttnn::SimpleShape& input_shape, const tt::tt_metal::TensorLayout& layout) {
tt::tt_metal::Device* device = tt::tt_metal::CreateDevice(0);

test_tensor_on_device(input_shape, layout, device);

tt::tt_metal::CloseDevice(device);
}

} // namespace test_utils
13 changes: 13 additions & 0 deletions tests/ttnn/unit_tests/gtests/tensor/common_tensor_test_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "ttnn/tensor/layout/tensor_layout.hpp"
#include "ttnn/tensor/tensor.hpp"

namespace test_utils {
void test_tensor_on_device(const ttnn::SimpleShape& input_shape, const tt::tt_metal::TensorLayout& layout, tt::tt_metal::Device* device);
void test_tensor_on_device(const ttnn::SimpleShape& input_shape, const tt::tt_metal::TensorLayout& layout);
}
7 changes: 6 additions & 1 deletion tests/ttnn/unit_tests/gtests/tensor/test_create_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

#include "ttnn_test_fixtures.hpp"

namespace {

void run_create_tensor_test(tt::tt_metal::Device* device, ttnn::SimpleShape input_shape) {
MemoryConfig mem_cfg = MemoryConfig{
.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED,
Expand All @@ -32,7 +34,9 @@ void run_create_tensor_test(tt::tt_metal::Device* device, ttnn::SimpleShape inpu
host_data[i] = 1;
}

auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device(input_buf_size_datums * datum_size_bytes, device, input_shape, dtype, Layout::TILE, mem_cfg);
tt::tt_metal::TensorLayout tensor_layout(dtype, PageConfig(Layout::TILE), mem_cfg);
ASSERT_EQ(input_buf_size_datums * datum_size_bytes, tensor_layout.compute_packed_buffer_size_bytes(input_shape));
auto input_buffer = tt::tt_metal::tensor_impl::allocate_buffer_on_device( device, input_shape, tensor_layout);

auto input_storage = tt::tt_metal::DeviceStorage{input_buffer};

Expand All @@ -52,6 +56,7 @@ void run_create_tensor_test(tt::tt_metal::Device* device, ttnn::SimpleShape inpu
struct CreateTensorParams {
ttnn::SimpleShape shape;
};
}

class CreateTensorTest : public ttnn::TTNNFixtureWithDevice, public ::testing::WithParamInterface<CreateTensorParams> {};

Expand Down
Loading

0 comments on commit f68c436

Please sign in to comment.