From ebca9fa5de24c7883a4552226f95d401e3cf2212 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Thu, 28 Nov 2024 18:45:10 +0000 Subject: [PATCH 1/4] #0: Disable clang-format precommit check once again due to errors --- .pre-commit-config.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e3f9978be82..3581cb2b2e9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,10 +23,10 @@ repos: rev: v1.35.1 hooks: - id: yamllint -- repo: https://github.com/pre-commit/mirrors-clang-format - rev: v19.1.4 - hooks: - - id: clang-format - entry: git-clang-format - types_or: [c++, c] - args: ["--style=file"] +# - repo: https://github.com/pre-commit/mirrors-clang-format +# rev: v19.1.4 +# hooks: +# - id: clang-format +# entry: git-clang-format +# types_or: [c++, c] +# args: ["--style=file"] From 5af64271f243af41da88457cdd4a659553a6af72 Mon Sep 17 00:00:00 2001 From: Austin Ho <109362939+tt-aho@users.noreply.github.com> Date: Thu, 28 Nov 2024 15:43:44 -0500 Subject: [PATCH 2/4] #15337: Fix incorrectly sized cb in remote cb microbenchmark (#15506) Force-merged by Raymond to unblock P0 from MLIR team https://github.com/tenstorrent/tt-metal/issues/15510 ### Ticket https://github.com/tenstorrent/tt-metal/issues/15510 ### Problem description Output cb was wrong sized and exceeds actual L1 buffer size on a bank for remote cb test. Problem with validation if user/op plans to update both the dynamic buffer and the total cb size, in that updating only one individually would compare to the old value of the other and potentially fail assertion. ### What's changed Fix size calculation in test. Add a new api that updates both the global buffer and the total size at the same time, so that it can perform validation on both new values. ### Checklist - [x] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] New/Existing tests provide coverage for changes --- tests/scripts/test_moreh_microbenchmark.py | 2 + .../test_dram_read_remote_cb.cpp | 396 ++++++++++-------- .../test_remote_cb_sync_matmul.cpp | 352 ++++++++++------ tt_metal/host_api.hpp | 16 + .../impl/buffers/circular_buffer_types.cpp | 19 +- .../impl/buffers/circular_buffer_types.hpp | 39 +- tt_metal/tt_metal.cpp | 6 + .../multi_core_hw/bcast_op_multi_core_hw.cpp | 8 +- .../device/transpose_program_factory.cpp | 8 +- ...t_and_width_multi_core_program_factory.cpp | 8 +- ...core_sharded_optimized_program_factory.cpp | 2 +- ...lement_wise_multi_core_program_factory.cpp | 12 +- .../group_attn_matmul_program_factory.cpp | 12 +- 13 files changed, 533 insertions(+), 347 deletions(-) diff --git a/tests/scripts/test_moreh_microbenchmark.py b/tests/scripts/test_moreh_microbenchmark.py index 90bf957f162..7e9f33f6377 100755 --- a/tests/scripts/test_moreh_microbenchmark.py +++ b/tests/scripts/test_moreh_microbenchmark.py @@ -1005,6 +1005,8 @@ def test_dram_read_remote_cb_sync( elif test == "Matmul": if arch == "wormhole_b0": bw_bound = 18.0 + if use_sub_devices: + pytest.xfail("Tests using sub-devices is not correctly set up for BW measurements") assert bw_bound <= throughput diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp index 3f77a2d204f..0c451cc8851 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp @@ -57,10 +57,8 @@ using std::chrono::microseconds; // --bypass-check (set to bypass checking performance criteria fulfillment) //////////////////////////////////////////////////////////////////////////////// - - template -std::vector slice_vec(std::vector const &v, int m, int n) { +std::vector slice_vec(std::vector const& v, int m, int n) { auto first = v.cbegin() + m; auto last = v.cbegin() + n + 1; @@ -68,7 +66,8 @@ std::vector slice_vec(std::vector const &v, int m, int n) { return vec; } -void get_max_page_size_and_num_pages(uint32_t num_tiles, uint32_t num_datums_per_tile, uint32_t& page_size, uint32_t& num_pages) { +void get_max_page_size_and_num_pages( + uint32_t num_tiles, uint32_t num_datums_per_tile, uint32_t& page_size, uint32_t& num_pages) { uint64_t total_size = static_cast(num_tiles) * num_datums_per_tile; page_size = (8192 / num_datums_per_tile) * num_datums_per_tile; @@ -78,12 +77,17 @@ void get_max_page_size_and_num_pages(uint32_t num_tiles, uint32_t num_datums_per num_pages = total_size / page_size; } -std::tuple, tt_metal::KernelHandle, uint32_t, std::vector>> create_programs( - tt_metal::Device *device, - const CoreRangeSet &dram_reader_core, - const CoreRangeSet &l1_receiver_cores, - const uint32_t &single_tile_size, - const tt::DataFormat &tile_format, +std::tuple< + std::vector, + tt_metal::KernelHandle, + uint32_t, + std::vector>> +create_programs( + tt_metal::Device* device, + const CoreRangeSet& dram_reader_core, + const CoreRangeSet& l1_receiver_cores, + const uint32_t& single_tile_size, + const tt::DataFormat& tile_format, uint32_t k, uint32_t n, uint32_t num_blocks, @@ -93,9 +97,7 @@ std::tuple, tt_metal::KernelHandle, uint32_t, std uint32_t cb_padding, const std::shared_ptr& input_buffer, const std::shared_ptr& output_buffer, - bool use_sub_devices - ) { - + bool use_sub_devices) { log_info("created program"); std::vector programs; programs.push_back(tt_metal::Program()); @@ -151,10 +153,15 @@ std::tuple, tt_metal::KernelHandle, uint32_t, std next_layer_single_tile_size = 2048; } uint32_t next_layer_reader_page_size, next_layer_reader_num_pages; - get_max_page_size_and_num_pages(next_layer_block_num_tiles, next_layer_single_tile_size, next_layer_reader_page_size, next_layer_reader_num_pages); + get_max_page_size_and_num_pages( + next_layer_block_num_tiles, + next_layer_single_tile_size, + next_layer_reader_page_size, + next_layer_reader_num_pages); uint32_t next_layer_writer_page_size, next_layer_writer_num_pages; - get_max_page_size_and_num_pages(block_w / num_receivers, next_layer_single_tile_size, next_layer_writer_page_size, next_layer_writer_num_pages); + get_max_page_size_and_num_pages( + block_w / num_receivers, next_layer_single_tile_size, next_layer_writer_page_size, next_layer_writer_num_pages); // L1 receiver CB uint32_t receiver_cb_index = 0; @@ -163,7 +170,8 @@ std::tuple, tt_metal::KernelHandle, uint32_t, std uint32_t receiver_cb_addr = output_buffer->address(); tt_metal::CircularBufferConfig receiver_cb_config = tt_metal::CircularBufferConfig(receiver_cb_size, {{receiver_cb_index, tile_format}}) - .set_page_size(receiver_cb_index, receiver_page_size).set_globally_allocated_address(*output_buffer); + .set_page_size(receiver_cb_index, receiver_page_size) + .set_globally_allocated_address(*output_buffer); auto receiver_cb = tt_metal::CreateCircularBuffer(receiver_program, l1_receiver_cores, receiver_cb_config); log_info("reader_cb_size: {}", reader_cb_size); @@ -176,25 +184,24 @@ std::tuple, tt_metal::KernelHandle, uint32_t, std // Global semaphores use an actual address instead of an index if (use_sub_devices) { global_sems.reserve(num_receivers * 2); - for (uint32_t i=0; i < num_receivers; ++i) { + for (uint32_t i = 0; i < num_receivers; ++i) { global_sems.push_back(tt_metal::CreateGlobalSemaphore(device, all_cores, INVALID)); pages_acked_semaphore_ids[i] = global_sems.back()->address(); global_sems.push_back(tt_metal::CreateGlobalSemaphore(device, all_cores, INVALID)); - pages_sent_semaphore_ids[i] = global_sems.back()->address(); + pages_sent_semaphore_ids[i] = global_sems.back()->address(); } } else { - for (uint32_t i=0; i < num_receivers; ++i) { + for (uint32_t i = 0; i < num_receivers; ++i) { pages_acked_semaphore_ids[i] = tt_metal::CreateSemaphore(sender_program, all_cores, INVALID); pages_sent_semaphore_ids[i] = tt_metal::CreateSemaphore(sender_program, all_cores, INVALID); } } std::vector reader_compile_time_args = { - (std::uint32_t) input_buffer->address(), - (std::uint32_t) start_tile_id, - (std::uint32_t) tt_metal::NOC::RISCV_0_default, - (std::uint32_t) num_mixed_df_layers - }; + (std::uint32_t)input_buffer->address(), + (std::uint32_t)start_tile_id, + (std::uint32_t)tt_metal::NOC::RISCV_0_default, + (std::uint32_t)num_mixed_df_layers}; auto reader_kernel = tt_metal::CreateKernel( sender_program, @@ -207,13 +214,12 @@ std::tuple, tt_metal::KernelHandle, uint32_t, std .compile_args = reader_compile_time_args}); std::vector writer_compile_time_args = { - (std::uint32_t) tt_metal::NOC::RISCV_0_default, - (std::uint32_t) receiver_cb_addr, - (std::uint32_t) receiver_cb_size, - (std::uint32_t) num_receivers, - (std::uint32_t) num_mixed_df_layers, - (std::uint32_t) use_sub_devices - }; + (std::uint32_t)tt_metal::NOC::RISCV_0_default, + (std::uint32_t)receiver_cb_addr, + (std::uint32_t)receiver_cb_size, + (std::uint32_t)num_receivers, + (std::uint32_t)num_mixed_df_layers, + (std::uint32_t)use_sub_devices}; auto writer_kernel = tt_metal::CreateKernel( sender_program, @@ -226,11 +232,10 @@ std::tuple, tt_metal::KernelHandle, uint32_t, std .compile_args = writer_compile_time_args}); std::vector receiver_compile_time_args = { - (std::uint32_t) reader_cb_addr, - (std::uint32_t) receiver_cb_size, - (std::uint32_t) num_mixed_df_layers, - (std::uint32_t) use_sub_devices - }; + (std::uint32_t)reader_cb_addr, + (std::uint32_t)receiver_cb_size, + (std::uint32_t)num_mixed_df_layers, + (std::uint32_t)use_sub_devices}; auto receiver_kernel = tt_metal::CreateKernel( receiver_program, @@ -247,21 +252,18 @@ std::tuple, tt_metal::KernelHandle, uint32_t, std auto dram_reader_core_coord_physical = device->worker_core_from_logical_core(dram_reader_core_coord); uint32_t bank_id = 0; uint32_t vc = bank_id & 0x1; - std::vector reader_rt_args = { - (std::uint32_t) bank_id, - (std::uint32_t) vc - }; + std::vector reader_rt_args = {(std::uint32_t)bank_id, (std::uint32_t)vc}; for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - reader_rt_args.push_back(i%2 == 0 ? reader_page_size : next_layer_reader_page_size); + reader_rt_args.push_back(i % 2 == 0 ? reader_page_size : next_layer_reader_page_size); } for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - reader_rt_args.push_back(i%2 == 0 ? reader_num_pages : next_layer_reader_num_pages); + reader_rt_args.push_back(i % 2 == 0 ? reader_num_pages : next_layer_reader_num_pages); } for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - reader_rt_args.push_back(i%2 == 0 ? num_blocks : next_layer_num_blocks); + reader_rt_args.push_back(i % 2 == 0 ? num_blocks : next_layer_num_blocks); } for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - reader_rt_args.push_back(i%2 == 0 ? block_num_tiles : next_layer_block_num_tiles); + reader_rt_args.push_back(i % 2 == 0 ? block_num_tiles : next_layer_block_num_tiles); } tt_metal::SetRuntimeArgs(sender_program, reader_kernel, dram_reader_core_coord, reader_rt_args); @@ -271,60 +273,59 @@ std::tuple, tt_metal::KernelHandle, uint32_t, std l1_receiver_core_coords.push_back(l1_receiver_core_coord); } std::vector writer_rt_args; - for (uint32_t i=0; i < num_receivers; ++i) { + for (uint32_t i = 0; i < num_receivers; ++i) { auto l1_receiver_core_coord_physical = device->worker_core_from_logical_core(l1_receiver_core_coords[i]); writer_rt_args.push_back(l1_receiver_core_coord_physical.x); } - for (uint32_t i=0; i < num_receivers; ++i) { + for (uint32_t i = 0; i < num_receivers; ++i) { auto l1_receiver_core_coord_physical = device->worker_core_from_logical_core(l1_receiver_core_coords[i]); writer_rt_args.push_back(l1_receiver_core_coord_physical.y); } - for (uint32_t i=0; i < num_receivers; ++i) { + for (uint32_t i = 0; i < num_receivers; ++i) { writer_rt_args.push_back(pages_acked_semaphore_ids[i]); } - for (uint32_t i=0; i < num_receivers; ++i) { + for (uint32_t i = 0; i < num_receivers; ++i) { writer_rt_args.push_back(pages_sent_semaphore_ids[i]); } for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - writer_rt_args.push_back(i%2 == 0 ? writer_page_size : next_layer_writer_page_size); + writer_rt_args.push_back(i % 2 == 0 ? writer_page_size : next_layer_writer_page_size); } for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - writer_rt_args.push_back(i%2 == 0 ? writer_num_pages : next_layer_writer_num_pages); + writer_rt_args.push_back(i % 2 == 0 ? writer_num_pages : next_layer_writer_num_pages); } for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - writer_rt_args.push_back(i%2 == 0 ? num_blocks : next_layer_num_blocks); + writer_rt_args.push_back(i % 2 == 0 ? num_blocks : next_layer_num_blocks); } for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - writer_rt_args.push_back(i%2 == 0 ? block_num_tiles : next_layer_block_num_tiles); + writer_rt_args.push_back(i % 2 == 0 ? block_num_tiles : next_layer_block_num_tiles); } for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - writer_rt_args.push_back(i%2 == 0 ? single_tile_size : next_layer_single_tile_size); + writer_rt_args.push_back(i % 2 == 0 ? single_tile_size : next_layer_single_tile_size); } for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - writer_rt_args.push_back(i%2 == 0 ? num_tile_rows_write : next_layer_num_tile_rows_write); + writer_rt_args.push_back(i % 2 == 0 ? num_tile_rows_write : next_layer_num_tile_rows_write); } tt_metal::SetRuntimeArgs(sender_program, writer_kernel, dram_reader_core_coord, writer_rt_args); // reciever rt - for (uint32_t i=0; i < num_receivers; ++i) { + for (uint32_t i = 0; i < num_receivers; ++i) { std::vector receiver_rt_args = { - (std::uint32_t) vc & 0x3, - (std::uint32_t) dram_reader_core_coord_physical.x, - (std::uint32_t) dram_reader_core_coord_physical.y - }; - vc ++; + (std::uint32_t)vc & 0x3, + (std::uint32_t)dram_reader_core_coord_physical.x, + (std::uint32_t)dram_reader_core_coord_physical.y}; + vc++; receiver_rt_args.push_back(pages_acked_semaphore_ids[i]); receiver_rt_args.push_back(pages_sent_semaphore_ids[i]); for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - receiver_rt_args.push_back(i%2 == 0 ? single_tile_size : next_layer_single_tile_size); + receiver_rt_args.push_back(i % 2 == 0 ? single_tile_size : next_layer_single_tile_size); } for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - receiver_rt_args.push_back(i%2 == 0 ? num_blocks : next_layer_num_blocks); + receiver_rt_args.push_back(i % 2 == 0 ? num_blocks : next_layer_num_blocks); } for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - receiver_rt_args.push_back(i%2 == 0 ? receiver_block_num_tile : next_layer_receiver_block_num_tile); + receiver_rt_args.push_back(i % 2 == 0 ? receiver_block_num_tile : next_layer_receiver_block_num_tile); } log_info("l1_receiver_core_coords: {}", l1_receiver_core_coords[i]); @@ -335,9 +336,7 @@ std::tuple, tt_metal::KernelHandle, uint32_t, std return {std::move(programs), reader_kernel, reader_cb_addr, std::move(global_sems)}; } -float to_float(bfloat16 bfloat16_num) { - return bfloat16_num.to_float(); -} +float to_float(bfloat16 bfloat16_num) { return bfloat16_num.to_float(); } float pcc(const std::vector& x, const std::vector& y) { if (x.size() != y.size()) { @@ -372,15 +371,14 @@ float pcc(const std::vector& x, const std::vector& y) { bool validation_bfp8_b( const tt::deprecated::Tensor& input_tensor, - const tt::DataFormat &data_format, + const tt::DataFormat& data_format, uint32_t num_blocks, uint32_t cb_num_blocks, uint32_t kt, uint32_t nt, - const std::shared_ptr& out_buffer -) { + const std::shared_ptr& out_buffer) { bool pass = true; - std::vector golden_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); // Initialize with zeros + std::vector golden_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); // Initialize with zeros std::vector result_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); auto num_datums_per_cb = kt * nt * 32 * 32 / num_blocks * cb_num_blocks; @@ -388,7 +386,7 @@ bool validation_bfp8_b( std::vector result; tt::tt_metal::detail::ReadFromBuffer(out_buffer, result); auto result_bfp8 = unpack_bfp8_tiles_into_float_vec(result, true, false); - result_untilized = tt::test_utils::untilize(result_bfp8, kt*32 / num_blocks * cb_num_blocks, nt*32); + result_untilized = tt::test_utils::untilize(result_bfp8, kt * 32 / num_blocks * cb_num_blocks, nt * 32); const auto& values = input_tensor.get_values(); @@ -402,7 +400,7 @@ bool validation_bfp8_b( } } - for (int i=0; i& input_tensor, - const tt::DataFormat &data_format, + const tt::DataFormat& data_format, uint32_t num_blocks, uint32_t cb_num_blocks, uint32_t kt, uint32_t nt, - const std::shared_ptr& out_buffer -) { + const std::shared_ptr& out_buffer) { bool pass = true; - std::vector golden_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); // Initialize with zeros + std::vector golden_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); // Initialize with zeros std::vector result_vec(kt * nt * 32 * 32 / num_blocks * cb_num_blocks, 0); auto num_datums_per_cb = kt * nt * 32 * 32 / num_blocks * cb_num_blocks; @@ -432,7 +428,7 @@ bool validation_fp16( tt::tt_metal::detail::ReadFromBuffer(out_buffer, result); auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result); auto result_flat_layout = convert_to_flat_layout(result_bfp16); - auto result_untilized = tt::test_utils::untilize(result_flat_layout, kt*32 / num_blocks * cb_num_blocks, nt*32); + auto result_untilized = tt::test_utils::untilize(result_flat_layout, kt * 32 / num_blocks * cb_num_blocks, nt * 32); const auto& values = input_tensor.get_values(); @@ -446,7 +442,7 @@ bool validation_fp16( } } - for (int i=0; i(result_untilized[i])); } @@ -460,15 +456,14 @@ bool validation_fp16( bool validation_mixed_df( const tt::deprecated::Tensor& input_tensor_fp16, const tt::deprecated::Tensor& input_tensor_fp8, - const tt::DataFormat &data_format, + const tt::DataFormat& data_format, uint32_t num_blocks, uint32_t cb_num_blocks, uint32_t kt, uint32_t nt, const std::shared_ptr& out_buffer, uint32_t num_mixed_df_layers, - uint32_t num_receivers -) { + uint32_t num_receivers) { bool pass = true; std::vector result; @@ -477,11 +472,11 @@ bool validation_mixed_df( auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result); auto result_untilized_fp16 = convert_to_flat_layout(result_bfp16); - std::vector golden_vec(kt*32 / num_blocks * cb_num_blocks * nt*32); - std::vector result_vec_fp16(kt*32 / num_blocks * cb_num_blocks * nt*32); + std::vector golden_vec(kt * 32 / num_blocks * cb_num_blocks * nt * 32); + std::vector result_vec_fp16(kt * 32 / num_blocks * cb_num_blocks * nt * 32); // compare with the result tilized with tilized - auto values_fp16 = tt::test_utils::tilize(input_tensor_fp16.get_values(), kt*32, nt*32); + auto values_fp16 = tt::test_utils::tilize(input_tensor_fp16.get_values(), kt * 32, nt * 32); uint32_t block_h = kt / num_blocks; uint32_t block_w = nt; @@ -489,49 +484,52 @@ bool validation_mixed_df( auto num_datums_per_cb = kt * nt * 32 * 32 / num_blocks * cb_num_blocks / num_receivers; int start_index = 0; - int fifo_size = kt*32 / num_blocks * cb_num_blocks * nt*32 * 2 / num_receivers; + int fifo_size = kt * 32 / num_blocks * cb_num_blocks * nt * 32 * 2 / num_receivers; int fifo_size_page_aligned, page_size, num_pages, layer_transfer_size, fifo_wr_ptr = 0; for (int l = 0; l < num_mixed_df_layers; ++l) { - if (l % 2 == 0) { // fp16 + if (l % 2 == 0) { // fp16 page_size = 2048; } else { page_size = 1088; } layer_transfer_size = page_size * kt * nt / num_receivers; - uint32_t block_size = block_num_tiles * tt::constants::TILE_HW * datum_size(tt::DataFormat::Float16_b); // fp16 + uint32_t block_size = block_num_tiles * tt::constants::TILE_HW * datum_size(tt::DataFormat::Float16_b); // fp16 uint32_t num_blocks = fifo_size / block_size; uint32_t cb_size_block_aligned = num_blocks * block_size; bool fifo_wr_ptr_exceed_fifo_limit = fifo_wr_ptr > cb_size_block_aligned; uint32_t num_blocks_till_fifo_limit = (cb_size_block_aligned - fifo_wr_ptr) / block_size; // start pointer addr of current layer - fifo_wr_ptr = fifo_wr_ptr_exceed_fifo_limit ? 0 : cb_size_block_aligned - num_blocks_till_fifo_limit * block_size; + fifo_wr_ptr = + fifo_wr_ptr_exceed_fifo_limit ? 0 : cb_size_block_aligned - num_blocks_till_fifo_limit * block_size; // start index to read, fifo_wr_ptr / 2 because fp16 format start_index = fifo_wr_ptr == cb_size_block_aligned ? 0 : fifo_wr_ptr / 2; // end pointer addr of current layer fifo_wr_ptr = (fifo_wr_ptr + layer_transfer_size) % cb_size_block_aligned; } - std::vector > values_fp16_split(num_receivers, std::vector(values_fp16.size() / num_receivers)); + std::vector> values_fp16_split( + num_receivers, std::vector(values_fp16.size() / num_receivers)); int index = 0; for (int k = 0; k < kt; ++k) { for (int n = 0; n < num_receivers; ++n) { for (int i = 0; i < nt * 32 * 32 / num_receivers; ++i) { values_fp16_split[n][i + k * nt * 32 * 32 / num_receivers] = to_float(values_fp16[index]); - index ++; + index++; } } } - std::vector > golden_vec_split(num_receivers, std::vector(golden_vec.size() / num_receivers)); + std::vector> golden_vec_split( + num_receivers, std::vector(golden_vec.size() / num_receivers)); for (int n = 0; n < num_receivers; ++n) { index = start_index; for (int i = 0; i < kt * nt * 32 * 32 / num_receivers; ++i) { golden_vec_split[n][index] = values_fp16_split[n][i]; - index ++; + index++; if (index == num_datums_per_cb) { index = 0; @@ -544,12 +542,12 @@ bool validation_mixed_df( for (int n = 0; n < num_receivers; ++n) { for (int i = 0; i < nt * 32 * 32 / num_receivers; ++i) { golden_vec[index] = golden_vec_split[n][i + k * nt * 32 * 32 / num_receivers]; - index ++; + index++; } } } - for (int i=0; i(result_untilized_fp16[i])); } @@ -585,9 +583,7 @@ std::shared_ptr create_and_transfer_data_sharded_cb( BufferType buffer_type, tt::DataFormat data_format, CoreRangeSet cores, - uint32_t num_receivers -) { - + uint32_t num_receivers) { uint32_t size_bytes; uint32_t page_size_bytes; if (data_format == tt::DataFormat::Bfp8_b) { @@ -599,24 +595,24 @@ std::shared_ptr create_and_transfer_data_sharded_cb( } ShardSpecBuffer shard_spec = ShardSpecBuffer( - cores, - {ht * tt::constants::TILE_HEIGHT, wt * tt::constants::TILE_WIDTH / num_receivers}, - ShardOrientation::ROW_MAJOR, - false, - {tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH}, - {ht, wt}); + cores, + {ht * tt::constants::TILE_HEIGHT, wt * tt::constants::TILE_WIDTH / num_receivers}, + ShardOrientation::ROW_MAJOR, + false, + {tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH}, + {ht, wt}); log_info("cores: {}", cores); log_info("size_bytes: {}", size_bytes); log_info("page_size_bytes: {}", page_size_bytes); auto input_buffer = CreateBuffer(tt::tt_metal::ShardedBufferConfig{ - .device = device, - .size = size_bytes, - .page_size = page_size_bytes, - .buffer_type = buffer_type, - .buffer_layout = TensorMemoryLayout::WIDTH_SHARDED, - .shard_parameters = shard_spec}); + .device = device, + .size = size_bytes, + .page_size = page_size_bytes, + .buffer_type = buffer_type, + .buffer_layout = TensorMemoryLayout::WIDTH_SHARDED, + .shard_parameters = shard_spec}); tt::tt_metal::detail::WriteToBuffer(input_buffer, input_vec); log_info("created sharded tensor"); @@ -624,7 +620,7 @@ std::shared_ptr create_and_transfer_data_sharded_cb( return input_buffer; } -int main(int argc, char **argv) { +int main(int argc, char** argv) { if (getenv("TT_METAL_SLOW_DISPATCH_MODE") != nullptr) { log_error("Test not supported w/ slow dispatch, exiting"); } @@ -648,10 +644,9 @@ int main(int argc, char **argv) { //////////////////////////////////////////////////////////////////////////// std::vector input_args(argv, argv + argc); try { - std::tie(k, input_args) = - test_args::get_command_option_uint64_and_remaining_args(input_args, "--k", 8192); + std::tie(k, input_args) = test_args::get_command_option_uint64_and_remaining_args(input_args, "--k", 8192); std::tie(n, input_args) = - test_args::get_command_option_uint64_and_remaining_args(input_args, "--n", 12*128); + test_args::get_command_option_uint64_and_remaining_args(input_args, "--n", 12 * 128); std::tie(num_blocks, input_args) = test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-blocks", 8); std::tie(cb_num_blocks, input_args) = @@ -671,28 +666,34 @@ int main(int argc, char **argv) { std::tie(use_sub_devices, input_args) = test_args::has_command_option_and_remaining_args(input_args, "--use-sub-devices"); - test_args::validate_remaining_args(input_args); - } catch (const std::exception &e) { + } catch (const std::exception& e) { log_error(tt::LogTest, "Command line arguments found exception", e.what()); TT_ASSERT(false); } log_info("num_mixed_df_layers: {} ", num_mixed_df_layers); log_info("num_receivers: {} ", num_receivers); - - TT_FATAL(num_mixed_df_layers % 2 == 1, "currently only support odd number of layers testing, due to issue with validatoin"); + // TODO: Re-enable usage of cb_padding once Global CBs are supported + log_warning( + "Setting cb padding to 0B for now due to how buffers are constructed. Can be non-zero once Global CBs are " + "supported."); + cb_padding = 0; + + TT_FATAL( + num_mixed_df_layers % 2 == 1, + "currently only support odd number of layers testing, due to issue with validatoin"); if (num_mixed_df_layers > 1) { TT_FATAL(df == 1, "must start with bfloat16 format for mix_df test"); } if (use_device_profiler) { - #if !defined(TRACY_ENABLE) +#if !defined(TRACY_ENABLE) log_error( LogTest, "Metal library and test code should be build with " "profiler option using ./scripts/build_scripts/build_with_profiler_opt.sh"); - #endif +#endif auto device_profiler = getenv("TT_METAL_DEVICE_PROFILER"); TT_FATAL( device_profiler, @@ -730,7 +731,7 @@ int main(int argc, char **argv) { // Device Setup //////////////////////////////////////////////////////////////////////////// int device_id = 0; - tt_metal::Device *device = tt_metal::CreateDevice(device_id); + tt_metal::Device* device = tt_metal::CreateDevice(device_id); CoreCoord dram_bank_coord = CoreCoord{0, 0}; CoreCoord dram_reader_core_coord = CoreCoord{0, 0}; @@ -752,56 +753,133 @@ int main(int argc, char **argv) { //////////////////////////////////////////////////////////////////////////// // Input Setup //////////////////////////////////////////////////////////////////////////// - std::vector > input_buffers(num_mixed_df_layers); + std::vector> input_buffers(num_mixed_df_layers); std::shared_ptr output_buffer; auto input_shape = SHAPE{1, 1, k, n}; - tt::deprecated::Tensor tensor_fp16 = tt::deprecated::initialize_tensor(input_shape, tt::deprecated::Initialize::INCREMENT, 0, 100, std::chrono::system_clock::now().time_since_epoch().count()); - tt::deprecated::Tensor tensor_fp8 = tt::deprecated::initialize_tensor(input_shape, tt::deprecated::Initialize::INCREMENT, 0, 100, std::chrono::system_clock::now().time_since_epoch().count()); + tt::deprecated::Tensor tensor_fp16 = tt::deprecated::initialize_tensor( + input_shape, + tt::deprecated::Initialize::INCREMENT, + 0, + 100, + std::chrono::system_clock::now().time_since_epoch().count()); + tt::deprecated::Tensor tensor_fp8 = tt::deprecated::initialize_tensor( + input_shape, + tt::deprecated::Initialize::INCREMENT, + 0, + 100, + std::chrono::system_clock::now().time_since_epoch().count()); if (tile_format == tt::DataFormat::Bfp8_b) { for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - if (i%2 == 0) { // even layers + if (i % 2 == 0) { // even layers auto input_vec_tilized = tt::test_utils::tilize(tensor_fp8.get_values(), k, n); - std::vector packed_input_vec_tile_layout = pack_fp32_vec_as_bfp8_tiles(input_vec_tilized, true, false); - input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Bfp8_b, dram_reader_core, num_banks); - } else { // odd layers + std::vector packed_input_vec_tile_layout = + pack_fp32_vec_as_bfp8_tiles(input_vec_tilized, true, false); + input_buffers[i] = create_and_transfer_data_sharded_cb( + device, + packed_input_vec_tile_layout, + kt, + nt, + tt_metal::BufferType::DRAM, + tt::DataFormat::Bfp8_b, + dram_reader_core, + num_banks); + } else { // odd layers auto input_vec_tilized = tt::test_utils::tilize(tensor_fp16.get_values(), k, n); auto input_vec_tile_layout = convert_to_tile_layout(input_vec_tilized); - vector packed_input_vec_tile_layout = pack_bfloat16_vec_into_uint32_vec(input_vec_tile_layout); - input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Float16_b, dram_reader_core, num_banks); + vector packed_input_vec_tile_layout = + pack_bfloat16_vec_into_uint32_vec(input_vec_tile_layout); + input_buffers[i] = create_and_transfer_data_sharded_cb( + device, + packed_input_vec_tile_layout, + kt, + nt, + tt_metal::BufferType::DRAM, + tt::DataFormat::Float16_b, + dram_reader_core, + num_banks); } } // output vector outputs = create_constant_vector_of_bfp8(output_size, 0, true); - output_buffer = create_and_transfer_data_sharded_cb(device, outputs, kt / num_blocks * cb_num_blocks, nt, tt_metal::BufferType::L1, tt::DataFormat::Bfp8_b, l1_receiver_core, num_receivers); + output_buffer = create_and_transfer_data_sharded_cb( + device, + outputs, + kt / num_blocks * cb_num_blocks, + nt, + tt_metal::BufferType::L1, + tt::DataFormat::Bfp8_b, + l1_receiver_core, + num_receivers); } else { for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { - if (i%2 == 0) { // even layers + if (i % 2 == 0) { // even layers auto input_vec_tilized = tt::test_utils::tilize(tensor_fp16.get_values(), k, n); auto input_vec_tile_layout = convert_to_tile_layout(input_vec_tilized); - vector packed_input_vec_tile_layout = pack_bfloat16_vec_into_uint32_vec(input_vec_tile_layout); - input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Float16_b, dram_reader_core, num_banks); + vector packed_input_vec_tile_layout = + pack_bfloat16_vec_into_uint32_vec(input_vec_tile_layout); + input_buffers[i] = create_and_transfer_data_sharded_cb( + device, + packed_input_vec_tile_layout, + kt, + nt, + tt_metal::BufferType::DRAM, + tt::DataFormat::Float16_b, + dram_reader_core, + num_banks); } else { auto input_vec_tilized = tt::test_utils::tilize(tensor_fp8.get_values(), k, n); - std::vector packed_input_vec_tile_layout = pack_fp32_vec_as_bfp8_tiles(input_vec_tilized, true, false); - input_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Bfp8_b, dram_reader_core, num_banks); + std::vector packed_input_vec_tile_layout = + pack_fp32_vec_as_bfp8_tiles(input_vec_tilized, true, false); + input_buffers[i] = create_and_transfer_data_sharded_cb( + device, + packed_input_vec_tile_layout, + kt, + nt, + tt_metal::BufferType::DRAM, + tt::DataFormat::Bfp8_b, + dram_reader_core, + num_banks); } } // output vector outputs = create_constant_vector_of_bfloat16(output_size, 0); - output_buffer = create_and_transfer_data_sharded_cb(device, outputs, kt / num_blocks * cb_num_blocks, nt, tt_metal::BufferType::L1, tt::DataFormat::Float16_b, l1_receiver_core, num_receivers); + output_buffer = create_and_transfer_data_sharded_cb( + device, + outputs, + kt / num_blocks * cb_num_blocks, + nt, + tt_metal::BufferType::L1, + tt::DataFormat::Float16_b, + l1_receiver_core, + num_receivers); } - for (uint32_t i=0; i < num_mixed_df_layers; ++i) { + for (uint32_t i = 0; i < num_mixed_df_layers; ++i) { log_info("input_buffers addr: {}", input_buffers[i]->address()); } //////////////////////////////////////////////////////////////////////////// // Application Setup //////////////////////////////////////////////////////////////////////////// - auto [programs, kernel, output_cb_addr, global_sems] = create_programs(device, dram_reader_core, l1_receiver_core, single_tile_size, tile_format, k, n, num_blocks, cb_num_blocks, num_receivers, num_mixed_df_layers, cb_padding, input_buffers[0], output_buffer, use_sub_devices); + auto [programs, kernel, output_cb_addr, global_sems] = create_programs( + device, + dram_reader_core, + l1_receiver_core, + single_tile_size, + tile_format, + k, + n, + num_blocks, + cb_num_blocks, + num_receivers, + num_mixed_df_layers, + cb_padding, + input_buffers[0], + output_buffer, + use_sub_devices); //////////////////////////////////////////////////////////////////////////// // Execution Application @@ -826,40 +904,26 @@ int main(int argc, char **argv) { //////////////////////////////////////////////////////////////////////////// if (num_mixed_df_layers == 1) { if (tile_format == tt::DataFormat::Bfp8_b) { - pass = validation_bfp8_b( - tensor_fp8, - tile_format, - num_blocks, - cb_num_blocks, - kt, - nt, - output_buffer); + pass = validation_bfp8_b(tensor_fp8, tile_format, num_blocks, cb_num_blocks, kt, nt, output_buffer); } else { - pass = validation_fp16( - tensor_fp16, - tile_format, - num_blocks, - cb_num_blocks, - kt, - nt, - output_buffer); + pass = validation_fp16(tensor_fp16, tile_format, num_blocks, cb_num_blocks, kt, nt, output_buffer); } } else { pass = validation_mixed_df( - tensor_fp16, - tensor_fp8, - tile_format, - num_blocks, - cb_num_blocks, - kt, - nt, - output_buffer, - num_mixed_df_layers, - num_receivers); + tensor_fp16, + tensor_fp8, + tile_format, + num_blocks, + cb_num_blocks, + kt, + nt, + output_buffer, + num_mixed_df_layers, + num_receivers); } pass &= tt_metal::CloseDevice(device); - } catch (const std::exception &e) { + } catch (const std::exception& e) { pass = false; log_error(LogTest, "{}", e.what()); log_error(LogTest, "System error message: {}", std::strerror(errno)); diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp index a871d849dd4..ccbc335504f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp @@ -49,10 +49,8 @@ using std::chrono::microseconds; // --bypass-check (set to bypass checking performance criteria fulfillment) //////////////////////////////////////////////////////////////////////////////// - - template -std::vector slice_vec(std::vector const &v, int m, int n) { +std::vector slice_vec(std::vector const& v, int m, int n) { auto first = v.cbegin() + m; auto last = v.cbegin() + n + 1; @@ -60,7 +58,8 @@ std::vector slice_vec(std::vector const &v, int m, int n) { return vec; } -void get_max_page_size_and_num_pages(uint32_t num_tiles, uint32_t num_datums_per_tile, uint32_t& page_size, uint32_t& num_pages) { +void get_max_page_size_and_num_pages( + uint32_t num_tiles, uint32_t num_datums_per_tile, uint32_t& page_size, uint32_t& num_pages) { uint64_t total_size = static_cast(num_tiles) * num_datums_per_tile; page_size = (8192 / num_datums_per_tile) * num_datums_per_tile; @@ -70,7 +69,8 @@ void get_max_page_size_and_num_pages(uint32_t num_tiles, uint32_t num_datums_per num_pages = total_size / page_size; } -std::tuple get_out_subblock_params(uint32_t per_core_Mt, uint32_t per_core_Nt, uint32_t choice = 0) { +std::tuple get_out_subblock_params( + uint32_t per_core_Mt, uint32_t per_core_Nt, uint32_t choice = 0) { constexpr std::array, 20> SUBBLOCK_HW_CHOICES = {{ {4, 2}, {2, 4}, {8, 1}, {1, 8}, {7, 1}, {1, 7}, {3, 2}, {2, 3}, {6, 1}, {1, 6}, {5, 1}, {1, 5}, {2, 2}, {4, 1}, {1, 4}, {3, 1}, {1, 3}, {2, 1}, {1, 2}, {1, 1}, @@ -87,18 +87,17 @@ std::tuple get_out_subblock_params(uint32_t per_core_Mt, uin index++; } } - } return {1, 1}; } std::tuple, std::vector>> create_programs( - tt_metal::Device *device, - const CoreRangeSet &dram_reader_core, - const CoreRangeSet &l1_receiver_cores, - const uint32_t &single_tile_size, - const tt::DataFormat &tile_format, + tt_metal::Device* device, + const CoreRangeSet& dram_reader_core, + const CoreRangeSet& l1_receiver_cores, + const uint32_t& single_tile_size, + const tt::DataFormat& tile_format, uint32_t m, uint32_t k, uint32_t n, @@ -111,9 +110,7 @@ std::tuple, std::vector& in1_buffer, const std::shared_ptr& in1_l1_buffer, const std::shared_ptr& output_buffer, - bool use_sub_devices - ) { - + bool use_sub_devices) { log_info("created program"); std::vector programs; @@ -147,7 +144,8 @@ std::tuple, std::vector, std::vectoraddress(); tt_metal::CircularBufferConfig in0_reader_cb_config = tt_metal::CircularBufferConfig(in0_reader_cb_size, {{in0_reader_cb_index, tile_format}}) - .set_page_size(in0_reader_cb_index, single_tile_size).set_globally_allocated_address(*in0_buffer); + .set_page_size(in0_reader_cb_index, single_tile_size) + .set_globally_allocated_address(*in0_buffer); auto in0_reader_cb = tt_metal::CreateCircularBuffer(receiver_program, l1_receiver_cores, in0_reader_cb_config); // in1 receiver CB @@ -172,16 +171,18 @@ std::tuple, std::vectoraddress(); tt_metal::CircularBufferConfig in1_receiver_cb_config = tt_metal::CircularBufferConfig(in1_receiver_cb_size, {{in1_receiver_cb_index, tile_format}}) - .set_page_size(in1_receiver_cb_index, single_tile_size).set_globally_allocated_address(*in1_l1_buffer); + .set_page_size(in1_receiver_cb_index, single_tile_size) + .set_globally_allocated_address(*in1_l1_buffer); auto in1_receiver_cb = tt_metal::CreateCircularBuffer(receiver_program, l1_receiver_cores, in1_receiver_cb_config); // output CB uint32_t output_cb_index = 16; - uint32_t output_cb_size = in0_block_h * in1_block_w * single_tile_size; + uint32_t output_cb_size = in0_block_h * in1_block_w * single_tile_size / num_receivers; uint32_t output_cb_addr = output_buffer->address(); tt_metal::CircularBufferConfig output_cb_config = tt_metal::CircularBufferConfig(output_cb_size, {{output_cb_index, tile_format}}) - .set_page_size(output_cb_index, single_tile_size).set_globally_allocated_address(*output_buffer); + .set_page_size(output_cb_index, single_tile_size) + .set_globally_allocated_address(*output_buffer); auto output_cb = tt_metal::CreateCircularBuffer(receiver_program, l1_receiver_cores, output_cb_config); // sync CB @@ -202,14 +203,14 @@ std::tuple, std::vectoraddress(); global_sems.push_back(tt_metal::CreateGlobalSemaphore(device, all_cores, INVALID)); - pages_sent_semaphore_ids[i] = global_sems.back()->address(); + pages_sent_semaphore_ids[i] = global_sems.back()->address(); } } else { - for (uint32_t i=0; i < num_receivers; ++i) { + for (uint32_t i = 0; i < num_receivers; ++i) { pages_acked_semaphore_ids[i] = tt_metal::CreateSemaphore(sender_program, all_cores, INVALID); pages_sent_semaphore_ids[i] = tt_metal::CreateSemaphore(sender_program, all_cores, INVALID); } @@ -217,11 +218,10 @@ std::tuple, std::vector in1_reader_compile_time_args = { - (std::uint32_t) in1_buffer->address(), - (std::uint32_t) start_tile_id, - (std::uint32_t) tt_metal::NOC::RISCV_0_default, - (std::uint32_t) num_layers - }; + (std::uint32_t)in1_buffer->address(), + (std::uint32_t)start_tile_id, + (std::uint32_t)tt_metal::NOC::RISCV_0_default, + (std::uint32_t)num_layers}; auto in1_reader_kernel = tt_metal::CreateKernel( sender_program, @@ -235,13 +235,12 @@ std::tuple, std::vector in1_writer_compile_time_args = { - (std::uint32_t) tt_metal::NOC::RISCV_0_default, - (std::uint32_t) in1_receiver_cb_addr, - (std::uint32_t) in1_receiver_cb_size, - (std::uint32_t) num_receivers, - (std::uint32_t) num_layers, - (std::uint32_t) use_sub_devices - }; + (std::uint32_t)tt_metal::NOC::RISCV_0_default, + (std::uint32_t)in1_receiver_cb_addr, + (std::uint32_t)in1_receiver_cb_size, + (std::uint32_t)num_receivers, + (std::uint32_t)num_layers, + (std::uint32_t)use_sub_devices}; auto in1_writer_kernel = tt_metal::CreateKernel( sender_program, @@ -254,9 +253,7 @@ std::tuple, std::vector in0_reader_compile_time_args = { - (std::uint32_t) num_layers - }; + vector in0_reader_compile_time_args = {(std::uint32_t)num_layers}; auto in0_reader_kernel = tt_metal::CreateKernel( receiver_program, @@ -269,11 +266,10 @@ std::tuple, std::vector in1_receiver_compile_time_args = { - (std::uint32_t) in1_receiver_cb_addr, - (std::uint32_t) in1_receiver_cb_size + cb_padding, - (std::uint32_t) num_layers, - (std::uint32_t) use_sub_devices - }; + (std::uint32_t)in1_receiver_cb_addr, + (std::uint32_t)in1_receiver_cb_size + cb_padding, + (std::uint32_t)num_layers, + (std::uint32_t)use_sub_devices}; auto in1_receiver_kernel = tt_metal::CreateKernel( receiver_program, @@ -289,20 +285,20 @@ std::tuple, std::vector compute_kernel_compile_time_args = { - in0_block_w, // in0_block_w + in0_block_w, // in0_block_w in0_block_num_tiles, in1_block_num_tiles / num_receivers, // in1_block_num_tiles - in1_per_core_w, // in1_per_core_w - num_blocks, // num_blocks - mt, // out_subblock_h - out_block_w, // out_block_w - out_block_num_tiles, // out_block_num_tiles - num_layers - }; + in1_per_core_w, // in1_per_core_w + num_blocks, // num_blocks + mt, // out_subblock_h + out_block_w, // out_block_w + out_block_num_tiles, // out_block_num_tiles + num_layers}; auto compute_kernel = tt_metal::CreateKernel( receiver_program, - "tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/kernels/bmm_large_block_zm_fused_bias_activation_copy.cpp", + "tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/kernels/" + "bmm_large_block_zm_fused_bias_activation_copy.cpp", l1_receiver_cores, tt_metal::ComputeConfig{ .math_fidelity = tile_format == tt::DataFormat::Float16_b ? MathFidelity::HiFi2 : MathFidelity::LoFi, @@ -316,10 +312,7 @@ std::tuple, std::vectorworker_core_from_logical_core(dram_reader_core_coord); uint32_t bank_id = 0; uint32_t vc = bank_id & 0x1; - std::vector reader_rt_args = { - (std::uint32_t) bank_id, - (std::uint32_t) vc - }; + std::vector reader_rt_args = {(std::uint32_t)bank_id, (std::uint32_t)vc}; for (uint32_t i = 0; i < num_layers; ++i) { reader_rt_args.push_back(in1_reader_page_size); } @@ -340,18 +333,18 @@ std::tuple, std::vector writer_rt_args; - for (uint32_t i=0; i < num_receivers; ++i) { + for (uint32_t i = 0; i < num_receivers; ++i) { auto l1_receiver_core_coord_physical = device->worker_core_from_logical_core(l1_receiver_core_coords[i]); writer_rt_args.push_back(l1_receiver_core_coord_physical.x); } - for (uint32_t i=0; i < num_receivers; ++i) { + for (uint32_t i = 0; i < num_receivers; ++i) { auto l1_receiver_core_coord_physical = device->worker_core_from_logical_core(l1_receiver_core_coords[i]); writer_rt_args.push_back(l1_receiver_core_coord_physical.y); } - for (uint32_t i=0; i < num_receivers; ++i) { + for (uint32_t i = 0; i < num_receivers; ++i) { writer_rt_args.push_back(pages_acked_semaphore_ids[i]); } - for (uint32_t i=0; i < num_receivers; ++i) { + for (uint32_t i = 0; i < num_receivers; ++i) { writer_rt_args.push_back(pages_sent_semaphore_ids[i]); } for (uint32_t i = 0; i < num_layers; ++i) { @@ -375,13 +368,12 @@ std::tuple, std::vector receiver_rt_args = { - (std::uint32_t) vc & 0x3, - (std::uint32_t) dram_reader_core_coord_physical.x, - (std::uint32_t) dram_reader_core_coord_physical.y - }; - vc ++; + (std::uint32_t)vc & 0x3, + (std::uint32_t)dram_reader_core_coord_physical.x, + (std::uint32_t)dram_reader_core_coord_physical.y}; + vc++; receiver_rt_args.push_back(pages_acked_semaphore_ids[i]); receiver_rt_args.push_back(pages_sent_semaphore_ids[i]); @@ -402,7 +394,7 @@ std::tuple, std::vector in0_reader_rt_args; for (uint32_t i = 0; i < num_layers; ++i) { in0_reader_rt_args.push_back(num_blocks); @@ -419,9 +411,7 @@ std::tuple, std::vector& x, const std::vector& y) { if (x.size() != y.size()) { @@ -457,24 +447,23 @@ float pcc(const std::vector& x, const std::vector& y) { bool validation_bfp8_b( const tt::deprecated::Tensor& in0_tensor, const tt::deprecated::Tensor& in1_tensor, - const tt::DataFormat &data_format, + const tt::DataFormat& data_format, uint32_t num_blocks, uint32_t cb_num_blocks, uint32_t mt, uint32_t kt, uint32_t nt, const std::shared_ptr& out_buffer, - uint32_t num_receivers -) { + uint32_t num_receivers) { bool pass = true; - std::vector golden_vec(mt * nt * 32 * 32, 0); // Initialize with zeros + std::vector golden_vec(mt * nt * 32 * 32, 0); // Initialize with zeros std::vector result_vec(mt * nt * 32 * 32, 0); std::vector result_untilized; std::vector result; tt::tt_metal::detail::ReadFromBuffer(out_buffer, result); auto result_bfp8 = unpack_bfp8_tiles_into_float_vec(result, true, false); - result_untilized = tt::test_utils::untilize(result_bfp8, mt*32, nt*32); + result_untilized = tt::test_utils::untilize(result_bfp8, mt * 32, nt * 32); const auto& in0_values = in0_tensor.get_values(); const auto& in1_values = in1_tensor.get_values(); @@ -485,14 +474,15 @@ bool validation_bfp8_b( for (size_t j = 0; j < per_core_n; ++j) { float sum = 0; for (size_t k = 0; k < kt * 32; ++k) { - sum += to_float(in0_values[n * kt * 32 + i * num_receivers * kt * 32 + k]) * to_float(in1_values[n * per_core_n + k * nt * 32 + j]); + sum += to_float(in0_values[n * kt * 32 + i * num_receivers * kt * 32 + k]) * + to_float(in1_values[n * per_core_n + k * nt * 32 + j]); } golden_vec[i * nt * 32 + n * per_core_n + j] = sum; } } } - for (int i=0; i& in0_tensor, const tt::deprecated::Tensor& in1_tensor, - const tt::DataFormat &data_format, + const tt::DataFormat& data_format, uint32_t num_blocks, uint32_t cb_num_blocks, uint32_t mt, uint32_t kt, uint32_t nt, const std::shared_ptr& out_buffer, - uint32_t num_receivers -) { + uint32_t num_receivers) { bool pass = true; - std::vector golden_vec(mt * nt * 32 * 32, 0); // Initialize with zeros + std::vector golden_vec(mt * nt * 32 * 32, 0); // Initialize with zeros std::vector result_vec(mt * nt * 32 * 32, 0); std::vector result; tt::tt_metal::detail::ReadFromBuffer(out_buffer, result); auto result_bfp16 = unpack_uint32_vec_into_bfloat16_vec(result); auto result_flat_layout = convert_to_flat_layout(result_bfp16); - auto result_untilized = tt::test_utils::untilize(result_flat_layout, mt*32, nt*32); + auto result_untilized = tt::test_utils::untilize(result_flat_layout, mt * 32, nt * 32); const auto& in0_values = in0_tensor.get_values(); const auto& in1_values = in1_tensor.get_values(); @@ -536,14 +524,15 @@ bool validation_fp16( for (size_t j = 0; j < per_core_n; ++j) { float sum = 0; for (size_t k = 0; k < kt * 32; ++k) { - sum += to_float(in0_values[n * kt * 32 + i * num_receivers * kt * 32 + k]) * to_float(in1_values[n * per_core_n + k * nt * 32 + j]); + sum += to_float(in0_values[n * kt * 32 + i * num_receivers * kt * 32 + k]) * + to_float(in1_values[n * per_core_n + k * nt * 32 + j]); } golden_vec[i * nt * 32 + n * per_core_n + j] = sum; } } } - for (int i=0; i(result_untilized[i])); } @@ -563,9 +552,7 @@ std::shared_ptr create_and_transfer_data_sharded_cb( BufferType buffer_type, tt::DataFormat data_format, CoreRangeSet cores, - uint32_t num_receivers -) { - + uint32_t num_receivers) { uint32_t size_bytes; uint32_t page_size_bytes; if (data_format == tt::DataFormat::Bfp8_b) { @@ -577,12 +564,12 @@ std::shared_ptr create_and_transfer_data_sharded_cb( } ShardSpecBuffer shard_spec = ShardSpecBuffer( - cores, - {ht * tt::constants::TILE_HEIGHT, wt * tt::constants::TILE_WIDTH / num_receivers}, - ShardOrientation::ROW_MAJOR, - false, - {tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH}, - {ht, wt}); + cores, + {ht * tt::constants::TILE_HEIGHT, wt * tt::constants::TILE_WIDTH / num_receivers}, + ShardOrientation::ROW_MAJOR, + false, + {tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH}, + {ht, wt}); log_info("cores: {}", cores); log_info("size_bytes: {}", size_bytes); @@ -590,12 +577,12 @@ std::shared_ptr create_and_transfer_data_sharded_cb( log_info("num_receivers: {}", num_receivers); auto input_buffer = CreateBuffer(tt::tt_metal::ShardedBufferConfig{ - .device = device, - .size = size_bytes, - .page_size = page_size_bytes, - .buffer_type = buffer_type, - .buffer_layout = TensorMemoryLayout::WIDTH_SHARDED, - .shard_parameters = shard_spec}); + .device = device, + .size = size_bytes, + .page_size = page_size_bytes, + .buffer_type = buffer_type, + .buffer_layout = TensorMemoryLayout::WIDTH_SHARDED, + .shard_parameters = shard_spec}); tt::tt_metal::detail::WriteToBuffer(input_buffer, input_vec); log_info("created sharded tensor"); @@ -603,8 +590,7 @@ std::shared_ptr create_and_transfer_data_sharded_cb( return input_buffer; } - -int main(int argc, char **argv) { +int main(int argc, char** argv) { if (getenv("TT_METAL_SLOW_DISPATCH_MODE") != nullptr) { log_error("Test not supported w/ slow dispatch, exiting"); } @@ -628,12 +614,9 @@ int main(int argc, char **argv) { //////////////////////////////////////////////////////////////////////////// std::vector input_args(argv, argv + argc); try { - std::tie(m, input_args) = - test_args::get_command_option_uint64_and_remaining_args(input_args, "--m", 32); - std::tie(k, input_args) = - test_args::get_command_option_uint64_and_remaining_args(input_args, "--k", 8192); - std::tie(n, input_args) = - test_args::get_command_option_uint64_and_remaining_args(input_args, "--n", 128); + std::tie(m, input_args) = test_args::get_command_option_uint64_and_remaining_args(input_args, "--m", 32); + std::tie(k, input_args) = test_args::get_command_option_uint64_and_remaining_args(input_args, "--k", 8192); + std::tie(n, input_args) = test_args::get_command_option_uint64_and_remaining_args(input_args, "--n", 128); std::tie(num_blocks, input_args) = test_args::get_command_option_uint64_and_remaining_args(input_args, "--num-blocks", 8); std::tie(cb_num_blocks, input_args) = @@ -653,9 +636,8 @@ int main(int argc, char **argv) { std::tie(use_sub_devices, input_args) = test_args::has_command_option_and_remaining_args(input_args, "--use-sub-devices"); - test_args::validate_remaining_args(input_args); - } catch (const std::exception &e) { + } catch (const std::exception& e) { log_error(tt::LogTest, "Command line arguments found exception", e.what()); TT_ASSERT(false); } @@ -666,12 +648,12 @@ int main(int argc, char **argv) { TT_FATAL(cb_num_blocks >= num_blocks, "Global CB must contain more (or equal) blocks than a single layer"); if (use_device_profiler) { - #if !defined(TRACY_ENABLE) +#if !defined(TRACY_ENABLE) log_error( LogTest, "Metal library and test code should be build with " "profiler option using ./scripts/build_scripts/build_with_profiler_opt.sh"); - #endif +#endif auto device_profiler = getenv("TT_METAL_DEVICE_PROFILER"); TT_FATAL( device_profiler, @@ -712,7 +694,7 @@ int main(int argc, char **argv) { // Device Setup //////////////////////////////////////////////////////////////////////////// int device_id = 0; - tt_metal::Device *device = tt_metal::CreateDevice(device_id); + tt_metal::Device* device = tt_metal::CreateDevice(device_id); CoreCoord dram_bank_coord = CoreCoord{0, 0}; CoreCoord dram_reader_core_coord = CoreCoord{0, 0}; @@ -736,68 +718,174 @@ int main(int argc, char **argv) { // Input Setup //////////////////////////////////////////////////////////////////////////// std::shared_ptr in0_buffer; - std::vector > in1_buffers(num_layers); + std::vector> in1_buffers(num_layers); std::shared_ptr in1_l1_buffer; std::shared_ptr output_buffer; SHAPE in0_shape = SHAPE{1, 1, m, k * num_receivers}; - tt::deprecated::Tensor in0_tensor_fp16 = tt::deprecated::initialize_tensor(in0_shape, tt::deprecated::Initialize::RANDOM, -1, 1, std::chrono::system_clock::now().time_since_epoch().count()); - tt::deprecated::Tensor in0_tensor_fp8 = tt::deprecated::initialize_tensor(in0_shape, tt::deprecated::Initialize::RANDOM, -1, 1, std::chrono::system_clock::now().time_since_epoch().count()); + tt::deprecated::Tensor in0_tensor_fp16 = tt::deprecated::initialize_tensor( + in0_shape, + tt::deprecated::Initialize::RANDOM, + -1, + 1, + std::chrono::system_clock::now().time_since_epoch().count()); + tt::deprecated::Tensor in0_tensor_fp8 = tt::deprecated::initialize_tensor( + in0_shape, + tt::deprecated::Initialize::RANDOM, + -1, + 1, + std::chrono::system_clock::now().time_since_epoch().count()); auto in1_shape = SHAPE{1, 1, k, n}; - tt::deprecated::Tensor in1_tensor_fp16 = tt::deprecated::initialize_tensor(in1_shape, tt::deprecated::Initialize::RANDOM, -1, 1, std::chrono::system_clock::now().time_since_epoch().count()); - tt::deprecated::Tensor in1_tensor_fp8 = tt::deprecated::initialize_tensor(in1_shape, tt::deprecated::Initialize::RANDOM, -1, 1, std::chrono::system_clock::now().time_since_epoch().count()); + tt::deprecated::Tensor in1_tensor_fp16 = tt::deprecated::initialize_tensor( + in1_shape, + tt::deprecated::Initialize::RANDOM, + -1, + 1, + std::chrono::system_clock::now().time_since_epoch().count()); + tt::deprecated::Tensor in1_tensor_fp8 = tt::deprecated::initialize_tensor( + in1_shape, + tt::deprecated::Initialize::RANDOM, + -1, + 1, + std::chrono::system_clock::now().time_since_epoch().count()); if (tile_format == tt::DataFormat::Bfp8_b) { // in1 DRAM for (uint32_t i = 0; i < num_layers; ++i) { auto input_vec_tilized = tt::test_utils::tilize(in1_tensor_fp8.get_values(), k, n); - std::vector packed_input_vec_tile_layout = pack_fp32_vec_as_bfp8_tiles(input_vec_tilized, true, false); - in1_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Bfp8_b, dram_reader_core, num_banks); + std::vector packed_input_vec_tile_layout = + pack_fp32_vec_as_bfp8_tiles(input_vec_tilized, true, false); + in1_buffers[i] = create_and_transfer_data_sharded_cb( + device, + packed_input_vec_tile_layout, + kt, + nt, + tt_metal::BufferType::DRAM, + tt::DataFormat::Bfp8_b, + dram_reader_core, + num_banks); } // in0 auto activations_tilized = tt::test_utils::tilize(in0_tensor_fp8.get_values(), m, k * num_receivers); std::vector activations = pack_fp32_vec_as_bfp8_tiles(activations_tilized, true, false); - in0_buffer = create_and_transfer_data_sharded_cb(device, activations, mt, kt * num_receivers, tt_metal::BufferType::L1, tt::DataFormat::Bfp8_b, l1_receiver_core, num_receivers); + in0_buffer = create_and_transfer_data_sharded_cb( + device, + activations, + mt, + kt * num_receivers, + tt_metal::BufferType::L1, + tt::DataFormat::Bfp8_b, + l1_receiver_core, + num_receivers); // in1 L1 CB - vector in1_buffer = create_constant_vector_of_bfp8(kt * nt * single_tile_size + extra_cb_size, 0, false); - in1_l1_buffer = create_and_transfer_data_sharded_cb(device, in1_buffer, kt + extra_kt, nt, tt_metal::BufferType::L1, tt::DataFormat::Bfp8_b, l1_receiver_core, num_receivers); + vector in1_buffer = + create_constant_vector_of_bfp8(kt * nt * single_tile_size + extra_cb_size, 0, false); + in1_l1_buffer = create_and_transfer_data_sharded_cb( + device, + in1_buffer, + kt + extra_kt, + nt, + tt_metal::BufferType::L1, + tt::DataFormat::Bfp8_b, + l1_receiver_core, + num_receivers); // output vector outputs = create_constant_vector_of_bfp8(mt * nt * single_tile_size, 0, false); - output_buffer = create_and_transfer_data_sharded_cb(device, outputs, mt, nt, tt_metal::BufferType::L1, tt::DataFormat::Bfp8_b, l1_receiver_core, num_receivers); + output_buffer = create_and_transfer_data_sharded_cb( + device, + outputs, + mt, + nt, + tt_metal::BufferType::L1, + tt::DataFormat::Bfp8_b, + l1_receiver_core, + num_receivers); } else { // in1 for (uint32_t i = 0; i < num_layers; ++i) { auto input_vec_tilized = tt::test_utils::tilize(in1_tensor_fp16.get_values(), k, n); auto input_vec_tile_layout = convert_to_tile_layout(input_vec_tilized); - vector packed_input_vec_tile_layout = pack_bfloat16_vec_into_uint32_vec(input_vec_tile_layout); - in1_buffers[i] = create_and_transfer_data_sharded_cb(device, packed_input_vec_tile_layout, kt, nt, tt_metal::BufferType::DRAM, tt::DataFormat::Float16_b, dram_reader_core, num_banks); + vector packed_input_vec_tile_layout = + pack_bfloat16_vec_into_uint32_vec(input_vec_tile_layout); + in1_buffers[i] = create_and_transfer_data_sharded_cb( + device, + packed_input_vec_tile_layout, + kt, + nt, + tt_metal::BufferType::DRAM, + tt::DataFormat::Float16_b, + dram_reader_core, + num_banks); } // in0 auto activations_tilized = tt::test_utils::tilize(in0_tensor_fp16.get_values(), m, k * num_receivers); auto activations_tile_layout = convert_to_tile_layout(activations_tilized); vector activations = pack_bfloat16_vec_into_uint32_vec(activations_tile_layout); - in0_buffer = create_and_transfer_data_sharded_cb(device, activations, mt, kt * num_receivers, tt_metal::BufferType::L1, tt::DataFormat::Float16_b, l1_receiver_core, num_receivers); + in0_buffer = create_and_transfer_data_sharded_cb( + device, + activations, + mt, + kt * num_receivers, + tt_metal::BufferType::L1, + tt::DataFormat::Float16_b, + l1_receiver_core, + num_receivers); // in1 L1 CB - vector in1_buffer = create_constant_vector_of_bfloat16(kt * nt * single_tile_size + extra_cb_size, 0); - in1_l1_buffer = create_and_transfer_data_sharded_cb(device, in1_buffer, kt + extra_kt, nt, tt_metal::BufferType::L1, tt::DataFormat::Float16_b, l1_receiver_core, num_receivers); + vector in1_buffer = + create_constant_vector_of_bfloat16(kt * nt * single_tile_size + extra_cb_size, 0); + in1_l1_buffer = create_and_transfer_data_sharded_cb( + device, + in1_buffer, + kt + extra_kt, + nt, + tt_metal::BufferType::L1, + tt::DataFormat::Float16_b, + l1_receiver_core, + num_receivers); // output vector outputs = create_constant_vector_of_bfloat16(mt * nt * single_tile_size, 0); - output_buffer = create_and_transfer_data_sharded_cb(device, outputs, mt, nt, tt_metal::BufferType::L1, tt::DataFormat::Float16_b, l1_receiver_core, num_receivers); + output_buffer = create_and_transfer_data_sharded_cb( + device, + outputs, + mt, + nt, + tt_metal::BufferType::L1, + tt::DataFormat::Float16_b, + l1_receiver_core, + num_receivers); } - for (uint32_t i=0; i < num_layers; ++i) { + for (uint32_t i = 0; i < num_layers; ++i) { log_info("in1_buffers addr: {}", in1_buffers[i]->address()); } //////////////////////////////////////////////////////////////////////////// // Application Setup //////////////////////////////////////////////////////////////////////////// - auto [programs, global_sems] = create_programs(device, dram_reader_core, l1_receiver_core, single_tile_size, tile_format, m, k, n, num_blocks, cb_num_blocks, num_receivers, num_layers, cb_padding, in0_buffer, in1_buffers[0], in1_l1_buffer, output_buffer, use_sub_devices); + auto [programs, global_sems] = create_programs( + device, + dram_reader_core, + l1_receiver_core, + single_tile_size, + tile_format, + m, + k, + n, + num_blocks, + cb_num_blocks, + num_receivers, + num_layers, + cb_padding, + in0_buffer, + in1_buffers[0], + in1_l1_buffer, + output_buffer, + use_sub_devices); //////////////////////////////////////////////////////////////////////////// // Execution Application @@ -847,7 +935,7 @@ int main(int argc, char **argv) { } pass &= tt_metal::CloseDevice(device); - } catch (const std::exception &e) { + } catch (const std::exception& e) { pass = false; log_error(LogTest, "{}", e.what()); log_error(LogTest, "System error message: {}", std::strerror(errno)); diff --git a/tt_metal/host_api.hpp b/tt_metal/host_api.hpp index 3cbc3cedb07..67dedb5df5d 100644 --- a/tt_metal/host_api.hpp +++ b/tt_metal/host_api.hpp @@ -213,6 +213,7 @@ const CircularBufferConfig& GetCircularBufferConfig(Program& program, CBHandle c // clang-format off /** * Update the total size of the circular buffer at the given circular buffer handle. Updating a program-local circular buffer requires all circular buffers in the program to be reallocated. + * If it is required to update the address and total size of a dynamic circular buffer, use `UpdateDynamicCircularBufferAddressAndTotalSize`. * * Return value: void * @@ -244,6 +245,7 @@ void UpdateCircularBufferPageSize(Program& program, CBHandle cb_handle, uint8_t // clang-format off /** * Update the address of a dynamic circular buffer. Dynamic circular buffers share the same address space as L1 buffers. + * If it is required to update the address and total size of a dynamic circular buffer, use `UpdateDynamicCircularBufferAddressAndTotalSize`. * * Return value: void * @@ -257,6 +259,20 @@ void UpdateCircularBufferPageSize(Program& program, CBHandle cb_handle, uint8_t void UpdateDynamicCircularBufferAddress(Program& program, CBHandle cb_handle, const Buffer& buffer); // clang-format off +/** + * Update the address and total size of a dynamic circular buffer. Dynamic circular buffers share the same address space as L1 buffers. + * + * Return value: void + * + * | Argument | Description | Type | Valid Range | Required | + * |------------|------------------------------------------------------------------------------------------|------------------------------|-------------|----------| + * | program | The program containing the circular buffer | Program & | | Yes | + * | cb_handle | ID of the circular buffer, returned by `CreateCircularBuffers` | CBHandle (uintptr_t) | | Yes | | + * | buffer | Dynamically allocated L1 buffer that shares address space of circular buffer `cb_handle` | const Buffer & | L1 buffer | Yes | + * | total_size | New size of the circular buffer in bytes | uint32_t | | Yes | + */ +void UpdateDynamicCircularBufferAddressAndTotalSize(Program& program, CBHandle cb_handle, const Buffer& buffer, uint32_t total_size); + /** * Initializes semaphore on all cores within core range (inclusive). Each core can have up to eight 4B semaphores aligned to L1_ALIGNMENT. * diff --git a/tt_metal/impl/buffers/circular_buffer_types.cpp b/tt_metal/impl/buffers/circular_buffer_types.cpp index 10778c9c0a5..471e048d353 100644 --- a/tt_metal/impl/buffers/circular_buffer_types.cpp +++ b/tt_metal/impl/buffers/circular_buffer_types.cpp @@ -86,6 +86,11 @@ CircularBufferConfig& CircularBufferConfig::set_total_size(uint32_t total_size) } CircularBufferConfig& CircularBufferConfig::set_globally_allocated_address(const Buffer& buffer) { + return this->set_globally_allocated_address_and_total_size(buffer, this->total_size_); +} + +CircularBufferConfig& CircularBufferConfig::set_globally_allocated_address_and_total_size( + const Buffer& buffer, uint32_t total_size) { if (not buffer.is_l1()) { TT_THROW("Only L1 buffers can have an associated circular buffer!"); } @@ -94,28 +99,32 @@ CircularBufferConfig& CircularBufferConfig::set_globally_allocated_address(const this->max_size_ = buffer.aligned_size_per_bank(); this->buffer_size_ = buffer.aligned_size(); this->shadow_global_buffer = &buffer; - if (this->total_size_ > this->max_size_) { + if (total_size > this->max_size_) { TT_ASSERT( false, "Cannot set to globally allocated buffer. Circular buffer size {} B exceeds allocated L1 buffer bank " "size of {} B", - this->total_size_, + total_size, this->max_size_); #ifndef DEBUG log_warning( "Circular buffer size {} B exceeds allocated L1 buffer bank size of {} B. This may allow this circular " "buffer to write outside the allocated buffer space.", - this->total_size_, + total_size, this->max_size_); - if (this->total_size_ > this->buffer_size_) { + if (total_size > this->buffer_size_) { TT_THROW( "Cannot set to globally allocated buffer. Circular buffer size {} B exceeds allocated L1 buffer " "size of {} B", - this->total_size_, + total_size, this->buffer_size_); } #endif } + if (total_size == 0) { + TT_THROW("Total size for circular buffer must be non-zero!"); + } + this->total_size_ = total_size; return *this; } diff --git a/tt_metal/impl/buffers/circular_buffer_types.hpp b/tt_metal/impl/buffers/circular_buffer_types.hpp index d3e15cc6040..cf4b6a43d1e 100644 --- a/tt_metal/impl/buffers/circular_buffer_types.hpp +++ b/tt_metal/impl/buffers/circular_buffer_types.hpp @@ -23,52 +23,53 @@ inline namespace v0 { using CBHandle = uintptr_t; - class CircularBufferConfig { - public: +public: // Static circular buffer spec - CircularBufferConfig(uint32_t total_size, const std::map &data_format_spec); + CircularBufferConfig(uint32_t total_size, const std::map& data_format_spec); // User is expected to use the builder here. CircularBufferConfig(uint32_t total_size); // Dynamic circular buffer spec CircularBufferConfig( - uint32_t total_size, const std::map &data_format_spec, const Buffer &buffer); + uint32_t total_size, const std::map& data_format_spec, const Buffer& buffer); CircularBufferConfig& set_page_size(uint8_t buffer_index, uint32_t page_size); CircularBufferConfig& set_total_size(uint32_t total_size); - CircularBufferConfig& set_globally_allocated_address(const Buffer &buffer); + CircularBufferConfig& set_globally_allocated_address(const Buffer& buffer); + + CircularBufferConfig& set_globally_allocated_address_and_total_size(const Buffer& buffer, uint32_t total_size); CircularBufferConfig& set_tile_dims(uint8_t buffer_index, const Tile& tile); - const std::array, NUM_CIRCULAR_BUFFERS> &tiles() const; + const std::array, NUM_CIRCULAR_BUFFERS>& tiles() const; uint32_t total_size() const; std::optional globally_allocated_address() const; - const std::array, NUM_CIRCULAR_BUFFERS> &data_formats() const; + const std::array, NUM_CIRCULAR_BUFFERS>& data_formats() const; - const std::array, NUM_CIRCULAR_BUFFERS> &page_sizes() const; + const std::array, NUM_CIRCULAR_BUFFERS>& page_sizes() const; const Buffer* shadow_global_buffer{nullptr}; class Builder { - public: - Builder(CircularBufferConfig &parent, uint8_t buffer_index); + public: + Builder(CircularBufferConfig& parent, uint8_t buffer_index); - const Builder &set_data_format(tt::DataFormat data_format) const; + const Builder& set_data_format(tt::DataFormat data_format) const; - const Builder &add_size(uint32_t size) const; + const Builder& add_size(uint32_t size) const; - const Builder &set_page_size(uint32_t page_size) const; + const Builder& set_page_size(uint32_t page_size) const; - const Builder &set_tile_dims(const Tile &tile) const; + const Builder& set_tile_dims(const Tile& tile) const; - private: - CircularBufferConfig &parent_; + private: + CircularBufferConfig& parent_; uint8_t buffer_index_; }; @@ -77,9 +78,9 @@ class CircularBufferConfig { friend bool operator==(const CircularBufferConfig& lhs, const CircularBufferConfig& rhs); friend bool operator!=(const CircularBufferConfig& lhs, const CircularBufferConfig& rhs); - - private: - void set_config(const std::map &data_format_spec); +private: + void set_config(const std::map& data_format_spec); + void validate_total_size(uint32_t total_size); uint32_t total_size_ = 0; std::optional globally_allocated_address_ = std::nullopt; diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 2d0f3fad1f4..aca0db3e2c1 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -1084,6 +1084,12 @@ void UpdateDynamicCircularBufferAddress(Program &program, CBHandle cb_handle, co circular_buffer->assign_global_address(); } +void UpdateDynamicCircularBufferAddressAndTotalSize(Program& program, CBHandle cb_handle, const Buffer& buffer, uint32_t total_size) { + auto circular_buffer = detail::GetCircularBuffer(program, cb_handle); + circular_buffer->config().set_globally_allocated_address_and_total_size(buffer, total_size); + circular_buffer->assign_global_address(); +} + uint32_t CreateSemaphore( Program &program, const std::variant &core_spec, diff --git a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp index 5feb3d5a8c3..76f8bc9e18c 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/bcast/device/multi_core_hw/bcast_op_multi_core_hw.cpp @@ -314,13 +314,13 @@ operation::ProgramWithCallbacks bcast_multi_core_hw( } if (src0_sharded) { - UpdateDynamicCircularBufferAddress(program, cb_src0, *src_buffer_a); - UpdateCircularBufferTotalSize(program, cb_src0, num_tiles_per_core_group_1 * src0_single_tile_size); + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_src0, *src_buffer_a, num_tiles_per_core_group_1 * src0_single_tile_size); } if (out_sharded) { - UpdateDynamicCircularBufferAddress(program, cb_output, *dst_buffer); - UpdateCircularBufferTotalSize(program, cb_output, num_tiles_per_core_group_1 * dst_single_tile_size); + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_output, *dst_buffer, num_tiles_per_core_group_1 * dst_single_tile_size); } }; diff --git a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp index 6d458f52c13..080a23f84ff 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/transpose/device/transpose_program_factory.cpp @@ -1899,13 +1899,13 @@ operation::ProgramWithCallbacks transpose_wh_multi_core_sharded(const Tensor& a, uint32_t num_tiles_per_shard = shard_spec.numel() / TILE_HW; if (src0_sharded) { - UpdateDynamicCircularBufferAddress(program, cb_src0, *src_buffer); - UpdateCircularBufferTotalSize(program, cb_src0, num_tiles_per_shard * src0_single_tile_size); + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_src0, *src_buffer, num_tiles_per_shard * src0_single_tile_size); } if (out_sharded) { - UpdateDynamicCircularBufferAddress(program, cb_output, *dst_buffer); - UpdateCircularBufferTotalSize(program, cb_output, num_tiles_per_shard * dst_single_tile_size); + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_output, *dst_buffer, num_tiles_per_shard * dst_single_tile_size); } uint32_t Wt = shard_spec.shape[1] / TILE_WIDTH; diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp index 50f2edc9e61..d54c9bdef6f 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_and_width_multi_core_program_factory.cpp @@ -384,13 +384,13 @@ void BinaryDeviceOperation::BroadcastHeightAndWidthMultiCore::override_runtime_a } if (src0_sharded) { - UpdateDynamicCircularBufferAddress(program, cb_src0, *src_buffer_a); - UpdateCircularBufferTotalSize(program, cb_src0, num_tiles_per_core_group_1 * src0_single_tile_size); + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_src0, *src_buffer_a, num_tiles_per_core_group_1 * src0_single_tile_size); } if (out_sharded) { - UpdateDynamicCircularBufferAddress(program, cb_output, *dst_buffer); - UpdateCircularBufferTotalSize(program, cb_output, num_tiles_per_core_group_1 * dst_single_tile_size); + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_output, *dst_buffer, num_tiles_per_core_group_1 * dst_single_tile_size); } } diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp index 889d591aaba..5403ae82ed2 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/broadcast_height_multi_core_sharded_optimized_program_factory.cpp @@ -91,7 +91,7 @@ BinaryDeviceOperation::BroadcastHeightMultiCoreShardedOptimized::create( TT_FATAL(input_tile_size == output_tile_size, "Input and output tile size should be same"); uint32_t shard_size_in_bytes = shard_spec.numel() * a.element_size(); - uint32_t num_tile_per_core = (shard_size_in_bytes + input_tile_size - 1) / TILE_HW; // ceil value + uint32_t num_tile_per_core = (shard_size_in_bytes + input_tile_size - 1) / input_tile_size; // ceil value TT_FATAL(input_tile_size <= shard_size_in_bytes, "Input tile size should be less than shard size"); uint32_t Wt, Ht; diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp index 610d94a6b43..ceef285e717 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/element_wise_multi_core_program_factory.cpp @@ -273,16 +273,16 @@ inline __attribute__((always_inline)) void set_eltwise_binary_runtime_args( } if (src0_sharded) { - UpdateDynamicCircularBufferAddress(program, cb_src0, *src_buffer_a); - UpdateCircularBufferTotalSize(program, cb_src0, num_tiles_per_core_group_1 * src0_single_tile_size); + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_src0, *src_buffer_a, num_tiles_per_core_group_1 * src0_single_tile_size); } if (src1_sharded) { - UpdateDynamicCircularBufferAddress(program, cb_src1, *src_buffer_b); - UpdateCircularBufferTotalSize(program, cb_src1, num_tiles_per_core_group_1 * src1_single_tile_size); + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_src1, *src_buffer_b, num_tiles_per_core_group_1 * src1_single_tile_size); } if (out_sharded) { - UpdateDynamicCircularBufferAddress(program, cb_output, *dst_buffer); - UpdateCircularBufferTotalSize(program, cb_output, num_tiles_per_core_group_1 * dst_single_tile_size); + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_output, *dst_buffer, num_tiles_per_core_group_1 * dst_single_tile_size); } } BinaryDeviceOperation::ElementWiseMultiCore::cached_program_t BinaryDeviceOperation::ElementWiseMultiCore::create( diff --git a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp index 9125aec9d3e..95395712799 100644 --- a/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/matmul/group_attn_matmul/device/group_attn_matmul_program_factory.cpp @@ -572,8 +572,8 @@ operation::ProgramWithCallbacks multi_core_group_attn_matmul( if (in0_is_sharded) { uint32_t cb0_num_input_tiles = a.shard_spec().value().numel() / TILE_HW; // Should be full MtKt and C should be 1 - UpdateDynamicCircularBufferAddress(program, cb_src0, *src0_buffer); - UpdateCircularBufferTotalSize(program, cb_src0, cb0_num_input_tiles * in0_single_tile_size); + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_src0, *src0_buffer, cb0_num_input_tiles * in0_single_tile_size); } else { uint32_t cb0_num_input_tiles = in0_block_w; // TODO: Generalize; double buffer and add blocking along ineer dim if we have Mt > 1 @@ -586,8 +586,8 @@ operation::ProgramWithCallbacks multi_core_group_attn_matmul( if (in1_is_sharded) { uint32_t cb2_num_input_tiles = b.shard_spec().value().numel() / TILE_HW; // Should be full CKtNt and batch must be 32 - UpdateDynamicCircularBufferAddress(program, cb_src2, *src1_buffer); - UpdateCircularBufferTotalSize(program, cb_src2, cb2_num_input_tiles * in1_single_tile_size); + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_src2, *src1_buffer, cb2_num_input_tiles * in1_single_tile_size); } UpdateCircularBufferTotalSize(program, cb_interm1, MtNt * interm_single_tile_size); @@ -595,8 +595,8 @@ operation::ProgramWithCallbacks multi_core_group_attn_matmul( if (output_is_sharded) { uint32_t num_output_tiles = output.shard_spec().value().numel() / TILE_HW; // Should be full MtNt and C should be 1 - UpdateDynamicCircularBufferAddress(program, cb_output, *dst_buffer); - UpdateCircularBufferTotalSize(program, cb_output, num_output_tiles * output_single_tile_size); + UpdateDynamicCircularBufferAddressAndTotalSize( + program, cb_output, *dst_buffer, num_output_tiles * output_single_tile_size); } else { uint32_t num_output_tiles = MtNt; // TODO: Should be MtNt if Mt > 1? Or, produce one Nt at a time and double buffer? From 9db3748a1823576f8e2167e080a79a63e05c01a2 Mon Sep 17 00:00:00 2001 From: Bryan Wilder Field Lozano Date: Fri, 29 Nov 2024 07:24:25 +0530 Subject: [PATCH 3/4] [skip ci] Update CONTRIBUTING.md with pre-commit info (#15537) --- CONTRIBUTING.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7b9fc84dca8..946c69f3720 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -429,6 +429,44 @@ cat generated/watcher/watcher.log # See k_ids field for each core in the last d - In the future, this tool will be expanded to show more debug information available from the host side. ## Contribution standards +This project has adopted C++ formatting and style as defined in `.clang-format`. +There are additional requirements such as license headers. + +## Pre-commit Hook Integration for Formatting and Linting + +As part of maintaining consistent code formatting across the project, we have integrated the [pre-commit](https://pre-commit.com/) framework into our workflow. The pre-commit hooks will help automatically check and format code before commits are made, ensuring that we adhere to the project's coding standards. + +### What is Pre-commit? + +Pre-commit is a framework for managing and maintaining multi-language pre-commit hooks. It helps catch common issues early by running a set of hooks before code is committed, automating tasks like: + +- Formatting code (e.g., fixing trailing whitespace, enforcing end-of-file newlines) +- Running linters (e.g., `clang-format`, `black`, `flake8`) +- Checking for merge conflicts or other common issues. + +For more details on pre-commit, you can visit the [official documentation](https://pre-commit.com/). + +### How to Set Up Pre-commit Locally + +To set up pre-commit on your local machine, follow these steps: + +1. **Install Pre-commit**: + Ensure you have Python installed, then run: + ```bash + pip install pre-commit + ``` + *Note:* pre-commit is already installed if you are using the python virtual environment. +2. **Install the Git Hook Scripts**: + In your local repository, run the following command to install the pre-commit hooks: + ```bash + pre-commit install + ``` + This command will configure your local Git to run the defined hooks automatically before each commit. +3. **Run Pre-commit Hooks Manually**: + You can also run the hooks manually against all files at any time with: + ```bash + pre-commit run --all-files + ``` ### File structure and formats From 427a7c925a14c9790b9bd1c8f2028d135a97d75f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Choi=20HyungSuk=28=EC=B5=9C=ED=98=95=EC=84=9D=29?= Date: Fri, 29 Nov 2024 14:22:17 +0900 Subject: [PATCH 4/4] #15542: Add dtype to `to_torch` ### Ticket https://github.com/tenstorrent/tt-metal/issues/15542 ### Problem description ```Python import pytest import torch import ttnn torch.set_printoptions(threshold=1000000, linewidth=100000000, sci_mode=False) @pytest.mark.parametrize("ttnn_dtype, torch_dtype", [ [ttnn.DataType.FLOAT32, torch.float32], [ttnn.DataType.BFLOAT16, torch.bfloat16], [ttnn.DataType.BFLOAT8_B, torch.bfloat16], ],) def test_support_1d(device, ttnn_dtype, torch_dtype): torch_x = torch.rand(20, dtype=torch_dtype) ttnn_x_bf16 = ttnn.from_torch(torch_x, dtype=ttnn.DataType.BFLOAT16, layout=ttnn.TILE_LAYOUT) ttnn_x = ttnn.from_torch(torch_x, dtype=ttnn_dtype, layout=ttnn.TILE_LAYOUT) torch_y = ttnn.to_torch(ttnn_x) print("ttnn_dtype", ttnn_dtype) print("torch_dtype", torch_dtype) print("torch_x\n", torch_x) print("ttnn_x_bf16\n", ttnn_x_bf16) print("ttnn_x\n", ttnn_x) print("torch_y\n", torch_y, type(torch_y)) ``` In the unit test, the `to_torch` function is used to convert `ttnn` tensors to `torch` tensors. However, the dtype conversion in `to_torch` is as follows: ttnn -> torch BFLOAT16-> bfloat16 FLOAT32-> float32 BFLOAT8_B -> float32 Since PyTorch does not have a `bfloat8_b` type, it is not an issue that it converts to `float`. However, the problem with `from_torch` is that it has a `dtype` parameter, while `to_torch` requires an additional conversion using `to(dtype=torch_dtype)`, which makes it a bit cumbersome. ### What's changed Add dtype to `to_torch` Add test. ### Checklist - [ ] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [x] New/Existing tests provide coverage for changes --- tests/ttnn/unit_tests/test_to_dtype.py | 3 +-- ttnn/ttnn/operations/core.py | 9 ++++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/ttnn/unit_tests/test_to_dtype.py b/tests/ttnn/unit_tests/test_to_dtype.py index 112359c4389..9a55e72f470 100644 --- a/tests/ttnn/unit_tests/test_to_dtype.py +++ b/tests/ttnn/unit_tests/test_to_dtype.py @@ -34,6 +34,5 @@ def test_to_dtype(height, width, from_dtype, to_dtype): assert output_tensor.layout == ttnn.ROW_MAJOR_LAYOUT assert tuple(output_tensor.shape) == (height, width) - output_tensor = ttnn.to_torch(output_tensor).to(torch_input_tensor.dtype) - + output_tensor = ttnn.to_torch(output_tensor, dtype=torch_input_tensor.dtype) assert_with_pcc(torch_input_tensor, output_tensor) diff --git a/ttnn/ttnn/operations/core.py b/ttnn/ttnn/operations/core.py index 5daf7cd394d..3eeda3a90b6 100644 --- a/ttnn/ttnn/operations/core.py +++ b/ttnn/ttnn/operations/core.py @@ -257,6 +257,7 @@ def __torch_function__(cls, func, types, func_args=(), func_kwargs=None): @ttnn.register_python_operation(name="ttnn.to_torch", golden_function=_golden_function) def to_torch( tensor: ttnn.Tensor, + dtype: Optional[torch.dtype] = None, *, torch_rank: Optional[int] = None, mesh_composer: Optional[ttnn.MeshToTensor] = None, @@ -269,6 +270,7 @@ def to_torch( Args: tensor (ttnn.Tensor): the input tensor. + dtype (torch.dtype, optional): the desired `torch` data type of returned tensor. Defaults to `None`. Keyword Args: torch_rank (int, optional): Desired rank of the `torch.Tensor`. Defaults to `None`. @@ -314,7 +316,12 @@ def to_torch( raise RuntimeError("ttnn: Unable to squeeze to desired rank!") tensor = tensor.squeeze(0) - return TorchTensor(tensor) + torch_tensor = TorchTensor(tensor) + + if dtype is not None: + torch_tensor = torch_tensor.to(dtype=dtype) + + return torch_tensor def _golden_function(tensor, *args, **kwargs):