Revert "#14512: Add DRAM pre-fetcher benchmark (#14528)" (#14749)

This reverts commit 6a9a66f.
tenstorrent · Nov 5, 2024 · 56fbe4f · 56fbe4f
1 parent 797cf21
commit 56fbe4f
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 223 deletions.
diff --git a/tests/scripts/test_moreh_microbenchmark.py b/tests/scripts/test_moreh_microbenchmark.py
@@ -684,7 +684,6 @@ def test_matmul_single_core_sharded(
     [
         ("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 0, 12, 0),
         ("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 1, 12, 0),
-        ("wormhole_b0", 1000, np.array([2048, 3840]), 1, 4, 1, 12, 0),  # Padded FF1 shapes for llama 70b on TG
     ],
 )
 def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
@@ -723,78 +722,29 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form
 
 
 @pytest.mark.parametrize(
-    "arch, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id, bw_target",
+    "arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id",
     [
-        ("grayskull", np.array([32768 * 2, 8 * 128]), 1, 64, 2, 8, 0, None),
-        ("wormhole_b0", np.array([32768 * 2, 12 * 128]), 1, 64, 2, 12, 0, None),
-        ("blackhole", np.array([32768 * 8, 8 * 128]), 1, 256, 2, 8, 0, None),
-        # FF1/FF3 shapes for TG llama 70b
-        (
-            "wormhole_b0",
-            np.array([2048, 3840]),
-            1,
-            16,
-            0,
-            12,
-            0,
-            240,
-        ),  # 244 GB/s
-        # FF2 shapes for TG llama 70b
-        (
-            "wormhole_b0",
-            np.array([3584, 2304]),
-            1,
-            28,
-            1,
-            12,
-            0,
-            250,
-        ),  # 255 GB/s
-        # Dense Out shapes for TG llama 70b
-        (
-            "wormhole_b0",
-            np.array([1024, 2304]),
-            1,
-            8,
-            1,
-            12,
-            0,
-            220,
-        ),  # 226 GB/s
-        # QKV shapes for TG llama 70b
-        (
-            "wormhole_b0",
-            np.array([2048, 1536]),
-            1,
-            16,
-            1,
-            12,
-            0,
-            225,
-        ),  # 232 GB/s
+        ("grayskull", 1202, np.array([32768 * 2, 8 * 128]), 1, 64, 1, 8, 0),
+        ("wormhole_b0", 1000, np.array([32768 * 2, 12 * 128]), 1, 64, 1, 12, 0),
+        ("blackhole", 800, np.array([32768 * 8, 8 * 128]), 1, 256, 1, 8, 0),
     ],
 )
-def test_dram_read_l1_write_core(
-    arch, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id, bw_target
-):
-    dev_freq = get_device_freq()
+def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
     data = []
     cycle_list = []
     time_list = []
     throughput_list = []
     for _ in range(num_tests):
         k = int(test_vector[0])
         n = int(test_vector[1])
-        if data_format == 0:  # BFP4
-            input_size = k * n * (512 + 64) // 1024
-        elif data_format == 1:  # BFP8
+        if data_format == 0:
             input_size = k * n * 1088 // 1024
-        elif data_format == 2:  # BFLOAT16
+        elif data_format == 1:
             input_size = k * n * 2048 // 1024
         run_dram_read_l1_write_cmd(k, n, nblock, data_format, num_banks, bank_start_id)
         cycle = profile_results_kernel_duration()
-        time = cycle / dev_freq / 1000.0 / 1000.0
-        throughput = input_size / cycle * dev_freq / 1000.0
+        time = cycle / freq / 1000.0 / 1000.0
+        throughput = input_size / cycle * freq / 1000.0
         cycle_list.append(cycle)
         time_list.append(time)
         throughput_list.append(throughput)
@@ -806,15 +756,13 @@ def test_dram_read_l1_write_core(
     logger.info("DRAM read throughput: " + str(throughput))
     data.append([throughput])
     # check within range
+    dev_freq = get_device_freq()
     if arch == "grayskull":
         bw_bound = 100.0
     elif arch == "wormhole_b0":
         bw_bound = 260.0
     elif arch == "blackhole":
         bw_bound = 340.0
-    if bw_target is not None:
-        bw_bound = bw_target
-    bw_bound = bw_bound * dev_freq / 1000.0  # Adjust for device frequency; target is based on max device frequency
     assert bw_bound <= throughput
 
 

diff --git a/...l/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/writer_l1.cpp b/...l/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/kernels/writer_l1.cpp
@@ -11,55 +11,40 @@
 
 void kernel_main() {
     constexpr uint32_t num_blocks = get_compile_time_arg_val(0);
-    constexpr uint32_t num_pages_w_per_receiver = get_compile_time_arg_val(1);
-    constexpr uint32_t num_tiles_h = get_compile_time_arg_val(2);
-    constexpr uint32_t block_num_tiles = get_compile_time_arg_val(3);
-    constexpr uint32_t page_size = get_compile_time_arg_val(4);
-    constexpr uint32_t noc = get_compile_time_arg_val(5);
+    constexpr uint32_t num_pages = get_compile_time_arg_val(1);
+    constexpr uint32_t block_num_tiles = get_compile_time_arg_val(2);
+    constexpr uint32_t page_size = get_compile_time_arg_val(3);
+    constexpr uint32_t noc = get_compile_time_arg_val(4);
 
     const uint32_t vc = get_arg_val<uint32_t>(0);
-    // First L1 writer core coordinates
-    const uint32_t noc_x1 = get_arg_val<uint32_t>(1);
-    const uint32_t noc_y1 = get_arg_val<uint32_t>(2);
-    // Second L1 writer core coordinates
-    const uint32_t noc_x2 = get_arg_val<uint32_t>(3);
-    const uint32_t noc_y2 = get_arg_val<uint32_t>(4);
+    const uint32_t noc_x = get_arg_val<uint32_t>(1);
+    const uint32_t noc_y = get_arg_val<uint32_t>(2);
 
     constexpr uint32_t cb_id = 0;
 
     uint32_t l1_write_addr = get_write_ptr(cb_id);
-    const uint64_t l1_noc_write_addr1 = get_noc_addr(noc_x1, noc_y1, l1_write_addr, noc);
-    const uint64_t l1_noc_write_addr2 = get_noc_addr(noc_x2, noc_y2, l1_write_addr, noc);
+    const uint64_t l1_noc_write_addr = get_noc_addr(noc_x, noc_y, l1_write_addr, noc);
 
-    for (uint32_t block = 0; block < num_blocks; ++block) { // Iterate over blocks
+    noc_async_write_one_packet_set_state(l1_noc_write_addr, page_size, noc, vc);
+
+    for (uint32_t block = 0; block < num_blocks; ++block) {
+
+        auto remote_l1_write_addr = l1_noc_write_addr;
 
         cb_wait_front(cb_id, block_num_tiles);
+        auto l1_read_addr = get_read_ptr(cb_id);
 
-        for (uint32_t core_id = 0; core_id < 2; ++core_id) { // Iterate over two neighboring cores
-            uint64_t l1_noc_write_addr_for_receiver_core = 0;
-            uint32_t l1_read_addr = get_read_ptr(cb_id);
-            if (core_id == 0) {
-                l1_noc_write_addr_for_receiver_core = l1_noc_write_addr1; // Set write pointer to start of cb for first core
-            } else {
-                l1_noc_write_addr_for_receiver_core = l1_noc_write_addr2; // Set write pointer to start of cb for second core
-                l1_read_addr += page_size * num_pages_w_per_receiver; // Stride read pointer to start of second core
-            }
-
-            noc_async_write_one_packet_set_state(l1_noc_write_addr_for_receiver_core, page_size, noc, vc); // Set state to write a page to noc/vc
-
-            for (uint32_t h = 0; h < num_tiles_h; ++h) { // Iterate over page rows per receiver core
-                for (uint32_t w = 0; w < num_pages_w_per_receiver; ++w) { // Iterate over page columns per receiver core
-                    noc_async_write_one_packet_with_state(l1_read_addr, l1_noc_write_addr_for_receiver_core, noc);
-                    l1_read_addr += page_size;
-                    l1_noc_write_addr_for_receiver_core += page_size;
-                }
-                l1_read_addr += page_size * num_pages_w_per_receiver; // Stride read pointer over other core's data
-            }
+        for (uint32_t h = 0; h < num_pages; ++h) {
+            noc_async_write_one_packet_with_state(l1_read_addr, remote_l1_write_addr, noc);
+            l1_read_addr += page_size;
+            remote_l1_write_addr += page_size;
         }
 
         noc_async_write_barrier(noc);
 
         cb_pop_front(cb_id, block_num_tiles);
+
     }
 
+
 }