Skip to content

Commit

Permalink
Revert "#14512: Add DRAM pre-fetcher benchmark (#14528)" (#14749)
Browse files Browse the repository at this point in the history
This reverts commit 6a9a66f.
  • Loading branch information
johanna-rock-tt authored Nov 5, 2024
1 parent 797cf21 commit 56fbe4f
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 223 deletions.
72 changes: 10 additions & 62 deletions tests/scripts/test_moreh_microbenchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,6 @@ def test_matmul_single_core_sharded(
[
("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 0, 12, 0),
("wormhole_b0", 1000, np.array([32768, 12 * 128]), 1, 8, 1, 12, 0),
("wormhole_b0", 1000, np.array([2048, 3840]), 1, 4, 1, 12, 0), # Padded FF1 shapes for llama 70b on TG
],
)
def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
Expand Down Expand Up @@ -723,78 +722,29 @@ def test_dram_read_12_core(arch, freq, test_vector, num_tests, nblock, data_form


@pytest.mark.parametrize(
"arch, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id, bw_target",
"arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id",
[
("grayskull", np.array([32768 * 2, 8 * 128]), 1, 64, 2, 8, 0, None),
("wormhole_b0", np.array([32768 * 2, 12 * 128]), 1, 64, 2, 12, 0, None),
("blackhole", np.array([32768 * 8, 8 * 128]), 1, 256, 2, 8, 0, None),
# FF1/FF3 shapes for TG llama 70b
(
"wormhole_b0",
np.array([2048, 3840]),
1,
16,
0,
12,
0,
240,
), # 244 GB/s
# FF2 shapes for TG llama 70b
(
"wormhole_b0",
np.array([3584, 2304]),
1,
28,
1,
12,
0,
250,
), # 255 GB/s
# Dense Out shapes for TG llama 70b
(
"wormhole_b0",
np.array([1024, 2304]),
1,
8,
1,
12,
0,
220,
), # 226 GB/s
# QKV shapes for TG llama 70b
(
"wormhole_b0",
np.array([2048, 1536]),
1,
16,
1,
12,
0,
225,
), # 232 GB/s
("grayskull", 1202, np.array([32768 * 2, 8 * 128]), 1, 64, 1, 8, 0),
("wormhole_b0", 1000, np.array([32768 * 2, 12 * 128]), 1, 64, 1, 12, 0),
("blackhole", 800, np.array([32768 * 8, 8 * 128]), 1, 256, 1, 8, 0),
],
)
def test_dram_read_l1_write_core(
arch, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id, bw_target
):
dev_freq = get_device_freq()
def test_dram_read_l1_write_core(arch, freq, test_vector, num_tests, nblock, data_format, num_banks, bank_start_id):
data = []
cycle_list = []
time_list = []
throughput_list = []
for _ in range(num_tests):
k = int(test_vector[0])
n = int(test_vector[1])
if data_format == 0: # BFP4
input_size = k * n * (512 + 64) // 1024
elif data_format == 1: # BFP8
if data_format == 0:
input_size = k * n * 1088 // 1024
elif data_format == 2: # BFLOAT16
elif data_format == 1:
input_size = k * n * 2048 // 1024
run_dram_read_l1_write_cmd(k, n, nblock, data_format, num_banks, bank_start_id)
cycle = profile_results_kernel_duration()
time = cycle / dev_freq / 1000.0 / 1000.0
throughput = input_size / cycle * dev_freq / 1000.0
time = cycle / freq / 1000.0 / 1000.0
throughput = input_size / cycle * freq / 1000.0
cycle_list.append(cycle)
time_list.append(time)
throughput_list.append(throughput)
Expand All @@ -806,15 +756,13 @@ def test_dram_read_l1_write_core(
logger.info("DRAM read throughput: " + str(throughput))
data.append([throughput])
# check within range
dev_freq = get_device_freq()
if arch == "grayskull":
bw_bound = 100.0
elif arch == "wormhole_b0":
bw_bound = 260.0
elif arch == "blackhole":
bw_bound = 340.0
if bw_target is not None:
bw_bound = bw_target
bw_bound = bw_bound * dev_freq / 1000.0 # Adjust for device frequency; target is based on max device frequency
assert bw_bound <= throughput


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,55 +11,40 @@

void kernel_main() {
constexpr uint32_t num_blocks = get_compile_time_arg_val(0);
constexpr uint32_t num_pages_w_per_receiver = get_compile_time_arg_val(1);
constexpr uint32_t num_tiles_h = get_compile_time_arg_val(2);
constexpr uint32_t block_num_tiles = get_compile_time_arg_val(3);
constexpr uint32_t page_size = get_compile_time_arg_val(4);
constexpr uint32_t noc = get_compile_time_arg_val(5);
constexpr uint32_t num_pages = get_compile_time_arg_val(1);
constexpr uint32_t block_num_tiles = get_compile_time_arg_val(2);
constexpr uint32_t page_size = get_compile_time_arg_val(3);
constexpr uint32_t noc = get_compile_time_arg_val(4);

const uint32_t vc = get_arg_val<uint32_t>(0);
// First L1 writer core coordinates
const uint32_t noc_x1 = get_arg_val<uint32_t>(1);
const uint32_t noc_y1 = get_arg_val<uint32_t>(2);
// Second L1 writer core coordinates
const uint32_t noc_x2 = get_arg_val<uint32_t>(3);
const uint32_t noc_y2 = get_arg_val<uint32_t>(4);
const uint32_t noc_x = get_arg_val<uint32_t>(1);
const uint32_t noc_y = get_arg_val<uint32_t>(2);

constexpr uint32_t cb_id = 0;

uint32_t l1_write_addr = get_write_ptr(cb_id);
const uint64_t l1_noc_write_addr1 = get_noc_addr(noc_x1, noc_y1, l1_write_addr, noc);
const uint64_t l1_noc_write_addr2 = get_noc_addr(noc_x2, noc_y2, l1_write_addr, noc);
const uint64_t l1_noc_write_addr = get_noc_addr(noc_x, noc_y, l1_write_addr, noc);

for (uint32_t block = 0; block < num_blocks; ++block) { // Iterate over blocks
noc_async_write_one_packet_set_state(l1_noc_write_addr, page_size, noc, vc);

for (uint32_t block = 0; block < num_blocks; ++block) {

auto remote_l1_write_addr = l1_noc_write_addr;

cb_wait_front(cb_id, block_num_tiles);
auto l1_read_addr = get_read_ptr(cb_id);

for (uint32_t core_id = 0; core_id < 2; ++core_id) { // Iterate over two neighboring cores
uint64_t l1_noc_write_addr_for_receiver_core = 0;
uint32_t l1_read_addr = get_read_ptr(cb_id);
if (core_id == 0) {
l1_noc_write_addr_for_receiver_core = l1_noc_write_addr1; // Set write pointer to start of cb for first core
} else {
l1_noc_write_addr_for_receiver_core = l1_noc_write_addr2; // Set write pointer to start of cb for second core
l1_read_addr += page_size * num_pages_w_per_receiver; // Stride read pointer to start of second core
}

noc_async_write_one_packet_set_state(l1_noc_write_addr_for_receiver_core, page_size, noc, vc); // Set state to write a page to noc/vc

for (uint32_t h = 0; h < num_tiles_h; ++h) { // Iterate over page rows per receiver core
for (uint32_t w = 0; w < num_pages_w_per_receiver; ++w) { // Iterate over page columns per receiver core
noc_async_write_one_packet_with_state(l1_read_addr, l1_noc_write_addr_for_receiver_core, noc);
l1_read_addr += page_size;
l1_noc_write_addr_for_receiver_core += page_size;
}
l1_read_addr += page_size * num_pages_w_per_receiver; // Stride read pointer over other core's data
}
for (uint32_t h = 0; h < num_pages; ++h) {
noc_async_write_one_packet_with_state(l1_read_addr, remote_l1_write_addr, noc);
l1_read_addr += page_size;
remote_l1_write_addr += page_size;
}

noc_async_write_barrier(noc);

cb_pop_front(cb_id, block_num_tiles);

}


}
Loading

0 comments on commit 56fbe4f

Please sign in to comment.