Skip to content

Commit

Permalink
#8482: Trace on remote chips
Browse files Browse the repository at this point in the history
  - Changes from @pgkeller:

     Fix host completion write ptr in test_prefetcher

     Was clobbering cmds.
     This enables more prefetcher test configs w/ exec_buf

     Rework split prefetch exec_buf sync mechanism

     prefetch_h now stalls itself at the start of exec_buf by taking away its
     downstream page credits.  After sending the exec_buf, it unstalls the
     prefetch_q so that it can continue to do prefetch work.  dispatch restores
     prefetch_h credits at exec_buf_end.

     Increase test_prefetcher exec_buf test coverage

 - cq_prefetch changes:
   - Modify detection of exec_buf command. Prefetcher no longer looks at the cmd_id,
     since exec_buf command is not the first command in the Fetch Q. Instead rely
     on stall flag.
   - prefetch_h should not offset exec_buf command by preamble size, since it belongs to
     the same entry as a previously offset command (due to barrier_and_stall)
   - Account for wrap in barrier and stall

 - Add T3K trace unit and stress tests.
  • Loading branch information
pgkeller authored and tt-asaigal committed Jun 15, 2024
1 parent d5b046b commit d44016b
Show file tree
Hide file tree
Showing 12 changed files with 295 additions and 210 deletions.
26 changes: 17 additions & 9 deletions tests/scripts/run_cpp_fd2_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,28 @@ run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 3 -i 5 -x" # PCIE Test
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 4 -i 5 -x" # Paged DRAM Read Test
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 5 -i 5 -x" # Paged DRAM Write + Read Test
#run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 6 -i 5 -x" # Host Test not supported w/ exec_buf
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 6 -i 5 -x" # Host Test

run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 1 -i 5 -x -spre" # Smoke Test
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 1 -i 5 -x -spre -sdis" # Smoke Test
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 2 -i 5 -x -spre -sdis" # Random Test
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 6 -i 5 -x -spre -sdis" # Host Test

if [[ $ARCH_NAME == "wormhole_b0" ]]; then
# packetized path used only on multi-chip WH
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 0 -i 5 -spre -sdis -packetized_en" # TrueSmoke Test with packetized path
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 1 -i 5 -spre -sdis -packetized_en" # Smoke Test with packetized path
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 2 -i 5 -spre -sdis -packetized_en" # Random Test with packetized path
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 6 -i 5 -spre -sdis -packetized_en" # Host Test with packetized path
fi
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 1 -i 1000 -x -rb -spre -sdis" # Smoke Test
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 2 -i 1000 -x -rb -spre -sdis" # Random Test

run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 0 -i 5 -spre -sdis -packetized_en" # TrueSmoke Test with packetized path
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 1 -i 5 -spre -sdis -packetized_en" # Smoke Test with packetized path
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 2 -i 5 -spre -sdis -packetized_en" # Random Test with packetized path
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 6 -i 5 -spre -sdis -packetized_en" # Host Test with packetized path
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 1 -i 1000 -rb -spre -sdis -packetized_en" # Smoke Test
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 2 -i 1000 -rb -spre -sdis -packetized_en" # Random Test

run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 0 -i 5 -x -spre -sdis -packetized_en" # TrueSmoke Test with packetized path+exec
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 1 -i 5 -x -spre -sdis -packetized_en" # Smoke Test with packetized path+exec
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 2 -i 5 -x -spre -sdis -packetized_en" # Random Test with packetized path+exec
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 6 -i 5 -x -spre -sdis -packetized_en" # Host Test with packetized path+exec
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 1 -i 1000 -x -rb -spre -sdis -packetized_en" # Smoke Test
run_test "./build/test/tt_metal/perf_microbenchmark/dispatch/test_prefetcher -t 2 -i 1000 -x -rb -spre -sdis -packetized_en" # Random Test


# Testcase: Paged Write Cmd to DRAM. 256 pages, 224b size.
Expand Down
17 changes: 17 additions & 0 deletions tests/scripts/t3000/run_t3000_frequent_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,20 @@ run_t3000_tteager_tests() {
echo "LOG_METAL: run_t3000_tteager_tests $duration seconds to complete"
}

run_t3000_trace_stress_tests() {
start_time=$(date +%s)

echo "LOG_METAL: Running run_t3000_trace_stress_tests"

NUM_TRACE_LOOPS=30 pytest tests/ttnn/unit_tests/test_multi_device_trace.py

# Record the end time
end_time=$(date +%s)
duration=$((end_time - start_time))
echo "LOG_METAL: run_t3000_trace_stress_tests $duration seconds to complete"
}


run_t3000_falcon40b_tests() {
# Record the start time
start_time=$(date +%s)
Expand All @@ -110,6 +124,9 @@ run_t3000_tests() {
# Run tteager tests
#run_t3000_tteager_tests

# Run trace tests
run_t3000_trace_stress_tests

# Run llama2-70b experimental tests
run_t3000_llama2_70b_experimental_tests

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,10 @@ int main(int argc, char **argv) {
0, // my_downstream_cb_sem_id
0, // downstream_cb_sem_id
0, // split_dispatch_page_preamble_size
false,// split_prefetcher
0, // prefetch noc_xy
0, // prefetch_local_downstream_sem_addr
0, // prefetch_downstream_buffer_pages
true, // is_dram_variant
true, // is_host_variant
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ constexpr uint32_t DRAM_DATA_ALIGNMENT = 32;

constexpr uint32_t PCIE_TRANSFER_SIZE_DEFAULT = 4096;

constexpr uint32_t dev_hugepage_base_g = 128; // HOST_CQ uses some at the start address
constexpr uint32_t dev_hugepage_base_g = 2 * (CQ_START * sizeof(uint32_t)); // HOST_CQ uses some at the start address

constexpr uint32_t host_data_dirty_pattern = 0xbaadf00d;

Expand Down Expand Up @@ -104,7 +104,6 @@ bool perf_test_g = false;
uint32_t max_xfer_size_bytes_g = dispatch_buffer_page_size_g;
uint32_t min_xfer_size_bytes_g = 4;
uint32_t l1_buf_base_g;
uint32_t prefetch_h_exec_buf_sem_addr_g;
uint32_t test_device_id_g = 0;

void init(int argc, char **argv) {
Expand Down Expand Up @@ -350,6 +349,7 @@ void add_prefetcher_cmd(vector<uint32_t>& cmds,

case CQ_PREFETCH_CMD_RELAY_INLINE:
case CQ_PREFETCH_CMD_RELAY_INLINE_NOFLUSH:
case CQ_PREFETCH_CMD_EXEC_BUF_END:
cmd.relay_inline.length = payload_length_bytes;
cmd.relay_inline.stride = round_cmd_size_up(payload_length_bytes + sizeof(CQPrefetchCmd));
break;
Expand Down Expand Up @@ -821,50 +821,32 @@ void gen_rnd_test(Device *device,

void gen_prefetcher_exec_buf_cmd_and_write_to_dram(Device *device,
vector<uint32_t>& prefetch_cmds,
vector<uint32_t> buf_cmds,
vector<uint32_t> exec_buf_cmds,
vector<uint32_t>& cmd_sizes) {

vector<uint32_t> empty_payload; // don't give me grief, it is just a test

CQPrefetchCmd cmd;
// Add the semaphore release for prefetch_h
CQDispatchCmd dcmd;
memset(&dcmd, 0, sizeof(CQDispatchCmd));

if (split_prefetcher_g) {
// Add the semaphore release for prefetch_h

CQDispatchCmd dcmd;
memset(&dcmd, 0, sizeof(CQDispatchCmd));

dcmd.base.cmd_id = CQ_DISPATCH_CMD_WRITE_LINEAR_H;
dcmd.write_linear.noc_xy_addr = NOC_XY_ENCODING(phys_prefetch_core_g.x, phys_prefetch_core_g.y);
dcmd.write_linear.addr = prefetch_h_exec_buf_sem_addr_g;
dcmd.write_linear.length = 16;

vector<uint32_t> dispatch_cmds;
vector<uint32_t> empty_sizes;
add_bare_dispatcher_cmd(dispatch_cmds, dcmd);
dispatch_cmds.push_back(1);
dispatch_cmds.push_back(0);
dispatch_cmds.push_back(0);
dispatch_cmds.push_back(0);

// Put the new commands at the front of the set of commands
// This tests that back-pressure stall in prefetch_d works
vector<uint32_t> tmp_cmds;
add_prefetcher_cmd(tmp_cmds, empty_sizes, CQ_PREFETCH_CMD_RELAY_INLINE, dispatch_cmds);
auto iter = buf_cmds.begin();
buf_cmds.insert(iter, tmp_cmds.begin(), tmp_cmds.end());
}
// cmddat_q in prefetch_d is re-used for exec_buf
// prefetch_h stalls at start of exec_buf by removing its downstream credits
// This command releases prefetch_h from the stall by restoring credits
dcmd.base.cmd_id = CQ_DISPATCH_CMD_EXEC_BUF_END;

vector<uint32_t> dispatch_cmds;
vector<uint32_t> empty_sizes; // unused for the exec_buf but call below needs it
add_bare_dispatcher_cmd(dispatch_cmds, dcmd);

// Add an end to the list of cmds to run from the buf
cmd.base.cmd_id = CQ_PREFETCH_CMD_EXEC_BUF_END;
add_bare_prefetcher_cmd(buf_cmds, cmd);
add_prefetcher_cmd(exec_buf_cmds, empty_sizes, CQ_PREFETCH_CMD_EXEC_BUF_END, dispatch_cmds);

// writes cmds to dram
num_dram_banks_g = device->num_banks(BufferType::DRAM);;

uint32_t page_size = 1 << exec_buf_log_page_size_g;

uint32_t length = buf_cmds.size() * sizeof(uint32_t);
uint32_t length = exec_buf_cmds.size() * sizeof(uint32_t);
length +=
(page_size - (length & (page_size - 1))) &
(page_size - 1); // rounded up to full pages
Expand All @@ -877,13 +859,14 @@ void gen_prefetcher_exec_buf_cmd_and_write_to_dram(Device *device,
auto dram_channel = device->dram_channel_from_bank_id(bank_id);
auto bank_core = device->dram_core_from_dram_channel(dram_channel);

tt::Cluster::instance().write_core(static_cast<const void*>(&buf_cmds[index / sizeof(uint32_t)]),
tt::Cluster::instance().write_core(static_cast<const void*>(&exec_buf_cmds[index / sizeof(uint32_t)]),
page_size, tt_cxy_pair(device->id(), bank_core), DRAM_EXEC_BUF_DEFAULT_BASE_ADDR + offset + (page_id / num_dram_banks_g) * page_size);

index += page_size;
}
tt::Cluster::instance().dram_barrier(device->id());

CQPrefetchCmd cmd;
cmd.base.cmd_id = CQ_PREFETCH_CMD_EXEC_BUF;
cmd.exec_buf.pad1 = 0;
cmd.exec_buf.pad2 = 0;
Expand Down Expand Up @@ -1246,9 +1229,9 @@ void write_prefetcher_cmds(uint32_t iterations,
if (!is_control_only && use_dram_exec_buf_g) {
// Write cmds to DRAM, generate a new command to execute those commands
cmd_sizes.resize(0);
vector<uint32_t> buf_cmds = prefetch_cmds;
vector<uint32_t> exec_buf_cmds = prefetch_cmds;
prefetch_cmds.resize(0);
gen_prefetcher_exec_buf_cmd_and_write_to_dram(device, prefetch_cmds, buf_cmds, cmd_sizes);
gen_prefetcher_exec_buf_cmd_and_write_to_dram(device, prefetch_cmds, exec_buf_cmds, cmd_sizes);
}

if (initialize_device_g) {
Expand Down Expand Up @@ -1421,20 +1404,15 @@ void configure_for_single_chip(Device *device,
tt_metal::CreateSemaphore(program, {prefetch_relay_demux_core}, 0); // unused
}
constexpr uint32_t prefetch_downstream_cb_sem = 1;
if (split_prefetcher_g) {
tt_metal::CreateSemaphore(program, {prefetch_core}, prefetch_d_buffer_pages);
if (packetized_path_en_g) {
// for the unpacketize stage, we use rptr/wptr for flow control, and poll semaphore
// value only to update the rptr:
tt_metal::CreateSemaphore(program, {prefetch_relay_demux_core}, 0);
}
} else {
tt_metal::CreateSemaphore(program, {prefetch_core}, dispatch_buffer_pages);
uint32_t prefetch_downstream_buffer_pages = split_prefetcher_g ? prefetch_d_buffer_pages : dispatch_buffer_pages;
tt_metal::CreateSemaphore(program, {prefetch_core}, prefetch_downstream_buffer_pages);
tt_metal::CreateSemaphore(program, {prefetch_core}, prefetch_d_buffer_pages);
if (packetized_path_en_g) {
// for the unpacketize stage, we use rptr/wptr for flow control, and poll semaphore
// value only to update the rptr:
tt_metal::CreateSemaphore(program, {prefetch_relay_demux_core}, 0);
}

uint32_t prefetch_h_exec_buf_sem = 2;
prefetch_h_exec_buf_sem_addr_g = tt_metal::CreateSemaphore(program, {prefetch_core}, 0);

constexpr uint32_t prefetch_d_upstream_cb_sem = 1;
constexpr uint32_t prefetch_d_downstream_cb_sem = 2;
if (packetized_path_en_g) {
Expand Down Expand Up @@ -1480,7 +1458,6 @@ void configure_for_single_chip(Device *device,
prefetch_downstream_cb_sem, // prefetch_d only
dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE,
dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only
prefetch_h_exec_buf_sem,
};

if (split_prefetcher_g) {
Expand Down Expand Up @@ -1698,14 +1675,18 @@ void configure_for_single_chip(Device *device,
split_prefetcher_g ? prefetch_d_downstream_cb_sem : prefetch_downstream_cb_sem, // overridden below for dispatch_h
dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS,
prefetch_sync_sem,
dev_hugepage_base_g,
0, // true base of hugepage
dev_hugepage_completion_buffer_base,
DEFAULT_HUGEPAGE_COMPLETION_BUFFER_SIZE,
dispatch_buffer_base,
(1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_buffer_pages,
0, // unused on hd, filled in below for h and d
0, // unused on hd, filled in below for h and d
0, // unused unless tunneler is between h and d
split_prefetcher_g,
NOC_XY_ENCODING(phys_prefetch_core_g.x, phys_prefetch_core_g.y),
prefetch_downstream_cb_sem,
prefetch_downstream_buffer_pages,
};

CoreCoord phys_upstream_from_dispatch_core = split_prefetcher_g ? phys_prefetch_d_core : phys_prefetch_core_g;
Expand Down Expand Up @@ -2007,20 +1988,15 @@ void configure_for_multi_chip(Device *device,
tt_metal::CreateSemaphore(program_r, {prefetch_relay_demux_core}, 0); // unused
}
constexpr uint32_t prefetch_downstream_cb_sem = 1;
if (split_prefetcher_g) {
tt_metal::CreateSemaphore(program, {prefetch_core}, prefetch_d_buffer_pages);
if (packetized_path_en_g) {
// for the unpacketize stage, we use rptr/wptr for flow control, and poll semaphore
// value only to update the rptr:
tt_metal::CreateSemaphore(program_r, {prefetch_relay_demux_core}, 0);
}
} else {
tt_metal::CreateSemaphore(program, {prefetch_core}, dispatch_buffer_pages);
uint32_t prefetch_downstream_buffer_pages = split_prefetcher_g ? prefetch_d_buffer_pages : dispatch_buffer_pages;
tt_metal::CreateSemaphore(program, {prefetch_core}, prefetch_downstream_buffer_pages);
tt_metal::CreateSemaphore(program, {prefetch_core}, prefetch_d_buffer_pages);
if (packetized_path_en_g) {
// for the unpacketize stage, we use rptr/wptr for flow control, and poll semaphore
// value only to update the rptr:
tt_metal::CreateSemaphore(program_r, {prefetch_relay_demux_core}, 0);
}

uint32_t prefetch_h_exec_buf_sem = 2;
prefetch_h_exec_buf_sem_addr_g = tt_metal::CreateSemaphore(program, {prefetch_core}, 0);

constexpr uint32_t prefetch_d_upstream_cb_sem = 1;
constexpr uint32_t prefetch_d_downstream_cb_sem = 2;
if (packetized_path_en_g) {
Expand Down Expand Up @@ -2066,7 +2042,6 @@ void configure_for_multi_chip(Device *device,
prefetch_downstream_cb_sem, // prefetch_d only
dispatch_constants::PREFETCH_D_BUFFER_LOG_PAGE_SIZE,
dispatch_constants::PREFETCH_D_BUFFER_BLOCKS, // prefetch_d only
prefetch_h_exec_buf_sem,
};

if (split_prefetcher_g) {
Expand Down Expand Up @@ -2369,14 +2344,18 @@ void configure_for_multi_chip(Device *device,
split_prefetcher_g ? prefetch_d_downstream_cb_sem : prefetch_downstream_cb_sem,
dispatch_constants::DISPATCH_BUFFER_SIZE_BLOCKS,
prefetch_sync_sem,
dev_hugepage_base_g,
0, // true base of hugepage
dev_hugepage_completion_buffer_base,
DEFAULT_HUGEPAGE_COMPLETION_BUFFER_SIZE,
dispatch_buffer_base,
(1 << dispatch_constants::DISPATCH_BUFFER_LOG_PAGE_SIZE) * dispatch_buffer_pages,
0, // unused on hd, filled in below for h and d
0, // unused on hd, filled in below for h and d
0, // unused unless tunneler is between h and d
split_prefetcher_g,
NOC_XY_ENCODING(phys_prefetch_core_g.x, phys_prefetch_core_g.y),
prefetch_downstream_cb_sem,
prefetch_downstream_buffer_pages,
};

CoreCoord phys_upstream_from_dispatch_core = split_prefetcher_g ? phys_prefetch_d_core : phys_prefetch_core_g;
Expand Down
Loading

0 comments on commit d44016b

Please sign in to comment.