Skip to content

Commit

Permalink
Merge branch 'main' into mistral-wh
Browse files Browse the repository at this point in the history
  • Loading branch information
mtairum authored Mar 23, 2024
2 parents 5a5978b + 9f84253 commit ce5f8f8
Show file tree
Hide file tree
Showing 10 changed files with 100 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
fail-fast: false
matrix:
arch: [grayskull, wormhole_b0]
frequent-type: [models, api]
frequent-type: [api]
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.arch }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/full-regressions-and-models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:
fail-fast: false
matrix:
arch: [grayskull, wormhole_b0]
frequent-type: [models, api]
frequent-type: [api]
env:
TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
ARCH_NAME: ${{ matrix.arch }}
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

<h3>

[TT-Metalium Documentation](https://tenstorrent-metal.github.io/tt-metal/latest/tt-metalium) | [TT-NN Documentation](https://tenstorrent-metal.github.io/tt-metal/latest/ttnn) | [Demo models](./models/demos/) | [Discord](https://discord.gg/tvhGzHQwaj) | [Tenstorrent website](https://tenstorrent.com) | [Buy an E150 Grayskull card](https://tenstorrent.com/cards/)
[TT-Metalium API Reference](https://tenstorrent-metal.github.io/tt-metal/latest/tt-metalium) | [TT-NN API Reference](https://tenstorrent-metal.github.io/tt-metal/latest/ttnn) | [Demo models](./models/demos/) | [Discord](https://discord.gg/tvhGzHQwaj) | [Tenstorrent website](https://tenstorrent.com) | [Buy an E150 Grayskull card](https://tenstorrent.com/cards/)

</h3>

Expand Down
12 changes: 12 additions & 0 deletions tests/scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,18 @@ run_frequent_api_pipeline_tests() {

source build/python_env/bin/activate
export PYTHONPATH=$TT_METAL_HOME

if [[ $dispatch_mode == "slow" ]]; then
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests_frequent
echo "Running Python API unit tests in SD for frequent..."
./tests/scripts/run_python_api_unit_tests.sh
else
if [[ $tt_arch == "wormhole_b0" ]]; then
pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly
else
echo "API tests are not available for fast dispatch because they're already covered in post-commit"
fi
fi
}

# Run frequent multi device pipeline tests - these are the t3000 + 4xn300 tests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ void kernel_main() {
constexpr uint32_t eth_receiver_noc_y = get_compile_time_arg_val(6);
constexpr uint32_t eth_receiver_l1_semaphore_addr = get_compile_time_arg_val(7);
constexpr uint32_t receiver_read_sem_addr = get_compile_time_arg_val(8);
constexpr uint32_t ID = get_compile_time_arg_val(9);
constexpr uint32_t half_cb_n_pages = get_compile_time_arg_val(9);
static_assert (half_cb_n_pages > rem_num_pages, "half_cb_n_pages must be greater than 0");

const uint32_t eth_receiver_l1_base_addr = get_arg_val<uint32_t>(0);

Expand Down Expand Up @@ -52,6 +53,9 @@ void kernel_main() {
noc_semaphore_set(receiver_read_semaphore_addr_ptr, 0);
fetch_chunk(cb_id_in0, rem_num_pages, page_size, eth_receiver_l1_base_noc_addr);
noc_semaphore_inc(eth_receiver_l1_semaphore_noc_addr, 1);
ASSERT(num_pages == 0 || num_pages > rem_num_pages);
ASSERT(half_cb_n_pages > rem_num_pages);
push_filler_pages_to_cb(cb_id_in0, half_cb_n_pages - rem_num_pages);
transfers_completed++;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ void kernel_main() {
// Same per worker receiver writer
constexpr uint32_t sem_addr = get_compile_time_arg_val(20);
constexpr bool is_clockwise_direction = get_compile_time_arg_val(21) == 1;
constexpr uint32_t half_cb_n_pages = get_compile_time_arg_val(22);
static_assert(half_cb_n_pages > rem_num_pages, "half_cb_n_pages must be greater than or equal to rem_num_pages");

constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
#ifdef RM_INTERLEAVED
Expand Down Expand Up @@ -76,6 +78,9 @@ void kernel_main() {
// DPRINT << "rws WRITE PARTIAL CHUNK " << i << "\n";
write_chunk(output_page_idx, col_idx, row_idx, cb_id_in0, d, num_cols, num_rows, col_offset, row_offset, rem_num_pages, page_size);
noc_semaphore_inc(worker_send_reader_semaphore_noc_addr, 1);
ASSERT(num_pages == 0 || num_pages > rem_num_pages);
ASSERT(half_cb_n_pages > rem_num_pages);
pop_filler_pages_from_cb(cb_id_in0, half_cb_n_pages - rem_num_pages);
}

if (is_clockwise_direction) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ void kernel_main() {
constexpr uint32_t input_start_ring_idx = get_compile_time_arg_val(21);
constexpr uint32_t sem_addr = get_compile_time_arg_val(22);
constexpr bool is_clockwise_direction = get_compile_time_arg_val(23) == 1;
constexpr uint32_t ID = get_compile_time_arg_val(24);
constexpr uint32_t half_cb_n_pages = get_compile_time_arg_val(24);
static_assert(half_cb_n_pages > rem_num_pages, "half_cb_n_pages must be greater than or equal to rem_num_pages");

constexpr uint32_t cb_id_in0 = tt::CB::c_in0;

Expand Down Expand Up @@ -74,6 +75,9 @@ void kernel_main() {
}
if constexpr(rem_num_pages > 0) {
read_chunk_from_input_tensor(input_page_idx, cb_id_in0, s, rem_num_pages, page_size);
ASSERT(num_pages == 0 || num_pages > rem_num_pages);
ASSERT(half_cb_n_pages > rem_num_pages);
push_filler_pages_to_cb(cb_id_in0, half_cb_n_pages - rem_num_pages);
}

uint32_t sem_idx = 1;
Expand Down Expand Up @@ -133,6 +137,9 @@ void kernel_main() {
noc_semaphore_wait_min(sender_semaphore_addr_ptr, sem_idx);
sem_idx++;
read_chunk_from_output_tensor(output_page_idx, col_idx, row_idx, cb_id_in0, d, num_cols, num_rows, col_offset, row_offset, rem_num_pages, page_size);
ASSERT(num_pages == 0 || num_pages > rem_num_pages);
ASSERT(half_cb_n_pages > rem_num_pages);
push_filler_pages_to_cb(cb_id_in0, half_cb_n_pages - rem_num_pages);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ void kernel_main() {
constexpr uint32_t writer_send_sem_addr = get_compile_time_arg_val(17);
constexpr uint32_t eth_sender_noc_x = get_compile_time_arg_val(18);
constexpr uint32_t eth_sender_noc_y = get_compile_time_arg_val(19);
constexpr uint32_t half_cb_n_pages = get_compile_time_arg_val(20);
static_assert(half_cb_n_pages > rem_num_pages, "half_cb_n_pages must be greater than or equal to rem_num_pages");

constexpr uint32_t cb_id_in0 = tt::CB::c_in0;
#ifdef RM_INTERLEAVED
Expand Down Expand Up @@ -65,16 +67,17 @@ void kernel_main() {
noc_semaphore_wait(writer_send_semaphore_addr_ptr, 1);
noc_semaphore_set(writer_send_semaphore_addr_ptr, 0);
// TODO: Might be better to split this?
write_and_send_chunk(output_page_idx, col_idx, row_idx, cb_id_in0, d, num_cols, num_rows, col_offset, row_offset, num_pages, page_size, eth_l1_sender_base_noc_addr);
noc_semaphore_inc(eth_l1_sender_semaphore_addr, 1);
write_and_send_chunk(output_page_idx, col_idx, row_idx, cb_id_in0, d, num_cols, num_rows, col_offset, row_offset, num_pages, page_size, eth_l1_sender_base_noc_addr, eth_l1_sender_semaphore_addr);
}
}

if constexpr(rem_num_pages > 0) {
noc_semaphore_wait(writer_send_semaphore_addr_ptr, 1);
noc_semaphore_set(writer_send_semaphore_addr_ptr, 0);
write_and_send_chunk(output_page_idx, col_idx, row_idx, cb_id_in0, d, num_cols, num_rows, col_offset, row_offset, rem_num_pages, page_size, eth_l1_sender_base_noc_addr);
noc_semaphore_inc(eth_l1_sender_semaphore_addr, 1);
write_and_send_chunk(output_page_idx, col_idx, row_idx, cb_id_in0, d, num_cols, num_rows, col_offset, row_offset, rem_num_pages, page_size, eth_l1_sender_base_noc_addr,eth_l1_sender_semaphore_addr);
ASSERT(num_pages == 0 || num_pages > rem_num_pages);
ASSERT(half_cb_n_pages > rem_num_pages);
pop_filler_pages_from_cb(cb_id_in0, half_cb_n_pages - rem_num_pages);
}

// num_transfers = num_devices - 1
Expand All @@ -92,6 +95,9 @@ void kernel_main() {
noc_semaphore_set(writer_send_semaphore_addr_ptr, 0);
send_chunk(cb_id_in0, rem_num_pages, page_size, eth_l1_sender_base_noc_addr);
noc_semaphore_inc(eth_l1_sender_semaphore_addr, 1);
ASSERT(num_pages == 0 || num_pages > rem_num_pages);
ASSERT(half_cb_n_pages > rem_num_pages);
pop_filler_pages_from_cb(cb_id_in0, half_cb_n_pages - rem_num_pages);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,16 +158,25 @@ struct ShardAddrGen final {
bool completed_core_wrap;
};

FORCE_INLINE void push_filler_pages_to_cb(const uint32_t& cb_id, uint32_t num_pages) {
ASSERT(num_pages < cb_interface[cb_id].fifo_num_pages);
cb_reserve_back(cb_id, num_pages);
cb_push_back(cb_id, num_pages);
}
FORCE_INLINE void pop_filler_pages_from_cb(const uint32_t& cb_id, uint32_t num_pages) {
ASSERT(num_pages < cb_interface[cb_id].fifo_num_pages);
cb_wait_front(cb_id, num_pages);
cb_pop_front(cb_id, num_pages);
}


FORCE_INLINE void fetch_chunk(
const uint32_t& cb_id, const uint32_t& num_pages, const uint32_t& page_size, uint64_t remote_l1_read_addr) {
for (uint32_t i = 0; i < num_pages; ++i) {
cb_reserve_back(cb_id, 1);
uint32_t l1_write_addr = get_write_ptr(cb_id);
noc_async_read(remote_l1_read_addr, l1_write_addr, page_size);
remote_l1_read_addr += page_size;
noc_async_read_barrier();
cb_push_back(cb_id, 1);
}
cb_reserve_back(cb_id, num_pages);
uint32_t l1_write_addr = get_write_ptr(cb_id);
noc_async_read(remote_l1_read_addr, l1_write_addr, page_size * num_pages);
noc_async_read_barrier();
cb_push_back(cb_id, num_pages);
}
FORCE_INLINE void fetch_chunk_sharded(
const uint32_t& cb_id, const uint32_t& num_pages, const uint32_t& page_size, uint64_t remote_l1_read_addr) {
Expand All @@ -181,14 +190,11 @@ FORCE_INLINE void fetch_chunk_sharded(

FORCE_INLINE void send_chunk(
const uint32_t& cb_id, const uint32_t& num_pages, const uint32_t& page_size, uint64_t remote_l1_write_addr) {
for (uint32_t i = 0; i < num_pages; ++i) {
cb_wait_front(cb_id, 1);
uint32_t l1_read_addr = get_read_ptr(cb_id);
noc_async_write(l1_read_addr, remote_l1_write_addr, page_size);
remote_l1_write_addr += page_size;
noc_async_write_barrier();
cb_pop_front(cb_id, 1);
}
cb_wait_front(cb_id, num_pages);
uint32_t l1_read_addr = get_read_ptr(cb_id);
noc_async_write(l1_read_addr, remote_l1_write_addr, page_size * num_pages);
noc_async_write_barrier();
cb_pop_front(cb_id, num_pages);
}
FORCE_INLINE void send_chunk_sharded(
const uint32_t& cb_id, const uint32_t& num_pages, const uint32_t& page_size, uint64_t remote_l1_write_addr) {
Expand All @@ -213,12 +219,13 @@ FORCE_INLINE void write_and_send_chunk_sharded(
cb_pop_front(cb_id, num_pages);
}
template <typename AddrGen>
FORCE_INLINE void write_and_send_chunk(uint32_t& output_page_idx, uint32_t& col_idx, uint32_t& row_idx, const uint32_t& cb_id, const AddrGen& d, const uint32_t num_cols, const uint32_t num_rows, const uint32_t& col_offset, const uint32_t& row_offset, const uint32_t& num_pages, const uint32_t& page_size, uint64_t remote_l1_write_addr) {
FORCE_INLINE void write_and_send_chunk(uint32_t& output_page_idx, uint32_t& col_idx, uint32_t& row_idx, const uint32_t& cb_id, const AddrGen& d, const uint32_t num_cols, const uint32_t num_rows, const uint32_t& col_offset, const uint32_t& row_offset, const uint32_t& num_pages, const uint32_t& page_size, uint64_t remote_l1_write_addr, uint64_t eth_l1_sender_semaphore_addr) {
cb_wait_front(cb_id, num_pages);
uint32_t l1_read_addr = get_read_ptr(cb_id);
noc_async_write(l1_read_addr, remote_l1_write_addr, page_size * num_pages);
noc_semaphore_inc(eth_l1_sender_semaphore_addr, 1);
// TODO: do eth semaphore inc here
for (uint32_t i = 0; i < num_pages; ++i) {
cb_wait_front(cb_id, 1);
uint32_t l1_read_addr = get_read_ptr(cb_id);
noc_async_write(l1_read_addr, remote_l1_write_addr, page_size);
remote_l1_write_addr += page_size;
#ifdef RM_INTERLEAVED
uint64_t dst_noc_addr = get_noc_addr(output_page_idx, d);
noc_async_write(l1_read_addr, dst_noc_addr, page_size);
Expand All @@ -242,9 +249,10 @@ FORCE_INLINE void write_and_send_chunk(uint32_t& output_page_idx, uint32_t& col_
}
}
#endif
noc_async_write_barrier();
cb_pop_front(cb_id, 1);
l1_read_addr += page_size;
}
noc_async_write_barrier();
cb_pop_front(cb_id, num_pages);
}

template <ShardType T>
Expand All @@ -264,9 +272,9 @@ FORCE_INLINE void write_chunk_sharded(const uint32_t& cb_id, ShardAddrGen<T>& ad
}
template <typename AddrGen>
FORCE_INLINE void write_chunk(uint32_t& output_page_idx, uint32_t& col_idx, uint32_t& row_idx, const uint32_t& cb_id, const AddrGen& d, const uint32_t& num_cols, const uint32_t& num_rows, const uint32_t& col_offset, const uint32_t& row_offset, const uint32_t& num_pages, const uint32_t& page_size) {
cb_wait_front(cb_id, num_pages);
uint32_t l1_read_addr = get_read_ptr(cb_id);
for (uint32_t i = 0; i < num_pages; ++i) {
cb_wait_front(cb_id, 1);
uint32_t l1_read_addr = get_read_ptr(cb_id);
#ifdef RM_INTERLEAVED
uint64_t dst_noc_addr = get_noc_addr(output_page_idx, d);
noc_async_write(l1_read_addr, dst_noc_addr, page_size);
Expand All @@ -290,9 +298,10 @@ FORCE_INLINE void write_chunk(uint32_t& output_page_idx, uint32_t& col_idx, uint
}
}
#endif
noc_async_write_barrier();
cb_pop_front(cb_id, 1);
l1_read_addr += page_size;
}
noc_async_write_barrier();
cb_pop_front(cb_id, num_pages);
}

template <ShardType T>
Expand All @@ -310,18 +319,19 @@ FORCE_INLINE void read_chunk_from_input_tensor_sharded(
template <typename AddrGen>
FORCE_INLINE void read_chunk_from_input_tensor(uint32_t& input_page_idx, const uint32_t& cb_id, const AddrGen& s, const uint32_t& num_pages, const uint32_t& page_size) {
const uint32_t end_read_idx = input_page_idx + num_pages;
cb_reserve_back(cb_id, num_pages);
uint32_t local_l1_read_addr = get_write_ptr(cb_id);
for (; input_page_idx < end_read_idx; ++input_page_idx) {
cb_reserve_back(cb_id, 1);
uint32_t local_l1_read_addr = get_write_ptr(cb_id);
#ifdef RM_INTERLEAVED
uint64_t src_noc_addr = get_noc_addr(input_page_idx, s);
noc_async_read(src_noc_addr, local_l1_read_addr, page_size);
#elif defined TILE_INTERLEAVED
noc_async_read_tile(input_page_idx, s, local_l1_read_addr);
#endif
noc_async_read_barrier();
cb_push_back(cb_id, 1);
local_l1_read_addr += page_size;
}
noc_async_read_barrier();
cb_push_back(cb_id, num_pages);
}

// Same function - just different address generators? Commonize later
Expand All @@ -339,9 +349,9 @@ FORCE_INLINE void read_chunk_from_output_tensor_sharded(
// read chunk from output tensor (local chip)
template <typename AddrGen>
FORCE_INLINE void read_chunk_from_output_tensor(uint32_t& input_page_idx, uint32_t& col_idx, uint32_t& row_idx, const uint32_t& cb_id, const AddrGen& s, const uint32_t& num_cols, const uint32_t& num_rows, const uint32_t& col_offset, const uint32_t& row_offset, const uint32_t& num_pages, const uint32_t& page_size) {
cb_reserve_back(cb_id, num_pages);
uint32_t local_l1_read_addr = get_write_ptr(cb_id);
for (uint32_t i = 0; i < num_pages; ++i) {
cb_reserve_back(cb_id, 1);
uint32_t local_l1_read_addr = get_write_ptr(cb_id);
#ifdef RM_INTERLEAVED
uint64_t src_noc_addr = get_noc_addr(input_page_idx, s);
noc_async_read(src_noc_addr, local_l1_read_addr, page_size);
Expand All @@ -365,7 +375,8 @@ FORCE_INLINE void read_chunk_from_output_tensor(uint32_t& input_page_idx, uint32
}
}
#endif
noc_async_read_barrier();
cb_push_back(cb_id, 1);
local_l1_read_addr += page_size;
}
noc_async_read_barrier();
cb_push_back(cb_id, num_pages);
}
Loading

0 comments on commit ce5f8f8

Please sign in to comment.