Skip to content

Commit

Permalink
#11838: Update
Browse files Browse the repository at this point in the history
  • Loading branch information
VirdhatchaniKN committed Sep 3, 2024
1 parent 61856c8 commit 1000312
Show file tree
Hide file tree
Showing 40 changed files with 62 additions and 74 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -423,11 +423,11 @@ int main(int argc, char** argv) {
if (fp32_dest_acc_en and (out_subblock_h * out_subblock_w > 4)) {
if (out_subblock_w >= 4) {
out_subblock_h = 1;
out_subblock_w = find_max_block_size(out_subblock_w, 4);
out_subblock_w = tt::tt_metal::find_max_block_size(out_subblock_w, 4);
} else {
while (out_subblock_h * out_subblock_w > 4) {
uint32_t div = find_max_divisor(out_subblock_h, out_subblock_h-1);
out_subblock_h = find_max_block_size(out_subblock_h, div);
uint32_t div = tt::tt_metal::find_max_divisor(out_subblock_h, out_subblock_h-1);
out_subblock_h = tt::tt_metal::find_max_block_size(out_subblock_h, div);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ inline std::tuple<uint32_t, uint32_t> get_max_cores_divisible_by_tiles_per_core_
}

// Finds the maximum even divisor of val starting at start_max_div and below
inline int find_max_divisor(uint32_t val, uint32_t start_max_div) {
inline int find_max_divisor(uint32_t val, uint32_t start_max_div) {
int result = 1;
for (int find_divisor = start_max_div; find_divisor >= 1; find_divisor--) {
if (find_divisor == 7 || find_divisor == 5)
Expand All @@ -51,7 +51,7 @@ inline int find_max_divisor(uint32_t val, uint32_t start_max_div) {
return result;
}

inline std::set<CoreRange> num_cores_to_corerange_set(
inline std::set<CoreRange> num_cores_to_corerange_set(
uint32_t target_num_cores, CoreCoord grid_size, bool row_wise = false) {
uint32_t num_cores_x = grid_size.x;
uint32_t num_cores_y = grid_size.y;
Expand Down Expand Up @@ -102,7 +102,7 @@ inline std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t,
CoreCoord grid_size, uint32_t units_to_divide) {
uint32_t num_cores_x = grid_size.x, num_cores_y = grid_size.y;
auto target_num_cores = std::min(units_to_divide, num_cores_x * num_cores_y);
CoreRangeSet all_cores( num_cores_to_corerange_set(target_num_cores, grid_size));
CoreRangeSet all_cores(num_cores_to_corerange_set(target_num_cores, grid_size));

std::set<CoreRange> core_group_1_set;
std::set<CoreRange> core_group_2_set;
Expand Down
24 changes: 6 additions & 18 deletions tests/ttnn/unit_tests/operations/test_paged_update_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@ def run_test_update_cache_decode(
# Input is sharded
compute_grid_size = device.compute_with_storage_grid_size()
num_cores = num_users
shard_grid = ttnn.CoreRangeSet(
ttnn.experimental.tensor.num_cores_to_corerange_set(num_cores, compute_grid_size, True)
)
shard_grid = ttnn.CoreRangeSet(ttnn.num_cores_to_corerange_set(num_cores, compute_grid_size, True))
input_shard_spec = ttnn.ShardSpec(
shard_grid,
[
Expand Down Expand Up @@ -108,9 +106,7 @@ def test_update_cache_decode(
# Input is sharded
compute_grid_size = device.compute_with_storage_grid_size()
num_cores = num_users
shard_grid = ttnn.CoreRangeSet(
ttnn.experimental.tensor.num_cores_to_corerange_set(num_cores, compute_grid_size, True)
)
shard_grid = ttnn.CoreRangeSet(ttnn.num_cores_to_corerange_set(num_cores, compute_grid_size, True))
input_shard_spec = ttnn.ShardSpec(
shard_grid,
[
Expand Down Expand Up @@ -183,9 +179,7 @@ def test_update_cache_decode_program_cache(
# Input is sharded
compute_grid_size = device.compute_with_storage_grid_size()
num_cores = num_users
shard_grid = ttnn.CoreRangeSet(
ttnn.experimental.tensor.num_cores_to_corerange_set(num_cores, compute_grid_size, True)
)
shard_grid = ttnn.CoreRangeSet(ttnn.num_cores_to_corerange_set(num_cores, compute_grid_size, True))
input_shard_spec = ttnn.ShardSpec(
shard_grid,
[
Expand Down Expand Up @@ -227,9 +221,7 @@ def run_test_tensor_index_update_cache_decode(
# Input is sharded
compute_grid_size = device.compute_with_storage_grid_size()
num_cores = num_users
shard_grid = ttnn.CoreRangeSet(
ttnn.experimental.tensor.num_cores_to_corerange_set(num_cores, compute_grid_size, True)
)
shard_grid = ttnn.CoreRangeSet(ttnn.num_cores_to_corerange_set(num_cores, compute_grid_size, True))
input_shard_spec = ttnn.ShardSpec(
shard_grid,
[
Expand Down Expand Up @@ -367,9 +359,7 @@ def run_test_paged_update_cache_decode(
# Input is sharded
compute_grid_size = device.compute_with_storage_grid_size()
num_cores = num_users
shard_grid = ttnn.CoreRangeSet(
ttnn.experimental.tensor.num_cores_to_corerange_set(num_cores, compute_grid_size, True)
)
shard_grid = ttnn.CoreRangeSet(ttnn.num_cores_to_corerange_set(num_cores, compute_grid_size, True))
input_shard_spec = ttnn.ShardSpec(
shard_grid,
[
Expand Down Expand Up @@ -497,9 +487,7 @@ def test_paged_update_cache_decode_program_caching(
# Input is sharded
compute_grid_size = device.compute_with_storage_grid_size()
num_cores = num_users
shard_grid = ttnn.CoreRangeSet(
ttnn.experimental.tensor.num_cores_to_corerange_set(num_cores, compute_grid_size, True)
)
shard_grid = ttnn.CoreRangeSet(ttnn.num_cores_to_corerange_set(num_cores, compute_grid_size, True))
input_shard_spec = ttnn.ShardSpec(
shard_grid,
[
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ operation::ProgramWithCallbacks moreh_adam_(
const auto num_cores_y = grid.y;

auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] =
tt::tt_metal::split_work_to_cores(grid, num_tiles);
tt_metal::split_work_to_cores(grid, num_tiles);

auto arch = param_in.device()->arch();
auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc] =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ operation::ProgramWithCallbacks moreh_clip_grad_norm_step1_impl(
core_group_1,
core_group_2,
num_inputs_per_core_group_1,
num_inputs_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_inputs);
num_inputs_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_inputs);
TT_ASSERT(core_group_2.ranges().empty());
TT_ASSERT(num_inputs_per_core_group_1 == 1);
TT_ASSERT(num_inputs_per_core_group_2 == 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ operation::ProgramWithCallbacks moreh_clip_grad_norm_step3_impl(
core_group_1,
core_group_2,
num_inputs_per_core_group_1,
num_inputs_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_inputs);
num_inputs_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_inputs);
TT_ASSERT(core_group_2.ranges().empty());
TT_ASSERT(num_inputs_per_core_group_1 == 1);
TT_ASSERT(num_inputs_per_core_group_2 == 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ operation::ProgramWithCallbacks moreh_cumsum_nc(
core_group_1,
core_group_2,
num_cols_per_core_group_1,
num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_tiles_per_chip);
num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_tiles_per_chip);

////////////////////////////////////////////////////////////////////////////
// CircularBuffer Setup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ operation::ProgramWithCallbacks moreh_groupnorm_impl(
core_group_1,
core_group_2,
num_rows_per_core_group_1,
num_rows_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_rows);
num_rows_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_rows);

log_debug(LogTest, fmt::format("num_cores_to_be_used: {}", num_cores_to_be_used).c_str());
log_debug(LogTest, fmt::format("num_rows_per_core_group_1: {}", num_rows_per_core_group_1).c_str());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ operation::ProgramWithOptionalOutputTensors moreh_groupnorm_backward_gamma_beta_
core_group_1,
core_group_2,
num_channels_per_core_group_1,
num_channels_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_channels);
num_channels_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_channels);

log_debug(LogTest, fmt::format("num_cores_to_be_used: {}", num_cores_to_be_used).c_str());
log_debug(LogTest, fmt::format("num_channels_per_core_group_1: {}", num_channels_per_core_group_1).c_str());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ operation::ProgramWithCallbacks moreh_groupnorm_backward_input_grad_impl(
core_group_1,
core_group_2,
num_rows_per_core_group_1,
num_rows_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_rows);
num_rows_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_rows);

log_debug(LogTest, fmt::format("num_cores_to_be_used: {}", num_cores_to_be_used).c_str());
log_debug(LogTest, fmt::format("num_rows_per_core_group_1: {}", num_rows_per_core_group_1).c_str());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ std::tuple<uint32_t, CoreRangeSet, CoreRangeSet, CoreRangeSet, uint32_t, uint32_
core_group_1_t,
core_group_2_t,
num_tiles_per_core_group_1,
num_tiles_per_core_group_2] = split_work_to_cores(grid_size, units_to_divide);
num_tiles_per_core_group_2] = tt_metal::split_work_to_cores(grid_size, units_to_divide);

auto core_x_offset = core_range.start_coord.x;
auto core_y_offset = core_range.start_coord.y;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ operation::ProgramWithCallbacks moreh_layernorm_impl(
core_group_1,
core_group_2,
num_rows_per_core_group_1,
num_rows_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_outer);
num_rows_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_outer);

auto arch = input.device()->arch();
auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc] =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ operation::ProgramWithCallbacks moreh_layernorm_backward_gamma_beta_grad_impl(
core_group_1,
core_group_2,
num_cols_per_core_group_1,
num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_inner);
num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_inner);

auto arch = input.device()->arch();
auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc] =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ operation::ProgramWithCallbacks moreh_layernorm_backward_input_grad_impl(
core_group_1,
core_group_2,
num_rows_per_core_group_1,
num_rows_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_outer);
num_rows_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_outer);

auto arch = input.device()->arch();
auto [math_fidelity, math_approx_mode, fp32_dest_acc_en, packer_l1_acc] =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ operation::ProgramWithCallbacks moreh_bias_backward_multi_core_h(const Tensor &o
core_group_1,
core_group_2,
num_cols_per_core_group_1,
num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, Wt);
num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, Wt);

////////////////////////////////////////////////////////////////////////////
// CircularBuffer Setup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ operation::ProgramWithCallbacks moreh_matmul_multi_core(
core_group_1,
core_group_2,
num_output_tiles_per_core_group_1,
num_output_tiles_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_output_tiles);
num_output_tiles_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_output_tiles);

log_debug(LogOp, "{}:{} num_output_tiles: {}", __func__, __LINE__, num_output_tiles);
log_debug(LogOp, "{}:{} num_output_tiles_per_core_group1: {}, 2: {} ", __func__, __LINE__, num_output_tiles_per_core_group_1, num_output_tiles_per_core_group_2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ operation::ProgramWithCallbacks moreh_mean_backward_impl(
core_group_1,
core_group_2,
num_cols_per_core_group_1,
num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_input_grad_tiles);
num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_input_grad_tiles);

////////////////////////////////////////////////////////////////////////////
// CircularBuffer Setup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ operation::ProgramWithCallbacks moreh_norm_h_impl(const Tensor &input, float p,
core_group_1,
core_group_2,
num_units_per_core_group_1,
num_units_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_units);
num_units_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_units);

////////////////////////////////////////////////////////////////////////////
// CircularBuffer Setup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ operation::ProgramWithCallbacks moreh_norm_other_impl(const Tensor &input, float
core_group_1,
core_group_2,
num_units_per_core_group_1,
num_units_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_output_tiles);
num_units_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_output_tiles);

////////////////////////////////////////////////////////////////////////////
// CircularBuffer Setup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ operation::ProgramWithCallbacks moreh_norm_w_impl(const Tensor &input, float p,
core_group_1,
core_group_2,
num_units_per_core_group_1,
num_units_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_units);
num_units_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_units);

////////////////////////////////////////////////////////////////////////////
// CircularBuffer Setup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ operation::ProgramWithCallbacks moreh_norm_backward_(
core_group_1,
core_group_2,
num_cols_per_core_group_1,
num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_input_grad_tiles);
num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_input_grad_tiles);

////////////////////////////////////////////////////////////////////////////
// CircularBuffer Setup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ operation::ProgramWithCallbacks moreh_sum_int_h_impl(const Tensor &input, const
core_group_1,
core_group_2,
num_cols_per_core_group_1,
num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_cols);
num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_cols);

log_debug(LogOp, "num_tiles {}, num_cols {}, num_cols_per_core_group_1 {}, num_cols_per_core_group_2 {}", num_tiles, num_cols, num_cols_per_core_group_1, num_cols_per_core_group_2);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ operation::ProgramWithCallbacks moreh_sum_int_nc_impl(const Tensor &input, const
core_group_1,
core_group_2,
num_cols_per_core_group_1,
num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_output_tiles);
num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_output_tiles);

////////////////////////////////////////////////////////////////////////////
// CircularBuffer Setup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ operation::ProgramWithCallbacks moreh_sum_nc_impl(const Tensor &input, const Ten
core_group_1,
core_group_2,
num_cols_per_core_group_1,
num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_output_tiles);
num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_output_tiles);

////////////////////////////////////////////////////////////////////////////
// CircularBuffer Setup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ operation::ProgramWithCallbacks moreh_sum_int_w_impl(const Tensor &input, const
core_group_1,
core_group_2,
num_rows_per_core_group_1,
num_rows_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_rows);
num_rows_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_rows);

log_debug(LogOp, "num_tiles {}, num_rows {}, num_rows_per_core_group_1 {}, num_rows_per_core_group_2 {}", num_tiles, num_rows, num_rows_per_core_group_1, num_rows_per_core_group_2);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ operation::ProgramWithCallbacks moreh_sum_backward_impl(
core_group_1,
core_group_2,
num_cols_per_core_group_1,
num_cols_per_core_group_2] = tt::tt_metal::split_work_to_cores(grid, num_input_grad_tiles);
num_cols_per_core_group_2] = tt_metal::split_work_to_cores(grid, num_input_grad_tiles);

////////////////////////////////////////////////////////////////////////////
// CircularBuffer Setup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -410,11 +410,11 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
if (fp32_dest_acc_en and (out_subblock_h_ntiles * out_subblock_w_ntiles > 4)) {
if (out_subblock_w_ntiles >= 4) {
out_subblock_h_ntiles = 1;
out_subblock_w_ntiles = find_max_block_size(out_subblock_w_ntiles, 4);
out_subblock_w_ntiles = tt::tt_metal::find_max_block_size(out_subblock_w_ntiles, 4);
} else {
while (out_subblock_h_ntiles * out_subblock_w_ntiles > 4) {
uint32_t div = tt::tt_metal:: find_max_divisor(out_subblock_h_ntiles, out_subblock_h_ntiles - 1);
out_subblock_h_ntiles = find_max_block_size(out_subblock_h_ntiles, div);
uint32_t div = tt::tt_metal::find_max_divisor(out_subblock_h_ntiles, out_subblock_h_ntiles - 1);
out_subblock_h_ntiles = tt::tt_metal::find_max_block_size(out_subblock_h_ntiles, div);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,11 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_width_sharded_v2_impl(
if (fp32_dest_acc_en and (out_subblock_h_ntiles * out_subblock_w_ntiles > 4)) {
if (out_subblock_w_ntiles >= 4) {
out_subblock_h_ntiles = 1;
out_subblock_w_ntiles = find_max_block_size(out_subblock_w_ntiles, 4);
out_subblock_w_ntiles = tt::tt_metal::find_max_block_size(out_subblock_w_ntiles, 4);
} else {
while (out_subblock_h_ntiles * out_subblock_w_ntiles > 4) {
uint32_t div = find_max_divisor(out_subblock_h_ntiles, out_subblock_h_ntiles - 1);
out_subblock_h_ntiles = find_max_block_size(out_subblock_h_ntiles, div);
uint32_t div = tt::tt_metal::find_max_divisor(out_subblock_h_ntiles, out_subblock_h_ntiles - 1);
out_subblock_h_ntiles = tt::tt_metal::find_max_block_size(out_subblock_h_ntiles, div);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ std::vector<Tensor> OptimizedConvNew::create_output_tensors(const std::vector<Te
if (this->memory_config.memory_layout == TensorMemoryLayout::HEIGHT_SHARDED) {
uint32_t total_height_tiles = tt::tt_metal::compute_volume(output_shape) / output_shape[-1] / TILE_HEIGHT;
uint32_t num_cores = total_height_tiles / this->parallelization_config.per_core_out_matrix_height_ntiles;
CoreRangeSet shard_grid = tt::tt_metal:: num_cores_to_corerange_set(num_cores, this->parallelization_config.grid_size, true);
CoreRangeSet shard_grid = tt::tt_metal::num_cores_to_corerange_set(num_cores, this->parallelization_config.grid_size, true);

std::array<uint32_t, 2> shard_shape = {this->parallelization_config.per_core_out_matrix_height_ntiles * TILE_HEIGHT, output_shape[-1]};
auto shard_spec = ShardSpec{shard_grid, shard_shape, ShardOrientation::ROW_MAJOR};
Expand Down
Loading

0 comments on commit 1000312

Please sign in to comment.