Skip to content

Commit

Permalink
support heuristic setting (#158)
Browse files Browse the repository at this point in the history
  • Loading branch information
carlushuang authored Mar 1, 2022
1 parent 1032106 commit 3301314
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 17 deletions.
31 changes: 20 additions & 11 deletions driver/conv_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -384,42 +384,51 @@ void launch_conv_driver(driver_t * driver, const args_t *conv_args, const std::v
double theo_conv_flop = get_theoritical_conv_flop(conv_args);
double theo_gpu_gflops = get_theoritical_gpu_gflops(sclk_mhz, driver->data_type);

auto launch = [&](const igemm_gtc_tunable_t * tunable, int index, int current_gks) -> result_t {
auto launch = [&](const igemm_gtc_tunable_t * tunable, int index, int current_gks, bool is_tunable_predicted = false) -> result_t {
igemm_gtc_tunable_t predicted_tunable;
const igemm_gtc_tunable_t * current_tunable = tunable;
if(is_tunable_predicted){
predicted_tunable = *tunable;
// in prediction, the gks will be 0, 1, 2... if tunable support gks, other wise it is -1.
// here we restore the gemm_k_global_split inside the tunable
predicted_tunable.gemm_k_global_split = current_gks >= 0 ? 1 : 0;
current_tunable = &predicted_tunable;
}
if(run_only_kernel != IGEMM_RUN_ONLY_KERNEL_DEFAULT){
if(run_only_kernel != driver->get_kernel_name(tunable))
if(run_only_kernel != driver->get_kernel_name(current_tunable))
{result_t result; result.return_code = -2; return result;}
}
if(silent_not_applicable_level0){
// direction
if(direction != tunable->direction)
if(direction != current_tunable->direction)
{result_t result; result.return_code = -2; return result;}

// layout
if(in_layout == "NCHW"){
if(tunable->tensor_layout != "nchw")
if(current_tunable->tensor_layout != "nchw")
{result_t result; result.return_code = -2; return result;}
}else if(in_layout == "NHWC"){
if(tunable->tensor_layout != "nhwc")
if(current_tunable->tensor_layout != "nhwc")
{result_t result; result.return_code = -2; return result;}
}else if(in_layout == "NCHWC"){
if(tunable->tensor_layout.compare(0, 5, "nchwc") != 0)
if(current_tunable->tensor_layout.compare(0, 5, "nchwc") != 0)
{result_t result; result.return_code = -2; return result;}
auto wei_layout_config = tunable->tensor_layout.substr(6);
auto wei_layout_config = current_tunable->tensor_layout.substr(6);
if((fil_layout == "NCHWC" && wei_layout_config != "kcyxc") ||
(fil_layout == "CHWNC" && wei_layout_config != "cyxkc"))
{result_t result; result.return_code = -2; return result;}
}
}

printf("[%s:%2d] %s", direction.c_str(), index, driver->get_kernel_name(tunable).c_str());
printf("[%s:%2d] %s", direction.c_str(), index, driver->get_kernel_name(current_tunable).c_str());
fflush(stdout);

pre_func();

result_t result = driver->run(conv_args, tunable, device_input, device_weight, device_output, current_gks);
result_t result = driver->run(conv_args, current_tunable, device_input, device_weight, device_output, current_gks);

std::string gks_string = "";
if(tunable->gemm_k_global_split){
if(current_tunable->gemm_k_global_split){
gks_string = "[" + std::to_string(result.gks) + "]";
}
printf("%s", gks_string.c_str());
Expand All @@ -445,7 +454,7 @@ void launch_conv_driver(driver_t * driver, const args_t *conv_args, const std::v
result.efficiency = (gflops / theo_gpu_gflops) * 100;

if(dump_gmap)
gmap_dump(conv_args, tunable, result.gks);
gmap_dump(conv_args, current_tunable, result.gks);
return result;
};

Expand Down
33 changes: 32 additions & 1 deletion driver/igemm_bwd_gtc_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -956,7 +956,38 @@ class igemm_bwd_gtc_t : public igemm_driver_base_t{
result.duration_ms = min_duration;
result.gks = selected_gks;
}else if(this->driver_mode == driver_mode_heuristic){
assert(0);
int gks = tunable->gemm_k_global_split ? current_gks : 0; // sync with is_tunable_predicted
size_t grid_size = get_grid_size(arg, tunable) * (1 << gks);
if(tunable->multihead){
if(tunable->tensor_layout == "nhwc"){
int gemm_m = n * h_tilda_slice * w_tilda_slice;
int gemm_n = c / group;
// This is hacky, but in MIOpen we prefer a heuristic way to set gks, so ok now.
igemm_bwd_gtc_nhwc_karg_t *karg = (igemm_bwd_gtc_nhwc_karg_t *)(karg_buffer);
magic_div_u32_t mdiv_x_tilda = magic_div_u32_gen(x_tilda);
magic_div_u32_t mdiv_y_tilda = magic_div_u32_gen(y_tilda);
magic_div_u32_t mdiv_group_mn = magic_div_u32_gen(group * utility_integer_divide_ceil(gemm_n, gemm_n_per_block) * utility_integer_divide_ceil(gemm_m, gemm_m_per_block));
karg->dtile_iy = num_of_gemm > 1 ? mdiv_x_tilda.magic : 0;
karg->dtile_ix = num_of_gemm > 1 ? mdiv_x_tilda.shift : 0;
karg->dslice_y = num_of_gemm > 1 ? mdiv_y_tilda.magic : y;
karg->dslice_x = num_of_gemm > 1 ? mdiv_y_tilda.shift : x;
karg->dtile_h = num_of_gemm > 1 ? mdiv_group_mn.magic : h_tilda;
karg->dtile_w = num_of_gemm > 1 ? mdiv_group_mn.shift : w_tilda;
karg->ks = gks;
}else{
assert(0);
}

float duration = igemm_launch_kernels({
{kernel_func, karg_buffer, karg_size, {grid_size * block_size, splits, 1}, {block_size, 1, 1}}
}, bwd_prolog, bwd_postlog, this->warmup, this->repeat);

result.return_code = 0;
result.duration_ms = duration;
result.gks = gks;
}else{
assert(0); // to be supported
}
}

#ifdef IGEMM_SPLIT_KERNEL
Expand Down
11 changes: 7 additions & 4 deletions driver/igemm_fwd_gtc_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -820,9 +820,7 @@ class igemm_fwd_gtc_t : public igemm_driver_base_t {
// printf("block:%d, grid:%d\n", block_size, grid_size);
// fflush(stdout);
}
if(tunable->tensor_layout.compare(0, 5, "nchwc") == 0){
splits = 1;
}

//printf("block:%d, grid:%d\n", block_size, grid_size);
std::vector<igemm_launch_kernel_t> kernel_launchers;
kernel_launchers.push_back({kernel_func, karg_buffer, karg_size, {grid_size * block_size, splits, 1}, {block_size, 1, 1}});
Expand Down Expand Up @@ -850,8 +848,13 @@ class igemm_fwd_gtc_t : public igemm_driver_base_t {
result.duration_ms = min_duration;
result.gks = selected_gks;
}else if(this->driver_mode == driver_mode_heuristic){
int gks = heuristic_select_gks(arg, tunable);
int gks = tunable->gemm_k_global_split ? current_gks : 0; // sync with is_tunable_predicted
size_t grid_size = get_grid_size(arg, tunable) * (1 << gks);
if(tunable->tensor_layout == "nhwc"){
// This is hacky, but in MIOpen we prefer a heuristic way to set gks, so ok now.
igemm_fwd_gtc_nhwc_karg_t *karg_revalue = (igemm_fwd_gtc_nhwc_karg_t *)(karg_buffer);
karg_revalue->ks = gks;
}

float duration = igemm_launch_kernels({
{kernel_func, karg_buffer, karg_size, {grid_size * block_size, splits, 1}, {block_size, 1, 1}}
Expand Down
5 changes: 4 additions & 1 deletion driver/igemm_wrw_gtc_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,10 @@ class igemm_wrw_gtc_t : public igemm_driver_base_t {
// std::cout << "not valid tunable config." << std::endl;
return result;
}


if(this->driver_mode == driver_mode_heuristic)
current_gks = tunable->gemm_k_global_split ? current_gks : 0;

int hi = arg->get_int("in_h");
int wi = arg->get_int("in_w");
int n = arg->get_int("batchsize");
Expand Down

0 comments on commit 3301314

Please sign in to comment.