support heuristic setting (#158)

ROCm · Mar 1, 2022 · 3301314 · 3301314
1 parent 1032106
commit 3301314
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 17 deletions.
diff --git a/driver/conv_driver.cpp b/driver/conv_driver.cpp
@@ -384,42 +384,51 @@ void launch_conv_driver(driver_t * driver, const args_t *conv_args, const std::v
     double theo_conv_flop  = get_theoritical_conv_flop(conv_args);
     double theo_gpu_gflops = get_theoritical_gpu_gflops(sclk_mhz, driver->data_type);
 
-    auto launch = [&](const igemm_gtc_tunable_t * tunable, int index, int current_gks) -> result_t {
+    auto launch = [&](const igemm_gtc_tunable_t * tunable, int index, int current_gks, bool is_tunable_predicted = false) -> result_t {
+        igemm_gtc_tunable_t predicted_tunable;
+        const igemm_gtc_tunable_t * current_tunable = tunable;
+        if(is_tunable_predicted){
+            predicted_tunable = *tunable;
+            // in prediction, the gks will be 0, 1, 2... if tunable support gks, other wise it is -1.
+            // here we restore the gemm_k_global_split inside the tunable
+            predicted_tunable.gemm_k_global_split = current_gks >= 0 ? 1 : 0;
+            current_tunable = &predicted_tunable;
+        }
         if(run_only_kernel != IGEMM_RUN_ONLY_KERNEL_DEFAULT){
-            if(run_only_kernel != driver->get_kernel_name(tunable))
+            if(run_only_kernel != driver->get_kernel_name(current_tunable))
                 {result_t result; result.return_code = -2; return result;}
         }
         if(silent_not_applicable_level0){
             // direction
-            if(direction != tunable->direction)
+            if(direction != current_tunable->direction)
                 {result_t result; result.return_code = -2; return result;}
 
             // layout
             if(in_layout == "NCHW"){
-                if(tunable->tensor_layout != "nchw")
+                if(current_tunable->tensor_layout != "nchw")
                     {result_t result; result.return_code = -2; return result;}
             }else if(in_layout == "NHWC"){
-                if(tunable->tensor_layout != "nhwc")
+                if(current_tunable->tensor_layout != "nhwc")
                     {result_t result; result.return_code = -2; return result;}
             }else if(in_layout == "NCHWC"){
-                if(tunable->tensor_layout.compare(0, 5, "nchwc") != 0)
+                if(current_tunable->tensor_layout.compare(0, 5, "nchwc") != 0)
                     {result_t result; result.return_code = -2; return result;}
-                auto wei_layout_config = tunable->tensor_layout.substr(6);
+                auto wei_layout_config = current_tunable->tensor_layout.substr(6);
                 if((fil_layout == "NCHWC" && wei_layout_config != "kcyxc") || 
                     (fil_layout == "CHWNC" && wei_layout_config != "cyxkc"))
                     {result_t result; result.return_code = -2; return result;}
             }
         }
 
-        printf("[%s:%2d] %s", direction.c_str(), index, driver->get_kernel_name(tunable).c_str());
+        printf("[%s:%2d] %s", direction.c_str(), index, driver->get_kernel_name(current_tunable).c_str());
         fflush(stdout);
 
         pre_func();
 
-        result_t result = driver->run(conv_args, tunable, device_input, device_weight, device_output, current_gks);
+        result_t result = driver->run(conv_args, current_tunable, device_input, device_weight, device_output, current_gks);
 
         std::string gks_string = "";
-        if(tunable->gemm_k_global_split){
+        if(current_tunable->gemm_k_global_split){
             gks_string = "[" + std::to_string(result.gks) + "]";
         }
         printf("%s", gks_string.c_str());
@@ -445,7 +454,7 @@ void launch_conv_driver(driver_t * driver, const args_t *conv_args, const std::v
         result.efficiency = (gflops / theo_gpu_gflops) * 100;
 
         if(dump_gmap)
-            gmap_dump(conv_args, tunable, result.gks);
+            gmap_dump(conv_args, current_tunable, result.gks);
         return result;
     };
 

diff --git a/driver/igemm_bwd_gtc_driver.h b/driver/igemm_bwd_gtc_driver.h
@@ -956,7 +956,38 @@ class igemm_bwd_gtc_t : public igemm_driver_base_t{
             result.duration_ms = min_duration;
             result.gks         = selected_gks;
         }else if(this->driver_mode == driver_mode_heuristic){
-            assert(0);
+            int gks   = tunable->gemm_k_global_split ? current_gks : 0;  // sync with is_tunable_predicted
+            size_t grid_size = get_grid_size(arg, tunable) * (1 << gks);
+            if(tunable->multihead){
+                if(tunable->tensor_layout == "nhwc"){
+                    int gemm_m = n * h_tilda_slice * w_tilda_slice;
+                    int gemm_n = c / group;
+                    // This is hacky, but in MIOpen we prefer a heuristic way to set gks, so ok now.
+                    igemm_bwd_gtc_nhwc_karg_t *karg = (igemm_bwd_gtc_nhwc_karg_t *)(karg_buffer);
+                    magic_div_u32_t mdiv_x_tilda = magic_div_u32_gen(x_tilda);
+                    magic_div_u32_t mdiv_y_tilda = magic_div_u32_gen(y_tilda);
+                    magic_div_u32_t mdiv_group_mn = magic_div_u32_gen(group * utility_integer_divide_ceil(gemm_n, gemm_n_per_block) * utility_integer_divide_ceil(gemm_m, gemm_m_per_block));
+                    karg->dtile_iy = num_of_gemm > 1 ? mdiv_x_tilda.magic : 0;
+                    karg->dtile_ix = num_of_gemm > 1 ? mdiv_x_tilda.shift : 0;
+                    karg->dslice_y = num_of_gemm > 1 ? mdiv_y_tilda.magic : y;
+                    karg->dslice_x = num_of_gemm > 1 ? mdiv_y_tilda.shift : x;
+                    karg->dtile_h  = num_of_gemm > 1 ? mdiv_group_mn.magic : h_tilda;
+                    karg->dtile_w  = num_of_gemm > 1 ? mdiv_group_mn.shift : w_tilda;
+                    karg->ks       = gks;
+                }else{
+                    assert(0);
+                }
+
+                float duration = igemm_launch_kernels({
+                        {kernel_func, karg_buffer, karg_size, {grid_size * block_size, splits, 1}, {block_size, 1, 1}}
+                    }, bwd_prolog, bwd_postlog, this->warmup, this->repeat);
+
+                result.return_code = 0;
+                result.duration_ms = duration;
+                result.gks         = gks;
+            }else{
+                assert(0);  // to be supported
+            }
         }
 
 #ifdef IGEMM_SPLIT_KERNEL

diff --git a/driver/igemm_fwd_gtc_driver.h b/driver/igemm_fwd_gtc_driver.h
@@ -820,9 +820,7 @@ class igemm_fwd_gtc_t : public igemm_driver_base_t {
                     // printf("block:%d, grid:%d\n", block_size, grid_size);
                     // fflush(stdout);
                 }
-                if(tunable->tensor_layout.compare(0, 5, "nchwc") == 0){
-                    splits = 1;
-                }
+
                 //printf("block:%d, grid:%d\n", block_size, grid_size);
                 std::vector<igemm_launch_kernel_t> kernel_launchers;
                 kernel_launchers.push_back({kernel_func, karg_buffer, karg_size, {grid_size * block_size, splits, 1}, {block_size, 1, 1}});
@@ -850,8 +848,13 @@ class igemm_fwd_gtc_t : public igemm_driver_base_t {
             result.duration_ms = min_duration;
             result.gks         = selected_gks;
         }else if(this->driver_mode == driver_mode_heuristic){
-            int gks   = heuristic_select_gks(arg, tunable);
+            int gks   = tunable->gemm_k_global_split ? current_gks : 0;  // sync with is_tunable_predicted
             size_t grid_size = get_grid_size(arg, tunable) * (1 << gks);
+            if(tunable->tensor_layout == "nhwc"){
+                // This is hacky, but in MIOpen we prefer a heuristic way to set gks, so ok now.
+                igemm_fwd_gtc_nhwc_karg_t *karg_revalue = (igemm_fwd_gtc_nhwc_karg_t *)(karg_buffer);
+                karg_revalue->ks = gks;
+            }
 
             float duration = igemm_launch_kernels({
                     {kernel_func, karg_buffer, karg_size, {grid_size * block_size, splits, 1}, {block_size, 1, 1}}

diff --git a/driver/igemm_wrw_gtc_driver.h b/driver/igemm_wrw_gtc_driver.h
@@ -620,7 +620,10 @@ class igemm_wrw_gtc_t : public igemm_driver_base_t {
             // std::cout << "not valid tunable config." << std::endl;
             return result;
         }
-
+
+        if(this->driver_mode == driver_mode_heuristic)
+            current_gks = tunable->gemm_k_global_split ? current_gks : 0;
+
         int hi = arg->get_int("in_h");
         int wi = arg->get_int("in_w");
         int n = arg->get_int("batchsize");