Add whether has named barrier to arch attribution

intel · May 24, 2024 · c5314b7 · c5314b7
1 parent e5510c6
commit c5314b7
Show file tree

Hide file tree

Showing 56 changed files with 723 additions and 538 deletions.
diff --git a/include/common/core/arch_config.hpp b/include/common/core/arch_config.hpp
@@ -29,6 +29,7 @@ namespace gpu::xetla {
 template <msg_type message_type, gpu_arch arch_tag>
 struct load_store_attr_t {
   static constexpr bool has_hw_block_2d = false;
+  static constexpr bool has_block_1d = false;
 };
 
 template <>
@@ -93,25 +94,32 @@ inline constexpr bool arch_has_2d_load_store =
 
 template <gpu_arch arch_tag>
 struct load_store_attr_t<msg_type::block_1d, arch_tag> {
+  static constexpr bool has_block_1d = true;
   static constexpr uint32_t max_load_vec_len = 32;
   static constexpr uint32_t max_store_vec_len = 32;
   static constexpr uint32_t max_prefetch_vec_len = 32;
 };
 
 template <>
 struct load_store_attr_t<msg_type::block_1d, gpu_arch::XeHpc> {
+  static constexpr bool has_block_1d = true;
   static constexpr uint32_t max_load_vec_len = 64;
   static constexpr uint32_t max_store_vec_len = 64;
   static constexpr uint32_t max_prefetch_vec_len = 64;
 };
 
+template <gpu_arch arch_tag>
+inline constexpr bool arch_has_1d_load_store =
+    load_store_attr_t<msg_type::block_1d, arch_tag>::has_block_1d;
+
 struct dpas_attr_base_t {
   static constexpr bool has_xmx = true;
   static constexpr uint32_t systolic_depth = 8;
   static constexpr uint32_t rcount_max = 8;
   static constexpr uint32_t op_per_channel_bits = 32;
   static constexpr uint32_t op_per_channel_bytes = (op_per_channel_bits >> 3);
   static constexpr uint32_t op_per_channel_max = 8;
+  static constexpr uint32_t k_in_bytes = systolic_depth * op_per_channel_bytes;
 };
 
 template <gpu_arch arch_tag>
@@ -121,12 +129,12 @@ struct dpas_attr_t {
 
 template <>
 struct dpas_attr_t<gpu_arch::XeHpc> : public dpas_attr_base_t {
-  static constexpr uint32_t n_fixed_limit = 16;
+  static constexpr uint32_t n_in_elem = 16;
 };
 
 template <>
 struct dpas_attr_t<gpu_arch::XeHpg> : public dpas_attr_base_t {
-  static constexpr uint32_t n_fixed_limit = 8;
+  static constexpr uint32_t n_in_elem = 8;
 };
 
 template <gpu_arch arch_tag>
@@ -140,16 +148,9 @@ struct fpu_attr_t {
 template <gpu_arch arch_tag>
 inline constexpr bool arch_has_fpu = fpu_attr_t<arch_tag>::has_fpu;
 
-template <grf_mode grf_num_mode>
-struct register_nums_t {
-  static constexpr uint32_t register_nums =
-      (grf_num_mode == grf_mode::normal) ? 128 : 256;
-  static constexpr uint32_t acc_register_nums =
-      (grf_num_mode == grf_mode::normal) ? 4 : 8;
-};
-
 template <gpu_arch arch_tag>
 struct register_bytes_t;
+
 template <>
 struct register_bytes_t<gpu_arch::XeHpc> {
   static constexpr uint32_t reg_in_bytes = 64;
@@ -163,6 +164,14 @@ struct register_bytes_t<gpu_arch::XeLpg> {
   static constexpr uint32_t reg_in_bytes = 32;
 };
 
+template <grf_mode grf_num_mode>
+struct register_nums_t {
+  static constexpr uint32_t register_nums =
+      (grf_num_mode == grf_mode::normal) ? 128 : 256;
+  static constexpr uint32_t acc_register_nums =
+      (grf_num_mode == grf_mode::normal) ? 4 : 8;
+};
+
 template <grf_mode grf_num_mode, gpu_arch arch_tag>
 struct register_attr_t {
   static constexpr uint32_t reg_in_bytes =
@@ -175,24 +184,48 @@ struct register_attr_t {
   static constexpr uint32_t grf_in_bytes = register_nums * reg_in_bytes;
 };
 
-template <gpu_arch arch_tag, uint32_t m, class enable = void>
+template <
+    gpu_arch arch_tag,
+    mma_engine engine_type,
+    uint32_t m,
+    class enable = void>
 struct mma_attr_t {};
 
 template <gpu_arch arch_tag, uint32_t m>
-struct mma_attr_t<arch_tag, m, std::enable_if_t<arch_has_xmx<arch_tag>>> {
+struct mma_attr_t<
+    arch_tag,
+    mma_engine::xmx,
+    m,
+    std::enable_if_t<arch_has_xmx<arch_tag>>> {
   using dpas_attr = dpas_attr_t<arch_tag>;
   static constexpr uint32_t mma_m_in_elem =
       (m > dpas_attr::rcount_max) ? dpas_attr::rcount_max : m;
-  static constexpr uint32_t mma_n_in_elem = dpas_attr::n_fixed_limit;
-  static constexpr uint32_t mma_k_in_bytes =
-      dpas_attr::systolic_depth * dpas_attr::op_per_channel_bytes;
+  static constexpr uint32_t blk_m_in_elem = 16;
+
+  static constexpr uint32_t mma_n_in_elem = dpas_attr::n_in_elem;
+  [[maybe_unused]] static constexpr uint32_t blk_n_in_bytes = 64;
+
+  static constexpr uint32_t mma_k_in_bytes = dpas_attr::k_in_bytes;
+  static constexpr uint32_t blk_k_in_bytes = mma_k_in_bytes;
 };
 
 template <gpu_arch arch_tag, uint32_t m>
-struct mma_attr_t<arch_tag, m, std::enable_if_t<!arch_has_xmx<arch_tag>>> {
+struct mma_attr_t<
+    arch_tag,
+    mma_engine::fpu,
+    m,
+    std::enable_if_t<arch_has_fpu<arch_tag>>> {
+  using load_store_attr = load_store_attr_t<msg_type::block_2d, arch_tag>;
   static constexpr uint32_t mma_m_in_elem = (m > 8) ? 8 : m;
-  static constexpr uint32_t mma_n_in_elem = 16;
+  static constexpr uint32_t blk_m_in_elem = 16;
+
   static constexpr uint32_t mma_k_in_bytes = 32;
+  static constexpr uint32_t blk_k_in_bytes =
+      load_store_attr::max_trans_load_width_in_bytes;
+
+  [[maybe_unused]] static constexpr uint32_t mma_n_in_elem = 16;
+  static constexpr uint32_t blk_n_in_bytes =
+      register_bytes_t<arch_tag>::reg_in_bytes;
 };
 
 template <gpu_arch arch_tag>
@@ -210,6 +243,8 @@ struct arch_attr_t<gpu_arch::XeHpc> {
 
   static constexpr uint32_t max_wg_num = 64;
   static constexpr uint32_t local_mem_size = 128 * 1024;
+  static constexpr bool has_named_barrier = true;
+  static constexpr bool has_atomic_add = true;
 };
 
 template <>
@@ -222,8 +257,11 @@ struct arch_attr_t<gpu_arch::XeHpg> {
 
   using dpas_attr = dpas_attr_t<gpu_arch::XeHpg>;
 
-  static constexpr uint32_t max_wg_num = 64;
+  static constexpr uint32_t max_wg_num = 32;
   static constexpr uint32_t local_mem_size = 64 * 1024;
+
+  static constexpr bool has_named_barrier = false;
+  static constexpr bool has_atomic_add = true;
 };
 
 template <>
@@ -236,10 +274,20 @@ struct arch_attr_t<gpu_arch::XeLpg> {
 
   using dpas_attr = dpas_attr_t<gpu_arch::XeLpg>;
 
-  static constexpr uint32_t max_wg_num = 64;
+  static constexpr uint32_t max_wg_num = 32;
   static constexpr uint32_t local_mem_size = 64 * 1024;
+  static constexpr bool has_named_barrier = false;
+  static constexpr bool has_atomic_add = true;
 };
 
+template <gpu_arch arch_tag>
+inline constexpr bool arch_has_named_barrier =
+    arch_attr_t<arch_tag>::has_named_barrier;
+
+template <gpu_arch arch_tag>
+inline constexpr bool arch_has_atomic_add =
+    arch_attr_t<arch_tag>::has_atomic_add;
+
 /// @} xetla_core_arch_config
 
 } // namespace gpu::xetla
diff --git a/include/common/core/common_types.hpp b/include/common/core/common_types.hpp
@@ -23,7 +23,12 @@
 namespace gpu::xetla {
 enum class gpu_arch : uint8_t { XeLpg = 0, XeHpg = 1, XeHpc = 2 };
 
+template <gpu_arch arch_tag>
+inline constexpr bool valid_xe_arch_tag = (arch_tag <= gpu_arch::XeHpc);
+
 enum class grf_mode : uint8_t { normal = 0, double_grf = 1 };
 
 enum class mem_layout : uint8_t { row_major = 0, col_major = 1 };
+
+enum class mma_engine : uint8_t { xmx = 0, fpu = 1 };
 } // namespace gpu::xetla
diff --git a/include/common/utils/common.hpp b/include/common/utils/common.hpp
@@ -51,7 +51,7 @@ constexpr uint32_t get_element_size_code() {
 enum class lsc_action : uint8_t { prefetch, load, store, atomic };
 
 template <lsc_action Action, cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
-constexpr std::enable_if_t<arch_tag <= gpu_arch::XeHpc, void>
+constexpr std::enable_if_t<valid_xe_arch_tag<arch_tag>, void>
 check_lsc_cache_hint() {
   if constexpr (Action == lsc_action::prefetch) {
     // https://gfxspecs.intel.com/Predator/Home/Index/53560
@@ -153,7 +153,7 @@ get_prefetch_cache_hint_code() {
 }
 
 template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
-constexpr std::enable_if_t<arch_tag <= gpu_arch::XeHpc, uint32_t>
+constexpr std::enable_if_t<arch_tag == gpu_arch::XeHpc, uint32_t>
 get_store_cache_hint_code() {
   check_lsc_cache_hint<lsc_action::store, L1H, L2H, arch_tag>();
   if (L1H == cache_hint::none && L2H == cache_hint::none) {
@@ -286,7 +286,6 @@ enum class store_op : uint8_t {
   scattered_transpose = 3,
   block_1d = 4
 };
-enum class mma_engine : uint8_t { xmx = 0, fpu = 1 };
 // enum class trans_mode : uint8_t { none = 0, transpose = 1 };
 enum class memory_op : uint8_t { load = 0, store = 1 };
 enum class tdesc_update_dir : uint8_t { x_dir = 0, y_dir = 1 };

diff --git a/include/common/utils/limitation.hpp b/include/common/utils/limitation.hpp
@@ -747,7 +747,7 @@ struct check_store {
 } // namespace subgroup
 
 namespace group {
-template <gpu_arch arch = gpu_arch::XeHpc>
+template <gpu_arch arch>
 struct gemm {
   struct default_fpu {
     template <
@@ -876,7 +876,7 @@ struct gemm {
         int block_size_x_b,
         int block_size_y_b>
     struct check_tile_size_default {
-      using mma_attr = mma_attr_t<arch, block_size_y_a>;
+      using mma_attr = mma_attr_t<arch, mma_engine::xmx, tile_size_y_a>;
       static constexpr int32_t mma_m = mma_attr::mma_m_in_elem;
       static constexpr int32_t mma_n = mma_attr::mma_n_in_elem;
       static constexpr int32_t mma_k =

diff --git a/include/common/utils/raw_send_load_store.hpp b/include/common/utils/raw_send_load_store.hpp
@@ -219,12 +219,12 @@ __XETLA_API void xetla_update_tdesc_offsety(
 template <
     typename Ty,
     uint32_t N,
-    cache_hint L1H = cache_hint::none,
-    cache_hint L2H = cache_hint::none,
-    bool transpose = false,
-    bool transform = false,
-    gpu_arch arch_tag = gpu_arch::XeHpc>
-__XETLA_API std::enable_if_t<arch_tag == gpu_arch::XeHpc, xetla_vector<Ty, N>>
+    cache_hint L1H,
+    cache_hint L2H,
+    bool transpose,
+    bool transform,
+    gpu_arch arch_tag >
+__XETLA_API std::enable_if_t<arch_has_2d_load_store<arch_tag>, xetla_vector<Ty, N>>
 xetla_tload_global(xetla_tdescriptor tdesc) {
   DEBUG_INVOKE(
       dbg_level::core,
@@ -273,10 +273,10 @@ xetla_tload_global(xetla_tdescriptor tdesc) {
 template <
     typename Ty,
     uint32_t N,
-    cache_hint L1H = cache_hint::none,
-    cache_hint L2H = cache_hint::none,
-    gpu_arch arch_tag = gpu_arch::XeHpc>
-__XETLA_API std::enable_if_t<arch_tag == gpu_arch::XeHpc, void>
+    cache_hint L1H,
+    cache_hint L2H,
+    gpu_arch arch_tag>
+__XETLA_API std::enable_if_t<arch_has_2d_load_store<arch_tag>, void>
 xetla_tstore_global(xetla_tdescriptor tdesc, xetla_vector<Ty, N> data) {
   DEBUG_INVOKE(
       dbg_level::core, core::block_2d<arch_tag, Ty>::check_store(tdesc));
@@ -310,10 +310,10 @@ xetla_tstore_global(xetla_tdescriptor tdesc, xetla_vector<Ty, N> data) {
 ///
 template <
     typename Ty,
-    cache_hint L1H = cache_hint::cached,
-    cache_hint L2H = cache_hint::cached,
-    gpu_arch arch_tag = gpu_arch::XeHpc>
-__XETLA_API std::enable_if_t<arch_tag == gpu_arch::XeHpc, void>
+    cache_hint L1H,
+    cache_hint L2H,
+    gpu_arch arch_tag>
+__XETLA_API std::enable_if_t<arch_has_2d_load_store<arch_tag>, void>
 xetla_tprefetch_global(xetla_tdescriptor tdesc) {
   uint32_t msg_desc = 3;
   msg_desc |= 0 << 7;
@@ -350,12 +350,12 @@ xetla_tprefetch_global(xetla_tdescriptor tdesc) {
 template <
     typename Ty,
     uint32_t N,
-    cache_hint L1H = cache_hint::none,
-    cache_hint L2H = cache_hint::none,
+    cache_hint L1H,
+    cache_hint L2H,
     atomic_op Op,
-    gpu_arch arch_tag = gpu_arch::XeHpc,
+    gpu_arch arch_tag,
     typename Toffset = uint32_t>
-__XETLA_API std::enable_if_t<arch_tag == gpu_arch::XeHpc, void>
+__XETLA_API std::enable_if_t<arch_has_2d_load_store<arch_tag>, void>
 xetla_tatomic_store_global(
     uint64_t base_address,
     xetla_vector<Toffset, N> offset,

diff --git a/include/common/utils/raw_send_nbarrier.hpp b/include/common/utils/raw_send_nbarrier.hpp
@@ -41,9 +41,9 @@ enum class nbarrier_role : uint8_t {
 /// as consumer.
 ///
 template <
-    uint8_t num_producers = 1,
-    uint8_t num_consumers = 1,
-    gpu_arch arch_tag = gpu_arch::XeHpc,
+    uint8_t num_producers,
+    uint8_t num_consumers,
+    gpu_arch arch_tag,
     typename enable = void>
 struct xetla_nbarrier_t;
 
@@ -52,7 +52,7 @@ struct xetla_nbarrier_t<
     num_producers,
     num_consumers,
     arch_tag,
-    std::enable_if_t<arch_tag == gpu_arch::XeHpc>> {
+    std::enable_if_t<arch_has_named_barrier<arch_tag>>> {
   ///
   /// @brief Description of named barrier objection.
   /// Structure is defined in
@@ -105,20 +105,7 @@ struct xetla_nbarrier_t<
     num_producers,
     num_consumers,
     arch_tag,
-    std::enable_if_t<arch_tag != gpu_arch::XeHpc>> {
-  ///
-  /// @brief Description of named barrier objection.
-  /// Structure is defined in
-  /// [here](https://gfxspecs.intel.com/Predator/Home/Index/57499).
-  ///
-  // xetla_vector<uint32_t, 16> nbar;
-  // uint32_t barrier_id;
-
-  /// @param role is the role of subgroup when participating the barrier.
-  /// @param nbarrier_id [in] is the id of the barrier.
-  /// note:  all subgroups participating the barrier should have the same
-  /// barrier_id. Here is the bspec link
-  /// https://gfxspecs.intel.com/Predator/Home/Index/54006
+    std::enable_if_t<!arch_has_named_barrier<arch_tag>>> {
   __XETLA_API void init_nbarrier(uint8_t, nbarrier_role) {}
 
   /// @brief Generic work-group split barrier.
@@ -127,14 +114,10 @@ struct xetla_nbarrier_t<
     __ESIMD_ENS::split_barrier<__ESIMD_ENS::split_barrier_action::signal>();
   }
 
-  /// @brief named barrier wait within subgroup.
-  ///
   __XETLA_API void wait() {
     __ESIMD_ENS::split_barrier<__ESIMD_ENS::split_barrier_action::wait>();
   }
 
-  /// @brief named barrier signal from subgroup.
-  ///
   __XETLA_API void arrive_wait() {
     arrive();
     wait();