Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
Add whether has named barrier to arch attribution
Browse files Browse the repository at this point in the history
  • Loading branch information
JianpingChen066 committed May 24, 2024
1 parent e5510c6 commit c5314b7
Show file tree
Hide file tree
Showing 56 changed files with 723 additions and 538 deletions.
86 changes: 67 additions & 19 deletions include/common/core/arch_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ namespace gpu::xetla {
template <msg_type message_type, gpu_arch arch_tag>
struct load_store_attr_t {
static constexpr bool has_hw_block_2d = false;
static constexpr bool has_block_1d = false;
};

template <>
Expand Down Expand Up @@ -93,25 +94,32 @@ inline constexpr bool arch_has_2d_load_store =

template <gpu_arch arch_tag>
struct load_store_attr_t<msg_type::block_1d, arch_tag> {
static constexpr bool has_block_1d = true;
static constexpr uint32_t max_load_vec_len = 32;
static constexpr uint32_t max_store_vec_len = 32;
static constexpr uint32_t max_prefetch_vec_len = 32;
};

template <>
struct load_store_attr_t<msg_type::block_1d, gpu_arch::XeHpc> {
static constexpr bool has_block_1d = true;
static constexpr uint32_t max_load_vec_len = 64;
static constexpr uint32_t max_store_vec_len = 64;
static constexpr uint32_t max_prefetch_vec_len = 64;
};

template <gpu_arch arch_tag>
inline constexpr bool arch_has_1d_load_store =
load_store_attr_t<msg_type::block_1d, arch_tag>::has_block_1d;

struct dpas_attr_base_t {
static constexpr bool has_xmx = true;
static constexpr uint32_t systolic_depth = 8;
static constexpr uint32_t rcount_max = 8;
static constexpr uint32_t op_per_channel_bits = 32;
static constexpr uint32_t op_per_channel_bytes = (op_per_channel_bits >> 3);
static constexpr uint32_t op_per_channel_max = 8;
static constexpr uint32_t k_in_bytes = systolic_depth * op_per_channel_bytes;
};

template <gpu_arch arch_tag>
Expand All @@ -121,12 +129,12 @@ struct dpas_attr_t {

template <>
struct dpas_attr_t<gpu_arch::XeHpc> : public dpas_attr_base_t {
static constexpr uint32_t n_fixed_limit = 16;
static constexpr uint32_t n_in_elem = 16;
};

template <>
struct dpas_attr_t<gpu_arch::XeHpg> : public dpas_attr_base_t {
static constexpr uint32_t n_fixed_limit = 8;
static constexpr uint32_t n_in_elem = 8;
};

template <gpu_arch arch_tag>
Expand All @@ -140,16 +148,9 @@ struct fpu_attr_t {
template <gpu_arch arch_tag>
inline constexpr bool arch_has_fpu = fpu_attr_t<arch_tag>::has_fpu;

template <grf_mode grf_num_mode>
struct register_nums_t {
static constexpr uint32_t register_nums =
(grf_num_mode == grf_mode::normal) ? 128 : 256;
static constexpr uint32_t acc_register_nums =
(grf_num_mode == grf_mode::normal) ? 4 : 8;
};

template <gpu_arch arch_tag>
struct register_bytes_t;

template <>
struct register_bytes_t<gpu_arch::XeHpc> {
static constexpr uint32_t reg_in_bytes = 64;
Expand All @@ -163,6 +164,14 @@ struct register_bytes_t<gpu_arch::XeLpg> {
static constexpr uint32_t reg_in_bytes = 32;
};

template <grf_mode grf_num_mode>
struct register_nums_t {
static constexpr uint32_t register_nums =
(grf_num_mode == grf_mode::normal) ? 128 : 256;
static constexpr uint32_t acc_register_nums =
(grf_num_mode == grf_mode::normal) ? 4 : 8;
};

template <grf_mode grf_num_mode, gpu_arch arch_tag>
struct register_attr_t {
static constexpr uint32_t reg_in_bytes =
Expand All @@ -175,24 +184,48 @@ struct register_attr_t {
static constexpr uint32_t grf_in_bytes = register_nums * reg_in_bytes;
};

template <gpu_arch arch_tag, uint32_t m, class enable = void>
template <
gpu_arch arch_tag,
mma_engine engine_type,
uint32_t m,
class enable = void>
struct mma_attr_t {};

template <gpu_arch arch_tag, uint32_t m>
struct mma_attr_t<arch_tag, m, std::enable_if_t<arch_has_xmx<arch_tag>>> {
struct mma_attr_t<
arch_tag,
mma_engine::xmx,
m,
std::enable_if_t<arch_has_xmx<arch_tag>>> {
using dpas_attr = dpas_attr_t<arch_tag>;
static constexpr uint32_t mma_m_in_elem =
(m > dpas_attr::rcount_max) ? dpas_attr::rcount_max : m;
static constexpr uint32_t mma_n_in_elem = dpas_attr::n_fixed_limit;
static constexpr uint32_t mma_k_in_bytes =
dpas_attr::systolic_depth * dpas_attr::op_per_channel_bytes;
static constexpr uint32_t blk_m_in_elem = 16;

static constexpr uint32_t mma_n_in_elem = dpas_attr::n_in_elem;
[[maybe_unused]] static constexpr uint32_t blk_n_in_bytes = 64;

static constexpr uint32_t mma_k_in_bytes = dpas_attr::k_in_bytes;
static constexpr uint32_t blk_k_in_bytes = mma_k_in_bytes;
};

template <gpu_arch arch_tag, uint32_t m>
struct mma_attr_t<arch_tag, m, std::enable_if_t<!arch_has_xmx<arch_tag>>> {
struct mma_attr_t<
arch_tag,
mma_engine::fpu,
m,
std::enable_if_t<arch_has_fpu<arch_tag>>> {
using load_store_attr = load_store_attr_t<msg_type::block_2d, arch_tag>;
static constexpr uint32_t mma_m_in_elem = (m > 8) ? 8 : m;
static constexpr uint32_t mma_n_in_elem = 16;
static constexpr uint32_t blk_m_in_elem = 16;

static constexpr uint32_t mma_k_in_bytes = 32;
static constexpr uint32_t blk_k_in_bytes =
load_store_attr::max_trans_load_width_in_bytes;

[[maybe_unused]] static constexpr uint32_t mma_n_in_elem = 16;
static constexpr uint32_t blk_n_in_bytes =
register_bytes_t<arch_tag>::reg_in_bytes;
};

template <gpu_arch arch_tag>
Expand All @@ -210,6 +243,8 @@ struct arch_attr_t<gpu_arch::XeHpc> {

static constexpr uint32_t max_wg_num = 64;
static constexpr uint32_t local_mem_size = 128 * 1024;
static constexpr bool has_named_barrier = true;
static constexpr bool has_atomic_add = true;
};

template <>
Expand All @@ -222,8 +257,11 @@ struct arch_attr_t<gpu_arch::XeHpg> {

using dpas_attr = dpas_attr_t<gpu_arch::XeHpg>;

static constexpr uint32_t max_wg_num = 64;
static constexpr uint32_t max_wg_num = 32;
static constexpr uint32_t local_mem_size = 64 * 1024;

static constexpr bool has_named_barrier = false;
static constexpr bool has_atomic_add = true;
};

template <>
Expand All @@ -236,10 +274,20 @@ struct arch_attr_t<gpu_arch::XeLpg> {

using dpas_attr = dpas_attr_t<gpu_arch::XeLpg>;

static constexpr uint32_t max_wg_num = 64;
static constexpr uint32_t max_wg_num = 32;
static constexpr uint32_t local_mem_size = 64 * 1024;
static constexpr bool has_named_barrier = false;
static constexpr bool has_atomic_add = true;
};

template <gpu_arch arch_tag>
inline constexpr bool arch_has_named_barrier =
arch_attr_t<arch_tag>::has_named_barrier;

template <gpu_arch arch_tag>
inline constexpr bool arch_has_atomic_add =
arch_attr_t<arch_tag>::has_atomic_add;

/// @} xetla_core_arch_config

} // namespace gpu::xetla
5 changes: 5 additions & 0 deletions include/common/core/common_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@
namespace gpu::xetla {
enum class gpu_arch : uint8_t { XeLpg = 0, XeHpg = 1, XeHpc = 2 };

template <gpu_arch arch_tag>
inline constexpr bool valid_xe_arch_tag = (arch_tag <= gpu_arch::XeHpc);

enum class grf_mode : uint8_t { normal = 0, double_grf = 1 };

enum class mem_layout : uint8_t { row_major = 0, col_major = 1 };

enum class mma_engine : uint8_t { xmx = 0, fpu = 1 };
} // namespace gpu::xetla
5 changes: 2 additions & 3 deletions include/common/utils/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ constexpr uint32_t get_element_size_code() {
enum class lsc_action : uint8_t { prefetch, load, store, atomic };

template <lsc_action Action, cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
constexpr std::enable_if_t<arch_tag <= gpu_arch::XeHpc, void>
constexpr std::enable_if_t<valid_xe_arch_tag<arch_tag>, void>
check_lsc_cache_hint() {
if constexpr (Action == lsc_action::prefetch) {
// https://gfxspecs.intel.com/Predator/Home/Index/53560
Expand Down Expand Up @@ -153,7 +153,7 @@ get_prefetch_cache_hint_code() {
}

template <cache_hint L1H, cache_hint L2H, gpu_arch arch_tag>
constexpr std::enable_if_t<arch_tag <= gpu_arch::XeHpc, uint32_t>
constexpr std::enable_if_t<arch_tag == gpu_arch::XeHpc, uint32_t>
get_store_cache_hint_code() {
check_lsc_cache_hint<lsc_action::store, L1H, L2H, arch_tag>();
if (L1H == cache_hint::none && L2H == cache_hint::none) {
Expand Down Expand Up @@ -286,7 +286,6 @@ enum class store_op : uint8_t {
scattered_transpose = 3,
block_1d = 4
};
enum class mma_engine : uint8_t { xmx = 0, fpu = 1 };
// enum class trans_mode : uint8_t { none = 0, transpose = 1 };
enum class memory_op : uint8_t { load = 0, store = 1 };
enum class tdesc_update_dir : uint8_t { x_dir = 0, y_dir = 1 };
Expand Down
4 changes: 2 additions & 2 deletions include/common/utils/limitation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -747,7 +747,7 @@ struct check_store {
} // namespace subgroup

namespace group {
template <gpu_arch arch = gpu_arch::XeHpc>
template <gpu_arch arch>
struct gemm {
struct default_fpu {
template <
Expand Down Expand Up @@ -876,7 +876,7 @@ struct gemm {
int block_size_x_b,
int block_size_y_b>
struct check_tile_size_default {
using mma_attr = mma_attr_t<arch, block_size_y_a>;
using mma_attr = mma_attr_t<arch, mma_engine::xmx, tile_size_y_a>;
static constexpr int32_t mma_m = mma_attr::mma_m_in_elem;
static constexpr int32_t mma_n = mma_attr::mma_n_in_elem;
static constexpr int32_t mma_k =
Expand Down
36 changes: 18 additions & 18 deletions include/common/utils/raw_send_load_store.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,12 +219,12 @@ __XETLA_API void xetla_update_tdesc_offsety(
template <
typename Ty,
uint32_t N,
cache_hint L1H = cache_hint::none,
cache_hint L2H = cache_hint::none,
bool transpose = false,
bool transform = false,
gpu_arch arch_tag = gpu_arch::XeHpc>
__XETLA_API std::enable_if_t<arch_tag == gpu_arch::XeHpc, xetla_vector<Ty, N>>
cache_hint L1H,
cache_hint L2H,
bool transpose,
bool transform,
gpu_arch arch_tag >
__XETLA_API std::enable_if_t<arch_has_2d_load_store<arch_tag>, xetla_vector<Ty, N>>
xetla_tload_global(xetla_tdescriptor tdesc) {
DEBUG_INVOKE(
dbg_level::core,
Expand Down Expand Up @@ -273,10 +273,10 @@ xetla_tload_global(xetla_tdescriptor tdesc) {
template <
typename Ty,
uint32_t N,
cache_hint L1H = cache_hint::none,
cache_hint L2H = cache_hint::none,
gpu_arch arch_tag = gpu_arch::XeHpc>
__XETLA_API std::enable_if_t<arch_tag == gpu_arch::XeHpc, void>
cache_hint L1H,
cache_hint L2H,
gpu_arch arch_tag>
__XETLA_API std::enable_if_t<arch_has_2d_load_store<arch_tag>, void>
xetla_tstore_global(xetla_tdescriptor tdesc, xetla_vector<Ty, N> data) {
DEBUG_INVOKE(
dbg_level::core, core::block_2d<arch_tag, Ty>::check_store(tdesc));
Expand Down Expand Up @@ -310,10 +310,10 @@ xetla_tstore_global(xetla_tdescriptor tdesc, xetla_vector<Ty, N> data) {
///
template <
typename Ty,
cache_hint L1H = cache_hint::cached,
cache_hint L2H = cache_hint::cached,
gpu_arch arch_tag = gpu_arch::XeHpc>
__XETLA_API std::enable_if_t<arch_tag == gpu_arch::XeHpc, void>
cache_hint L1H,
cache_hint L2H,
gpu_arch arch_tag>
__XETLA_API std::enable_if_t<arch_has_2d_load_store<arch_tag>, void>
xetla_tprefetch_global(xetla_tdescriptor tdesc) {
uint32_t msg_desc = 3;
msg_desc |= 0 << 7;
Expand Down Expand Up @@ -350,12 +350,12 @@ xetla_tprefetch_global(xetla_tdescriptor tdesc) {
template <
typename Ty,
uint32_t N,
cache_hint L1H = cache_hint::none,
cache_hint L2H = cache_hint::none,
cache_hint L1H,
cache_hint L2H,
atomic_op Op,
gpu_arch arch_tag = gpu_arch::XeHpc,
gpu_arch arch_tag,
typename Toffset = uint32_t>
__XETLA_API std::enable_if_t<arch_tag == gpu_arch::XeHpc, void>
__XETLA_API std::enable_if_t<arch_has_2d_load_store<arch_tag>, void>
xetla_tatomic_store_global(
uint64_t base_address,
xetla_vector<Toffset, N> offset,
Expand Down
27 changes: 5 additions & 22 deletions include/common/utils/raw_send_nbarrier.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ enum class nbarrier_role : uint8_t {
/// as consumer.
///
template <
uint8_t num_producers = 1,
uint8_t num_consumers = 1,
gpu_arch arch_tag = gpu_arch::XeHpc,
uint8_t num_producers,
uint8_t num_consumers,
gpu_arch arch_tag,
typename enable = void>
struct xetla_nbarrier_t;

Expand All @@ -52,7 +52,7 @@ struct xetla_nbarrier_t<
num_producers,
num_consumers,
arch_tag,
std::enable_if_t<arch_tag == gpu_arch::XeHpc>> {
std::enable_if_t<arch_has_named_barrier<arch_tag>>> {
///
/// @brief Description of named barrier objection.
/// Structure is defined in
Expand Down Expand Up @@ -105,20 +105,7 @@ struct xetla_nbarrier_t<
num_producers,
num_consumers,
arch_tag,
std::enable_if_t<arch_tag != gpu_arch::XeHpc>> {
///
/// @brief Description of named barrier objection.
/// Structure is defined in
/// [here](https://gfxspecs.intel.com/Predator/Home/Index/57499).
///
// xetla_vector<uint32_t, 16> nbar;
// uint32_t barrier_id;

/// @param role is the role of subgroup when participating the barrier.
/// @param nbarrier_id [in] is the id of the barrier.
/// note: all subgroups participating the barrier should have the same
/// barrier_id. Here is the bspec link
/// https://gfxspecs.intel.com/Predator/Home/Index/54006
std::enable_if_t<!arch_has_named_barrier<arch_tag>>> {
__XETLA_API void init_nbarrier(uint8_t, nbarrier_role) {}

/// @brief Generic work-group split barrier.
Expand All @@ -127,14 +114,10 @@ struct xetla_nbarrier_t<
__ESIMD_ENS::split_barrier<__ESIMD_ENS::split_barrier_action::signal>();
}

/// @brief named barrier wait within subgroup.
///
__XETLA_API void wait() {
__ESIMD_ENS::split_barrier<__ESIMD_ENS::split_barrier_action::wait>();
}

/// @brief named barrier signal from subgroup.
///
__XETLA_API void arrive_wait() {
arrive();
wait();
Expand Down
Loading

0 comments on commit c5314b7

Please sign in to comment.