diff --git a/bestla/CMakeLists.txt b/bestla/CMakeLists.txt index 9b9fa4be7..9e59b1c41 100644 --- a/bestla/CMakeLists.txt +++ b/bestla/CMakeLists.txt @@ -88,14 +88,6 @@ install( if(WIN32) target_compile_definitions(${PROJECT_NAME} INTERFACE _CRT_SECURE_NO_WARNINGS NOMINMAX) - target_compile_options(${PROJECT_NAME} INTERFACE /wd4068 /wd4849 /wd6262 /wd4702 /wd4100) - #4068 ignore unroll and GCC flags - #4849 ignore collapse - #6262 ignore stack too large - #4702 unreachable code(false warning on constexpr condition) - #4100 unreferenced formal parameter - - target_link_options(${PROJECT_NAME} INTERFACE /STACK:5242880) #Stack requires up to L2 cache size endif(WIN32) diff --git a/bestla/bestla/bestla_device.h b/bestla/bestla/bestla_device.h index ebfc7d8de..6b73e5a15 100644 --- a/bestla/bestla/bestla_device.h +++ b/bestla/bestla/bestla_device.h @@ -17,12 +17,16 @@ #include #include "bestla.h" #include "xbyak/xbyak_util.h" +#include "bestla_utils.h" #ifdef _WIN32 #include #else #include #endif +#define FIXED_CACHE_SIZE ((1 << 20) - (32 << 10)) +#define FIXED_CACHE 1 + namespace bestla { namespace device { @@ -244,6 +248,9 @@ class CpuDevice { ADD_FLAG(AVX512_BF16); ADD_FLAG(AVX512_FP16); numcores = _cpu.getNumCores(Xbyak::util::IntelCpuTopologyLevel::CoreLevel); + if (mHasAMX_BF16 || mHasAMX_INT8) { + utils::request_perm_xtile_data(); + } static bool p = false; { uint32_t tmp[4]; @@ -315,6 +322,10 @@ class CpuDevice { L2Cache = _cpu.getDataCacheSize(1); numthreads = numcores; } +#if FIXED_CACHE + L2Cache = L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : L2Cache; + E_L2Cache = E_L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : E_L2Cache; +#endif } static CpuDevice* getInstance() { diff --git a/neural_speed/models/model_utils/gguf.h b/neural_speed/models/model_utils/gguf.h index 71cf1b86a..ffed28e45 100644 --- a/neural_speed/models/model_utils/gguf.h +++ b/neural_speed/models/model_utils/gguf.h @@ -135,10 +135,7 @@ enum gguf_type { }; static const char* GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { - [GGUF_TYPE_UINT8] = "u8", [GGUF_TYPE_INT8] = "i8", [GGUF_TYPE_UINT16] = "u16", [GGUF_TYPE_INT16] = "i16", - [GGUF_TYPE_UINT32] = "u32", [GGUF_TYPE_INT32] = "i32", [GGUF_TYPE_FLOAT32] = "f32", [GGUF_TYPE_BOOL] = "bool", - [GGUF_TYPE_STRING] = "str", [GGUF_TYPE_ARRAY] = "arr", [GGUF_TYPE_UINT64] = "u64", [GGUF_TYPE_INT64] = "i64", - [GGUF_TYPE_FLOAT64] = "f64", + "u8", "i8", "u16", "i16", "u32", "i32", "f32", "bool", "str", "arr", "u64", "i64", "f64", }; union gguf_value { @@ -201,19 +198,19 @@ struct gguf_context { #define GGUF_DEFAULT_ALIGNMENT 32 static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = { - [GGUF_TYPE_UINT8] = sizeof(uint8_t), - [GGUF_TYPE_INT8] = sizeof(int8_t), - [GGUF_TYPE_UINT16] = sizeof(uint16_t), - [GGUF_TYPE_INT16] = sizeof(int16_t), - [GGUF_TYPE_UINT32] = sizeof(uint32_t), - [GGUF_TYPE_INT32] = sizeof(int32_t), - [GGUF_TYPE_FLOAT32] = sizeof(float), - [GGUF_TYPE_BOOL] = sizeof(bool), - [GGUF_TYPE_STRING] = sizeof(struct gguf_str), - [GGUF_TYPE_ARRAY] = 0, // undefined - [GGUF_TYPE_UINT64] = sizeof(uint64_t), - [GGUF_TYPE_INT64] = sizeof(int64_t), - [GGUF_TYPE_FLOAT64] = sizeof(double), + sizeof(uint8_t), + sizeof(int8_t), + sizeof(uint16_t), + sizeof(int16_t), + sizeof(uint32_t), + sizeof(int32_t), + sizeof(float), + sizeof(bool), + sizeof(struct gguf_str), + 0, // undefined + sizeof(uint64_t), + sizeof(int64_t), + sizeof(double), }; static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13"); @@ -296,6 +293,10 @@ inline static void* ggml_aligned_malloc(size_t size) { return NULL; } void* aligned_memory = NULL; +#ifdef _MSC_VER + aligned_memory = _aligned_malloc(size, GGML_MEM_ALIGN); + int result = aligned_memory ? 0 : 1; +#else #ifdef GGML_USE_CPU_HBM int result = hbw_posix_memalign(&aligned_memory, 16, size); #elif GGML_USE_METAL @@ -303,6 +304,8 @@ inline static void* ggml_aligned_malloc(size_t size) { #else int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size); #endif +#endif + if (result != 0) { // Handle allocation failure const char* error_desc = "unknown allocation error";