From 5833314efd5ea6fded18d0a7304d50e813d4d8be Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 22 Aug 2023 11:26:14 +0300 Subject: [PATCH 1/4] ggml : sync latest llama.cpp (GGUF) ggml-ci --- include/ggml/ggml.h | 131 ++++- src/ggml.c | 1162 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 1247 insertions(+), 46 deletions(-) diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index 8ffeeb81d..58598ebfb 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -207,7 +207,7 @@ #define GGML_MAX_PARAMS 256 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 6 -#define GGML_MAX_NAME 48 +#define GGML_MAX_NAME 64 #define GGML_MAX_OP_PARAMS 32 #define GGML_DEFAULT_N_THREADS 4 @@ -215,6 +215,11 @@ #define GGML_EXIT_SUCCESS 0 #define GGML_EXIT_ABORTED 1 +#define GGUF_MAGIC 0x46554747 // "GGUF" +#define GGUF_VERSION 1 + +#define GGUF_DEFAULT_ALIGNMENT 32 + #define GGML_UNUSED(x) (void)(x) #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) @@ -255,8 +260,9 @@ extern "C" { #endif -#ifdef __ARM_NEON - // we use the built-in 16-bit float type +#if defined(__ARM_NEON) && defined(__CUDACC__) + typedef half ggml_fp16_t; +#elif defined(__ARM_NEON) typedef __fp16 ggml_fp16_t; #else typedef uint16_t ggml_fp16_t; @@ -565,6 +571,7 @@ extern "C" { GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split); GGML_API int ggml_blck_size (enum ggml_type type); @@ -1760,6 +1767,118 @@ extern "C" { GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + // + // gguf + // + + enum gguf_type { + GGUF_TYPE_UINT8 = 0, + GGUF_TYPE_INT8 = 1, + GGUF_TYPE_UINT16 = 2, + GGUF_TYPE_INT16 = 3, + GGUF_TYPE_UINT32 = 4, + GGUF_TYPE_INT32 = 5, + GGUF_TYPE_FLOAT32 = 6, + GGUF_TYPE_BOOL = 7, + GGUF_TYPE_STRING = 8, + GGUF_TYPE_ARRAY = 9, + GGUF_TYPE_COUNT, // marks the end of the enum + }; + + struct gguf_context; + + struct gguf_init_params { + bool no_alloc; + + // if not NULL, create a ggml_context and allocate the tensor data in it + struct ggml_context ** ctx; + }; + + GGML_API struct gguf_context * gguf_init_empty(void); + GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); + //GGML_API struct gguf_context * gguf_init_from_buffer(..); + + GGML_API void gguf_free(struct gguf_context * ctx); + + GGML_API const char * gguf_type_name(enum gguf_type type); + + GGML_API int gguf_get_version (struct gguf_context * ctx); + GGML_API size_t gguf_get_alignment (struct gguf_context * ctx); + GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx); + GGML_API void * gguf_get_data (struct gguf_context * ctx); + + GGML_API int gguf_get_n_kv(struct gguf_context * ctx); + GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key); + GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i); + + GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i); + GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i); + + // results are undefined if the wrong type is used for the key + GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i); + GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i); + GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i); + GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i); + GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i); + GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i); + GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i); + GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i); + GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i); + GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i); + GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i); + GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i); + + GGML_API int gguf_get_n_tensors (struct gguf_context * ctx); + GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name); + GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i); + GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i); + + // overrides existing values or adds a new one + GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); + GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val); + GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val); + GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val); + GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val); + GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val); + GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val); + GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val); + GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val); + GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n); + GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n); + + // set or add KV pairs from another context + GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src); + + // manage tensor info + GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); + GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type); + GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size); + + // writing gguf files can be done in 2 ways: + // + // - write the entire gguf_context to a binary file in a single pass: + // + // gguf_write_to_file(ctx, fname); + // + // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data: + // + // FILE * f = fopen(fname, "wb"); + // fseek(f, gguf_get_meta_size(ctx), SEEK_SET); + // fwrite(f, ...); + // void * data = gguf_meta_get_meta_data(ctx); + // fseek(f, 0, SEEK_SET); + // fwrite(f, data, gguf_get_meta_size(ctx)); + // free(data); + // fclose(f); + // + + // write the entire context to a binary file + GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta); + + // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding + GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx); + GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data); + // // system info // @@ -1797,6 +1916,10 @@ extern "C" { typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); typedef struct { + const char * type_name; + int blck_size; + size_t type_size; + bool is_quantized; ggml_to_float_t to_float; ggml_from_float_t from_float; ggml_from_float_t from_float_reference; @@ -1804,7 +1927,7 @@ extern "C" { enum ggml_type vec_dot_type; } ggml_type_traits_t; - ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i); + ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); #ifdef __cplusplus } diff --git a/src/ggml.c b/src/ggml.c index 6d4d13306..977933b1c 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -213,8 +213,7 @@ inline static void * ggml_aligned_malloc(size_t size) { error_desc = "insufficient memory"; break; } - GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", - __func__, error_desc, size/(1024.0*1024.0)); + GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0)); return NULL; } return aligned_memory; @@ -1643,11 +1642,37 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { + [GGML_TYPE_I8] = { + .type_name = "i8", + .blck_size = 1, + .type_size = sizeof(int8_t), + .is_quantized = false, + }, + [GGML_TYPE_I16] = { + .type_name = "i16", + .blck_size = 1, + .type_size = sizeof(int16_t), + .is_quantized = false, + }, + [GGML_TYPE_I32] = { + .type_name = "i32", + .blck_size = 1, + .type_size = sizeof(int32_t), + .is_quantized = false, + }, [GGML_TYPE_F32] = { + .type_name = "f32", + .blck_size = 1, + .type_size = sizeof(float), + .is_quantized = false, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, .vec_dot_type = GGML_TYPE_F32, }, [GGML_TYPE_F16] = { + .type_name = "f16", + .blck_size = 1, + .type_size = sizeof(ggml_fp16_t), + .is_quantized = false, .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row, .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row, .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row, @@ -1655,6 +1680,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_F16, }, [GGML_TYPE_Q4_0] = { + .type_name = "q4_0", + .blck_size = QK4_0, + .type_size = sizeof(block_q4_0), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_0, .from_float = quantize_row_q4_0, .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference, @@ -1662,6 +1691,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q4_1] = { + .type_name = "q4_1", + .blck_size = QK4_1, + .type_size = sizeof(block_q4_1), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_1, .from_float = quantize_row_q4_1, .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference, @@ -1669,6 +1702,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_1, }, [GGML_TYPE_Q5_0] = { + .type_name = "q5_0", + .blck_size = QK5_0, + .type_size = sizeof(block_q5_0), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_0, .from_float = quantize_row_q5_0, .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference, @@ -1676,6 +1713,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q5_1] = { + .type_name = "q5_1", + .blck_size = QK5_1, + .type_size = sizeof(block_q5_1), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_1, .from_float = quantize_row_q5_1, .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference, @@ -1683,6 +1724,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_1, }, [GGML_TYPE_Q8_0] = { + .type_name = "q8_0", + .blck_size = QK8_0, + .type_size = sizeof(block_q8_0), + .is_quantized = true, .to_float = dequantize_row_q8_0, .from_float = quantize_row_q8_0, .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference, @@ -1690,12 +1735,20 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q8_1] = { + .type_name = "q8_1", + .blck_size = QK8_1, + .type_size = sizeof(block_q8_1), + .is_quantized = true, .from_float = quantize_row_q8_1, .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference, .vec_dot_type = GGML_TYPE_Q8_1, }, #ifdef GGML_USE_K_QUANTS [GGML_TYPE_Q2_K] = { + .type_name = "q2_K", + .blck_size = QK_K, + .type_size = sizeof(block_q2_K), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q2_K, .from_float = quantize_row_q2_K, .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference, @@ -1703,6 +1756,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q3_K] = { + .type_name = "q3_K", + .blck_size = QK_K, + .type_size = sizeof(block_q3_K), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q3_K, .from_float = quantize_row_q3_K, .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference, @@ -1710,6 +1767,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q4_K] = { + .type_name = "q4_K", + .blck_size = QK_K, + .type_size = sizeof(block_q4_K), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q4_K, .from_float = quantize_row_q4_K, .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference, @@ -1717,6 +1778,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q5_K] = { + .type_name = "q5_K", + .blck_size = QK_K, + .type_size = sizeof(block_q5_K), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q5_K, .from_float = quantize_row_q5_K, .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference, @@ -1724,6 +1789,10 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q6_K] = { + .type_name = "q6_K", + .blck_size = QK_K, + .type_size = sizeof(block_q6_K), + .is_quantized = true, .to_float = (ggml_to_float_t) dequantize_row_q6_K, .from_float = quantize_row_q6_K, .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference, @@ -1731,15 +1800,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, }, [GGML_TYPE_Q8_K] = { + .type_name = "q8_K", + .blck_size = QK_K, + .type_size = sizeof(block_q8_K), + .is_quantized = true, .from_float = quantize_row_q8_K, } #endif }; // For internal test use -ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i) { - GGML_ASSERT(i < GGML_TYPE_COUNT); - return type_traits[i]; +ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { + GGML_ASSERT(type < GGML_TYPE_COUNT); + return type_traits[type]; } @@ -4126,29 +4199,37 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) { // // is enough, but just in case, adding the second part - return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN); + return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type)); +} + +size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) { + return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN); } size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]; + return (nrows_split*tensor->ne[0]*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type); } int ggml_blck_size(enum ggml_type type) { - return GGML_BLCK_SIZE[type]; + return type_traits[type].blck_size; } size_t ggml_type_size(enum ggml_type type) { - return GGML_TYPE_SIZE[type]; + return type_traits[type].type_size; } float ggml_type_sizef(enum ggml_type type) { - return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type]; + return ((float)(type_traits[type].type_size))/type_traits[type].blck_size; } const char * ggml_type_name(enum ggml_type type) { - return GGML_TYPE_NAME[type]; + return type_traits[type].type_name; +} + +bool ggml_is_quantized(enum ggml_type type) { + return type_traits[type].is_quantized; } const char * ggml_op_name(enum ggml_op op) { @@ -4160,7 +4241,7 @@ const char * ggml_op_symbol(enum ggml_op op) { } size_t ggml_element_size(const struct ggml_tensor * tensor) { - return GGML_TYPE_SIZE[tensor->type]; + return ggml_type_size(tensor->type); } static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) { @@ -4198,10 +4279,6 @@ static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct (t0->ne[3] == t1->ne[3]); } -bool ggml_is_quantized(enum ggml_type type) { - return GGML_IS_QUANTIZED[type]; -} - enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { enum ggml_type wtype = GGML_TYPE_COUNT; @@ -4239,8 +4316,8 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return - tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] && - tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] && + tensor->nb[0] == ggml_type_size(tensor->type) && + tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) && tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } @@ -4249,7 +4326,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return - tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] && + tensor->nb[0] == ggml_type_size(tensor->type) && tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } @@ -4264,7 +4341,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); return - tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] && + tensor->nb[0] == ggml_type_size(tensor->type) && tensor->nb[2] == tensor->nb[1]*tensor->ne[1] && tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } @@ -4583,7 +4660,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( size_t data_size = 0; if (data == NULL && !ctx->no_alloc) { - data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]); + data_size += ggml_type_size(type)*(ne[0]/ggml_blck_size(type)); for (int i = 1; i < n_dims; i++) { data_size *= ne[i]; } @@ -4638,8 +4715,8 @@ static struct ggml_tensor * ggml_new_tensor_impl( result->ne[i] = ne[i]; } - result->nb[0] = GGML_TYPE_SIZE[type]; - result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]); + result->nb[0] = ggml_type_size(type); + result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type)); for (int i = 2; i < GGML_MAX_DIMS; i++) { result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; } @@ -7895,7 +7972,7 @@ static void ggml_compute_forward_dup_same_cont( memcpy( ((char *) dst->data + ie0*nb0), ((char *) src0->data + ie0*nb00), - (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]); + (ie1 - ie0) * ggml_type_size(src0->type)); } } @@ -7929,7 +8006,7 @@ static void ggml_compute_forward_dup_f16( if (src0->type == dst->type && ne00 == ne0 && - nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) { + nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { // copy by rows const size_t rs = ne00*nb00; for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -7987,7 +8064,7 @@ static void ggml_compute_forward_dup_f16( float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; size_t id = 0; - size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { @@ -8200,7 +8277,7 @@ static void ggml_compute_forward_dup_f32( if (src0->type == dst->type && ne00 == ne0 && - nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) { + nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { // copy by rows const size_t rs = ne00*nb00; for (int64_t i03 = 0; i03 < ne03; i03++) { @@ -8239,7 +8316,7 @@ static void ggml_compute_forward_dup_f32( ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; size_t id = 0; - size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]); + size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); char * dst_ptr = (char *) dst->data; for (int i03 = 0; i03 < ne03; i03++) { @@ -8651,7 +8728,7 @@ static void ggml_compute_forward_add_q_f32( ggml_from_float_t const quantize_row_q = type_traits[type].from_float; // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb00 == ggml_type_size(type)); GGML_ASSERT(nb10 == sizeof(float)); // dst cannot be transposed or permuted @@ -8925,7 +9002,7 @@ static void ggml_compute_forward_add1_q_f32( ggml_from_float_t const quantize_row_q = type_traits[type].from_float; // we don't support permuted src0 - GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb00 == ggml_type_size(type)); // dst cannot be transposed or permuted GGML_ASSERT(nb0 <= nb1); @@ -9287,6 +9364,8 @@ static void ggml_compute_forward_mul( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now"); + switch (src0->type) { case GGML_TYPE_F32: { @@ -10779,7 +10858,7 @@ static void ggml_compute_forward_mul_mat( GGML_ASSERT(ne3 == ne13); // we don't support permuted src0 or src1 - GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]); + GGML_ASSERT(nb00 == ggml_type_size(type)); GGML_ASSERT(nb10 == sizeof(float)); // dst cannot be transposed or permuted @@ -10865,7 +10944,7 @@ static void ggml_compute_forward_mul_mat( if (params->type == GGML_TASK_INIT) { if (src1->type != vec_dot_type) { char * wdata = params->wdata; - const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); for (int64_t i13 = 0; i13 < ne13; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { @@ -10885,7 +10964,7 @@ static void ggml_compute_forward_mul_mat( } const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; + const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type); const int64_t nr0 = ne01; // src0 rows const int64_t nr1 = ne11*ne12*ne13; // src1 rows @@ -11354,7 +11433,7 @@ static void ggml_compute_forward_get_rows_q( assert( dst->ne[0] == nc); assert( dst->ne[1] == nr); - assert(src0->nb[0] == GGML_TYPE_SIZE[type]); + assert(src0->nb[0] == ggml_type_size(type)); for (int i = 0; i < nr; ++i) { const int r = ((int32_t *) src1->data)[i]; @@ -16781,7 +16860,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { size_t cur = 0; if (ggml_is_quantized(node->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks; + cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; } work_size = MAX(work_size, cur); @@ -16794,7 +16873,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { size_t cur = 0; if (ggml_is_quantized(node->src[0]->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[0]->ne[0] * n_tasks; + cur = ggml_type_size(GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks; } work_size = MAX(work_size, cur); @@ -16806,7 +16885,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { size_t cur = 0; if (ggml_is_quantized(node->src[0]->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src[1]->ne[0] * n_tasks; + cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks; } work_size = MAX(work_size, cur); @@ -16889,12 +16968,12 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { // the threads are still spinning if (node->src[0]->type != GGML_TYPE_F32) { // here we need memory just for single 2D matrix from src0 - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src[0]->ne[0]*node->src[0]->ne[1]); + cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]); } } else #endif if (node->src[1]->type != vec_dot_type) { - cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src[1])/GGML_BLCK_SIZE[vec_dot_type]; + cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type); } else { cur = 0; } @@ -17322,7 +17401,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { // compute size of intermediate results // TODO: does not take into account scratch buffers !!!! for (int i = 0; i < cgraph->n_nodes; ++i) { - size_eval += ggml_nbytes(cgraph->nodes[i]); + size_eval += ggml_nbytes_pad(cgraph->nodes[i]); } // print @@ -18723,8 +18802,8 @@ enum ggml_opt_result ggml_opt_resume( struct ggml_tensor * f) { // build forward + backward compute graphs - struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0)); - struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / GGML_TYPE_SIZE[GGML_TYPE_I32]+ (sizeof(struct ggml_cgraph) % GGML_TYPE_SIZE[GGML_TYPE_I32] ? 1 : 0)); + struct ggml_tensor * gfbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); + struct ggml_tensor * gbbuf = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(struct ggml_cgraph) / ggml_type_size(GGML_TYPE_I32)+ (sizeof(struct ggml_cgraph) % ggml_type_size(GGML_TYPE_I32) ? 1 : 0)); struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; @@ -18983,6 +19062,1005 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i //////////////////////////////////////////////////////////////////////////////// +struct gguf_str { + uint32_t n; + char * data; +}; + +static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = { + [GGUF_TYPE_UINT8] = sizeof(uint8_t), + [GGUF_TYPE_INT8] = sizeof(int8_t), + [GGUF_TYPE_UINT16] = sizeof(uint16_t), + [GGUF_TYPE_INT16] = sizeof(int16_t), + [GGUF_TYPE_UINT32] = sizeof(uint32_t), + [GGUF_TYPE_INT32] = sizeof(int32_t), + [GGUF_TYPE_FLOAT32] = sizeof(float), + [GGUF_TYPE_BOOL] = sizeof(bool), + [GGUF_TYPE_STRING] = sizeof(struct gguf_str), + [GGUF_TYPE_ARRAY] = 0, // undefined +}; +static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10"); + +static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = { + [GGUF_TYPE_UINT8] = "u8", + [GGUF_TYPE_INT8] = "i8", + [GGUF_TYPE_UINT16] = "u16", + [GGUF_TYPE_INT16] = "i16", + [GGUF_TYPE_UINT32] = "u32", + [GGUF_TYPE_INT32] = "i32", + [GGUF_TYPE_FLOAT32] = "f32", + [GGUF_TYPE_BOOL] = "bool", + [GGUF_TYPE_STRING] = "str", + [GGUF_TYPE_ARRAY] = "arr", +}; +static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10"); + +union gguf_value { + uint8_t uint8; + int8_t int8; + uint16_t uint16; + int16_t int16; + uint32_t uint32; + int32_t int32; + float float32; + bool bool_; + + struct gguf_str str; + + struct { + enum gguf_type type; + + uint32_t n; + void * data; + } arr; +}; + +struct gguf_kv { + struct gguf_str key; + + uint32_t n_bytes; // TODO: is this actually needed? + + enum gguf_type type; + union gguf_value value; +}; + +struct gguf_header { + uint32_t magic; + uint32_t version; + uint32_t n_tensors; + uint32_t n_kv; +}; + +struct gguf_tensor_info { + struct gguf_str name; + + uint32_t n_dims; + uint32_t ne[GGML_MAX_DIMS]; + + enum ggml_type type; + + uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT` + + // for writing API + const void * data; + size_t size; +}; + +struct gguf_context { + struct gguf_header header; + + struct gguf_kv * kv; + struct gguf_tensor_info * infos; + + size_t alignment; + size_t offset; // offset of `data` from beginning of file + size_t size; // size of `data` in bytes + + //uint8_t * padding; + void * data; +}; + +static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) { + const size_t n = fread(dst, 1, size, file); + *offset += n; + return n == size; +} + +static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) { + p->n = 0; + p->data = NULL; + + bool ok = true; + + // TODO: how to avoid mallocs for strings? + ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); p->data = calloc(p->n + 1, 1); + ok = ok && gguf_fread_el(file, p->data, p->n, offset); + + return ok; +} + +struct gguf_context * gguf_init_empty(void) { + struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + + ctx->header.magic = GGUF_MAGIC; + ctx->header.version = GGUF_VERSION; + ctx->header.n_tensors = 0; + ctx->header.n_kv = 0; + + ctx->kv = NULL; + ctx->infos = NULL; + + ctx->alignment = GGUF_DEFAULT_ALIGNMENT; + ctx->offset = 0; + ctx->size = 0; + + ctx->data = NULL; + + return ctx; +} + +struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { + FILE * file = fopen(fname, "rb"); + if (!file) { + return NULL; + } + + // offset from start of file + size_t offset = 0; + + uint32_t magic = 0; + + // check the magic before making allocations + { + gguf_fread_el(file, &magic, sizeof(magic), &offset); + + if (magic != GGUF_MAGIC) { + fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic); + fclose(file); + return NULL; + } + } + + bool ok = true; + + struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + + // read the header + { + ctx->header.magic = magic; + + ctx->kv = NULL; + ctx->infos = NULL; + ctx->data = NULL; + + ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset); + ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); + ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); + + if (!ok) { + fprintf(stderr, "%s: failed to read header\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } + } + + // read the kv pairs + { + ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv)); + + for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + struct gguf_kv * kv = &ctx->kv[i]; + + //fprintf(stderr, "%s: reading kv %d\n", __func__, i); + + ok = ok && gguf_fread_str(file, &kv->key, &offset); + //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset); + ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset); + + //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data); + + switch (kv->type) { + case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break; + case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break; + case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break; + case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break; + case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break; + case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break; + case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break; + case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break; + case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break; + case GGUF_TYPE_ARRAY: + { + ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); + ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); + + switch (kv->value.arr.type) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_BOOL: + { + kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); + ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset); + } break; + case GGUF_TYPE_STRING: + { + kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str)); + for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); + } + } break; + case GGUF_TYPE_ARRAY: + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + }; + } break; + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); + }; + + if (!ok) { + break; + } + } + + if (!ok) { + fprintf(stderr, "%s: failed to read key-value pairs\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } + } + + // read the tensor infos + { + ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); + + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + info->ne[j] = 1; + } + + ok = ok && gguf_fread_str(file, &info->name, &offset); + ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset); + for (uint32_t j = 0; j < info->n_dims; ++j) { + ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); + } + ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); + ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); + + if (!ok) { + fprintf(stderr, "%s: failed to read tensor info\n", __func__); + fclose(file); + gguf_free(ctx); + return NULL; + } + } + } + + ctx->alignment = GGUF_DEFAULT_ALIGNMENT; + + int alignment_idx = gguf_find_key(ctx, "general.alignment"); + if (alignment_idx != -1) { + ctx->alignment = gguf_get_val_u32(ctx, alignment_idx); + } + + // we require the data section to be aligned, so take into account any padding + { + const size_t offset_pad = offset % ctx->alignment; + + if (offset_pad != 0) { + offset += ctx->alignment - offset_pad; + fseek(file, offset, SEEK_SET); + } + } + + // store the current file offset - this is where the data section starts + ctx->offset = offset; + + // compute the total size of the data section, taking into account the alignment + { + ctx->size = 0; + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + const int64_t ne = + (int64_t) info->ne[0] * + (int64_t) info->ne[1] * + (int64_t) info->ne[2] * + (int64_t) info->ne[3]; + + if (ne % ggml_blck_size(info->type) != 0) { + fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n", + __func__, info->name.data, ne, ggml_blck_size(info->type)); + fclose(file); + gguf_free(ctx); + return NULL; + } + + const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type); + + ctx->size += GGML_PAD(size_cur, ctx->alignment); + } + } + + // load the tensor data only if requested + if (params.ctx != NULL) { + // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob + // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of + // the ggml_tensor structs to the appropriate locations in the binary blob + + // compute the exact size needed for the new ggml_context + const size_t mem_size = + params.no_alloc ? + (ctx->header.n_tensors )*ggml_tensor_overhead() : + (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size; + + struct ggml_init_params pdata = { + .mem_size = mem_size, + .mem_buffer = NULL, + .no_alloc = params.no_alloc, + }; + + *params.ctx = ggml_init(pdata); + + struct ggml_context * ctx_data = *params.ctx; + + struct ggml_tensor * data = NULL; + + if (params.no_alloc == false) { + data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size); + + ok = ok && data != NULL; + + // read the binary blob with the tensor data + ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset); + + if (!ok) { + fprintf(stderr, "%s: failed to read tensor data\n", __func__); + fclose(file); + ggml_free(ctx_data); + gguf_free(ctx); + return NULL; + } + + ctx->data = data->data; + } + + ggml_set_no_alloc(ctx_data, true); + + // create the tensors + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + const int64_t ne[GGML_MAX_DIMS] = { + ctx->infos[i].ne[0], + ctx->infos[i].ne[1], + ctx->infos[i].ne[2], + ctx->infos[i].ne[3], + }; + + struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne); + + ok = ok && cur != NULL; + + ggml_set_name(cur, ctx->infos[i].name.data); + + if (!ok) { + break; + } + + // point the data member to the appropriate location in the binary blob using the tensor infos + if (params.no_alloc == false) { + //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file + cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data + } + } + + if (!ok) { + fprintf(stderr, "%s: failed to read the tensor data\n", __func__); + fclose(file); + ggml_free(ctx_data); + gguf_free(ctx); + return NULL; + } + + ggml_set_no_alloc(ctx_data, params.no_alloc); + } + + fclose(file); + + return ctx; +} + +void gguf_free(struct gguf_context * ctx) { + if (ctx == NULL) { + return; + } + + if (ctx->kv) { + // free string memory - not great.. + for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + struct gguf_kv * kv = &ctx->kv[i]; + + if (kv->key.data) { + free(kv->key.data); + } + + if (kv->type == GGUF_TYPE_STRING) { + if (kv->value.str.data) { + free(kv->value.str.data); + } + } + + if (kv->type == GGUF_TYPE_ARRAY) { + if (kv->value.arr.data) { + if (kv->value.arr.type == GGUF_TYPE_STRING) { + for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j]; + if (str->data) { + free(str->data); + } + } + } + free(kv->value.arr.data); + } + } + } + + GGML_ALIGNED_FREE(ctx->kv); + } + + if (ctx->infos) { + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + if (info->name.data) { + free(info->name.data); + } + } + + GGML_ALIGNED_FREE(ctx->infos); + } + + GGML_ALIGNED_FREE(ctx); +} + +const char * gguf_type_name(enum gguf_type type) { + return GGUF_TYPE_NAME[type]; +} + +int gguf_get_version(struct gguf_context * ctx) { + return ctx->header.version; +} + +size_t gguf_get_alignment(struct gguf_context * ctx) { + return ctx->alignment; +} + +size_t gguf_get_data_offset(struct gguf_context * ctx) { + return ctx->offset; +} + +void * gguf_get_data(struct gguf_context * ctx) { + return ctx->data; +} + +int gguf_get_n_kv(struct gguf_context * ctx) { + return ctx->header.n_kv; +} + +int gguf_find_key(struct gguf_context * ctx, const char * key) { + // return -1 if key not found + int keyfound = -1; + + const int n_kv = gguf_get_n_kv(ctx); + + for (int i = 0; i < n_kv; ++i) { + if (strcmp(key, gguf_get_key(ctx, i)) == 0) { + keyfound = i; + break; + } + } + + return keyfound; +} + +const char * gguf_get_key(struct gguf_context * ctx, int i) { + return ctx->kv[i].key.data; +} + +enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) { + return ctx->kv[i].type; +} + +enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.arr.type; +} + +const void * gguf_get_arr_data(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.arr.data; +} + +const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) { + struct gguf_kv * kv = &ctx->kv[key_id]; + struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i]; + return str->data; +} + +int gguf_get_arr_n(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.arr.n; +} + +uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint8; +} + +int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int8; +} + +uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint16; +} + +int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int16; +} + +uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.uint32; +} + +int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.int32; +} + +float gguf_get_val_f32(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.float32; +} + +bool gguf_get_val_bool(struct gguf_context * ctx, int i) { + return ctx->kv[i].value.bool_; +} + +const char * gguf_get_val_str (struct gguf_context * ctx, int i) { + return ctx->kv[i].value.str.data; +} + +int gguf_get_n_tensors(struct gguf_context * ctx) { + return ctx->header.n_tensors; +} + +int gguf_find_tensor(struct gguf_context * ctx, const char * name) { + // return -1 if tensor not found + int tensorfound = -1; + + const int n_tensors = gguf_get_n_tensors(ctx); + + for (int i = 0; i < n_tensors; ++i) { + if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) { + tensorfound = i; + break; + } + } + + return tensorfound; +} + +size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) { + return ctx->infos[i].offset; +} + +char * gguf_get_tensor_name(struct gguf_context * ctx, int i) { + return ctx->infos[i].name.data; +} + +// returns the index +static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { + const int idx = gguf_find_key(ctx, key); + if (idx >= 0) { + return idx; + } + + const int n_kv = gguf_get_n_kv(ctx); + + ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv)); + ctx->kv[n_kv].key.n = strlen(key) + 1; + ctx->kv[n_kv].key.data = strdup(key); + ctx->header.n_kv++; + + return n_kv; +} + +void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT8; + ctx->kv[idx].value.uint8 = val; +} + +void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT8; + ctx->kv[idx].value.int8 = val; +} + +void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT16; + ctx->kv[idx].value.uint16 = val; +} + +void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT16; + ctx->kv[idx].value.int16 = val; +} + +void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_UINT32; + ctx->kv[idx].value.uint32 = val; +} + +void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_INT32; + ctx->kv[idx].value.int32 = val; +} + +void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_FLOAT32; + ctx->kv[idx].value.float32 = val; +} + +void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_BOOL; + ctx->kv[idx].value.bool_ = val; +} + +void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_STRING; + ctx->kv[idx].value.str.n = strlen(val) + 1; + ctx->kv[idx].value.str.data = strdup(val); +} + +void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_ARRAY; + ctx->kv[idx].value.arr.type = type; + ctx->kv[idx].value.arr.n = n; + ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]); + memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]); +} + +void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) { + const int idx = gguf_get_or_add_key(ctx, key); + + ctx->kv[idx].type = GGUF_TYPE_ARRAY; + ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING; + ctx->kv[idx].value.arr.n = n; + ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str)); + for (int i = 0; i < n; i++) { + struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i]; + str->n = strlen(data[i]) + 1; + str->data = strdup(data[i]); + } +} + +// set or add KV pairs from another context +void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { + for (uint32_t i = 0; i < src->header.n_kv; i++) { + switch (src->kv[i].type) { + case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break; + case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break; + case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break; + case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break; + case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break; + case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break; + case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break; + case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break; + case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break; + case GGUF_TYPE_ARRAY: + { + if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) { + const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *)); + for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { + data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data; + } + gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n); + free(data); + } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) { + GGML_ASSERT(false && "nested arrays not supported"); + } else { + gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n); + } + } break; + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + } + } +} + +void gguf_add_tensor( + struct gguf_context * ctx, + const struct ggml_tensor * tensor) { + const int idx = ctx->header.n_tensors; + ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); + + ctx->infos[idx].name.n = strlen(tensor->name) + 1; + ctx->infos[idx].name.data = strdup(tensor->name); + + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + ctx->infos[idx].ne[i] = 1; + } + + ctx->infos[idx].n_dims = tensor->n_dims; + for (int i = 0; i < tensor->n_dims; i++) { + ctx->infos[idx].ne[i] = tensor->ne[i]; + } + + ctx->infos[idx].type = tensor->type; + ctx->infos[idx].offset = 0; + ctx->infos[idx].data = tensor->data; + ctx->infos[idx].size = ggml_nbytes(tensor); + + if (ctx->header.n_tensors > 0) { + ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment); + } + + ctx->header.n_tensors++; +} + +void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) { + const int idx = gguf_find_tensor(ctx, name); + if (idx < 0) { + GGML_ASSERT(false && "tensor not found"); + } + + ctx->infos[idx].type = type; +} + +void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) { + const int idx = gguf_find_tensor(ctx, name); + if (idx < 0) { + GGML_ASSERT(false && "tensor not found"); + } + + ctx->infos[idx].data = data; + ctx->infos[idx].size = size; + + // update offsets + for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) { + ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment); + } +} + +//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) { +// fwrite(&val->n, sizeof(val->n), 1, file); +// fwrite(val->data, sizeof(char), val->n, file); +//} +// +//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) { +// fwrite(val, sizeof(char), size, file); +//} + +struct gguf_buf { + void * data; + size_t size; + size_t offset; +}; + +static struct gguf_buf gguf_buf_init(size_t size) { + struct gguf_buf buf = { + /*buf.data =*/ size == 0 ? NULL : malloc(size), + /*buf.size =*/ size, + /*buf.offset =*/ 0, + }; + + return buf; +} + +static void gguf_buf_free(struct gguf_buf buf) { + if (buf.data) { + free(buf.data); + } +} + +static void gguf_buf_grow(struct gguf_buf * buf, size_t size) { + if (buf->offset + size > buf->size) { + buf->size = 1.5*(buf->offset + size); + if (buf->data) { + buf->data = realloc(buf->data, buf->size); + } + } +} + +static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) { + gguf_buf_grow(buf, sizeof(val->n) + val->n); + + if (buf->data) { + memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n)); + } + buf->offset += sizeof(val->n); + + if (buf->data) { + memcpy((char *) buf->data + buf->offset, val->data, val->n); + } + buf->offset += val->n; +} + +static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) { + gguf_buf_grow(buf, el_size); + + if (buf->data) { + memcpy((char *) buf->data + buf->offset, val, el_size); + } + buf->offset += el_size; +} + +static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) { + // write header + gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic)); + gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version)); + gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors)); + gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv)); + + // write key-value pairs + for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { + struct gguf_kv * kv = &ctx->kv[i]; + + gguf_bwrite_str(buf, &kv->key); + gguf_bwrite_el (buf, &kv->type, sizeof(kv->type)); + + switch (kv->type) { + case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break; + case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break; + case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break; + case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break; + case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break; + case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break; + case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break; + case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break; + case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break; + case GGUF_TYPE_ARRAY: + { + gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type)); + gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) ); + + switch (kv->value.arr.type) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_BOOL: + { + gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); + } break; + case GGUF_TYPE_STRING: + { + for (uint32_t j = 0; j < kv->value.arr.n; ++j) { + gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]); + } + } break; + case GGUF_TYPE_ARRAY: + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break; + }; + } break; + case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); + }; + } + + // write tensor infos + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + gguf_bwrite_str(buf, &info->name); + gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims)); + for (uint32_t j = 0; j < info->n_dims; ++j) { + gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j])); + } + gguf_bwrite_el(buf, &info->type, sizeof(info->type)); + gguf_bwrite_el(buf, &info->offset, sizeof(info->offset)); + } + + // we require the data section to be aligned, so take into account any padding + { + const size_t offset = buf->offset; + const size_t offset_pad = GGML_PAD(offset, ctx->alignment); + + if (offset_pad != offset) { + uint8_t pad = 0; + for (size_t i = 0; i < offset_pad - offset; ++i) { + gguf_bwrite_el(buf, &pad, sizeof(pad)); + } + } + } + + if (only_meta) { + return; + } + + size_t offset = 0; + + // write tensor data + for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { + struct gguf_tensor_info * info = &ctx->infos[i]; + + const size_t size = info->size; + const size_t size_pad = GGML_PAD(size, ctx->alignment); + + gguf_bwrite_el(buf, info->data, size); + + if (size_pad != size) { + uint8_t pad = 0; + for (size_t j = 0; j < size_pad - size; ++j) { + gguf_bwrite_el(buf, &pad, sizeof(pad)); + } + } + + GGML_ASSERT(offset == info->offset); + + offset += size_pad; + } +} + +void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) { + FILE * file = fopen(fname, "wb"); + if (!file) { + GGML_ASSERT(false && "failed to open file for writing"); + } + + struct gguf_buf buf = gguf_buf_init(16*1024); + + gguf_write_to_buf(ctx, &buf, only_meta); + + fwrite(buf.data, 1, buf.offset, file); + + gguf_buf_free(buf); + + fclose(file); +} + +size_t gguf_get_meta_size(struct gguf_context * ctx) { + // no allocs - only compute size + struct gguf_buf buf = gguf_buf_init(0); + + gguf_write_to_buf(ctx, &buf, true); + + return buf.offset; +} + +void gguf_get_meta_data(struct gguf_context * ctx, void * data) { + struct gguf_buf buf = gguf_buf_init(16*1024); + + gguf_write_to_buf(ctx, &buf, true); + + memcpy(data, buf.data, buf.offset); + + gguf_buf_free(buf); +} + +//////////////////////////////////////////////////////////////////////////////// + int ggml_cpu_has_avx(void) { #if defined(__AVX__) return 1; From bcc3d058aa170a7495203b4bdecf995e359947b4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 22 Aug 2023 11:31:36 +0300 Subject: [PATCH 2/4] ggml : sync GPU backends ggml-ci --- src/ggml-cuda.cu | 1332 +++++++++++++++++++++++++++++------------- src/ggml-cuda.h | 42 +- src/ggml-metal.h | 12 +- src/ggml-metal.m | 216 +++---- src/ggml-metal.metal | 971 +++++++++++++++--------------- 5 files changed, 1529 insertions(+), 1044 deletions(-) diff --git a/src/ggml-cuda.cu b/src/ggml-cuda.cu index 9d42efb0d..5b415c646 100644 --- a/src/ggml-cuda.cu +++ b/src/ggml-cuda.cu @@ -14,6 +14,7 @@ #include "ggml.h" #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#define CC_TURING 700 #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -262,10 +263,6 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define CUDA_QUANTIZE_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 -#ifndef GGML_CUDA_MMQ_Y -#define GGML_CUDA_MMQ_Y 64 -#endif // GGML_CUDA_MMQ_Y - // dmmv = dequantize_mul_mat_vec #ifndef GGML_CUDA_DMMV_X #define GGML_CUDA_DMMV_X 32 @@ -285,6 +282,20 @@ struct ggml_tensor_extra_gpu { cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs }; +static int g_device_count = -1; +static int g_main_device = 0; +static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; +static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; +static bool g_mul_mat_q = false; + +static void * g_scratch_buffer = nullptr; +static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default +static size_t g_scratch_offset = 0; + +static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; + +static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr }; + static __global__ void add_f32(const float * x, const float * y, float * dst, const int kx, const int ky) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -1388,6 +1399,7 @@ template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp // second part effectively subtracts 8 from each quant value return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1425,6 +1437,7 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1460,6 +1473,7 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp // second part effectively subtracts 16 from each quant value return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1505,6 +1519,7 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp return sumi*d5d8 + m5s8 / (QI5_1 / vdr); #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1526,6 +1541,7 @@ template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp return d8_0*d8_1 * sumi; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1549,13 +1565,14 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp #else const float2 dm8f = __half22float2(dm8); const float2 ds8f = __half22float2(ds8); - const float d8d8 = dm8.x * ds8.x; - const float m8s8 = dm8.y * ds8.y; + const float d8d8 = dm8f.x * ds8f.x; + const float m8s8 = dm8f.y * ds8f.y; #endif // GGML_CUDA_F16 // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it return sumi*d8d8 + m8s8 / (QI8_1 / vdr); #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1591,6 +1608,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( return dm2f.x*sumf_d - dm2f.y*sumf_m; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1628,6 +1646,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1668,6 +1687,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( return d3 * sumf; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1693,6 +1713,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( return d3*d8 * sumi; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1726,12 +1747,12 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( return dm4f.x*sumf_d - dm4f.y*sumf_m; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } // contiguous u/y values -// also used for q5_K static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { @@ -1741,19 +1762,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( float sumf_m = 0.0f; #pragma unroll - for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) { + for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) { int sumi_d = 0; #pragma unroll - for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) { - sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product - sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product } - const float2 ds8f = __half22float2(ds8[i0 / 4]); + const float2 ds8f = __half22float2(ds8[i]); - sumf_d += ds8f.x * (sc[i0/4] * sumi_d); - sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val } const float2 dm4f = __half22float2(dm4); @@ -1761,6 +1781,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( return dm4f.x*sumf_d - dm4f.y*sumf_m; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1769,7 +1790,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( #define VDR_Q5_K_Q8_1_MMQ 8 // contiguous v/x values -static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl( +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { @@ -1801,6 +1822,41 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl( return dm5f.x*sumf_d - dm5f.y*sumf_m; #else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product + } + + const float2 ds8f = __half22float2(ds8[i]); + + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1831,6 +1887,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( return d*sumf; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1862,6 +1919,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( return d6 * sumf_d; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } @@ -1884,21 +1942,21 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1( return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_0) + GGML_CUDA_MMQ_Y/QI4_0]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0]; *x_ql = tile_x_qs; *x_dm = (half2 *) tile_x_d; } -template static __device__ __forceinline__ void load_tiles_q4_0( +template static __device__ __forceinline__ void load_tiles_q4_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -1910,7 +1968,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -1920,39 +1978,30 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); - x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; + // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; } -// const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; -// const int kbxd = k % blocks_per_tile_x_row; + const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; + const int kbxd = k % blocks_per_tile_x_row; -// #pragma unroll -// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_0) { -// FIXME out-of-bounds -// const int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { + int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; -// if (i >= GGML_CUDA_MMQ_Y) { -// return; -// } + if (need_check) { + i = min(i, i_max); + } -// const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; -// x_dm[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd].x = bxi->d; -// } + x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d; + } } static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q4_0_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const float * x_dmf = (float *) x_dm; @@ -1960,13 +2009,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_0]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; } return vec_dot_q4_0_q8_1_impl (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], - y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q4_1_q8_1( @@ -1987,21 +2036,21 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_1) + GGML_CUDA_MMQ_Y/QI4_1]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1]; *x_ql = tile_x_qs; *x_dm = tile_x_dm; } -template static __device__ __forceinline__ void load_tiles_q4_1( +template static __device__ __forceinline__ void load_tiles_q4_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2011,7 +2060,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_1 * bx0 = (block_q4_1 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2027,7 +2076,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ const int kbxd = k % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_1) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; if (need_check) { @@ -2044,27 +2093,19 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q4_1_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); int u[2*VDR_Q4_1_Q8_1_MMQ]; #pragma unroll for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI4_1]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; } return vec_dot_q4_1_q8_1_impl (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], - y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q5_0_q8_1( @@ -2087,21 +2128,21 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1( return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_0) + GGML_CUDA_MMQ_Y/QI5_0]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0]; *x_ql = tile_x_ql; *x_dm = (half2 *) tile_x_d; } -template static __device__ __forceinline__ void load_tiles_q5_0( +template static __device__ __forceinline__ void load_tiles_q5_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2111,7 +2152,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_0 * bx0 = (block_q5_0 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2147,7 +2188,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_0) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; if (need_check) { @@ -2164,14 +2205,6 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q5_0_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; const float * x_dmf = (const float *) x_dm; @@ -2181,12 +2214,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_0]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; } return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q5_1_q8_1( @@ -2209,21 +2242,21 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1( return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); } -static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_1) + GGML_CUDA_MMQ_Y/QI5_1]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; } -template static __device__ __forceinline__ void load_tiles_q5_1( +template static __device__ __forceinline__ void load_tiles_q5_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2233,7 +2266,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_1 * bx0 = (block_q5_1 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2266,7 +2299,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const int kbxd = k % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_1) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; if (need_check) { @@ -2283,14 +2316,6 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q5_1_Q8_1_MMQ == 0); - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; @@ -2298,12 +2323,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * (2*WARP_SIZE) + kyqs + l]; - u[2*l+1] = y_qs[j * (2*WARP_SIZE) + kyqs + l + QI5_1]; + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; } return vec_dot_q8_1_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]); + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q8_0_q8_1( @@ -2323,21 +2348,21 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, bq8_1->ds.x); } -static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ float tile_x_d[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI8_0) + GGML_CUDA_MMQ_Y/QI8_0]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0]; *x_ql = tile_x_qs; *x_dm = (half2 *) tile_x_d; } -template static __device__ __forceinline__ void load_tiles_q8_0( +template static __device__ __forceinline__ void load_tiles_q8_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2348,7 +2373,7 @@ template static __device__ __forceinline__ void load_tiles_q8_ const block_q8_0 * bx0 = (block_q8_0 *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2358,41 +2383,29 @@ template static __device__ __forceinline__ void load_tiles_q8_ const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); - x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbx] = bxi->d; } -// const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; -// const int kbxd = k % blocks_per_tile_x_row; + const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; + const int kbxd = k % blocks_per_tile_x_row; -// #pragma unroll -// for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI8_0) { -// FIXME out-of-bounds -// const int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) { + int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; -// #if GGML_CUDA_MMQ_Y < 64 -// if (i >= GGML_CUDA_MMQ_Y) { -// return; -// } -// #endif // GGML_CUDA_MMQ_Y < 64 + if (need_check) { + i = min(i, i_max); + } -// const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; -// x_dm[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd].x = bxi->d; -// } + x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d; + } } static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q8_0_Q8_1_MMQ == 0); - const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; @@ -2424,23 +2437,23 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); } -static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI2_K) + GGML_CUDA_MMQ_Y/QI2_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q2_K( +template static __device__ __forceinline__ void load_tiles_q2_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2450,7 +2463,7 @@ template static __device__ __forceinline__ void load_tiles_q2_ const block_q2_K * bx0 = (block_q2_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2466,8 +2479,8 @@ template static __device__ __forceinline__ void load_tiles_q2_ const int kbxd = k % blocks_per_tile_x_row; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI2_K) { - int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) { + int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2479,7 +2492,7 @@ template static __device__ __forceinline__ void load_tiles_q2_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); if (need_check) { @@ -2496,14 +2509,6 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q2_K_Q8_1_MMQ == 0); - const int kbx = k / QI2_K; const int ky = (k % QI2_K) * QR2_K; const float * y_df = (const float *) y_ds; @@ -2520,7 +2525,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; - const int index_y = j * (QR2_K*WARP_SIZE) + QR2_K*k; + const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); } @@ -2551,12 +2556,12 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); } -static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI3_K) + GGML_CUDA_MMQ_Y/QI3_K]; - __shared__ int tile_x_qh[GGML_CUDA_MMQ_Y * (WARP_SIZE/2) + GGML_CUDA_MMQ_Y/2]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/4) + GGML_CUDA_MMQ_Y/4]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K]; + __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -2564,12 +2569,12 @@ static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q3_K( +template static __device__ __forceinline__ void load_tiles_q3_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2579,7 +2584,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ const block_q3_K * bx0 = (block_q3_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2596,8 +2601,8 @@ template static __device__ __forceinline__ void load_tiles_q3_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI3_K) { - int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) { + int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2609,7 +2614,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 2) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); if (need_check) { @@ -2623,7 +2628,7 @@ template static __device__ __forceinline__ void load_tiles_q3_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 4) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); if (need_check) { @@ -2652,14 +2657,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q3_K_Q8_1_MMQ == 0); - const int kbx = k / QI3_K; const int ky = (k % QI3_K) * QR3_K; const float * x_dmf = (const float *) x_dm; @@ -2681,7 +2678,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( v[l] = __vsubss4(vll, vlh); } - const int index_y = j * (QR3_K*WARP_SIZE) + k*QR3_K; + const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); } @@ -2772,29 +2769,30 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( return dall * sumf_d - dmin * sumf_m; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif } -static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI4_K) + GGML_CUDA_MMQ_Y/QI4_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q4_K( +template static __device__ __forceinline__ void load_tiles_q4_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2804,7 +2802,7 @@ template static __device__ __forceinline__ void load_tiles_q4_ const block_q4_K * bx0 = (block_q4_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -2820,8 +2818,8 @@ template static __device__ __forceinline__ void load_tiles_q4_ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI4_K) { - int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) { + int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2833,8 +2831,8 @@ template static __device__ __forceinline__ void load_tiles_q4_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; if (need_check) { i = min(i, i_max); @@ -2858,26 +2856,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q4_K_Q8_1_MMQ == 0); - - int v[QR4_K*VDR_Q4_K_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) { - v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F; - v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F; - } - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); - const int index_y = j * (QR4_K*WARP_SIZE) + QR4_K*k; - return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); + const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; + return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q5_K_q8_1( @@ -2924,7 +2907,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( u[2*i+1] = q8[4]; } - return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8); + return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); #else @@ -2963,29 +2946,30 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( return d * sumf_d; #else + assert(false); return 0.0f; // only to satisfy the compiler #endif // __CUDA_ARCH__ >= MIN_CC_DP4A #endif } -static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI5_K) + GGML_CUDA_MMQ_Y/QI5_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q5_K( +template static __device__ __forceinline__ void load_tiles_q5_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -2995,7 +2979,7 @@ template static __device__ __forceinline__ void load_tiles_q5_ const block_q5_K * bx0 = (block_q5_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -3024,8 +3008,8 @@ template static __device__ __forceinline__ void load_tiles_q5_ const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI5_K) { - int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) { + int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3037,8 +3021,8 @@ template static __device__ __forceinline__ void load_tiles_q5_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3062,19 +3046,12 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q5_K_Q8_1_MMQ == 0); - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); - const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; - const int index_y = j * (QR5_K*WARP_SIZE) + QR5_K*k; - return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); + const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; + const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; + return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q6_K_q8_1( @@ -3103,23 +3080,23 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); } -static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { +template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[GGML_CUDA_MMQ_Y * (2*WARP_SIZE) + GGML_CUDA_MMQ_Y]; - __shared__ half2 tile_x_dm[GGML_CUDA_MMQ_Y * (WARP_SIZE/QI6_K) + GGML_CUDA_MMQ_Y/QI6_K]; - __shared__ int tile_x_sc[GGML_CUDA_MMQ_Y * (WARP_SIZE/8) + GGML_CUDA_MMQ_Y/8]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; *x_sc = tile_x_sc; } -template static __device__ __forceinline__ void load_tiles_q6_K( +template static __device__ __forceinline__ void load_tiles_q6_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { __builtin_assume(i_offset >= 0); - __builtin_assume(i_offset < 8); + __builtin_assume(i_offset < nwarps); __builtin_assume(k >= 0); __builtin_assume(k < WARP_SIZE); @@ -3129,7 +3106,7 @@ template static __device__ __forceinline__ void load_tiles_q6_ const block_q6_K * bx0 = (block_q6_K *) vx; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8) { + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { int i = i0 + i_offset; if (need_check) { @@ -3159,8 +3136,8 @@ template static __device__ __forceinline__ void load_tiles_q6_ float * x_dmf = (float *) x_dm; #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * QI6_K) { - int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { + int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3172,8 +3149,8 @@ template static __device__ __forceinline__ void load_tiles_q6_ } #pragma unroll - for (int i0 = 0; i0 < GGML_CUDA_MMQ_Y; i0 += 8 * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % GGML_CUDA_MMQ_Y; + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; if (need_check) { i = min(i, i_max); @@ -3189,27 +3166,19 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - __builtin_assume(i >= 0); - __builtin_assume(i < GGML_CUDA_MMQ_Y); - __builtin_assume(j >= 0); - __builtin_assume(j < WARP_SIZE); - __builtin_assume(k >= 0); - __builtin_assume(k < WARP_SIZE); - __builtin_assume(k % VDR_Q6_K_Q8_1_MMQ == 0); - const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); - const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; - const int index_y = j * (QR6_K*WARP_SIZE) + QR6_K*k; + const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; + const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); } -template -static __global__ void mul_mat_q( +static __device__ __forceinline__ void mul_mat_q( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { @@ -3222,14 +3191,10 @@ static __global__ void mul_mat_q( const int & ncols_dst = ncols_y; - const int tid_x = threadIdx.x; - const int tid_y = threadIdx.y; - - const int row_dst_0 = blockIdx.x*GGML_CUDA_MMQ_Y; + const int row_dst_0 = blockIdx.x*mmq_y; const int & row_x_0 = row_dst_0; - const int row_dst = row_dst_0 + tid_x; - const int col_dst_0 = blockIdx.y*WARP_SIZE; + const int col_dst_0 = blockIdx.y*mmq_x; const int & col_y_0 = col_dst_0; int * tile_x_ql = nullptr; @@ -3239,84 +3204,444 @@ static __global__ void mul_mat_q( allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - const int blocks_per_tile_y_col = qr*WARP_SIZE/QI8_1; + __shared__ int tile_y_qs[mmq_x * WARP_SIZE]; + __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1]; - __shared__ int tile_y_qs[(WARP_SIZE) * (qr*WARP_SIZE)]; - __shared__ half2 tile_y_ds[(WARP_SIZE) * blocks_per_tile_y_col]; - - float sum[GGML_CUDA_MMQ_Y/WARP_SIZE][4] = {0.0f}; + float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f}; for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, - tid_y, nrows_x-row_x_0-1, tid_x, blocks_per_row_x); + threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x); +#pragma unroll for (int ir = 0; ir < qr; ++ir) { - const int kqs = ir*WARP_SIZE + tid_x; + const int kqs = ir*WARP_SIZE + threadIdx.x; const int kbxd = kqs / QI8_1; - for (int i = 0; i < WARP_SIZE; i += 8) { - const int col_y_eff = min(col_y_0 + tid_y + i, ncols_y-1); // to prevent out-of-bounds memory accesses +#pragma unroll + for (int i = 0; i < mmq_x; i += nwarps) { + const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; - tile_y_qs[(tid_y + i) * (qr*WARP_SIZE) + kqs] = get_int_from_int8_aligned(by0->qs, tid_x % QI8_1); + const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE; + tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1); } - } - for (int ids0 = 0; ids0 < WARP_SIZE; ids0 += 8 * (WARP_SIZE/blocks_per_tile_y_col)) { - const int ids = (ids0 + tid_y * (WARP_SIZE/blocks_per_tile_y_col) + tid_x / blocks_per_tile_y_col) % WARP_SIZE; - const int kby = tid_x % blocks_per_tile_y_col; - const int col_y_eff = min(col_y_0 + ids, ncols_y-1); - - // if the sum is not needed it's faster to transform the scale to f32 ahead of time - const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kby].ds; - half2 * dsi_dst = &tile_y_ds[ids * (qr*WARP_SIZE/QI8_1) + kby]; - if (need_sum) { - *dsi_dst = *dsi_src; - } else { - float * dfi_dst = (float *) dsi_dst; - *dfi_dst = (*dsi_src).x; +#pragma unroll + for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { + const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x; + const int kby = threadIdx.x % (WARP_SIZE/QI8_1); + const int col_y_eff = min(col_y_0 + ids, ncols_y-1); + + // if the sum is not needed it's faster to transform the scale to f32 ahead of time + const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds; + half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby]; + if (need_sum) { + *dsi_dst = *dsi_src; + } else { + float * dfi_dst = (float *) dsi_dst; + *dfi_dst = (*dsi_src).x; + } } - } - __syncthreads(); + __syncthreads(); -#if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal +// #pragma unroll // unrolling this loop causes too much register pressure + for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { #pragma unroll -#endif // __CUDA_ARCH__ >= 700 - for (int k = 0; k < WARP_SIZE; k += vdr) { + for (int j = 0; j < mmq_x; j += nwarps) { #pragma unroll - for (int j = 0; j < WARP_SIZE; j += 8) { -#pragma unroll - for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) { - sum[i/WARP_SIZE][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, - tid_x + i, tid_y + j, k); + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + sum[i/WARP_SIZE][j/nwarps] += vec_dot( + tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, + threadIdx.x + i, threadIdx.y + j, k); + } } } - } - - __syncthreads(); - } - - if (row_dst >= nrows_dst) { - return; + __syncthreads(); + } } - for (int j = 0; j < WARP_SIZE; j += 8) { - const int col_dst = col_dst_0 + j + tid_y; +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { + const int col_dst = col_dst_0 + j + threadIdx.y; if (col_dst >= ncols_dst) { return; } - for (int i = 0; i < GGML_CUDA_MMQ_Y; i += WARP_SIZE) { - dst[col_dst*nrows_dst + row_dst + i] = sum[i/WARP_SIZE][j/8]; +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + const int row_dst = row_dst_0 + threadIdx.x + i; + + if (row_dst >= nrows_dst) { + continue; + } + + dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps]; } } } +#define MMQ_X_Q4_0_AMPERE 64 +#define MMQ_Y_Q4_0_AMPERE 128 +#define NWARPS_Q4_0_AMPERE 4 +#define MMQ_X_Q4_0_PASCAL 64 +#define MMQ_Y_Q4_0_PASCAL 64 +#define NWARPS_Q4_0_PASCAL 8 + +template static __global__ void mul_mat_q4_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q4_0_AMPERE; + const int mmq_y = MMQ_Y_Q4_0_AMPERE; + const int nwarps = NWARPS_Q4_0_AMPERE; + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_0_PASCAL; + const int mmq_y = MMQ_Y_Q4_0_PASCAL; + const int nwarps = NWARPS_Q4_0_PASCAL; + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q4_0_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q4_1_AMPERE 64 +#define MMQ_Y_Q4_1_AMPERE 128 +#define NWARPS_Q4_1_AMPERE 4 +#define MMQ_X_Q4_1_PASCAL 64 +#define MMQ_Y_Q4_1_PASCAL 64 +#define NWARPS_Q4_1_PASCAL 8 + +template static __global__ void +#if __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q4_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q4_1_AMPERE; + const int mmq_y = MMQ_Y_Q4_1_AMPERE; + const int nwarps = NWARPS_Q4_1_AMPERE; + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_1_PASCAL; + const int mmq_y = MMQ_Y_Q4_1_PASCAL; + const int nwarps = NWARPS_Q4_1_PASCAL; + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q4_1_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q5_0_AMPERE 128 +#define MMQ_Y_Q5_0_AMPERE 64 +#define NWARPS_Q5_0_AMPERE 4 +#define MMQ_X_Q5_0_PASCAL 64 +#define MMQ_Y_Q5_0_PASCAL 64 +#define NWARPS_Q5_0_PASCAL 8 + +template static __global__ void mul_mat_q5_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q5_0_AMPERE; + const int mmq_y = MMQ_Y_Q5_0_AMPERE; + const int nwarps = NWARPS_Q5_0_AMPERE; + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_0_PASCAL; + const int mmq_y = MMQ_Y_Q5_0_PASCAL; + const int nwarps = NWARPS_Q5_0_PASCAL; + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q5_0_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q5_1_AMPERE 128 +#define MMQ_Y_Q5_1_AMPERE 64 +#define NWARPS_Q5_1_AMPERE 4 +#define MMQ_X_Q5_1_PASCAL 64 +#define MMQ_Y_Q5_1_PASCAL 64 +#define NWARPS_Q5_1_PASCAL 8 + +template static __global__ void mul_mat_q5_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q5_1_AMPERE; + const int mmq_y = MMQ_Y_Q5_1_AMPERE; + const int nwarps = NWARPS_Q5_1_AMPERE; + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_1_PASCAL; + const int mmq_y = MMQ_Y_Q5_1_PASCAL; + const int nwarps = NWARPS_Q5_1_PASCAL; + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q5_1_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q8_0_AMPERE 128 +#define MMQ_Y_Q8_0_AMPERE 64 +#define NWARPS_Q8_0_AMPERE 4 +#define MMQ_X_Q8_0_PASCAL 64 +#define MMQ_Y_Q8_0_PASCAL 64 +#define NWARPS_Q8_0_PASCAL 8 + +template static __global__ void mul_mat_q8_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q8_0_AMPERE; + const int mmq_y = MMQ_Y_Q8_0_AMPERE; + const int nwarps = NWARPS_Q8_0_AMPERE; + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q8_0_PASCAL; + const int mmq_y = MMQ_Y_Q8_0_PASCAL; + const int nwarps = NWARPS_Q8_0_PASCAL; + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q8_0_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q2_K_AMPERE 64 +#define MMQ_Y_Q2_K_AMPERE 128 +#define NWARPS_Q2_K_AMPERE 4 +#define MMQ_X_Q2_K_PASCAL 64 +#define MMQ_Y_Q2_K_PASCAL 64 +#define NWARPS_Q2_K_PASCAL 8 + +template static __global__ void mul_mat_q2_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q2_K_AMPERE; + const int mmq_y = MMQ_Y_Q2_K_AMPERE; + const int nwarps = NWARPS_Q2_K_AMPERE; + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q2_K_PASCAL; + const int mmq_y = MMQ_Y_Q2_K_PASCAL; + const int nwarps = NWARPS_Q2_K_PASCAL; + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q2_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q3_K_AMPERE 128 +#define MMQ_Y_Q3_K_AMPERE 128 +#define NWARPS_Q3_K_AMPERE 4 +#define MMQ_X_Q3_K_PASCAL 64 +#define MMQ_Y_Q3_K_PASCAL 64 +#define NWARPS_Q3_K_PASCAL 8 + +template static __global__ void +#if __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q3_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q3_K_AMPERE; + const int mmq_y = MMQ_Y_Q3_K_AMPERE; + const int nwarps = NWARPS_Q3_K_AMPERE; + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q3_K_PASCAL; + const int mmq_y = MMQ_Y_Q3_K_PASCAL; + const int nwarps = NWARPS_Q3_K_PASCAL; + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q3_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q4_K_AMPERE 64 +#define MMQ_Y_Q4_K_AMPERE 128 +#define NWARPS_Q4_K_AMPERE 4 +#define MMQ_X_Q4_K_PASCAL 64 +#define MMQ_Y_Q4_K_PASCAL 64 +#define NWARPS_Q4_K_PASCAL 8 + +template static __global__ void +#if __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q4_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q4_K_AMPERE; + const int mmq_y = MMQ_Y_Q4_K_AMPERE; + const int nwarps = NWARPS_Q4_K_AMPERE; + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_K_PASCAL; + const int mmq_y = MMQ_Y_Q4_K_PASCAL; + const int nwarps = NWARPS_Q4_K_PASCAL; + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q4_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q5_K_AMPERE 64 +#define MMQ_Y_Q5_K_AMPERE 128 +#define NWARPS_Q5_K_AMPERE 4 +#define MMQ_X_Q5_K_PASCAL 64 +#define MMQ_Y_Q5_K_PASCAL 64 +#define NWARPS_Q5_K_PASCAL 8 + +template static __global__ void mul_mat_q5_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q5_K_AMPERE; + const int mmq_y = MMQ_Y_Q5_K_AMPERE; + const int nwarps = NWARPS_Q5_K_AMPERE; + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_K_PASCAL; + const int mmq_y = MMQ_Y_Q5_K_PASCAL; + const int nwarps = NWARPS_Q5_K_PASCAL; + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q5_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + +#define MMQ_X_Q6_K_AMPERE 64 +#define MMQ_Y_Q6_K_AMPERE 64 +#define NWARPS_Q6_K_AMPERE 4 +#define MMQ_X_Q6_K_PASCAL 64 +#define MMQ_Y_Q6_K_PASCAL 64 +#define NWARPS_Q6_K_PASCAL 8 + +template static __global__ void +#if __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q6_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if __CUDA_ARCH__ >= CC_TURING + const int mmq_x = MMQ_X_Q6_K_AMPERE; + const int mmq_y = MMQ_Y_Q6_K_AMPERE; + const int nwarps = NWARPS_Q6_K_AMPERE; + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q6_K_PASCAL; + const int mmq_y = MMQ_Y_Q6_K_PASCAL; + const int nwarps = NWARPS_Q6_K_PASCAL; + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + (void) vec_dot_q6_K_q8_1_mul_mat; + assert(false); +#endif // __CUDA_ARCH__ >= CC_TURING +} + template static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) { const int row = blockIdx.y*blockDim.y + threadIdx.y; @@ -4014,17 +4339,36 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q4_0_AMPERE; + mmq_y = MMQ_Y_Q4_0_AMPERE; + nwarps = NWARPS_Q4_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_0_PASCAL; + mmq_y = MMQ_Y_Q4_0_PASCAL; + nwarps = NWARPS_Q4_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q4_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q4_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4032,17 +4376,36 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q4_1_AMPERE; + mmq_y = MMQ_Y_Q4_1_AMPERE; + nwarps = NWARPS_Q4_1_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_1_PASCAL; + mmq_y = MMQ_Y_Q4_1_PASCAL; + nwarps = NWARPS_Q4_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q4_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q4_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4050,17 +4413,36 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q5_0_AMPERE; + mmq_y = MMQ_Y_Q5_0_AMPERE; + nwarps = NWARPS_Q5_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_0_PASCAL; + mmq_y = MMQ_Y_Q5_0_PASCAL; + nwarps = NWARPS_Q5_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q5_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q5_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4068,17 +4450,36 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q5_1_AMPERE; + mmq_y = MMQ_Y_Q5_1_AMPERE; + nwarps = NWARPS_Q5_1_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_1_PASCAL; + mmq_y = MMQ_Y_Q5_1_PASCAL; + nwarps = NWARPS_Q5_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q5_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q5_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4086,17 +4487,36 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q8_0_AMPERE; + mmq_y = MMQ_Y_Q8_0_AMPERE; + nwarps = NWARPS_Q8_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q8_0_PASCAL; + mmq_y = MMQ_Y_Q8_0_PASCAL; + nwarps = NWARPS_Q8_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q8_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q8_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4104,17 +4524,36 @@ static void ggml_mul_mat_q2_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q2_K_AMPERE; + mmq_y = MMQ_Y_Q2_K_AMPERE; + nwarps = NWARPS_Q2_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q2_K_PASCAL; + mmq_y = MMQ_Y_Q2_K_PASCAL; + nwarps = NWARPS_Q2_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q2_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q2_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4122,17 +4561,36 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q3_K_AMPERE; + mmq_y = MMQ_Y_Q3_K_AMPERE; + nwarps = NWARPS_Q3_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q3_K_PASCAL; + mmq_y = MMQ_Y_Q3_K_PASCAL; + nwarps = NWARPS_Q3_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q3_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q3_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4140,17 +4598,36 @@ static void ggml_mul_mat_q4_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q4_K_AMPERE; + mmq_y = MMQ_Y_Q4_K_AMPERE; + nwarps = NWARPS_Q4_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_K_PASCAL; + mmq_y = MMQ_Y_Q4_K_PASCAL; + nwarps = NWARPS_Q4_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q4_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q4_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4158,17 +4635,36 @@ static void ggml_mul_mat_q5_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q5_K_AMPERE; + mmq_y = MMQ_Y_Q5_K_AMPERE; + nwarps = NWARPS_Q5_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_K_PASCAL; + mmq_y = MMQ_Y_Q5_K_PASCAL; + nwarps = NWARPS_Q5_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q5_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q5_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4176,17 +4672,36 @@ static void ggml_mul_mat_q6_K_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - const int block_num_x = (nrows_x + GGML_CUDA_MMQ_Y - 1) / GGML_CUDA_MMQ_Y; - const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE; + int id; + CUDA_CHECK(cudaGetDevice(&id)); + const int compute_capability = g_compute_capabilities[id]; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_TURING) { + mmq_x = MMQ_X_Q6_K_AMPERE; + mmq_y = MMQ_Y_Q6_K_AMPERE; + nwarps = NWARPS_Q6_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q6_K_PASCAL; + mmq_y = MMQ_Y_Q6_K_PASCAL; + nwarps = NWARPS_Q6_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); - if (nrows_x % GGML_CUDA_MMQ_Y == 0) { - mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q6_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } else { - mul_mat_q, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - <<>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + const bool need_check = true; + mul_mat_q6_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); } } @@ -4361,20 +4876,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) { } -static void * g_scratch_buffer = nullptr; -static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default -static size_t g_scratch_offset = 0; - -static int g_device_count = -1; -static int g_main_device = 0; -static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; -static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; -static bool g_mul_mat_q = false; - -static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; - -static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr }; - void ggml_init_cublas() { static bool initialized = false; @@ -4730,6 +5231,37 @@ inline void ggml_cuda_op_mul_mat_q( (void) i1; } +static int64_t get_row_rounding(ggml_type type) { + int max_compute_capability = INT_MIN; + for (int id = 0; id < g_device_count; ++id) { + if (max_compute_capability < g_compute_capabilities[id] + && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + max_compute_capability = g_compute_capabilities[id]; + } + } + + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + return max_compute_capability >= CC_TURING ? 128 : 64; + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return 64; + case GGML_TYPE_F16: + return 1; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + return max_compute_capability >= CC_TURING ? 128 : 64; + case GGML_TYPE_Q6_K: + return 64; + default: + GGML_ASSERT(false); + } +} + inline void ggml_cuda_op_mul_mat_vec( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i, float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1, @@ -5130,14 +5662,16 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm int64_t row_low, row_high; if (split) { + const int64_t rounding = get_row_rounding(src0->type); + row_low = id == 0 ? 0 : nrows0*g_tensor_split[id]; - row_low -= row_low % GGML_CUDA_MMQ_Y; + row_low -= row_low % rounding; if (id == g_device_count - 1) { row_high = nrows0; } else { row_high = nrows0*g_tensor_split[id + 1]; - row_high -= row_high % GGML_CUDA_MMQ_Y; + row_high -= row_high % rounding; } } else { row_low = 0; @@ -5616,14 +6150,16 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { row_low = 0; row_high = nrows; } else if (backend == GGML_BACKEND_GPU_SPLIT) { + const int64_t rounding = get_row_rounding(tensor->type); + row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; - row_low -= row_low % GGML_CUDA_MMQ_Y; + row_low -= row_low % rounding; if (id == g_device_count - 1) { row_high = nrows; } else { row_high = nrows*g_tensor_split[id + 1]; - row_high -= row_high % GGML_CUDA_MMQ_Y; + row_high -= row_high % rounding; } } else { GGML_ASSERT(false); @@ -5933,3 +6469,15 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ func(tensor->src[0], tensor->src[1], tensor); return true; } + +int ggml_cuda_get_device_count() { + int device_count; + CUDA_CHECK(cudaGetDeviceCount(&device_count)); + return device_count; +} + +void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); + snprintf(description, description_size, "%s", prop.name); +} diff --git a/src/ggml-cuda.h b/src/ggml-cuda.h index 72d7afa46..cad05f5fa 100644 --- a/src/ggml-cuda.h +++ b/src/ggml-cuda.h @@ -8,29 +8,25 @@ extern "C" { #define GGML_CUDA_MAX_DEVICES 16 -void ggml_init_cublas(void); -void ggml_cuda_set_tensor_split(const float * tensor_split); - -void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); - -// TODO: export these with GGML_API -void * ggml_cuda_host_malloc(size_t size); -void ggml_cuda_host_free(void * ptr); - -void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); - -void ggml_cuda_free_data(struct ggml_tensor * tensor); -void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); -void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); -void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); -void ggml_cuda_set_main_device(int main_device); -void ggml_cuda_set_mul_mat_q(bool mul_mat_q); -void ggml_cuda_set_scratch_size(size_t scratch_size); -void ggml_cuda_free_scratch(void); -bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +GGML_API void ggml_init_cublas(void); +GGML_API void * ggml_cuda_host_malloc(size_t size); +GGML_API void ggml_cuda_host_free(void * ptr); + +GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split); +GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); +GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); +GGML_API void ggml_cuda_set_main_device(int main_device); +GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q); +GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size); +GGML_API void ggml_cuda_free_scratch(void); +GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); + +GGML_API int ggml_cuda_get_device_count(void); +GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size); #ifdef __cplusplus } diff --git a/src/ggml-metal.h b/src/ggml-metal.h index 16f1a0caa..00202b787 100644 --- a/src/ggml-metal.h +++ b/src/ggml-metal.h @@ -38,6 +38,9 @@ struct ggml_metal_context; struct ggml_metal_context * ggml_metal_init(int n_cb); void ggml_metal_free(struct ggml_metal_context * ctx); +void * ggml_metal_host_malloc(size_t n); +void ggml_metal_host_free (void * data); + // set the number of command buffers to use void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); @@ -63,10 +66,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * // try to find operations that can be run concurrently in the graph // you should run it again if the topology of your graph changes -void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem); + +// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized +int ggml_metal_if_optimized(struct ggml_metal_context * ctx); -// if the graph has been optimized for concurrently dispatch -bool ggml_metal_if_optimized(struct ggml_metal_context * ctx); +// output the concur_list for ggml_alloc +int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); // same as ggml_graph_compute but uses Metal // creates gf->n_threads command buffers in parallel diff --git a/src/ggml-metal.m b/src/ggml-metal.m index b47a98e21..835c5f297 100644 --- a/src/ggml-metal.m +++ b/src/ggml-metal.m @@ -5,7 +5,6 @@ #import #import -#import #undef MIN #undef MAX @@ -79,6 +78,14 @@ GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_f16_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32); GGML_METAL_DECL_KERNEL(rope); GGML_METAL_DECL_KERNEL(alibi_f32); GGML_METAL_DECL_KERNEL(cpy_f32_f16); @@ -110,13 +117,6 @@ @implementation GGMLMetalClass ctx->n_buffers = 0; ctx->concur_list_len = 0; - // determine if we can use MPS - if (MPSSupportsMTLDevice(ctx->device)) { - fprintf(stderr, "%s: using MPS\n", __func__); - } else { - fprintf(stderr, "%s: not using MPS\n", __func__); - GGML_ASSERT(false && "MPS not supported"); - } #if 0 // compile from source string and show compile log @@ -126,7 +126,7 @@ @implementation GGMLMetalClass ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error]; if (error) { fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); - exit(1); + return NULL; } } #else @@ -144,7 +144,7 @@ @implementation GGMLMetalClass NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; if (error) { fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); - exit(1); + return NULL; } #ifdef GGML_QKK_64 @@ -156,17 +156,22 @@ @implementation GGMLMetalClass #endif if (error) { fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); - exit(1); + return NULL; } } #endif // load kernels { + NSError * error = nil; #define GGML_METAL_ADD_KERNEL(name) \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ - ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \ - fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); + ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ + fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); \ + if (error) { \ + fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + return NULL; \ + } GGML_METAL_ADD_KERNEL(add); GGML_METAL_ADD_KERNEL(add_row); @@ -196,6 +201,14 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); GGML_METAL_ADD_KERNEL(rope); GGML_METAL_ADD_KERNEL(alibi_f32); GGML_METAL_ADD_KERNEL(cpy_f32_f16); @@ -224,15 +237,31 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { free(ctx); } +void * ggml_metal_host_malloc(size_t n) { + void * data = NULL; + const int result = posix_memalign((void **) &data, getpagesize(), n); + if (result != 0) { + fprintf(stderr, "%s: error: posix_memalign failed\n", __func__); + return NULL; + } + + return data; +} + +void ggml_metal_host_free(void * data) { + free(data); +} + void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { ctx->n_cb = n_cb; } -bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) { - if (ctx->concur_list_len) { - return true; - } - return false; +int ggml_metal_if_optimized(struct ggml_metal_context * ctx) { + return ctx->concur_list_len; +} + +int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) { + return ctx->concur_list; } // finds the Metal buffer that contains the tensor data on the GPU device @@ -375,7 +404,7 @@ void ggml_metal_get_tensor( void ggml_metal_graph_find_concurrency( struct ggml_metal_context * ctx, - struct ggml_cgraph * gf) { + struct ggml_cgraph * gf, bool check_mem) { int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time int nodes_unused[GGML_MAX_CONCUR]; @@ -422,7 +451,7 @@ void ggml_metal_graph_find_concurrency( } } } - if (exe_flag) { + if (exe_flag && check_mem) { // check if nodes[i]'s data will be overwritten by a node before nodes[i]. // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3] int64_t data_start = (int64_t) gf->nodes[i]->data; @@ -506,7 +535,7 @@ void ggml_metal_graph_compute( id command_buffer = command_buffers[cb_idx]; - id encoder = nil; + id encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; const int node_start = (cb_idx + 0) * n_nodes_per_cb; const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb; @@ -515,10 +544,6 @@ void ggml_metal_graph_compute( const int i = has_concur ? ctx->concur_list[ind] : ind; if (i == -1) { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - continue; - } [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers]; continue; } @@ -592,10 +617,6 @@ void ggml_metal_graph_compute( } break; case GGML_OP_ADD: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - if (ggml_nelements(src1) == ne10) { // src1 is a row [encoder setComputePipelineState:ctx->pipeline_add_row]; @@ -613,10 +634,6 @@ void ggml_metal_graph_compute( } break; case GGML_OP_MUL: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - if (ggml_nelements(src1) == ne10) { // src1 is a row [encoder setComputePipelineState:ctx->pipeline_mul_row]; @@ -634,10 +651,6 @@ void ggml_metal_graph_compute( } break; case GGML_OP_SCALE: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - const float scale = *(const float *) src1->data; [encoder setComputePipelineState:ctx->pipeline_scale]; @@ -653,10 +666,6 @@ void ggml_metal_graph_compute( switch (ggml_get_unary_op(gf->nodes[i])) { case GGML_UNARY_OP_SILU: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - [encoder setComputePipelineState:ctx->pipeline_silu]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -667,10 +676,6 @@ void ggml_metal_graph_compute( } break; case GGML_UNARY_OP_RELU: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - [encoder setComputePipelineState:ctx->pipeline_relu]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -681,10 +686,6 @@ void ggml_metal_graph_compute( } break; case GGML_UNARY_OP_GELU: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - [encoder setComputePipelineState:ctx->pipeline_gelu]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; @@ -701,10 +702,6 @@ void ggml_metal_graph_compute( } break; case GGML_OP_SOFT_MAX: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - const int nth = 32; [encoder setComputePipelineState:ctx->pipeline_soft_max]; @@ -719,10 +716,6 @@ void ggml_metal_graph_compute( } break; case GGML_OP_DIAG_MASK_INF: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - const int n_past = ((int32_t *)(dst->op_params))[0]; [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; @@ -740,53 +733,43 @@ void ggml_metal_graph_compute( GGML_ASSERT(ne00 == ne10); // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere + uint gqa = ne12/ne02; GGML_ASSERT(ne03 == ne13); + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && - (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) { - - if (encoder != nil) { - [encoder endEncoding]; - encoder = nil; - } - - MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16; - MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16; - - // for F32 x F32 we use MPS - MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor - matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt]; - - MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor - matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt]; - - MPSMatrixDescriptor * desc = [MPSMatrixDescriptor - matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32]; - - MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] - initWithDevice:ctx->device transposeLeft:false transposeRight:true - resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0]; - - // we need to do ne12 multiplications - // TODO: is there a way to do this in parallel - currently very slow .. - // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS - for (int64_t i02 = 0; i02 < ne12; ++i02) { - size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now - size_t offs_src1_cur = offs_src1 + i02*nb12; - size_t offs_dst_cur = offs_dst + i02*nb2; - - MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0]; - MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1]; - MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst_cur descriptor:desc ]; - - [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst]; - } - } else { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; + src1t == GGML_TYPE_F32 && + [ctx->device supportsFamily:MTLGPUFamilyApple7] && + ne00%32 == 0 && + ne11 > 1) { + switch (src0->type) { + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break; + case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break; + case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break; + case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break; + case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break; + case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break; + case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break; + case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break; + default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); + } + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9]; + [encoder setBytes:&gqa length:sizeof(gqa) atIndex:10]; + [encoder setThreadgroupMemoryLength:8192 atIndex:0]; + [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } - + else { int nth0 = 32; int nth1 = 1; @@ -885,23 +868,24 @@ void ggml_metal_graph_compute( [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16]; + [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17]; if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q3_K) { #ifdef GGML_QKK_64 - [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; #else - [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; #endif } else if (src0t == GGML_TYPE_Q5_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q6_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else { [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; @@ -910,10 +894,6 @@ void ggml_metal_graph_compute( } break; case GGML_OP_GET_ROWS: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - switch (src0->type) { case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break; case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break; @@ -939,10 +919,6 @@ void ggml_metal_graph_compute( } break; case GGML_OP_RMS_NORM: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -962,10 +938,6 @@ void ggml_metal_graph_compute( } break; case GGML_OP_NORM: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - const float eps = 1e-5f; const int nth = 256; @@ -984,10 +956,6 @@ void ggml_metal_graph_compute( } break; case GGML_OP_ALIBI: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - GGML_ASSERT((src0t == GGML_TYPE_F32)); const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past); @@ -1027,10 +995,6 @@ void ggml_metal_graph_compute( } break; case GGML_OP_ROPE: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; const int mode = ((int32_t *) dst->op_params)[2]; @@ -1071,10 +1035,6 @@ void ggml_metal_graph_compute( case GGML_OP_CPY: case GGML_OP_CONT: { - if (encoder == nil) { - encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; - } - const int nth = 32; switch (src0t) { diff --git a/src/ggml-metal.metal b/src/ggml-metal.metal index 8d26b5ec2..ce3541f4b 100644 --- a/src/ggml-metal.metal +++ b/src/ggml-metal.metal @@ -18,47 +18,6 @@ typedef struct { uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; -static void dequantize_row_q4_0(device const block_q4_0 * x, device float * y, int k) { - const int qk = QK4_0; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - const half d = x[i].d; - - for (int j = 0; j < qk/2; ++j) { - const int x0 = (x[i].qs[j] & 0x0F) - 8; - const int x1 = (x[i].qs[j] >> 4) - 8; - - y[i*qk + j + 0 ] = x0*d; - y[i*qk + j + qk/2] = x1*d; - } - } -} - -static void dequantize_row_q4_1(device const block_q4_1 * x, device float * y, int k) { - const int qk = QK4_1; - - assert(k % qk == 0); - - const int nb = k / qk; - - for (int i = 0; i < nb; i++) { - const half d = x[i].d; - const half m = x[i].m; - - for (int j = 0; j < qk/2; ++j) { - const int x0 = (x[i].qs[j] & 0x0F); - const int x1 = (x[i].qs[j] >> 4); - - y[i*qk + j + 0 ] = x0*d + m; - y[i*qk + j + qk/2] = x1*d + m; - } - } -} - kernel void kernel_add( device const float * src0, device const float * src1, @@ -219,54 +178,6 @@ kernel void kernel_diag_mask_inf( } } -kernel void kernel_get_rows_f16( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; - - for (int j = 0; j < ne00; j++) { - dst[i*nb1 + j] = ((device half *) ((device char *) src0 + r*nb01))[j]; - } -} - -kernel void kernel_get_rows_q4_0( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; - - dequantize_row_q4_0( - (device const block_q4_0 *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); -} - -kernel void kernel_get_rows_q4_1( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; - - dequantize_row_q4_1( - (device const block_q4_1 *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); -} - kernel void kernel_norm( device const void * src0, device float * dst, @@ -432,14 +343,16 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre // N_DST, so this is another explicit assumption of the implementation. template void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst, - int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01, - uint2 tgpig, uint tiisg, uint sgitg) { + int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa, + uint3 tgpig, uint tiisg, uint sgitg) { const int nb = ne00/QK4_0; const int r0 = tgpig.x; const int r1 = tgpig.y; + const int im = tgpig.z; const int first_row = (r0 * nsg + sgitg) * nr; - device const block_q_type * x = (device const block_q_type *) src0 + first_row * nb; - device const float * y = (device const float *) src1 + r1*ne10; + const uint offset0 = first_row * nb + im/gqa*(nb*ne0); + device const block_q_type * x = (device const block_q_type *) src0 + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; float yl[16]; // src1 vector cache float sumf[nr]={0.f}; @@ -470,7 +383,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device for (int row = 0; row < nr; ++row) { const float tot = simd_sum(sumf[row]); if (tiisg == 0 && first_row + row < ne01) { - dst[r1*ne0 + first_row + row] = tot; + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot; } } } @@ -480,13 +393,17 @@ kernel void kernel_mul_mat_q4_0_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, constant int64_t & ne01[[buffer(4)]], - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg); + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); } kernel void kernel_mul_mat_q4_1_f32( @@ -494,13 +411,17 @@ kernel void kernel_mul_mat_q4_1_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, constant int64_t & ne01[[buffer(4)]], - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg); + mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); } kernel void kernel_mul_mat_f16_f32( @@ -869,354 +790,6 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { return r; } -//========================================== dequantization ============================= - -static void dequantize_row_q2_K(device const block_q2_K * x, device float * y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; - - for (int i = 0; i < nb; i++) { - - const float d = x[i].d; - const float min = x[i].dmin; - - device const uint8_t * q = x[i].qs; - -#if QK_K == 256 - int is = 0; - float dl, ml; - for (int n = 0; n < QK_K; n += 128) { - int shift = 0; - for (int j = 0; j < 4; ++j) { - - uint8_t sc = x[i].scales[is++]; - dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; - - sc = x[i].scales[is++]; - dl = d * (sc & 0xF); ml = min * (sc >> 4); - for (int l = 0; l < 16; ++l) *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; - - shift += 2; - } - q += 32; - } -#else - float dl1 = d * (x[i].scales[0] & 0xF), ml1 = min * (x[i].scales[0] >> 4); - float dl2 = d * (x[i].scales[1] & 0xF), ml2 = min * (x[i].scales[1] >> 4); - float dl3 = d * (x[i].scales[2] & 0xF), ml3 = min * (x[i].scales[2] >> 4); - float dl4 = d * (x[i].scales[3] & 0xF), ml4 = min * (x[i].scales[3] >> 4); - for (int l = 0; l < 16; ++l) { - y[l+ 0] = dl1 * ((q[l] >> 0) & 3) - ml1; - y[l+16] = dl2 * ((q[l] >> 2) & 3) - ml2; - y[l+32] = dl3 * ((q[l] >> 4) & 3) - ml3; - y[l+48] = dl4 * ((q[l] >> 6) & 3) - ml4; - } - y += QK_K; -#endif - - } -} - -static void dequantize_row_q3_K(device const block_q3_K * x, device float * y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; - -#if QK_K == 256 - - const uint16_t kmask1 = 0x0303; - const uint16_t kmask2 = 0x0f0f; - - uint16_t aux[8]; - thread const int8_t * scales = (thread const int8_t*)aux; - - for (int i = 0; i < nb; i++) { - - const float d_all = (float)(x[i].d); - - device const uint8_t * q = x[i].qs; - device const uint8_t * h = x[i].hmask; - uint8_t m = 1; - - device const uint16_t * a = (device const uint16_t *)x[i].scales; - aux[0] = (a[0] & kmask2) | (((a[4] >> 0) & kmask1) << 4); - aux[1] = (a[1] & kmask2) | (((a[5] >> 0) & kmask1) << 4); - aux[2] = (a[2] & kmask2) | (((a[4] >> 2) & kmask1) << 4); - aux[3] = (a[3] & kmask2) | (((a[5] >> 2) & kmask1) << 4); - aux[4] = ((a[0] >> 4) & kmask2) | (((a[4] >> 4) & kmask1) << 4); - aux[5] = ((a[1] >> 4) & kmask2) | (((a[5] >> 4) & kmask1) << 4); - aux[6] = ((a[2] >> 4) & kmask2) | (((a[4] >> 6) & kmask1) << 4); - aux[7] = ((a[3] >> 4) & kmask2) | (((a[5] >> 6) & kmask1) << 4); - - int is = 0; - float dl; - for (int n = 0; n < QK_K; n += 128) { - int shift = 0; - for (int j = 0; j < 4; ++j) { - - dl = d_all * (scales[is++] - 32); - for (int l = 0; l < 16; ++l) { - *y++ = dl * ((int8_t)((q[l+ 0] >> shift) & 3) - ((h[l+ 0] & m) ? 0 : 4)); - } - - dl = d_all * (scales[is++] - 32); - for (int l = 0; l < 16; ++l) { - *y++ = dl * ((int8_t)((q[l+16] >> shift) & 3) - ((h[l+16] & m) ? 0 : 4)); - } - - shift += 2; - m <<= 1; - } - q += 32; - } - } -#else - for (int i = 0; i < nb; i++) { - - const float d_all = (float)(x[i].d); - - device const uint8_t * q = x[i].qs; - device const uint8_t * hm = x[i].hmask; - - const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); - const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); - const float d3 = d_all * ((x[i].scales[1] & 0xF) - 8); - const float d4 = d_all * ((x[i].scales[1] >> 4) - 8); - - for (int l = 0; l < 8; ++l) { - uint8_t h = hm[l]; - y[l+ 0] = d1 * ((int8_t)((q[l+0] >> 0) & 3) - ((h & 0x01) ? 0 : 4)); - y[l+ 8] = d1 * ((int8_t)((q[l+8] >> 0) & 3) - ((h & 0x02) ? 0 : 4)); - y[l+16] = d2 * ((int8_t)((q[l+0] >> 2) & 3) - ((h & 0x04) ? 0 : 4)); - y[l+24] = d2 * ((int8_t)((q[l+8] >> 2) & 3) - ((h & 0x08) ? 0 : 4)); - y[l+32] = d3 * ((int8_t)((q[l+0] >> 4) & 3) - ((h & 0x10) ? 0 : 4)); - y[l+40] = d3 * ((int8_t)((q[l+8] >> 4) & 3) - ((h & 0x20) ? 0 : 4)); - y[l+48] = d4 * ((int8_t)((q[l+0] >> 6) & 3) - ((h & 0x40) ? 0 : 4)); - y[l+56] = d4 * ((int8_t)((q[l+8] >> 6) & 3) - ((h & 0x80) ? 0 : 4)); - } - y += QK_K; - } -#endif - -} - -static void dequantize_row_q4_K(device const block_q4_K * x, device float * y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; - - for (int i = 0; i < nb; i++) { - - device const uint8_t * q = x[i].qs; - -#if QK_K == 256 - const float d = x[i].d; - const float min = x[i].dmin; - - device const uint8_t * scales = x[i].scales; - - int is = 0; - for (int j = 0; j < QK_K; j += 64) { - const uchar4 sc = get_scale_min_k4(is, scales); - const float d1 = d * sc[0]; const float m1 = min * sc[1]; - const float d2 = d * sc[2]; const float m2 = min * sc[3]; - for (int l = 0; l < 32; ++l) *y++ = d1 * (q[l] & 0xF) - m1; - for (int l = 0; l < 32; ++l) *y++ = d2 * (q[l] >> 4) - m2; - q += 32; is += 2; - } -#else - device const uint8_t * s = x[i].scales; - device const half2 * dh = (device const half2 *)x[i].d; - const float2 d = (float2)dh[0]; - const float d1 = d[0] * (s[0] & 0xF); - const float d2 = d[0] * (s[1] & 0xF); - const float m1 = d[1] * (s[0] >> 4); - const float m2 = d[1] * (s[1] >> 4); - for (int l = 0; l < 32; ++l) { - y[l+ 0] = d1 * (q[l] & 0xF) - m1; - y[l+32] = d2 * (q[l] >> 4) - m2; - } - y += QK_K; -#endif - - } -} - -static void dequantize_row_q5_K(device const block_q5_K * x, device float * y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; - -#if QK_K == 256 - for (int i = 0; i < nb; i++) { - - const float d = (float)(x[i].d); - const float min = (float)(x[i].dmin); - - device const uint8_t * ql = x[i].qs; - device const uint8_t * qh = x[i].qh; - - int is = 0; - uint8_t u1 = 1, u2 = 2; - for (int j = 0; j < QK_K; j += 64) { - const uchar4 sc = get_scale_min_k4(is, x[i].scales); - const float d1 = d * sc[0]; const float m1 = min * sc[1]; - const float d2 = d * sc[2]; const float m2 = min * sc[3]; - for (int l = 0; l < 32; ++l) *y++ = d1 * ((ql[l] & 0xF) + (qh[l] & u1 ? 16 : 0)) - m1; - for (int l = 0; l < 32; ++l) *y++ = d2 * ((ql[l] >> 4) + (qh[l] & u2 ? 16 : 0)) - m2; - ql += 32; is += 2; - u1 <<= 2; u2 <<= 2; - } - } -#else - for (int i = 0; i < nb; i++) { - - const float d = (float)x[i].d; - - device const uint8_t * ql = x[i].qs; - device const uint8_t * qh = x[i].qh; - device const int8_t * sc = x[i].scales; - - for (int l = 0; l < 8; ++l) { - y[l+ 0] = d * sc[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); - y[l+ 8] = d * sc[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16)); - y[l+16] = d * sc[1] * ((ql[l+16] & 0xF) - (qh[l] & 0x04 ? 0 : 16)); - y[l+24] = d * sc[1] * ((ql[l+24] & 0xF) - (qh[l] & 0x08 ? 0 : 16)); - y[l+32] = d * sc[2] * ((ql[l+ 0] >> 4) - (qh[l] & 0x10 ? 0 : 16)); - y[l+40] = d * sc[2] * ((ql[l+ 8] >> 4) - (qh[l] & 0x20 ? 0 : 16)); - y[l+48] = d * sc[3] * ((ql[l+16] >> 4) - (qh[l] & 0x40 ? 0 : 16)); - y[l+56] = d * sc[3] * ((ql[l+24] >> 4) - (qh[l] & 0x80 ? 0 : 16)); - } - y += QK_K; - } -#endif - -} - -static void dequantize_row_q6_K(device const block_q6_K * x, device float * y, int k) { - assert(k % QK_K == 0); - const int nb = k / QK_K; - - for (int i = 0; i < nb; i++) { - - device const uint8_t * ql = x[i].ql; - device const uint8_t * qh = x[i].qh; - device const int8_t * sc = x[i].scales; - - const float d = x[i].d; - -#if QK_K == 256 - for (int n = 0; n < QK_K; n += 128) { - for (int l = 0; l < 32; ++l) { - int is = l/16; - const int8_t q1 = (int8_t)((ql[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; - const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; - const int8_t q3 = (int8_t)((ql[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; - const int8_t q4 = (int8_t)((ql[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; - y[l + 0] = d * sc[is + 0] * q1; - y[l + 32] = d * sc[is + 2] * q2; - y[l + 64] = d * sc[is + 4] * q3; - y[l + 96] = d * sc[is + 6] * q4; - } - y += 128; - ql += 64; - qh += 32; - sc += 8; - } -#else - for (int l = 0; l < 16; ++l) { - const int8_t q1 = (int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; - const int8_t q2 = (int8_t)((ql[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; - const int8_t q3 = (int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; - const int8_t q4 = (int8_t)((ql[l+16] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; - y[l+ 0] = d * sc[0] * q1; - y[l+16] = d * sc[1] * q2; - y[l+32] = d * sc[2] * q3; - y[l+48] = d * sc[3] * q4; - } - y += 64; -#endif - } -} - -kernel void kernel_get_rows_q2_K( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; - - dequantize_row_q2_K( - (device const block_q2_K *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); -} - -kernel void kernel_get_rows_q3_K( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; - - dequantize_row_q3_K( - (device const block_q3_K *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); -} - -kernel void kernel_get_rows_q4_K( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; - - dequantize_row_q4_K( - (device const block_q4_K *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); -} - -kernel void kernel_get_rows_q5_K( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; - - dequantize_row_q5_K( - (device const block_q5_K *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); -} - -kernel void kernel_get_rows_q6_K( - device const void * src0, - device const int * src1, - device float * dst, - constant int64_t & ne00, - constant uint64_t & nb01, - constant uint64_t & nb1, - uint tpig[[thread_position_in_grid]]) { - const int i = tpig; - const int r = ((device int32_t *) src1)[i]; - - dequantize_row_q6_K( - (device const block_q6_K *) ((device char *) src0 + r*nb01), - (device float *) ((device char *) dst + i*nb1), ne00); -} - //====================================== dot products ========================= kernel void kernel_mul_mat_q2_K_f32( @@ -1224,21 +797,27 @@ kernel void kernel_mul_mat_q2_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, constant int64_t & ne01[[buffer(4)]], - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { const int nb = ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; + const int r2 = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; const int ib_row = first_row * nb; - device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row; - device const float * y = (device const float *) src1 + r1*ne10; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q2_K * x = (device const block_q2_K *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; float yl[32]; float sumf[N_DST]={0.f}, all_sum; @@ -1351,7 +930,7 @@ kernel void kernel_mul_mat_q2_K_f32( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + first_row + row] = all_sum; + dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum; } } } @@ -1362,10 +941,14 @@ kernel void kernel_mul_mat_q3_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, - constant int64_t & ne1, - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1373,11 +956,12 @@ kernel void kernel_mul_mat_q3_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; + const int64_t r2 = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2; - - device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb; - device const float * yy = (device const float *) src1 + r1*ne10; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q3_K * x = (device const block_q3_K *) src0 + first_row*nb + offset0; + device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; float yl[16]; @@ -1465,7 +1049,7 @@ kernel void kernel_mul_mat_q3_K_f32( const float sumf = (sumf1[row] - 32.f*sumf2[row]) / (1 << shift); const float tot = simd_sum(sumf); if (tiisg == 0) { - dst[r1*ne0 + first_row + row] = tot; + dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot; } } } @@ -1475,10 +1059,14 @@ kernel void kernel_mul_mat_q3_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, - constant int64_t & ne1, - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1486,11 +1074,12 @@ kernel void kernel_mul_mat_q3_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; + const int64_t r2 = tgpig.z; const int row = 2 * r0 + sgitg; - - device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb; - device const float * yy = (device const float *) src1 + r1*ne10; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0; + device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; const int ix = tiisg/4; const int il = 4 * (tiisg%4);// 0, 4, 8, 12 const int im = il/8; // 0, 0, 1, 1 @@ -1529,7 +1118,7 @@ kernel void kernel_mul_mat_q3_K_f32( const float tot = simd_sum(sumf); if (tiisg == 0) { - dst[r1*ne0 + row] = tot; + dst[r1*ne0 + r2*ne0*ne1 + row] = tot; } } @@ -1541,10 +1130,14 @@ kernel void kernel_mul_mat_q4_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, constant int64_t & ne01[[buffer(4)]], - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1560,10 +1153,12 @@ kernel void kernel_mul_mat_q4_K_f32( const int nb = ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; + const int r2 = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; const int ib_row = first_row * nb; - device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row; - device const float * y = (device const float *) src1 + r1*ne10; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; float yl[16]; float yh[16]; float sumf[N_DST]={0.f}, all_sum; @@ -1630,7 +1225,7 @@ kernel void kernel_mul_mat_q4_K_f32( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + first_row + row] = all_sum; + dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = all_sum; } } } @@ -1640,10 +1235,14 @@ kernel void kernel_mul_mat_q4_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, constant int64_t & ne01[[buffer(4)]], - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1653,10 +1252,12 @@ kernel void kernel_mul_mat_q4_K_f32( const int nb = ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; + const int r2 = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; const int ib_row = first_row * nb; - device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row; - device const float * y = (device const float *) src1 + r1*ne10; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; float yl[8]; float yh[8]; float sumf[N_DST]={0.f}, all_sum; @@ -1712,7 +1313,7 @@ kernel void kernel_mul_mat_q4_K_f32( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + first_row + row] = all_sum; + dst[r1*ne0+ r2*ne0*ne1 + first_row + row] = all_sum; } } } @@ -1723,9 +1324,14 @@ kernel void kernel_mul_mat_q5_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1733,11 +1339,12 @@ kernel void kernel_mul_mat_q5_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; + const int r2 = tgpig.z; const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2; - - device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb; - device const float * yy = (device const float *) src1 + r1*ne10; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb + offset0; + device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; float sumf[2]={0.f}; @@ -1871,7 +1478,7 @@ kernel void kernel_mul_mat_q5_K_f32( for (int row = 0; row < 2; ++row) { const float tot = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + first_row + row] = tot; + dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = tot; } } @@ -1882,9 +1489,14 @@ kernel void kernel_mul_mat_q6_K_f32( device const float * src1, device float * dst, constant int64_t & ne00, - constant int64_t & ne10, - constant int64_t & ne0, - uint2 tgpig[[threadgroup_position_in_grid]], + constant int64_t & ne01[[buffer(4)]], + constant int64_t & ne02[[buffer(5)]], + constant int64_t & ne10[[buffer(9)]], + constant int64_t & ne12[[buffer(11)]], + constant int64_t & ne0[[buffer(15)]], + constant int64_t & ne1[[buffer(16)]], + constant uint & gqa[[buffer(17)]], + uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -1897,11 +1509,12 @@ kernel void kernel_mul_mat_q6_K_f32( const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; + const int r2 = tgpig.z; const int row = 2 * r0 + sgitg; - - device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb; //r0*nb; - device const float * yy = (device const float *) src1 + r1*ne10; + const uint offset0 = r2/gqa*(nb*ne0); + device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb + offset0; + device const float * yy = (device const float *) src1 + r1*ne10 + r2*ne00*ne1; float sumf = 0; @@ -1967,6 +1580,368 @@ kernel void kernel_mul_mat_q6_K_f32( const float tot = simd_sum(sumf); if (tiisg == 0) { - dst[r1*ne0 + row] = tot; + dst[r1*ne0 + r2*ne0*ne1 + row] = tot; + } +} + +//============================= templates and their specializations ============================= + +template +void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) { + half4x4 temp = *(((device half4x4 *)src)); + for (int i = 0; i < 16; i++){ + reg[i/4][i%4] = temp[i/4][i%4]; } } + +template +void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 1); + const half d = il ? (xb->d / 16.h) : xb->d; + const half m = il ? (-8.h * 16.h) : -8.h; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = il ? 0xF000 : 0x0F00; + + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)] = (((qs[i] & mask0)) + m) * d; + reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) + m) * d; + } +} + +template +void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) { + device const uint16_t * qs = ((device const uint16_t *)xb + 2); + const half d = il ? (xb->d / 16.h) : xb->d; + const half m = xb->m; + const ushort mask0 = il ? 0x00F0 : 0x000F; + const ushort mask1 = il ? 0xF000 : 0x0F00; + + for (int i=0;i<8;i++) { + reg[i/2][2*(i%2)] = (((qs[i] & mask0)) * d) + m; + reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) * d) + m; + } +} + +template +void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { + const half d = xb->d; + const half min = xb->dmin; + device const uint8_t * q = (device const uint8_t *)xb->qs; + half dl, ml; + uint8_t sc = xb->scales[il]; + +#if QK_K == 256 + q = q + 32*(il/8) + 16*(il&1); + il = (il/2)%4; +#endif + half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4); + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - ml; + } +} + +template +void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg) { + const float d_all = (float)(xb->d); + device const uint8_t * q = (device const uint8_t *)xb->qs; + device const uint8_t * h = (device const uint8_t *)xb->hmask; + device const int8_t * scales = (device const int8_t *)xb->scales; + +#if QK_K == 256 + q = q + 32 * (il/8) + 16 * (il&1); + h = h + 16 * (il&1); + uint8_t m = 1 << (il/2); + uint16_t kmask1 = (il/4)>1 ? ((il/4)>2 ? 192 : 48) : \ + ((il/4)>0 ? 12 : 3); + uint16_t kmask2 = il/8 ? 0xF0 : 0x0F; + uint16_t scale_2 = scales[il%8], scale_1 = scales[8 + il%4]; + int16_t dl_int = (il/4)&1 ? (scale_2&kmask2) | ((scale_1&kmask1) << 2) : \ + (scale_2&kmask2) | ((scale_1&kmask1) << 4); + float dl = il<8 ? d_all * (dl_int - 32.f) : d_all * (dl_int / 16.f - 32.f); + + il = (il/2)%4; + float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i] & m) ? 0 : 4.f/coef)); + } +#else + float kcoef = il&1 ? 1.f/16.f : 1.f; + uint16_t kmask = il&1 ? 0xF0 : 0x0F; + float dl = d_all * ((scales[il/2] & kmask) * kcoef - 8); + float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h); + uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + uint8_t m = 1<<(il*2); + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef)); + } +#endif +} + +template +void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) { + device const uint8_t * q = xb->qs; + +#if QK_K == 256 + const float d = (float)(xb->d); + const float min = (float)(xb->dmin); + short is = (il/4) * 2; + q = q + (il/4) * 32 + 16 * (il&1); + il = il%4; + const uchar4 sc = get_scale_min_k4(is, xb->scales); + const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h; + const float ml = il<2 ? min * sc[1] : min * sc[3]; +#else + q = q + 16 * (il&1); + device const uint8_t * s = xb->scales; + device const half2 * dh = (device const half2 *)xb->d; + const float2 d = (float2)dh[0]; + const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h; + const float ml = il<2 ? d[1] * (s[0]>>4) : d[1 ]* (s[1]>>4); +#endif + const ushort mask = il<2 ? 0x0F : 0xF0; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * (q[i] & mask) - ml; + } +} + +template +void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg) { + device const uint8_t * q = xb->qs; + device const uint8_t * qh = xb->qh; + +#if QK_K == 256 + const float d = (float)(xb->d); + const float min = (float)(xb->dmin); + short is = (il/4) * 2; + q = q + 32 * (il/4) + 16 * (il&1); + qh = qh + 16 * (il&1); + uint8_t ul = 1 << (il/2); + il = il%4; + const uchar4 sc = get_scale_min_k4(is, xb->scales); + const float dl = il<2 ? d * sc[0] : d * sc[2]/16.h; + const float ml = il<2 ? min * sc[1] : min * sc[3]; + + const ushort mask = il<2 ? 0x0F : 0xF0; + const float qh_val = il<2 ? 16.f : 256.f; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml; + } +#else + q = q + 16 * (il&1); + device const int8_t * s = xb->scales; + const float dl = xb->d * s[il]; + uint8_t m = 1<<(il*2); + const float coef = il<2 ? 1.f : 1.f/16.f; + const ushort mask = il<2 ? 0x0F : 0xF0; + for (int i = 0; i < 16; ++i) { + reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef)); + } +#endif +} + +template +void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg) { + const float d_all = (float)(xb->d); + device const uint8_t * ql = (device const uint8_t *)xb->ql; + device const uint8_t * qh = (device const uint8_t *)xb->qh; + device const int8_t * scales = (device const int8_t *)xb->scales; + +#if QK_K == 256 + ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1); + qh = qh + 32*(il/8) + 16*(il&1); + float sc = scales[(il%2) + 2 * ((il/2))]; + il = (il/2)%4; +#else + ql = ql + 16 * (il&1); + float sc = scales[il]; +#endif + for (int i = 0; i < 16; ++i) { + uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3); + uint16_t kmask2 = il>1 ? 0xF0 : 0x0F; + const float coef = il>1 ? 1.f/16.f : 1.f; + float q = il&1 ? ((ql[i]&kmask2)|((qh[i]&kmask1)<<2)) - 32.f/coef : \ + ((ql[i]&kmask2)|((qh[i]&kmask1)<<4)) - 32.f/coef; + reg[i/4][i%4] = d_all * sc * q * coef; + } +} + +template +kernel void kernel_get_rows( + device const void * src0, + device const int * src1, + device float * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant uint64_t & nb1, + uint tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tptg[[threads_per_threadgroup]]) { + const int i = tgpig; + const int r = ((device int32_t *) src1)[i]; + + for (int ind = tiitg; ind < ne00/16; ind += tptg) { + float4x4 temp; + dequantize_func( + ((device const block_q *) ((device char *) src0 + r*nb01)) + ind/nl, ind%nl, temp); + *(((device float4x4 *) ((device char *) dst + i*nb1)) + ind) = temp; + } +} + +#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A +#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A +#define BLOCK_SIZE_K 32 +#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A +#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B +#define THREAD_PER_BLOCK 128 +#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers +#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers +#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8 +#define SG_MAT_ROW 8 + +// each block_q contains 16*nl weights +template +kernel void kernel_mul_mm(device const uchar * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne02, + constant int64_t & nb01, + constant int64_t & nb02, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & gqa, + threadgroup uchar * shared_memory [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + threadgroup half * sa = ((threadgroup half *)shared_memory); + threadgroup float * sb = (threadgroup float *)(shared_memory + 4096); + + const uint r0 = tgpig.y; + const uint r1 = tgpig.x; + const uint im = tgpig.z; + // if this block is of 64x32 shape or smaller + short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M; + short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N; + // a thread shouldn't load data outside of the matrix + short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; + short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; + + simdgroup_half8x8 ma[4]; + simdgroup_float8x8 mb[2]; + simdgroup_float8x8 c_res[8]; + for (int i = 0; i < 8; i++){ + c_res[i] = make_filled_simdgroup_matrix(0.f); + } + + short il = (tiitg % THREAD_PER_ROW); + uint offset0 = im/gqa*nb02; ushort offset1 = il/nl; + device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; + device const float * y = src1 + (r1 * BLOCK_SIZE_N + thread_col) * ne00 \ + + BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL) + im * ne00 * ne1; + + for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) { + //load data and store to threadgroup memory + half4x4 temp_a; + dequantize_func(x, il, temp_a); + threadgroup_barrier(mem_flags::mem_threadgroup); + #pragma unroll(16) + for (int i = 0; i < 16; i++) { + *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ + + 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \ + + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4]; + } + *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \ + = *((device float2x4 *)y); + il = (il + 2 < nl) ? il + 2 : il % 2; + x = (il < 2) ? x + (2+nl-1)/nl : x; + y += BLOCK_SIZE_K; + + threadgroup_barrier(mem_flags::mem_threadgroup); + //load matrices from threadgroup memory and conduct outer products + threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2)); + threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2)); + #pragma unroll(4) + for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { + #pragma unroll(4) + for (int i = 0; i < 4; i++) { + simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i); + } + simdgroup_barrier(mem_flags::mem_none); + #pragma unroll(2) + for (int i = 0; i < 2; i++) { + simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i); + } + + lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE; + lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE; + #pragma unroll(8) + for (int i = 0; i < 8; i++){ + simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]); + } + } + } + + if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) { + device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \ + + (BLOCK_SIZE_N * r1 + 16 * (sgitg>>1)) * ne0 + im*ne1*ne0; + for (int i = 0; i < 8; i++) { + simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0); + } + } else { + // block is smaller than 64x32, we should avoid writing data outside of the matrix + threadgroup_barrier(mem_flags::mem_threadgroup); + threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ + + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; + for (int i = 0; i < 8; i++) { + simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; + if (sgitg==0) { + for (int i = 0; i < n_rows; i++) { + for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) { + *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M); + } + } + } + } +} + +#if QK_K == 256 +#define QK_NL 16 +#else +#define QK_NL 4 +#endif + +typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \ + constant uint64_t &, constant uint64_t &, uint, uint, uint); + +template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q2_K")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q3_K")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows; + +typedef void (mat_mm_t)(device const uchar *, device const float *, device float *, constant int64_t &,\ + constant int64_t &, constant int64_t &, constant int64_t &, constant int64_t &, \ + constant int64_t &, constant int64_t &, constant uint &, threadgroup uchar *, uint3, uint, uint); + +template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; From 9bdb10b139caf90d300920d5be1bfe7c6a2988ec Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 22 Aug 2023 11:35:35 +0300 Subject: [PATCH 3/4] ggml : sync ggml-alloc ggml-ci --- include/ggml/ggml-alloc.h | 4 ++++ scripts/sync-llama.sh | 2 ++ src/ggml-alloc.c | 48 +++++++++++++++++++++++++++++++-------- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/include/ggml/ggml-alloc.h b/include/ggml/ggml-alloc.h index a5ec8f87a..14a4350ac 100644 --- a/include/ggml/ggml-alloc.h +++ b/include/ggml/ggml-alloc.h @@ -10,6 +10,10 @@ extern "C" { GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); +// tell the allocator to parse nodes following the order described in the list +// you should call this if your graph are optimized to execute out-of-order +GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n); + GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); diff --git a/scripts/sync-llama.sh b/scripts/sync-llama.sh index 22d5fbecd..db7ee49a6 100755 --- a/scripts/sync-llama.sh +++ b/scripts/sync-llama.sh @@ -1,6 +1,7 @@ #!/bin/bash cp -rpv ../llama.cpp/ggml.c src/ggml.c +cp -rpv ../llama.cpp/ggml-alloc.c src/ggml-alloc.c cp -rpv ../llama.cpp/ggml-cuda.h src/ggml-cuda.h cp -rpv ../llama.cpp/ggml-cuda.cu src/ggml-cuda.cu cp -rpv ../llama.cpp/ggml-opencl.h src/ggml-opencl.h @@ -9,6 +10,7 @@ cp -rpv ../llama.cpp/ggml-metal.h src/ggml-metal.h cp -rpv ../llama.cpp/ggml-metal.m src/ggml-metal.m cp -rpv ../llama.cpp/ggml-metal.metal src/ggml-metal.metal cp -rpv ../llama.cpp/ggml.h include/ggml/ggml.h +cp -rpv ../llama.cpp/ggml-alloc.h include/ggml/ggml-alloc.h cp -rpv ../llama.cpp/tests/test-opt.cpp tests/test-opt.cpp cp -rpv ../llama.cpp/tests/test-grad0.cpp tests/test-grad0.cpp diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index 71339a582..f06f9a3c1 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -67,6 +67,8 @@ struct ggml_allocr { struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE]; size_t max_size; bool measure; + int parse_seq[GGML_MAX_NODES]; + bool has_parse_seq; #ifdef GGML_ALLOCATOR_DEBUG struct ggml_tensor * allocated_tensors[1024]; @@ -111,10 +113,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) size_t max_avail = 0; - // find the best fitting free block + // find the best fitting free block besides the last block int best_fit_block = -1; size_t best_fit_size = SIZE_MAX; - for (int i = 0; i < alloc->n_free_blocks; i++) { + for (int i = 0; i < alloc->n_free_blocks - 1; i++) { struct free_block * block = &alloc->free_blocks[i]; max_avail = MAX(max_avail, block->size); if (block->size >= size && block->size <= best_fit_size) { @@ -126,10 +128,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) AT_PRINTF("block %d\n", best_fit_block); if (best_fit_block == -1) { - fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", - __func__, size, max_avail); - GGML_ASSERT(!"not enough space in the buffer"); + // the last block is our last resort + struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; + if (block->size >= size) { + best_fit_block = alloc->n_free_blocks - 1; + max_avail = MAX(max_avail, block->size); + } else { + fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n", + __func__, size, max_avail); + GGML_ASSERT(!"not enough space in the buffer"); return; + } } struct free_block * block = &alloc->free_blocks[best_fit_block]; void * addr = block->addr; @@ -229,6 +238,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t alloc->n_free_blocks++; } +void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) { + int pos = 0; + for (int i = 0; i < n; i++) { + if (list[i] != -1) { + alloc->parse_seq[pos] = list[i]; + pos++; + } + } + alloc->has_parse_seq = true; +} + void ggml_allocr_reset(struct ggml_allocr * alloc) { alloc->n_free_blocks = 1; size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); @@ -248,8 +268,10 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) /*.hash_table = */ {{0}}, /*.max_size = */ 0, /*.measure = */ false, + /*.parse_seq = */ {0}, + /*.has_parse_seq = */ false, #ifdef GGML_ALLOCATOR_DEBUG - /* .allocated_tensors = */ {0}, + /*.allocated_tensors = */ = {0}, #endif }; @@ -275,8 +297,10 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { /*.hash_table = */ {{0}}, /*.max_size = */ 0, /*.measure = */ true, + /*.parse_seq = */ {0}, + /*.has_parse_seq = */ false, #ifdef GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ {0}, + /*.allocated_tensors = */ = {0}, #endif }; @@ -473,7 +497,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n( allocate_node(alloc, input); } } - for (int i = 0; i < gf->n_nodes; i++) { + for (int ind = 0; ind < gf->n_nodes; ind++) { + int i; + if (alloc->has_parse_seq) { + i = alloc->parse_seq[ind]; + } else { + i = ind; + } struct ggml_tensor * node = gf->nodes[i]; // allocate parents (leafs) @@ -517,7 +547,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n( struct ggml_tensor * view_src = get_view_source(parent); struct hash_node * view_src_hn = hash_get(ht, view_src); view_src_hn->n_views -= 1; - AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); + AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views); if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { ggml_allocator_free_tensor(alloc, view_src); } From adab318633ddc28d3f7c80efa77253fb589487d5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 22 Aug 2023 11:37:29 +0300 Subject: [PATCH 4/4] ggml : remove obosolete constant arrays (type traits) ggml-ci --- src/ggml.c | 89 ------------------------------------------------------ 1 file changed, 89 deletions(-) diff --git a/src/ggml.c b/src/ggml.c index 977933b1c..7b5922e36 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -3725,95 +3725,6 @@ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) { // data types // -static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { - [GGML_TYPE_F32] = 1, - [GGML_TYPE_F16] = 1, - [GGML_TYPE_Q4_0] = QK4_0, - [GGML_TYPE_Q4_1] = QK4_1, - [GGML_TYPE_Q5_0] = QK5_0, - [GGML_TYPE_Q5_1] = QK5_1, - [GGML_TYPE_Q8_0] = QK8_0, - [GGML_TYPE_Q8_1] = QK8_1, -#ifdef GGML_USE_K_QUANTS - [GGML_TYPE_Q2_K] = QK_K, - [GGML_TYPE_Q3_K] = QK_K, - [GGML_TYPE_Q4_K] = QK_K, - [GGML_TYPE_Q5_K] = QK_K, - [GGML_TYPE_Q6_K] = QK_K, - [GGML_TYPE_Q8_K] = QK_K, -#endif - [GGML_TYPE_I8] = 1, - [GGML_TYPE_I16] = 1, - [GGML_TYPE_I32] = 1, -}; -static_assert(GGML_TYPE_COUNT == 19, "GGML_BLCK_SIZE is outdated"); - -static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { - [GGML_TYPE_F32] = sizeof(float), - [GGML_TYPE_F16] = sizeof(ggml_fp16_t), - [GGML_TYPE_Q4_0] = sizeof(block_q4_0), - [GGML_TYPE_Q4_1] = sizeof(block_q4_1), - [GGML_TYPE_Q5_0] = sizeof(block_q5_0), - [GGML_TYPE_Q5_1] = sizeof(block_q5_1), - [GGML_TYPE_Q8_0] = sizeof(block_q8_0), - [GGML_TYPE_Q8_1] = sizeof(block_q8_1), -#ifdef GGML_USE_K_QUANTS - [GGML_TYPE_Q2_K] = sizeof(block_q2_K), - [GGML_TYPE_Q3_K] = sizeof(block_q3_K), - [GGML_TYPE_Q4_K] = sizeof(block_q4_K), - [GGML_TYPE_Q5_K] = sizeof(block_q5_K), - [GGML_TYPE_Q6_K] = sizeof(block_q6_K), - [GGML_TYPE_Q8_K] = sizeof(block_q8_K), -#endif - [GGML_TYPE_I8] = sizeof(int8_t), - [GGML_TYPE_I16] = sizeof(int16_t), - [GGML_TYPE_I32] = sizeof(int32_t), -}; -static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_SIZE is outdated"); - - -static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { - [GGML_TYPE_F32] = "f32", - [GGML_TYPE_F16] = "f16", - [GGML_TYPE_Q4_0] = "q4_0", - [GGML_TYPE_Q4_1] = "q4_1", - [GGML_TYPE_Q5_0] = "q5_0", - [GGML_TYPE_Q5_1] = "q5_1", - [GGML_TYPE_Q8_0] = "q8_0", - [GGML_TYPE_Q8_1] = "q8_1", - [GGML_TYPE_Q2_K] = "q2_K", - [GGML_TYPE_Q3_K] = "q3_K", - [GGML_TYPE_Q4_K] = "q4_K", - [GGML_TYPE_Q5_K] = "q5_K", - [GGML_TYPE_Q6_K] = "q6_K", - [GGML_TYPE_Q8_K] = "q8_K", - [GGML_TYPE_I8] = "i8", - [GGML_TYPE_I16] = "i16", - [GGML_TYPE_I32] = "i32", -}; -static_assert(GGML_TYPE_COUNT == 19, "GGML_TYPE_NAME is outdated"); - -static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { - [GGML_TYPE_F32] = false, - [GGML_TYPE_F16] = false, - [GGML_TYPE_Q4_0] = true, - [GGML_TYPE_Q4_1] = true, - [GGML_TYPE_Q5_0] = true, - [GGML_TYPE_Q5_1] = true, - [GGML_TYPE_Q8_0] = true, - [GGML_TYPE_Q8_1] = true, - [GGML_TYPE_Q2_K] = true, - [GGML_TYPE_Q3_K] = true, - [GGML_TYPE_Q4_K] = true, - [GGML_TYPE_Q5_K] = true, - [GGML_TYPE_Q6_K] = true, - [GGML_TYPE_Q8_K] = true, - [GGML_TYPE_I8] = false, - [GGML_TYPE_I16] = false, - [GGML_TYPE_I32] = false, -}; -static_assert(GGML_TYPE_COUNT == 19, "GGML_IS_QUANTIZED is outdated"); - static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "NONE",