diff --git a/README.md b/README.md index 7030201a..6ea6e577 100644 --- a/README.md +++ b/README.md @@ -637,9 +637,9 @@ int main() { simsimd_f32_t vector_a[1536]; simsimd_f32_t vector_b[1536]; simsimd_kernel_punned_t distance_function = simsimd_metric_punned( - simsimd_cos_k, // Metric kind, like the angular cosine distance - simsimd_datatype_f32_k, // Data type, like: f16, f32, f64, i8, b8, and complex variants - simsimd_cap_any_k); // Which CPU capabilities are we allowed to use + simsimd_cos_k, // Metric kind, like the angular cosine distance + simsimd_f32_k, // Data type, like: f16, f32, f64, i8, b8, complex variants, etc. + simsimd_cap_any_k); // Which CPU capabilities are we allowed to use simsimd_distance_t distance; distance_function(vector_a, vector_b, 1536, &distance); return 0; diff --git a/c/lib.c b/c/lib.c index d0bc1b88..f554c132 100644 --- a/c/lib.c +++ b/c/lib.c @@ -55,105 +55,105 @@ extern "C" { // If no metric is found, it returns NaN. We can obtain NaN by dividing 0.0 by 0.0, but that annoys // the MSVC compiler. Instead we can directly write-in the signaling NaN (0x7FF0000000000001) // or the qNaN (0x7FF8000000000000). -#define SIMSIMD_DECLARATION_DENSE(name, extension, type) \ - SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ - simsimd_size_t n, simsimd_distance_t *results) { \ - static simsimd_metric_dense_punned_t metric = 0; \ - if (metric == 0) { \ - simsimd_capability_t used_capability; \ - simsimd_find_kernel_punned(simsimd_##name##_k, simsimd_datatype_##extension##_k, simsimd_capabilities(), \ - simsimd_cap_any_k, (simsimd_kernel_punned_t *)&metric, &used_capability); \ - if (!metric) { \ - *(simsimd_u64_t *)results = 0x7FF0000000000001ull; \ - return; \ - } \ - } \ - metric(a, b, n, results); \ +#define SIMSIMD_DECLARATION_DENSE(name, extension, type) \ + SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ + simsimd_size_t n, simsimd_distance_t *results) { \ + static simsimd_dense_metric_t metric = 0; \ + if (metric == 0) { \ + simsimd_capability_t used_capability; \ + simsimd_find_kernel(simsimd_##name##_k, simsimd_##extension##_k, simsimd_capabilities(), \ + simsimd_cap_any_k, (simsimd_kernel_punned_t *)&metric, &used_capability); \ + if (!metric) { \ + *(simsimd_u64_t *)results = 0x7FF0000000000001ull; \ + return; \ + } \ + } \ + metric(a, b, n, results); \ } -#define SIMSIMD_DECLARATION_SPARSE(name, extension, type) \ - SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ - simsimd_size_t a_length, simsimd_size_t b_length, \ - simsimd_distance_t *result) { \ - static simsimd_metric_sparse_punned_t metric = 0; \ - if (metric == 0) { \ - simsimd_capability_t used_capability; \ - simsimd_find_kernel_punned(simsimd_##name##_k, simsimd_datatype_##extension##_k, simsimd_capabilities(), \ - simsimd_cap_any_k, (simsimd_kernel_punned_t *)(&metric), &used_capability); \ - if (!metric) { \ - *(simsimd_u64_t *)result = 0x7FF0000000000001ull; \ - return; \ - } \ - } \ - metric(a, b, a_length, b_length, result); \ +#define SIMSIMD_DECLARATION_SPARSE(name, extension, type) \ + SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ + simsimd_size_t a_length, simsimd_size_t b_length, \ + simsimd_distance_t *result) { \ + static simsimd_sparse_metric_t metric = 0; \ + if (metric == 0) { \ + simsimd_capability_t used_capability; \ + simsimd_find_kernel(simsimd_##name##_k, simsimd_##extension##_k, simsimd_capabilities(), \ + simsimd_cap_any_k, (simsimd_kernel_punned_t *)(&metric), &used_capability); \ + if (!metric) { \ + *(simsimd_u64_t *)result = 0x7FF0000000000001ull; \ + return; \ + } \ + } \ + metric(a, b, a_length, b_length, result); \ } -#define SIMSIMD_DECLARATION_CURVED(name, extension, type) \ - SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ - simsimd_##type##_t const *c, simsimd_size_t n, \ - simsimd_distance_t *result) { \ - static simsimd_metric_curved_punned_t metric = 0; \ - if (metric == 0) { \ - simsimd_capability_t used_capability; \ - simsimd_find_kernel_punned(simsimd_##name##_k, simsimd_datatype_##extension##_k, simsimd_capabilities(), \ - simsimd_cap_any_k, (simsimd_kernel_punned_t *)(&metric), &used_capability); \ - if (!metric) { \ - *(simsimd_u64_t *)result = 0x7FF0000000000001ull; \ - return; \ - } \ - } \ - metric(a, b, c, n, result); \ +#define SIMSIMD_DECLARATION_CURVED(name, extension, type) \ + SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ + simsimd_##type##_t const *c, simsimd_size_t n, \ + simsimd_distance_t *result) { \ + static simsimd_curved_metric_t metric = 0; \ + if (metric == 0) { \ + simsimd_capability_t used_capability; \ + simsimd_find_kernel(simsimd_##name##_k, simsimd_##extension##_k, simsimd_capabilities(), \ + simsimd_cap_any_k, (simsimd_kernel_punned_t *)(&metric), &used_capability); \ + if (!metric) { \ + *(simsimd_u64_t *)result = 0x7FF0000000000001ull; \ + return; \ + } \ + } \ + metric(a, b, c, n, result); \ } -#define SIMSIMD_DECLARATION_SCALE(name, extension, type) \ - SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_size_t n, \ - simsimd_distance_t alpha, simsimd_distance_t beta, \ - simsimd_##type##_t *result) { \ - static simsimd_kernel_scale_punned_t metric = 0; \ - if (metric == 0) { \ - simsimd_capability_t used_capability; \ - simsimd_find_kernel_punned(simsimd_##name##_k, simsimd_datatype_##extension##_k, simsimd_capabilities(), \ - simsimd_cap_any_k, (simsimd_kernel_punned_t *)(&metric), &used_capability); \ - } \ - metric(a, n, alpha, beta, result); \ +#define SIMSIMD_DECLARATION_SCALE(name, extension, type) \ + SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_size_t n, \ + simsimd_distance_t alpha, simsimd_distance_t beta, \ + simsimd_##type##_t *result) { \ + static simsimd_elementwise_scale_t metric = 0; \ + if (metric == 0) { \ + simsimd_capability_t used_capability; \ + simsimd_find_kernel(simsimd_##name##_k, simsimd_##extension##_k, simsimd_capabilities(), \ + simsimd_cap_any_k, (simsimd_kernel_punned_t *)(&metric), &used_capability); \ + } \ + metric(a, n, alpha, beta, result); \ } -#define SIMSIMD_DECLARATION_SUM(name, extension, type) \ - SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ - simsimd_size_t n, simsimd_##type##_t *result) { \ - static simsimd_kernel_sum_punned_t metric = 0; \ - if (metric == 0) { \ - simsimd_capability_t used_capability; \ - simsimd_find_kernel_punned(simsimd_##name##_k, simsimd_datatype_##extension##_k, simsimd_capabilities(), \ - simsimd_cap_any_k, (simsimd_kernel_punned_t *)(&metric), &used_capability); \ - } \ - metric(a, b, n, result); \ +#define SIMSIMD_DECLARATION_SUM(name, extension, type) \ + SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ + simsimd_size_t n, simsimd_##type##_t *result) { \ + static simsimd_elementwise_sum_t metric = 0; \ + if (metric == 0) { \ + simsimd_capability_t used_capability; \ + simsimd_find_kernel(simsimd_##name##_k, simsimd_##extension##_k, simsimd_capabilities(), \ + simsimd_cap_any_k, (simsimd_kernel_punned_t *)(&metric), &used_capability); \ + } \ + metric(a, b, n, result); \ } -#define SIMSIMD_DECLARATION_WSUM(name, extension, type) \ - SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ - simsimd_size_t n, simsimd_distance_t alpha, \ - simsimd_distance_t beta, simsimd_##type##_t *result) { \ - static simsimd_kernel_wsum_punned_t metric = 0; \ - if (metric == 0) { \ - simsimd_capability_t used_capability; \ - simsimd_find_kernel_punned(simsimd_##name##_k, simsimd_datatype_##extension##_k, simsimd_capabilities(), \ - simsimd_cap_any_k, (simsimd_kernel_punned_t *)(&metric), &used_capability); \ - } \ - metric(a, b, n, alpha, beta, result); \ +#define SIMSIMD_DECLARATION_WSUM(name, extension, type) \ + SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ + simsimd_size_t n, simsimd_distance_t alpha, \ + simsimd_distance_t beta, simsimd_##type##_t *result) { \ + static simsimd_elementwise_wsum_t metric = 0; \ + if (metric == 0) { \ + simsimd_capability_t used_capability; \ + simsimd_find_kernel(simsimd_##name##_k, simsimd_##extension##_k, simsimd_capabilities(), \ + simsimd_cap_any_k, (simsimd_kernel_punned_t *)(&metric), &used_capability); \ + } \ + metric(a, b, n, alpha, beta, result); \ } -#define SIMSIMD_DECLARATION_FMA(name, extension, type) \ - SIMSIMD_DYNAMIC void simsimd_##name##_##extension( \ - simsimd_##type##_t const *a, simsimd_##type##_t const *b, simsimd_##type##_t const *c, simsimd_size_t n, \ - simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_##type##_t *result) { \ - static simsimd_kernel_fma_punned_t metric = 0; \ - if (metric == 0) { \ - simsimd_capability_t used_capability; \ - simsimd_find_kernel_punned(simsimd_##name##_k, simsimd_datatype_##extension##_k, simsimd_capabilities(), \ - simsimd_cap_any_k, (simsimd_kernel_punned_t *)(&metric), &used_capability); \ - } \ - metric(a, b, c, n, alpha, beta, result); \ +#define SIMSIMD_DECLARATION_FMA(name, extension, type) \ + SIMSIMD_DYNAMIC void simsimd_##name##_##extension( \ + simsimd_##type##_t const *a, simsimd_##type##_t const *b, simsimd_##type##_t const *c, simsimd_size_t n, \ + simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_##type##_t *result) { \ + static simsimd_elementwise_fma_t metric = 0; \ + if (metric == 0) { \ + simsimd_capability_t used_capability; \ + simsimd_find_kernel(simsimd_##name##_k, simsimd_##extension##_k, simsimd_capabilities(), \ + simsimd_cap_any_k, (simsimd_kernel_punned_t *)(&metric), &used_capability); \ + } \ + metric(a, b, c, n, alpha, beta, result); \ } // Dot products @@ -370,14 +370,14 @@ SIMSIMD_DYNAMIC simsimd_capability_t simsimd_capabilities(void) { return static_capabilities; } -SIMSIMD_DYNAMIC void simsimd_find_kernel_punned( // - simsimd_metric_kind_t kind, // - simsimd_datatype_t datatype, // - simsimd_capability_t supported, // - simsimd_capability_t allowed, // - simsimd_kernel_punned_t *kernel_output, // +SIMSIMD_DYNAMIC void simsimd_find_kernel( // + simsimd_metric_kind_t kind, // + simsimd_datatype_t datatype, // + simsimd_capability_t supported, // + simsimd_capability_t allowed, // + simsimd_kernel_punned_t *kernel_output, // simsimd_capability_t *capability_output) { - _simsimd_find_kernel_punned_implementation(kind, datatype, supported, allowed, kernel_output, capability_output); + _simsimd_find_kernel_implementation(kind, datatype, supported, allowed, kernel_output, capability_output); } #ifdef __cplusplus diff --git a/golang/simsimd.go b/golang/simsimd.go index 86465508..2ff5b4a5 100644 --- a/golang/simsimd.go +++ b/golang/simsimd.go @@ -8,12 +8,12 @@ package simsimd #include "../include/simsimd/simsimd.h" #include -inline static simsimd_f32_t cosine_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_cosine_k, simsimd_datatype_i8_k, simsimd_cap_any_k)(a, b, d, d); } -inline static simsimd_f32_t cosine_f32(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_cosine_k, simsimd_datatype_f32_k, simsimd_cap_any_k)(a, b, d, d); } -inline static simsimd_f32_t inner_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_inner_k, simsimd_datatype_i8_k, simsimd_cap_any_k)(a, b, d, d); } -inline static simsimd_f32_t inner_f32(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_inner_k, simsimd_datatype_f32_k, simsimd_cap_any_k)(a, b, d, d); } -inline static simsimd_f32_t sqeuclidean_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_sqeuclidean_k, simsimd_datatype_i8_k, simsimd_cap_any_k)(a, b, d, d); } -inline static simsimd_f32_t sqeuclidean_f32(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_sqeuclidean_k, simsimd_datatype_f32_k, simsimd_cap_any_k)(a, b, d, d); } +inline static simsimd_f32_t cosine_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_cosine_k, simsimd_i8_k, simsimd_cap_any_k)(a, b, d, d); } +inline static simsimd_f32_t cosine_f32(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_cosine_k, simsimd_f32_k, simsimd_cap_any_k)(a, b, d, d); } +inline static simsimd_f32_t inner_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_inner_k, simsimd_i8_k, simsimd_cap_any_k)(a, b, d, d); } +inline static simsimd_f32_t inner_f32(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_inner_k, simsimd_f32_k, simsimd_cap_any_k)(a, b, d, d); } +inline static simsimd_f32_t sqeuclidean_i8(simsimd_i8_t const* a, simsimd_i8_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_sqeuclidean_k, simsimd_i8_k, simsimd_cap_any_k)(a, b, d, d); } +inline static simsimd_f32_t sqeuclidean_f32(simsimd_f32_t const* a, simsimd_f32_t const* b, simsimd_size_t d) { return simsimd_metric_punned(simsimd_sqeuclidean_k, simsimd_f32_k, simsimd_cap_any_k)(a, b, d, d); } */ import "C" diff --git a/include/simsimd/elementwise.h b/include/simsimd/elementwise.h index 95c0b57a..e55c698f 100644 --- a/include/simsimd/elementwise.h +++ b/include/simsimd/elementwise.h @@ -4,7 +4,7 @@ * @author Ash Vardanian * @date October 16, 2024 * - * Contains following element-wise operations: + * Contains following @b Unary/Binary/Ternary element-wise operations: * - Scale (Multiply) with Shift: R[i] = Alpha * A[i] + Beta * - Sum (Add): R[i] = A[i] + B[i] * - WSum or Weighted-Sum: R[i] = Alpha * A[i] + Beta * B[i] @@ -1211,6 +1211,110 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_haswell( } } +SIMSIMD_PUBLIC void simsimd_sum_i16_haswell(simsimd_i16_t const *a, simsimd_i16_t const *b, simsimd_size_t n, + simsimd_i16_t *result) { + // The main loop: + simsimd_size_t i = 0; + for (; i + 16 <= n; i += 16) { + __m256i a_vec = _mm256_lddqu_si256((__m256i *)(a + i)); + __m256i b_vec = _mm256_lddqu_si256((__m256i *)(b + i)); + __m256i sum_vec = _mm256_adds_epi16(a_vec, b_vec); + _mm256_storeu_si256((__m256i *)(result + i), sum_vec); + } + + // The tail: + for (; i < n; ++i) { + simsimd_i64_t ai = a[i], bi = b[i]; + simsimd_i64_t sum = ai + bi; + _simsimd_i64_to_i16(&sum, result + i); + } +} + +SIMSIMD_PUBLIC void simsimd_scale_i16_haswell(simsimd_i16_t const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_distance_t beta, simsimd_i16_t *result) { + + simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; + simsimd_f32_t beta_f32 = (simsimd_f32_t)beta; + __m256 alpha_vec = _mm256_set1_ps(alpha_f32); + __m256 beta_vec = _mm256_set1_ps(beta_f32); + + // The main loop: + simsimd_size_t i = 0; + for (; i + 8 <= n; i += 8) { + __m256 a_vec = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i *)(a + i)))); + __m256 b_vec = _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm_lddqu_si128((__m128i *)(a + i)))); + __m256 sum_vec = _mm256_fmadd_ps(a_vec, alpha_vec, beta_vec); + __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); + sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-32768)); + sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(32767)); + __m128i sum_i16_vec = + _mm_packs_epi32(_mm256_castsi256_si128(sum_i32_vec), _mm256_extracti128_si256(sum_i32_vec, 1)); + _mm_storeu_si128((__m128i *)(result + i), sum_i16_vec); + } + + // The tail: + for (; i < n; ++i) { + simsimd_f32_t ai = a[i]; + simsimd_f32_t sum = alpha_f32 * ai + beta_f32; + _simsimd_f32_to_i16(&sum, result + i); + } +} + +SIMSIMD_PUBLIC void simsimd_fma_i16_haswell( // + simsimd_i16_t const *a, simsimd_i16_t const *b, simsimd_i16_t const *c, simsimd_size_t n, // + simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i16_t *result) { +#if 0 + simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha; + simsimd_f32_t beta_f32 = (simsimd_f32_t)beta; + __m256 alpha_vec = _mm256_set1_ps(alpha_f32); + __m256 beta_vec = _mm256_set1_ps(beta_f32); + int sum_i32s[8], a_i32s[8], b_i32s[8], c_i32s[8]; + + // The main loop: + simsimd_size_t i = 0; + for (; i + 8 <= n; i += 8) { + //? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the + //? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day. + a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], // + a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7]; + b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], // + b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7]; + c_i32s[0] = c[i + 0], c_i32s[1] = c[i + 1], c_i32s[2] = c[i + 2], c_i32s[3] = c[i + 3], // + c_i32s[4] = c[i + 4], c_i32s[5] = c[i + 5], c_i32s[6] = c[i + 6], c_i32s[7] = c[i + 7]; + //! This can be done at least 50% faster if we convert 8-bit integers to floats instead + //! of relying on the slow `_mm256_cvtepi32_ps` instruction. + __m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s)); + __m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s)); + __m256 c_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)c_i32s)); + // The normal part. + __m256 ab_vec = _mm256_mul_ps(a_vec, b_vec); + __m256 ab_scaled_vec = _mm256_mul_ps(ab_vec, alpha_vec); + __m256 sum_vec = _mm256_fmadd_ps(c_vec, beta_vec, ab_scaled_vec); + // Instead of serial calls to expensive `_simsimd_f32_to_u8`, convert and clip with SIMD. + __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); + sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128)); + sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(127)); + // Export into a serial buffer. + _mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec); + result[i + 0] = (simsimd_i16_t)sum_i32s[0]; + result[i + 1] = (simsimd_i16_t)sum_i32s[1]; + result[i + 2] = (simsimd_i16_t)sum_i32s[2]; + result[i + 3] = (simsimd_i16_t)sum_i32s[3]; + result[i + 4] = (simsimd_i16_t)sum_i32s[4]; + result[i + 5] = (simsimd_i16_t)sum_i32s[5]; + result[i + 6] = (simsimd_i16_t)sum_i32s[6]; + result[i + 7] = (simsimd_i16_t)sum_i32s[7]; + } + + // The tail: + for (; i < n; ++i) { + simsimd_f32_t ai = a[i], bi = b[i], ci = c[i]; + simsimd_f32_t sum = alpha_f32 * ai * bi + beta_f32 * ci; + _simsimd_f32_to_i16(&sum, result + i); + } +#endif +} + #pragma clang attribute pop #pragma GCC pop_options #endif // SIMSIMD_TARGET_HASWELL diff --git a/include/simsimd/simsimd.h b/include/simsimd/simsimd.h index 0c145693..9bd0becc 100644 --- a/include/simsimd/simsimd.h +++ b/include/simsimd/simsimd.h @@ -211,30 +211,30 @@ typedef enum { * interfaces. */ typedef enum { - simsimd_datatype_unknown_k = 0, ///< Unknown data type - simsimd_datatype_b8_k = 1 << 1, ///< Single-bit values packed into 8-bit words - simsimd_datatype_b1x8_k = simsimd_datatype_b8_k, ///< Single-bit values packed into 8-bit words - simsimd_datatype_i4x2_k = 1 << 19, ///< 4-bit signed integers packed into 8-bit words - - simsimd_datatype_i8_k = 1 << 2, ///< 8-bit signed integer - simsimd_datatype_i16_k = 1 << 3, ///< 16-bit signed integer - simsimd_datatype_i32_k = 1 << 4, ///< 32-bit signed integer - simsimd_datatype_i64_k = 1 << 5, ///< 64-bit signed integer - - simsimd_datatype_u8_k = 1 << 6, ///< 8-bit unsigned integer - simsimd_datatype_u16_k = 1 << 7, ///< 16-bit unsigned integer - simsimd_datatype_u32_k = 1 << 8, ///< 32-bit unsigned integer - simsimd_datatype_u64_k = 1 << 9, ///< 64-bit unsigned integer - - simsimd_datatype_f64_k = 1 << 10, ///< Double precision floating point - simsimd_datatype_f32_k = 1 << 11, ///< Single precision floating point - simsimd_datatype_f16_k = 1 << 12, ///< Half precision floating point - simsimd_datatype_bf16_k = 1 << 13, ///< Brain floating point - - simsimd_datatype_f64c_k = 1 << 20, ///< Complex double precision floating point - simsimd_datatype_f32c_k = 1 << 21, ///< Complex single precision floating point - simsimd_datatype_f16c_k = 1 << 22, ///< Complex half precision floating point - simsimd_datatype_bf16c_k = 1 << 23, ///< Complex brain floating point + simsimd_datatype_unknown_k = 0, ///< Unknown data type + simsimd_b8_k = 1 << 1, ///< Single-bit values packed into 8-bit words + simsimd_b1x8_k = simsimd_b8_k, ///< Single-bit values packed into 8-bit words + simsimd_i4x2_k = 1 << 19, ///< 4-bit signed integers packed into 8-bit words + + simsimd_i8_k = 1 << 2, ///< 8-bit signed integer + simsimd_i16_k = 1 << 3, ///< 16-bit signed integer + simsimd_i32_k = 1 << 4, ///< 32-bit signed integer + simsimd_i64_k = 1 << 5, ///< 64-bit signed integer + + simsimd_u8_k = 1 << 6, ///< 8-bit unsigned integer + simsimd_u16_k = 1 << 7, ///< 16-bit unsigned integer + simsimd_u32_k = 1 << 8, ///< 32-bit unsigned integer + simsimd_u64_k = 1 << 9, ///< 64-bit unsigned integer + + simsimd_f64_k = 1 << 10, ///< Double precision floating point + simsimd_f32_k = 1 << 11, ///< Single precision floating point + simsimd_f16_k = 1 << 12, ///< Half precision floating point + simsimd_bf16_k = 1 << 13, ///< Brain floating point + + simsimd_f64c_k = 1 << 20, ///< Complex double precision floating point + simsimd_f32c_k = 1 << 21, ///< Complex single precision floating point + simsimd_f16c_k = 1 << 22, ///< Complex half precision floating point + simsimd_bf16c_k = 1 << 23, ///< Complex brain floating point } simsimd_datatype_t; typedef enum { @@ -252,24 +252,24 @@ typedef enum { */ SIMSIMD_PUBLIC simsimd_datatype_family_k simsimd_datatype_family(simsimd_datatype_t dtype) { switch (dtype) { - case simsimd_datatype_f64_k: return simsimd_datatype_float_family_k; - case simsimd_datatype_f32_k: return simsimd_datatype_float_family_k; - case simsimd_datatype_f16_k: return simsimd_datatype_float_family_k; - case simsimd_datatype_bf16_k: return simsimd_datatype_float_family_k; - case simsimd_datatype_f64c_k: return simsimd_datatype_complex_float_family_k; - case simsimd_datatype_f32c_k: return simsimd_datatype_complex_float_family_k; - case simsimd_datatype_f16c_k: return simsimd_datatype_complex_float_family_k; - case simsimd_datatype_bf16c_k: return simsimd_datatype_complex_float_family_k; - case simsimd_datatype_b8_k: return simsimd_datatype_binary_family_k; - case simsimd_datatype_u8_k: return simsimd_datatype_uint_family_k; - case simsimd_datatype_u16_k: return simsimd_datatype_uint_family_k; - case simsimd_datatype_u32_k: return simsimd_datatype_uint_family_k; - case simsimd_datatype_u64_k: return simsimd_datatype_uint_family_k; - case simsimd_datatype_i8_k: return simsimd_datatype_int_family_k; - case simsimd_datatype_i16_k: return simsimd_datatype_int_family_k; - case simsimd_datatype_i32_k: return simsimd_datatype_int_family_k; - case simsimd_datatype_i64_k: return simsimd_datatype_int_family_k; - case simsimd_datatype_i4x2_k: return simsimd_datatype_int_family_k; + case simsimd_f64_k: return simsimd_datatype_float_family_k; + case simsimd_f32_k: return simsimd_datatype_float_family_k; + case simsimd_f16_k: return simsimd_datatype_float_family_k; + case simsimd_bf16_k: return simsimd_datatype_float_family_k; + case simsimd_f64c_k: return simsimd_datatype_complex_float_family_k; + case simsimd_f32c_k: return simsimd_datatype_complex_float_family_k; + case simsimd_f16c_k: return simsimd_datatype_complex_float_family_k; + case simsimd_bf16c_k: return simsimd_datatype_complex_float_family_k; + case simsimd_b8_k: return simsimd_datatype_binary_family_k; + case simsimd_u8_k: return simsimd_datatype_uint_family_k; + case simsimd_u16_k: return simsimd_datatype_uint_family_k; + case simsimd_u32_k: return simsimd_datatype_uint_family_k; + case simsimd_u64_k: return simsimd_datatype_uint_family_k; + case simsimd_i8_k: return simsimd_datatype_int_family_k; + case simsimd_i16_k: return simsimd_datatype_int_family_k; + case simsimd_i32_k: return simsimd_datatype_int_family_k; + case simsimd_i64_k: return simsimd_datatype_int_family_k; + case simsimd_i4x2_k: return simsimd_datatype_int_family_k; default: return simsimd_datatype_unknown_family_k; } } @@ -285,7 +285,7 @@ SIMSIMD_PUBLIC simsimd_datatype_family_k simsimd_datatype_family(simsimd_datatyp * @param[out] d Output value as a double-precision float. * In complex dot-products @b two scalars are exported for the real and imaginary parts. */ -typedef void (*simsimd_metric_dense_punned_t)(void const *a, void const *b, simsimd_size_t n, simsimd_distance_t *d); +typedef void (*simsimd_dense_metric_t)(void const *a, void const *b, simsimd_size_t n, simsimd_distance_t *d); /** * @brief Type-punned function pointer for sparse vector representations and similarity measures. @@ -296,9 +296,9 @@ typedef void (*simsimd_metric_dense_punned_t)(void const *a, void const *b, sims * @param[in] b_length Number of scalar words in the second input array. * @param[out] d Output value as a double-precision float, generally without decimals. */ -typedef void (*simsimd_metric_sparse_punned_t)(void const *a, void const *b, // - simsimd_size_t a_length, simsimd_size_t b_length, // - simsimd_distance_t *d); +typedef void (*simsimd_sparse_metric_t)(void const *a, void const *b, // + simsimd_size_t a_length, simsimd_size_t b_length, // + simsimd_distance_t *d); /** * @brief Type-punned function pointer for curved vector spaces and similarity measures. @@ -309,8 +309,8 @@ typedef void (*simsimd_metric_sparse_punned_t)(void const *a, void const *b, * @param[in] n Number of scalar words in the input arrays. * @param[out] d Output value as a double-precision float. */ -typedef void (*simsimd_metric_curved_punned_t)(void const *a, void const *b, void const *c, // - simsimd_size_t n, simsimd_distance_t *d); +typedef void (*simsimd_curved_metric_t)(void const *a, void const *b, void const *c, // + simsimd_size_t n, simsimd_distance_t *d); /** * @brief Type-punned function pointer for Scaling & Shifting operations on dense vector representations. @@ -322,8 +322,8 @@ typedef void (*simsimd_metric_curved_punned_t)(void const *a, void const *b, voi * @param[in] beta Scaling factor for the third array. * @param[out] y Output value in the same precision as the input arrays. */ -typedef void (*simsimd_kernel_scale_punned_t)(void const *a, simsimd_size_t n, simsimd_distance_t alpha, - simsimd_distance_t beta, void *y); +typedef void (*simsimd_elementwise_scale_t)(void const *a, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_distance_t beta, void *y); /** * @brief Type-punned function pointer for element-wise Sum operations on dense vector representations. @@ -334,8 +334,7 @@ typedef void (*simsimd_kernel_scale_punned_t)(void const *a, simsimd_size_t n, s * @param[in] n Number of scalar words in the input arrays. * @param[out] y Output value in the same precision as the input arrays. */ -typedef void (*simsimd_kernel_sum_punned_t)(void const *a, void const *b, // - simsimd_size_t n, void *y); +typedef void (*simsimd_elementwise_sum_t)(void const *a, void const *b, simsimd_size_t n, void *y); /** * @brief Type-punned function pointer for Weighted Sum operations on dense vector representations. @@ -348,9 +347,8 @@ typedef void (*simsimd_kernel_sum_punned_t)(void const *a, void const *b, // * @param[in] beta Scaling factor for the second array. * @param[out] y Output value in the same precision as the input arrays. */ -typedef void (*simsimd_kernel_wsum_punned_t)(void const *a, void const *b, // - simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, - void *y); +typedef void (*simsimd_elementwise_wsum_t)(void const *a, void const *b, simsimd_size_t n, simsimd_distance_t alpha, + simsimd_distance_t beta, void *y); /** * @brief Type-punned function pointer for FMA operations on dense vector representations. @@ -364,35 +362,34 @@ typedef void (*simsimd_kernel_wsum_punned_t)(void const *a, void const *b, // * @param[in] beta Scaling factor for the third array. * @param[out] y Output value in the same precision as the input arrays. */ -typedef void (*simsimd_kernel_fma_punned_t)(void const *a, void const *b, void const *c, // - simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, - void *y); +typedef void (*simsimd_elementwise_fma_t)(void const *a, void const *b, void const *c, simsimd_size_t n, + simsimd_distance_t alpha, simsimd_distance_t beta, void *y); /** * @brief Type-punned function pointer for a SimSIMD public interface. * - * Can be a `simsimd_metric_dense_punned_t`, `simsimd_metric_sparse_punned_t`, `simsimd_metric_curved_punned_t`, - * `simsimd_kernel_fma_punned_t`, or `simsimd_kernel_wsum_punned_t`. + * Can be a `simsimd_dense_metric_t`, `simsimd_sparse_metric_t`, `simsimd_curved_metric_t`, + * `simsimd_elementwise_fma_t`, or `simsimd_elementwise_wsum_t`. */ typedef void (*simsimd_kernel_punned_t)(void *); #if SIMSIMD_DYNAMIC_DISPATCH SIMSIMD_DYNAMIC simsimd_capability_t simsimd_capabilities(void); -SIMSIMD_DYNAMIC void simsimd_find_kernel_punned( // - simsimd_metric_kind_t kind, // - simsimd_datatype_t datatype, // - simsimd_capability_t supported, // - simsimd_capability_t allowed, // - simsimd_kernel_punned_t *kernel_output, // +SIMSIMD_DYNAMIC void simsimd_find_kernel( // + simsimd_metric_kind_t kind, // + simsimd_datatype_t datatype, // + simsimd_capability_t supported, // + simsimd_capability_t allowed, // + simsimd_kernel_punned_t *kernel_output, // simsimd_capability_t *capability_output); #else SIMSIMD_PUBLIC simsimd_capability_t simsimd_capabilities(void); -SIMSIMD_PUBLIC void simsimd_find_kernel_punned( // - simsimd_metric_kind_t kind, // - simsimd_datatype_t datatype, // - simsimd_capability_t supported, // - simsimd_capability_t allowed, // - simsimd_kernel_punned_t *kernel_output, // +SIMSIMD_PUBLIC void simsimd_find_kernel( // + simsimd_metric_kind_t kind, // + simsimd_datatype_t datatype, // + simsimd_capability_t supported, // + simsimd_capability_t allowed, // + simsimd_kernel_punned_t *kernel_output, // simsimd_capability_t *capability_output); #endif @@ -604,8 +601,8 @@ SIMSIMD_PUBLIC simsimd_capability_t _simsimd_capabilities_implementation(void) { #pragma clang diagnostic ignored "-Wvolatile" #endif -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f64(simsimd_capability_t v, simsimd_metric_kind_t k, - simsimd_kernel_punned_t *m, simsimd_capability_t *c) { +SIMSIMD_INTERNAL void _simsimd_find_kernel_f64(simsimd_capability_t v, simsimd_metric_kind_t k, + simsimd_kernel_punned_t *m, simsimd_capability_t *c) { typedef simsimd_kernel_punned_t m_t; #if SIMSIMD_TARGET_SVE if (v & simsimd_cap_sve_k) switch (k) { @@ -666,8 +663,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f64(simsimd_capability_t v, si } } -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f32(simsimd_capability_t v, simsimd_metric_kind_t k, - simsimd_kernel_punned_t *m, simsimd_capability_t *c) { +SIMSIMD_INTERNAL void _simsimd_find_kernel_f32(simsimd_capability_t v, simsimd_metric_kind_t k, + simsimd_kernel_punned_t *m, simsimd_capability_t *c) { typedef simsimd_kernel_punned_t m_t; #if SIMSIMD_TARGET_SVE if (v & simsimd_cap_sve_k) switch (k) { @@ -740,8 +737,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f32(simsimd_capability_t v, si } } -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f16(simsimd_capability_t v, simsimd_metric_kind_t k, - simsimd_kernel_punned_t *m, simsimd_capability_t *c) { +SIMSIMD_INTERNAL void _simsimd_find_kernel_f16(simsimd_capability_t v, simsimd_metric_kind_t k, + simsimd_kernel_punned_t *m, simsimd_capability_t *c) { typedef simsimd_kernel_punned_t m_t; #if SIMSIMD_TARGET_SVE_F16 if (v & simsimd_cap_sve_k) switch (k) { @@ -820,8 +817,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f16(simsimd_capability_t v, si } } -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_bf16(simsimd_capability_t v, simsimd_metric_kind_t k, - simsimd_kernel_punned_t *m, simsimd_capability_t *c) { +SIMSIMD_INTERNAL void _simsimd_find_kernel_bf16(simsimd_capability_t v, simsimd_metric_kind_t k, + simsimd_kernel_punned_t *m, simsimd_capability_t *c) { typedef simsimd_kernel_punned_t m_t; #if SIMSIMD_TARGET_SVE_BF16 if (v & simsimd_cap_sve_bf16_k) switch (k) { @@ -896,8 +893,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_bf16(simsimd_capability_t v, s } } -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_i8(simsimd_capability_t v, simsimd_metric_kind_t k, - simsimd_kernel_punned_t *m, simsimd_capability_t *c) { +SIMSIMD_INTERNAL void _simsimd_find_kernel_i8(simsimd_capability_t v, simsimd_metric_kind_t k, + simsimd_kernel_punned_t *m, simsimd_capability_t *c) { typedef simsimd_kernel_punned_t m_t; #if SIMSIMD_TARGET_NEON_I8 if (v & simsimd_cap_neon_i8_k) switch (k) { @@ -960,8 +957,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_i8(simsimd_capability_t v, sim default: break; } } -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_u8(simsimd_capability_t v, simsimd_metric_kind_t k, - simsimd_kernel_punned_t *m, simsimd_capability_t *c) { +SIMSIMD_INTERNAL void _simsimd_find_kernel_u8(simsimd_capability_t v, simsimd_metric_kind_t k, + simsimd_kernel_punned_t *m, simsimd_capability_t *c) { typedef simsimd_kernel_punned_t m_t; #if SIMSIMD_TARGET_NEON_I8 if (v & simsimd_cap_neon_i8_k) switch (k) { @@ -1025,8 +1022,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_u8(simsimd_capability_t v, sim } } -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_b8(simsimd_capability_t v, simsimd_metric_kind_t k, - simsimd_kernel_punned_t *m, simsimd_capability_t *c) { +SIMSIMD_INTERNAL void _simsimd_find_kernel_b8(simsimd_capability_t v, simsimd_metric_kind_t k, + simsimd_kernel_punned_t *m, simsimd_capability_t *c) { typedef simsimd_kernel_punned_t m_t; #if SIMSIMD_TARGET_SVE if (v & simsimd_cap_sve_k) switch (k) { @@ -1063,8 +1060,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_b8(simsimd_capability_t v, sim } } -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f64c(simsimd_capability_t v, simsimd_metric_kind_t k, - simsimd_kernel_punned_t *m, simsimd_capability_t *c) { +SIMSIMD_INTERNAL void _simsimd_find_kernel_f64c(simsimd_capability_t v, simsimd_metric_kind_t k, + simsimd_kernel_punned_t *m, simsimd_capability_t *c) { typedef simsimd_kernel_punned_t m_t; #if SIMSIMD_TARGET_SVE if (v & simsimd_cap_sve_k) switch (k) { @@ -1087,8 +1084,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f64c(simsimd_capability_t v, s } } -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f32c(simsimd_capability_t v, simsimd_metric_kind_t k, - simsimd_kernel_punned_t *m, simsimd_capability_t *c) { +SIMSIMD_INTERNAL void _simsimd_find_kernel_f32c(simsimd_capability_t v, simsimd_metric_kind_t k, + simsimd_kernel_punned_t *m, simsimd_capability_t *c) { typedef simsimd_kernel_punned_t m_t; #if SIMSIMD_TARGET_SVE if (v & simsimd_cap_sve_k) switch (k) { @@ -1125,8 +1122,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f32c(simsimd_capability_t v, s } } -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f16c(simsimd_capability_t v, simsimd_metric_kind_t k, - simsimd_kernel_punned_t *m, simsimd_capability_t *c) { +SIMSIMD_INTERNAL void _simsimd_find_kernel_f16c(simsimd_capability_t v, simsimd_metric_kind_t k, + simsimd_kernel_punned_t *m, simsimd_capability_t *c) { typedef simsimd_kernel_punned_t m_t; #if SIMSIMD_TARGET_SVE_F16 if (v & simsimd_cap_sve_k) switch (k) { @@ -1163,8 +1160,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f16c(simsimd_capability_t v, s } } -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_bf16c(simsimd_capability_t v, simsimd_metric_kind_t k, - simsimd_kernel_punned_t *m, simsimd_capability_t *c) { +SIMSIMD_INTERNAL void _simsimd_find_kernel_bf16c(simsimd_capability_t v, simsimd_metric_kind_t k, + simsimd_kernel_punned_t *m, simsimd_capability_t *c) { typedef simsimd_kernel_punned_t m_t; #if SIMSIMD_TARGET_NEON_BF16 if (v & simsimd_cap_neon_bf16_k) switch (k) { @@ -1187,8 +1184,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_bf16c(simsimd_capability_t v, } } -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_u16(simsimd_capability_t v, simsimd_metric_kind_t k, - simsimd_kernel_punned_t *m, simsimd_capability_t *c) { +SIMSIMD_INTERNAL void _simsimd_find_kernel_u16(simsimd_capability_t v, simsimd_metric_kind_t k, + simsimd_kernel_punned_t *m, simsimd_capability_t *c) { typedef simsimd_kernel_punned_t m_t; #if SIMSIMD_TARGET_SVE2 if (v & simsimd_cap_sve2_k) switch (k) { @@ -1226,8 +1223,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_u16(simsimd_capability_t v, si } } -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_u32(simsimd_capability_t v, simsimd_metric_kind_t k, - simsimd_kernel_punned_t *m, simsimd_capability_t *c) { +SIMSIMD_INTERNAL void _simsimd_find_kernel_u32(simsimd_capability_t v, simsimd_metric_kind_t k, + simsimd_kernel_punned_t *m, simsimd_capability_t *c) { typedef simsimd_kernel_punned_t m_t; #if SIMSIMD_TARGET_SVE2 if (v & simsimd_cap_sve2_k) switch (k) { @@ -1270,12 +1267,12 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_u32(simsimd_capability_t v, si * @param kernel_output Output variable for the selected similarity function. * @param capability_output Output variable for the utilized hardware capabilities. */ -SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_implementation( // - simsimd_metric_kind_t kind, // - simsimd_datatype_t datatype, // - simsimd_capability_t supported, // - simsimd_capability_t allowed, // - simsimd_kernel_punned_t *kernel_output, // +SIMSIMD_INTERNAL void _simsimd_find_kernel_implementation( // + simsimd_metric_kind_t kind, // + simsimd_datatype_t datatype, // + simsimd_capability_t supported, // + simsimd_capability_t allowed, // + simsimd_kernel_punned_t *kernel_output, // simsimd_capability_t *capability_output) { // Modern compilers abso-freaking-lutely love optimizing-out my logic! @@ -1293,26 +1290,26 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_implementation( // switch (datatype) { - case simsimd_datatype_f64_k: _simsimd_find_kernel_punned_f64(viable, kind, m, c); return; - case simsimd_datatype_f32_k: _simsimd_find_kernel_punned_f32(viable, kind, m, c); return; - case simsimd_datatype_f16_k: _simsimd_find_kernel_punned_f16(viable, kind, m, c); return; - case simsimd_datatype_bf16_k: _simsimd_find_kernel_punned_bf16(viable, kind, m, c); return; - case simsimd_datatype_i8_k: _simsimd_find_kernel_punned_i8(viable, kind, m, c); return; - case simsimd_datatype_u8_k: _simsimd_find_kernel_punned_u8(viable, kind, m, c); return; - case simsimd_datatype_b8_k: _simsimd_find_kernel_punned_b8(viable, kind, m, c); return; - case simsimd_datatype_f32c_k: _simsimd_find_kernel_punned_f32c(viable, kind, m, c); return; - case simsimd_datatype_f64c_k: _simsimd_find_kernel_punned_f64c(viable, kind, m, c); return; - case simsimd_datatype_f16c_k: _simsimd_find_kernel_punned_f16c(viable, kind, m, c); return; - case simsimd_datatype_bf16c_k: _simsimd_find_kernel_punned_bf16c(viable, kind, m, c); return; - case simsimd_datatype_u16_k: _simsimd_find_kernel_punned_u16(viable, kind, m, c); return; - case simsimd_datatype_u32_k: _simsimd_find_kernel_punned_u32(viable, kind, m, c); return; + case simsimd_f64_k: _simsimd_find_kernel_f64(viable, kind, m, c); return; + case simsimd_f32_k: _simsimd_find_kernel_f32(viable, kind, m, c); return; + case simsimd_f16_k: _simsimd_find_kernel_f16(viable, kind, m, c); return; + case simsimd_bf16_k: _simsimd_find_kernel_bf16(viable, kind, m, c); return; + case simsimd_i8_k: _simsimd_find_kernel_i8(viable, kind, m, c); return; + case simsimd_u8_k: _simsimd_find_kernel_u8(viable, kind, m, c); return; + case simsimd_b8_k: _simsimd_find_kernel_b8(viable, kind, m, c); return; + case simsimd_f32c_k: _simsimd_find_kernel_f32c(viable, kind, m, c); return; + case simsimd_f64c_k: _simsimd_find_kernel_f64c(viable, kind, m, c); return; + case simsimd_f16c_k: _simsimd_find_kernel_f16c(viable, kind, m, c); return; + case simsimd_bf16c_k: _simsimd_find_kernel_bf16c(viable, kind, m, c); return; + case simsimd_u16_k: _simsimd_find_kernel_u16(viable, kind, m, c); return; + case simsimd_u32_k: _simsimd_find_kernel_u32(viable, kind, m, c); return; // These data-types are not supported yet - case simsimd_datatype_i4x2_k: break; - case simsimd_datatype_i16_k: break; - case simsimd_datatype_i32_k: break; - case simsimd_datatype_i64_k: break; - case simsimd_datatype_u64_k: break; + case simsimd_i4x2_k: break; + case simsimd_i16_k: break; + case simsimd_i32_k: break; + case simsimd_i64_k: break; + case simsimd_u64_k: break; case simsimd_datatype_unknown_k: break; default: break; } @@ -1351,7 +1348,7 @@ SIMSIMD_PUBLIC simsimd_kernel_punned_t simsimd_metric_punned( // simsimd_kernel_punned_t result = 0; simsimd_capability_t c = simsimd_cap_serial_k; simsimd_capability_t supported = simsimd_capabilities(); - simsimd_find_kernel_punned(kind, datatype, supported, allowed, &result, &c); + simsimd_find_kernel(kind, datatype, supported, allowed, &result, &c); return result; } @@ -1563,14 +1560,14 @@ SIMSIMD_PUBLIC int simsimd_uses_turin(void) { return _SIMSIMD_TARGET_X86 && SIMS SIMSIMD_PUBLIC int simsimd_uses_sierra(void) { return _SIMSIMD_TARGET_X86 && SIMSIMD_TARGET_SIERRA; } SIMSIMD_PUBLIC int simsimd_uses_dynamic_dispatch(void) { return 0; } SIMSIMD_PUBLIC simsimd_capability_t simsimd_capabilities(void) { return _simsimd_capabilities_implementation(); } -SIMSIMD_PUBLIC void simsimd_find_kernel_punned( // +SIMSIMD_PUBLIC void simsimd_find_kernel( // simsimd_metric_kind_t kind, // simsimd_datatype_t datatype, // simsimd_capability_t supported, // simsimd_capability_t allowed, // simsimd_kernel_punned_t* kernel_output, // simsimd_capability_t* capability_output) { - _simsimd_find_kernel_punned_implementation(kind, datatype, supported, allowed, kernel_output, capability_output); + _simsimd_find_kernel_implementation(kind, datatype, supported, allowed, kernel_output, capability_output); } // clang-format on diff --git a/javascript/lib.c b/javascript/lib.c index 3964012c..147eb99f 100644 --- a/javascript/lib.c +++ b/javascript/lib.c @@ -46,17 +46,17 @@ napi_value dense(napi_env env, napi_callback_info info, simsimd_metric_kind_t me } if (datatype == simsimd_datatype_unknown_k) switch (type_a) { - case napi_float64_array: datatype = simsimd_datatype_f64_k; break; - case napi_float32_array: datatype = simsimd_datatype_f32_k; break; - case napi_int8_array: datatype = simsimd_datatype_i8_k; break; - case napi_uint8_array: datatype = simsimd_datatype_u8_k; break; + case napi_float64_array: datatype = simsimd_f64_k; break; + case napi_float32_array: datatype = simsimd_f32_k; break; + case napi_int8_array: datatype = simsimd_i8_k; break; + case napi_uint8_array: datatype = simsimd_u8_k; break; default: break; } - simsimd_metric_dense_punned_t metric = NULL; + simsimd_dense_metric_t metric = NULL; simsimd_capability_t capability = simsimd_cap_serial_k; - simsimd_find_kernel_punned(metric_kind, datatype, static_capabilities, simsimd_cap_any_k, - (simsimd_kernel_punned_t *)&metric, &capability); + simsimd_find_kernel(metric_kind, datatype, static_capabilities, simsimd_cap_any_k, + (simsimd_kernel_punned_t *)&metric, &capability); if (metric == NULL) { napi_throw_error(env, NULL, "Unsupported datatype for given metric"); return NULL; @@ -89,10 +89,10 @@ napi_value api_js(napi_env env, napi_callback_info info) { return dense(env, info, simsimd_js_k, simsimd_datatype_unknown_k); } napi_value api_hamming(napi_env env, napi_callback_info info) { - return dense(env, info, simsimd_hamming_k, simsimd_datatype_b8_k); + return dense(env, info, simsimd_hamming_k, simsimd_b8_k); } napi_value api_jaccard(napi_env env, napi_callback_info info) { - return dense(env, info, simsimd_jaccard_k, simsimd_datatype_b8_k); + return dense(env, info, simsimd_jaccard_k, simsimd_b8_k); } napi_value Init(napi_env env, napi_value exports) { diff --git a/python/lib.c b/python/lib.c index 13450efd..ae54e1c5 100644 --- a/python/lib.c +++ b/python/lib.c @@ -220,8 +220,8 @@ int same_string(char const *a, char const *b) { return strcmp(a, b) == 0; } /// @brief Helper method to check if a logical datatype is complex and should be represented as two scalars. /// @return 1 if the datatype is complex, 0 otherwise. int is_complex(simsimd_datatype_t datatype) { - return datatype == simsimd_datatype_f32c_k || datatype == simsimd_datatype_f64c_k || - datatype == simsimd_datatype_f16c_k || datatype == simsimd_datatype_bf16c_k; + return datatype == simsimd_f32c_k || datatype == simsimd_f64c_k || datatype == simsimd_f16c_k || + datatype == simsimd_bf16c_k; } /// @brief Converts a Python-ic datatype string to a logical datatype, normalizing the format. @@ -234,50 +234,50 @@ simsimd_datatype_t python_string_to_datatype(char const *name) { if (same_string(name, "float32") || same_string(name, "f32") || // SimSIMD-specific same_string(name, "f4") || same_string(name, "as_scalar, &as_f16, sizeof(simsimd_f16_t)); - parsed->datatype = simsimd_datatype_f16_k; + parsed->datatype = simsimd_f16_k; } else if (as_float == (simsimd_f64_t)(simsimd_f32_t)as_float) { simsimd_f32_t as_f32 = (simsimd_f32_t)as_float; memcpy(parsed->as_scalar, &as_f32, sizeof(simsimd_f32_t)); - parsed->datatype = simsimd_datatype_f32_k; + parsed->datatype = simsimd_f32_k; } else { memcpy(parsed->as_scalar, &as_float, sizeof(simsimd_f64_t)); - parsed->datatype = simsimd_datatype_f64_k; + parsed->datatype = simsimd_f64_k; } return 1; } @@ -683,36 +683,36 @@ int parse_buffer_or_scalar_argument(PyObject *obj, Py_buffer *buffer, BufferOrSc if (as_integral == (simsimd_u64_t)(simsimd_u8_t)as_integral) { simsimd_u8_t as_u8 = (simsimd_u8_t)as_integral; memcpy(parsed->as_scalar, &as_u8, sizeof(simsimd_u8_t)); - parsed->datatype = simsimd_datatype_u8_k; + parsed->datatype = simsimd_u8_k; } else if (as_integral == (simsimd_u64_t)(simsimd_u16_t)as_integral) { simsimd_u16_t as_u16 = (simsimd_u16_t)as_integral; memcpy(parsed->as_scalar, &as_u16, sizeof(simsimd_u16_t)); - parsed->datatype = simsimd_datatype_u16_k; + parsed->datatype = simsimd_u16_k; } else if (as_integral == (simsimd_u64_t)(simsimd_u32_t)as_integral) { simsimd_u32_t as_u32 = (simsimd_u32_t)as_integral; memcpy(parsed->as_scalar, &as_u32, sizeof(simsimd_u32_t)); - parsed->datatype = simsimd_datatype_u32_k; + parsed->datatype = simsimd_u32_k; } else if (as_integral == (simsimd_i64_t)(simsimd_i8_t)as_integral) { simsimd_i8_t as_i8 = (simsimd_i8_t)as_integral; memcpy(parsed->as_scalar, &as_i8, sizeof(simsimd_i8_t)); - parsed->datatype = simsimd_datatype_i8_k; + parsed->datatype = simsimd_i8_k; } else if (as_integral == (simsimd_i64_t)(simsimd_i16_t)as_integral) { simsimd_i16_t as_i16 = (simsimd_i16_t)as_integral; memcpy(parsed->as_scalar, &as_i16, sizeof(simsimd_i16_t)); - parsed->datatype = simsimd_datatype_i16_k; + parsed->datatype = simsimd_i16_k; } else if (as_integral == (simsimd_i64_t)(simsimd_i32_t)as_integral) { simsimd_i32_t as_i32 = (simsimd_i32_t)as_integral; memcpy(parsed->as_scalar, &as_i32, sizeof(simsimd_i32_t)); - parsed->datatype = simsimd_datatype_i32_k; + parsed->datatype = simsimd_i32_k; } else { memcpy(parsed->as_scalar, &as_integral, sizeof(simsimd_i64_t)); - parsed->datatype = simsimd_datatype_i64_k; + parsed->datatype = simsimd_i64_k; } return 1; } @@ -924,7 +924,7 @@ static PyObject *implement_dense_metric( // // 3. double precision float (or its complex variant) if (out_dtype == simsimd_datatype_unknown_k) { if (out_obj) { out_dtype = out_parsed.datatype; } - else { out_dtype = is_complex(dtype) ? simsimd_datatype_f64c_k : simsimd_datatype_f64_k; } + else { out_dtype = is_complex(dtype) ? simsimd_f64c_k : simsimd_f64_k; } } // Make sure the return datatype is complex if the input datatype is complex, and the same for real numbers @@ -947,10 +947,10 @@ static PyObject *implement_dense_metric( // } // Look up the metric and the capability - simsimd_metric_dense_punned_t metric = NULL; + simsimd_dense_metric_t metric = NULL; simsimd_capability_t capability = simsimd_cap_serial_k; - simsimd_find_kernel_punned(metric_kind, dtype, static_capabilities, simsimd_cap_any_k, - (simsimd_kernel_punned_t *)&metric, &capability); + simsimd_find_kernel(metric_kind, dtype, static_capabilities, simsimd_cap_any_k, (simsimd_kernel_punned_t *)&metric, + &capability); if (!metric) { PyErr_Format( // PyExc_LookupError, @@ -1152,10 +1152,10 @@ static PyObject *implement_curved_metric( // if (dtype == simsimd_datatype_unknown_k) dtype = a_parsed.datatype; // Look up the metric and the capability - simsimd_metric_curved_punned_t metric = NULL; + simsimd_curved_metric_t metric = NULL; simsimd_capability_t capability = simsimd_cap_serial_k; - simsimd_find_kernel_punned(metric_kind, dtype, static_capabilities, simsimd_cap_any_k, - (simsimd_kernel_punned_t *)&metric, &capability); + simsimd_find_kernel(metric_kind, dtype, static_capabilities, simsimd_cap_any_k, (simsimd_kernel_punned_t *)&metric, + &capability); if (!metric) { PyErr_Format( // PyExc_LookupError, @@ -1212,10 +1212,10 @@ static PyObject *implement_sparse_metric( // } simsimd_datatype_t dtype = a_parsed.datatype; - simsimd_metric_sparse_punned_t metric = NULL; + simsimd_sparse_metric_t metric = NULL; simsimd_capability_t capability = simsimd_cap_serial_k; - simsimd_find_kernel_punned(metric_kind, dtype, static_capabilities, simsimd_cap_any_k, - (simsimd_kernel_punned_t *)&metric, &capability); + simsimd_find_kernel(metric_kind, dtype, static_capabilities, simsimd_cap_any_k, (simsimd_kernel_punned_t *)&metric, + &capability); if (!metric) { PyErr_Format( // PyExc_LookupError, "Unsupported metric '%c' and datatype combination ('%s'/'%s' and '%s'/'%s')", @@ -1284,7 +1284,7 @@ static PyObject *implement_cdist( // // 3. double precision float (or its complex variant) if (out_dtype == simsimd_datatype_unknown_k) { if (out_obj) { out_dtype = out_parsed.datatype; } - else { out_dtype = is_complex(dtype) ? simsimd_datatype_f64c_k : simsimd_datatype_f64_k; } + else { out_dtype = is_complex(dtype) ? simsimd_f64c_k : simsimd_f64_k; } } // Make sure the return datatype is complex if the input datatype is complex, and the same for real numbers @@ -1307,10 +1307,10 @@ static PyObject *implement_cdist( // } // Look up the metric and the capability - simsimd_metric_dense_punned_t metric = NULL; + simsimd_dense_metric_t metric = NULL; simsimd_capability_t capability = simsimd_cap_serial_k; - simsimd_find_kernel_punned(metric_kind, dtype, static_capabilities, simsimd_cap_any_k, - (simsimd_kernel_punned_t *)&metric, &capability); + simsimd_find_kernel(metric_kind, dtype, static_capabilities, simsimd_cap_any_k, (simsimd_kernel_punned_t *)&metric, + &capability); if (!metric) { PyErr_Format( // PyExc_LookupError, "Unsupported metric '%c' and datatype combination ('%s'/'%s' and '%s'/'%s')", @@ -1450,7 +1450,7 @@ static PyObject *implement_pointer_access(simsimd_metric_kind_t metric_kind, PyO simsimd_kernel_punned_t metric = NULL; simsimd_capability_t capability = simsimd_cap_serial_k; - simsimd_find_kernel_punned(metric_kind, datatype, static_capabilities, simsimd_cap_any_k, &metric, &capability); + simsimd_find_kernel(metric_kind, datatype, static_capabilities, simsimd_cap_any_k, &metric, &capability); if (metric == NULL) { PyErr_SetString(PyExc_LookupError, "No such metric"); return NULL; @@ -1996,11 +1996,11 @@ static PyObject *api_scale(PyObject *self, PyObject *const *args, Py_ssize_t con if (dtype == simsimd_datatype_unknown_k) dtype = a_parsed.datatype; // Look up the kernel and the capability - simsimd_kernel_scale_punned_t kernel = NULL; + simsimd_elementwise_scale_t kernel = NULL; simsimd_capability_t capability = simsimd_cap_serial_k; simsimd_kernel_kind_t const kernel_kind = simsimd_scale_k; - simsimd_find_kernel_punned(kernel_kind, dtype, static_capabilities, simsimd_cap_any_k, - (simsimd_kernel_punned_t *)&kernel, &capability); + simsimd_find_kernel(kernel_kind, dtype, static_capabilities, simsimd_cap_any_k, (simsimd_kernel_punned_t *)&kernel, + &capability); if (!kernel) { PyErr_Format( // PyExc_LookupError, @@ -2158,11 +2158,11 @@ static PyObject *api_sum(PyObject *self, PyObject *const *args, Py_ssize_t const if (dtype == simsimd_datatype_unknown_k) dtype = a_parsed.datatype; // Look up the kernel and the capability - simsimd_kernel_sum_punned_t kernel = NULL; + simsimd_elementwise_sum_t kernel = NULL; simsimd_capability_t capability = simsimd_cap_serial_k; simsimd_kernel_kind_t const kernel_kind = simsimd_sum_k; - simsimd_find_kernel_punned(kernel_kind, dtype, static_capabilities, simsimd_cap_any_k, - (simsimd_kernel_punned_t *)&kernel, &capability); + simsimd_find_kernel(kernel_kind, dtype, static_capabilities, simsimd_cap_any_k, (simsimd_kernel_punned_t *)&kernel, + &capability); if (!kernel) { PyErr_Format( // PyExc_LookupError, @@ -2336,11 +2336,11 @@ static PyObject *api_wsum(PyObject *self, PyObject *const *args, Py_ssize_t cons if (dtype == simsimd_datatype_unknown_k) dtype = a_parsed.datatype; // Look up the kernel and the capability - simsimd_kernel_wsum_punned_t kernel = NULL; + simsimd_elementwise_wsum_t kernel = NULL; simsimd_capability_t capability = simsimd_cap_serial_k; simsimd_kernel_kind_t const kernel_kind = simsimd_wsum_k; - simsimd_find_kernel_punned(kernel_kind, dtype, static_capabilities, simsimd_cap_any_k, - (simsimd_kernel_punned_t *)&kernel, &capability); + simsimd_find_kernel(kernel_kind, dtype, static_capabilities, simsimd_cap_any_k, (simsimd_kernel_punned_t *)&kernel, + &capability); if (!kernel) { PyErr_Format( // PyExc_LookupError, @@ -2521,11 +2521,11 @@ static PyObject *api_fma(PyObject *self, PyObject *const *args, Py_ssize_t const if (dtype == simsimd_datatype_unknown_k) dtype = a_parsed.datatype; // Look up the kernel and the capability - simsimd_kernel_fma_punned_t kernel = NULL; + simsimd_elementwise_fma_t kernel = NULL; simsimd_capability_t capability = simsimd_cap_serial_k; simsimd_kernel_kind_t const kernel_kind = simsimd_fma_k; - simsimd_find_kernel_punned(kernel_kind, dtype, static_capabilities, simsimd_cap_any_k, - (simsimd_kernel_punned_t *)&kernel, &capability); + simsimd_find_kernel(kernel_kind, dtype, static_capabilities, simsimd_cap_any_k, (simsimd_kernel_punned_t *)&kernel, + &capability); if (!kernel) { PyErr_Format( // PyExc_LookupError, @@ -2772,126 +2772,126 @@ void apply_scale_to_each_continuous_slice( static binary_kernel_t elementwise_sadd(simsimd_datatype_t dtype) { switch (dtype) { - case simsimd_datatype_u64_k: return (binary_kernel_t)&_simsimd_u64_sadd; - case simsimd_datatype_u32_k: return (binary_kernel_t)&_simsimd_u32_sadd; - case simsimd_datatype_u16_k: return (binary_kernel_t)&_simsimd_u16_sadd; - case simsimd_datatype_u8_k: return (binary_kernel_t)&_simsimd_u8_sadd; - case simsimd_datatype_i64_k: return (binary_kernel_t)&_simsimd_i64_sadd; - case simsimd_datatype_i32_k: return (binary_kernel_t)&_simsimd_i32_sadd; - case simsimd_datatype_i16_k: return (binary_kernel_t)&_simsimd_i16_sadd; - case simsimd_datatype_i8_k: return (binary_kernel_t)&_simsimd_i8_sadd; - case simsimd_datatype_f64_k: return (binary_kernel_t)&_simsimd_f64_sadd; - case simsimd_datatype_f32_k: return (binary_kernel_t)&_simsimd_f32_sadd; - case simsimd_datatype_f16_k: return (binary_kernel_t)&_simsimd_f16_sadd; - case simsimd_datatype_bf16_k: return (binary_kernel_t)&_simsimd_bf16_sadd; + case simsimd_u64_k: return (binary_kernel_t)&_simsimd_u64_sadd; + case simsimd_u32_k: return (binary_kernel_t)&_simsimd_u32_sadd; + case simsimd_u16_k: return (binary_kernel_t)&_simsimd_u16_sadd; + case simsimd_u8_k: return (binary_kernel_t)&_simsimd_u8_sadd; + case simsimd_i64_k: return (binary_kernel_t)&_simsimd_i64_sadd; + case simsimd_i32_k: return (binary_kernel_t)&_simsimd_i32_sadd; + case simsimd_i16_k: return (binary_kernel_t)&_simsimd_i16_sadd; + case simsimd_i8_k: return (binary_kernel_t)&_simsimd_i8_sadd; + case simsimd_f64_k: return (binary_kernel_t)&_simsimd_f64_sadd; + case simsimd_f32_k: return (binary_kernel_t)&_simsimd_f32_sadd; + case simsimd_f16_k: return (binary_kernel_t)&_simsimd_f16_sadd; + case simsimd_bf16_k: return (binary_kernel_t)&_simsimd_bf16_sadd; default: return NULL; } } static unary_kernel_t elementwise_upcast_to_f64(simsimd_datatype_t dtype) { switch (dtype) { - case simsimd_datatype_u64_k: return (unary_kernel_t)&_simsimd_u64_to_f64; - case simsimd_datatype_u32_k: return (unary_kernel_t)&_simsimd_u32_to_f64; - case simsimd_datatype_u16_k: return (unary_kernel_t)&_simsimd_u16_to_f64; - case simsimd_datatype_u8_k: return (unary_kernel_t)&_simsimd_u8_to_f64; - case simsimd_datatype_i64_k: return (unary_kernel_t)&_simsimd_i64_to_f64; - case simsimd_datatype_i32_k: return (unary_kernel_t)&_simsimd_i32_to_f64; - case simsimd_datatype_i16_k: return (unary_kernel_t)&_simsimd_i16_to_f64; - case simsimd_datatype_i8_k: return (unary_kernel_t)&_simsimd_i8_to_f64; - case simsimd_datatype_f64_k: return (unary_kernel_t)&_simsimd_f64_to_f64; - case simsimd_datatype_f32_k: return (unary_kernel_t)&_simsimd_f32_to_f64; - case simsimd_datatype_f16_k: return (unary_kernel_t)&_simsimd_f16_to_f64; - case simsimd_datatype_bf16_k: return (unary_kernel_t)&_simsimd_bf16_to_f64; + case simsimd_u64_k: return (unary_kernel_t)&_simsimd_u64_to_f64; + case simsimd_u32_k: return (unary_kernel_t)&_simsimd_u32_to_f64; + case simsimd_u16_k: return (unary_kernel_t)&_simsimd_u16_to_f64; + case simsimd_u8_k: return (unary_kernel_t)&_simsimd_u8_to_f64; + case simsimd_i64_k: return (unary_kernel_t)&_simsimd_i64_to_f64; + case simsimd_i32_k: return (unary_kernel_t)&_simsimd_i32_to_f64; + case simsimd_i16_k: return (unary_kernel_t)&_simsimd_i16_to_f64; + case simsimd_i8_k: return (unary_kernel_t)&_simsimd_i8_to_f64; + case simsimd_f64_k: return (unary_kernel_t)&_simsimd_f64_to_f64; + case simsimd_f32_k: return (unary_kernel_t)&_simsimd_f32_to_f64; + case simsimd_f16_k: return (unary_kernel_t)&_simsimd_f16_to_f64; + case simsimd_bf16_k: return (unary_kernel_t)&_simsimd_bf16_to_f64; default: return NULL; } } static unary_kernel_t elementwise_upcast_to_i64(simsimd_datatype_t dtype) { switch (dtype) { - case simsimd_datatype_u64_k: return (unary_kernel_t)&_simsimd_u64_to_i64; - case simsimd_datatype_u32_k: return (unary_kernel_t)&_simsimd_u32_to_i64; - case simsimd_datatype_u16_k: return (unary_kernel_t)&_simsimd_u16_to_i64; - case simsimd_datatype_u8_k: return (unary_kernel_t)&_simsimd_u8_to_i64; - case simsimd_datatype_i64_k: return (unary_kernel_t)&_simsimd_i64_to_i64; - case simsimd_datatype_i32_k: return (unary_kernel_t)&_simsimd_i32_to_i64; - case simsimd_datatype_i16_k: return (unary_kernel_t)&_simsimd_i16_to_i64; - case simsimd_datatype_i8_k: return (unary_kernel_t)&_simsimd_i8_to_i64; - case simsimd_datatype_f64_k: return NULL; - case simsimd_datatype_f32_k: return NULL; - case simsimd_datatype_f16_k: return NULL; - case simsimd_datatype_bf16_k: return NULL; + case simsimd_u64_k: return (unary_kernel_t)&_simsimd_u64_to_i64; + case simsimd_u32_k: return (unary_kernel_t)&_simsimd_u32_to_i64; + case simsimd_u16_k: return (unary_kernel_t)&_simsimd_u16_to_i64; + case simsimd_u8_k: return (unary_kernel_t)&_simsimd_u8_to_i64; + case simsimd_i64_k: return (unary_kernel_t)&_simsimd_i64_to_i64; + case simsimd_i32_k: return (unary_kernel_t)&_simsimd_i32_to_i64; + case simsimd_i16_k: return (unary_kernel_t)&_simsimd_i16_to_i64; + case simsimd_i8_k: return (unary_kernel_t)&_simsimd_i8_to_i64; + case simsimd_f64_k: return NULL; + case simsimd_f32_k: return NULL; + case simsimd_f16_k: return NULL; + case simsimd_bf16_k: return NULL; default: return NULL; } } static unary_kernel_t elementwise_upcast_to_u64(simsimd_datatype_t dtype) { switch (dtype) { - case simsimd_datatype_u64_k: return (unary_kernel_t)&_simsimd_u64_to_u64; - case simsimd_datatype_u32_k: return (unary_kernel_t)&_simsimd_u32_to_u64; - case simsimd_datatype_u16_k: return (unary_kernel_t)&_simsimd_u16_to_u64; - case simsimd_datatype_u8_k: return (unary_kernel_t)&_simsimd_u8_to_u64; - case simsimd_datatype_i64_k: return (unary_kernel_t)&_simsimd_i64_to_u64; - case simsimd_datatype_i32_k: return (unary_kernel_t)&_simsimd_i32_to_u64; - case simsimd_datatype_i16_k: return (unary_kernel_t)&_simsimd_i16_to_u64; - case simsimd_datatype_i8_k: return (unary_kernel_t)&_simsimd_i8_to_u64; - case simsimd_datatype_f64_k: return NULL; - case simsimd_datatype_f32_k: return NULL; - case simsimd_datatype_f16_k: return NULL; - case simsimd_datatype_bf16_k: return NULL; + case simsimd_u64_k: return (unary_kernel_t)&_simsimd_u64_to_u64; + case simsimd_u32_k: return (unary_kernel_t)&_simsimd_u32_to_u64; + case simsimd_u16_k: return (unary_kernel_t)&_simsimd_u16_to_u64; + case simsimd_u8_k: return (unary_kernel_t)&_simsimd_u8_to_u64; + case simsimd_i64_k: return (unary_kernel_t)&_simsimd_i64_to_u64; + case simsimd_i32_k: return (unary_kernel_t)&_simsimd_i32_to_u64; + case simsimd_i16_k: return (unary_kernel_t)&_simsimd_i16_to_u64; + case simsimd_i8_k: return (unary_kernel_t)&_simsimd_i8_to_u64; + case simsimd_f64_k: return NULL; + case simsimd_f32_k: return NULL; + case simsimd_f16_k: return NULL; + case simsimd_bf16_k: return NULL; default: return NULL; } } static unary_kernel_t elementwise_downcast_from_f64(simsimd_datatype_t dtype) { switch (dtype) { - case simsimd_datatype_u64_k: return (unary_kernel_t)&_simsimd_f64_to_u64; - case simsimd_datatype_u32_k: return (unary_kernel_t)&_simsimd_f64_to_u32; - case simsimd_datatype_u16_k: return (unary_kernel_t)&_simsimd_f64_to_u16; - case simsimd_datatype_u8_k: return (unary_kernel_t)&_simsimd_f64_to_u8; - case simsimd_datatype_i64_k: return (unary_kernel_t)&_simsimd_f64_to_i64; - case simsimd_datatype_i32_k: return (unary_kernel_t)&_simsimd_f64_to_i32; - case simsimd_datatype_i16_k: return (unary_kernel_t)&_simsimd_f64_to_i16; - case simsimd_datatype_i8_k: return (unary_kernel_t)&_simsimd_f64_to_i8; - case simsimd_datatype_f64_k: return (unary_kernel_t)&_simsimd_f64_to_f64; - case simsimd_datatype_f32_k: return (unary_kernel_t)&_simsimd_f64_to_f32; - case simsimd_datatype_f16_k: return (unary_kernel_t)&_simsimd_f64_to_f16; - case simsimd_datatype_bf16_k: return (unary_kernel_t)&_simsimd_f64_to_bf16; + case simsimd_u64_k: return (unary_kernel_t)&_simsimd_f64_to_u64; + case simsimd_u32_k: return (unary_kernel_t)&_simsimd_f64_to_u32; + case simsimd_u16_k: return (unary_kernel_t)&_simsimd_f64_to_u16; + case simsimd_u8_k: return (unary_kernel_t)&_simsimd_f64_to_u8; + case simsimd_i64_k: return (unary_kernel_t)&_simsimd_f64_to_i64; + case simsimd_i32_k: return (unary_kernel_t)&_simsimd_f64_to_i32; + case simsimd_i16_k: return (unary_kernel_t)&_simsimd_f64_to_i16; + case simsimd_i8_k: return (unary_kernel_t)&_simsimd_f64_to_i8; + case simsimd_f64_k: return (unary_kernel_t)&_simsimd_f64_to_f64; + case simsimd_f32_k: return (unary_kernel_t)&_simsimd_f64_to_f32; + case simsimd_f16_k: return (unary_kernel_t)&_simsimd_f64_to_f16; + case simsimd_bf16_k: return (unary_kernel_t)&_simsimd_f64_to_bf16; default: return NULL; } } static unary_kernel_t elementwise_downcast_from_i64(simsimd_datatype_t dtype) { switch (dtype) { - case simsimd_datatype_u64_k: return (unary_kernel_t)&_simsimd_i64_to_u64; - case simsimd_datatype_u32_k: return (unary_kernel_t)&_simsimd_i64_to_u32; - case simsimd_datatype_u16_k: return (unary_kernel_t)&_simsimd_i64_to_u16; - case simsimd_datatype_u8_k: return (unary_kernel_t)&_simsimd_i64_to_u8; - case simsimd_datatype_i64_k: return (unary_kernel_t)&_simsimd_i64_to_i64; - case simsimd_datatype_i32_k: return (unary_kernel_t)&_simsimd_i64_to_i32; - case simsimd_datatype_i16_k: return (unary_kernel_t)&_simsimd_i64_to_i16; - case simsimd_datatype_i8_k: return (unary_kernel_t)&_simsimd_i64_to_i8; - case simsimd_datatype_f64_k: return (unary_kernel_t)&_simsimd_i64_to_f64; - case simsimd_datatype_f32_k: return (unary_kernel_t)&_simsimd_i64_to_f32; - case simsimd_datatype_f16_k: return (unary_kernel_t)&_simsimd_i64_to_f16; - case simsimd_datatype_bf16_k: return (unary_kernel_t)&_simsimd_i64_to_bf16; + case simsimd_u64_k: return (unary_kernel_t)&_simsimd_i64_to_u64; + case simsimd_u32_k: return (unary_kernel_t)&_simsimd_i64_to_u32; + case simsimd_u16_k: return (unary_kernel_t)&_simsimd_i64_to_u16; + case simsimd_u8_k: return (unary_kernel_t)&_simsimd_i64_to_u8; + case simsimd_i64_k: return (unary_kernel_t)&_simsimd_i64_to_i64; + case simsimd_i32_k: return (unary_kernel_t)&_simsimd_i64_to_i32; + case simsimd_i16_k: return (unary_kernel_t)&_simsimd_i64_to_i16; + case simsimd_i8_k: return (unary_kernel_t)&_simsimd_i64_to_i8; + case simsimd_f64_k: return (unary_kernel_t)&_simsimd_i64_to_f64; + case simsimd_f32_k: return (unary_kernel_t)&_simsimd_i64_to_f32; + case simsimd_f16_k: return (unary_kernel_t)&_simsimd_i64_to_f16; + case simsimd_bf16_k: return (unary_kernel_t)&_simsimd_i64_to_bf16; default: return NULL; } } static unary_kernel_t elementwise_downcast_from_u64(simsimd_datatype_t dtype) { switch (dtype) { - case simsimd_datatype_u64_k: return (unary_kernel_t)&_simsimd_u64_to_u64; - case simsimd_datatype_u32_k: return (unary_kernel_t)&_simsimd_u64_to_u32; - case simsimd_datatype_u16_k: return (unary_kernel_t)&_simsimd_u64_to_u16; - case simsimd_datatype_u8_k: return (unary_kernel_t)&_simsimd_u64_to_u8; - case simsimd_datatype_i64_k: return (unary_kernel_t)&_simsimd_u64_to_i64; - case simsimd_datatype_i32_k: return (unary_kernel_t)&_simsimd_u64_to_i32; - case simsimd_datatype_i16_k: return (unary_kernel_t)&_simsimd_u64_to_i16; - case simsimd_datatype_i8_k: return (unary_kernel_t)&_simsimd_u64_to_i8; - case simsimd_datatype_f64_k: return (unary_kernel_t)&_simsimd_u64_to_f64; - case simsimd_datatype_f32_k: return (unary_kernel_t)&_simsimd_u64_to_f32; - case simsimd_datatype_f16_k: return (unary_kernel_t)&_simsimd_u64_to_f16; - case simsimd_datatype_bf16_k: return (unary_kernel_t)&_simsimd_u64_to_bf16; + case simsimd_u64_k: return (unary_kernel_t)&_simsimd_u64_to_u64; + case simsimd_u32_k: return (unary_kernel_t)&_simsimd_u64_to_u32; + case simsimd_u16_k: return (unary_kernel_t)&_simsimd_u64_to_u16; + case simsimd_u8_k: return (unary_kernel_t)&_simsimd_u64_to_u8; + case simsimd_i64_k: return (unary_kernel_t)&_simsimd_u64_to_i64; + case simsimd_i32_k: return (unary_kernel_t)&_simsimd_u64_to_i32; + case simsimd_i16_k: return (unary_kernel_t)&_simsimd_u64_to_i16; + case simsimd_i8_k: return (unary_kernel_t)&_simsimd_u64_to_i8; + case simsimd_f64_k: return (unary_kernel_t)&_simsimd_u64_to_f64; + case simsimd_f32_k: return (unary_kernel_t)&_simsimd_u64_to_f32; + case simsimd_f16_k: return (unary_kernel_t)&_simsimd_u64_to_f16; + case simsimd_bf16_k: return (unary_kernel_t)&_simsimd_u64_to_bf16; default: return NULL; } } @@ -3178,16 +3178,16 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const size_t integral_size = a_family == simsimd_datatype_float_family_k ? b_itemsize : a_itemsize; if (float_size <= integral_size) { //? No 128-bit float on most platforms - if (max_itemsize == 8) { ab_dtype = simsimd_datatype_f64_k; } - else if (max_itemsize == 4) { ab_dtype = simsimd_datatype_f64_k; } - else if (max_itemsize == 2) { ab_dtype = simsimd_datatype_f32_k; } - else if (max_itemsize == 1) { ab_dtype = simsimd_datatype_f16_k; } + if (max_itemsize == 8) { ab_dtype = simsimd_f64_k; } + else if (max_itemsize == 4) { ab_dtype = simsimd_f64_k; } + else if (max_itemsize == 2) { ab_dtype = simsimd_f32_k; } + else if (max_itemsize == 1) { ab_dtype = simsimd_f16_k; } } else { - if (max_itemsize == 8) { ab_dtype = simsimd_datatype_f64_k; } - else if (max_itemsize == 4) { ab_dtype = simsimd_datatype_f32_k; } - else if (max_itemsize == 2) { ab_dtype = simsimd_datatype_f16_k; } - else if (max_itemsize == 1) { ab_dtype = simsimd_datatype_f16_k; } + if (max_itemsize == 8) { ab_dtype = simsimd_f64_k; } + else if (max_itemsize == 4) { ab_dtype = simsimd_f32_k; } + else if (max_itemsize == 2) { ab_dtype = simsimd_f16_k; } + else if (max_itemsize == 1) { ab_dtype = simsimd_f16_k; } } } // If only one of the operands is a unsigned, and the second is a signed integral of same size, @@ -3200,16 +3200,16 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const size_t signed_size = a_family == simsimd_datatype_int_family_k ? a_itemsize : b_itemsize; if (signed_size <= unsigned_size) { //? No 128-bit integer on most platforms - if (max_itemsize == 8) { ab_dtype = simsimd_datatype_i64_k; } - else if (max_itemsize == 4) { ab_dtype = simsimd_datatype_i64_k; } - else if (max_itemsize == 2) { ab_dtype = simsimd_datatype_i32_k; } - else if (max_itemsize == 1) { ab_dtype = simsimd_datatype_i16_k; } + if (max_itemsize == 8) { ab_dtype = simsimd_i64_k; } + else if (max_itemsize == 4) { ab_dtype = simsimd_i64_k; } + else if (max_itemsize == 2) { ab_dtype = simsimd_i32_k; } + else if (max_itemsize == 1) { ab_dtype = simsimd_i16_k; } } else { - if (max_itemsize == 8) { ab_dtype = simsimd_datatype_i64_k; } - else if (max_itemsize == 4) { ab_dtype = simsimd_datatype_i32_k; } - else if (max_itemsize == 2) { ab_dtype = simsimd_datatype_i16_k; } - else if (max_itemsize == 1) { ab_dtype = simsimd_datatype_i16_k; } + if (max_itemsize == 8) { ab_dtype = simsimd_i64_k; } + else if (max_itemsize == 4) { ab_dtype = simsimd_i32_k; } + else if (max_itemsize == 2) { ab_dtype = simsimd_i16_k; } + else if (max_itemsize == 1) { ab_dtype = simsimd_i16_k; } } } // For boolean and complex types, we don't yet have a clear policy. @@ -3335,11 +3335,11 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const out_continuous_dimensions && ins_continuous_dimensions) { // Look up the kernel and the capability - simsimd_kernel_sum_punned_t kernel = NULL; + simsimd_elementwise_sum_t kernel = NULL; simsimd_capability_t capability = simsimd_cap_serial_k; simsimd_kernel_kind_t const kernel_kind = simsimd_sum_k; - simsimd_find_kernel_punned(kernel_kind, ab_dtype, static_capabilities, simsimd_cap_any_k, - (simsimd_kernel_punned_t *)&kernel, &capability); + simsimd_find_kernel(kernel_kind, ab_dtype, static_capabilities, simsimd_cap_any_k, + (simsimd_kernel_punned_t *)&kernel, &capability); if (!kernel) { PyErr_Format( // PyExc_LookupError, "Unsupported kernel '%c' and datatype combination across inputs ('%s' and '%s')", @@ -3374,11 +3374,11 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const if ((is_tensor_a_with_scalar_b || is_tensor_b_with_scalar_b) && (out_continuous_dimensions && ins_continuous_dimensions)) { // Look up the kernel and the capability - simsimd_kernel_scale_punned_t kernel = NULL; + simsimd_elementwise_scale_t kernel = NULL; simsimd_capability_t capability = simsimd_cap_serial_k; simsimd_kernel_kind_t const kernel_kind = simsimd_scale_k; - simsimd_find_kernel_punned(kernel_kind, ab_dtype, static_capabilities, simsimd_cap_any_k, - (simsimd_kernel_punned_t *)&kernel, &capability); + simsimd_find_kernel(kernel_kind, ab_dtype, static_capabilities, simsimd_cap_any_k, + (simsimd_kernel_punned_t *)&kernel, &capability); if (!kernel) { PyErr_Format( // PyExc_LookupError, "Unsupported kernel '%c' and datatype combination across inputs ('%s' and '%s')", @@ -3425,7 +3425,7 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const simsimd_datatype_family(b_parsed.datatype) == simsimd_datatype_float_family_k) { unary_kernel_t a_upcast_ptr = elementwise_upcast_to_f64(a_parsed.datatype); unary_kernel_t b_upcast_ptr = elementwise_upcast_to_f64(b_parsed.datatype); - binary_kernel_t elementwise_sadd_ptr = elementwise_sadd(simsimd_datatype_f64_k); + binary_kernel_t elementwise_sadd_ptr = elementwise_sadd(simsimd_f64_k); unary_kernel_t out_downcast_ptr = elementwise_downcast_from_f64(out_parsed.datatype); apply_elementwise_casting_binary_operation_to_each_scalar( // &a_parsed, &b_parsed, &out_parsed, // @@ -3436,7 +3436,7 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const simsimd_datatype_family(b_parsed.datatype) == simsimd_datatype_uint_family_k) { unary_kernel_t a_upcast_ptr = elementwise_upcast_to_u64(a_parsed.datatype); unary_kernel_t b_upcast_ptr = elementwise_upcast_to_u64(b_parsed.datatype); - binary_kernel_t elementwise_sadd_ptr = elementwise_sadd(simsimd_datatype_u64_k); + binary_kernel_t elementwise_sadd_ptr = elementwise_sadd(simsimd_u64_k); unary_kernel_t out_downcast_ptr = elementwise_downcast_from_u64(out_parsed.datatype); apply_elementwise_casting_binary_operation_to_each_scalar( // &a_parsed, &b_parsed, &out_parsed, // @@ -3446,7 +3446,7 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const else { unary_kernel_t a_upcast_ptr = elementwise_upcast_to_i64(a_parsed.datatype); unary_kernel_t b_upcast_ptr = elementwise_upcast_to_i64(b_parsed.datatype); - binary_kernel_t elementwise_sadd_ptr = elementwise_sadd(simsimd_datatype_i64_k); + binary_kernel_t elementwise_sadd_ptr = elementwise_sadd(simsimd_i64_k); unary_kernel_t out_downcast_ptr = elementwise_downcast_from_i64(out_parsed.datatype); apply_elementwise_casting_binary_operation_to_each_scalar( // &a_parsed, &b_parsed, &out_parsed, // diff --git a/scripts/bench.cxx b/scripts/bench.cxx index 5e595283..ccaf0230 100644 --- a/scripts/bench.cxx +++ b/scripts/bench.cxx @@ -40,23 +40,23 @@ namespace bm = benchmark; // clang-format off template struct datatype_enum_to_type_gt { using value_t = void; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_f64_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_f32_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_f16_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_bf16_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_f64_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_f32_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_f16_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_bf16_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_b8_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_i8_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_u8_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_i16_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_u16_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_i32_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_u32_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_i64_t; }; -template <> struct datatype_enum_to_type_gt { using value_t = simsimd_u64_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_f64_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_f32_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_f16_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_bf16_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_f64_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_f32_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_f16_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_bf16_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_b8_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_i8_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_u8_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_i16_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_u16_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_i32_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_u32_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_i64_t; }; +template <> struct datatype_enum_to_type_gt { using value_t = simsimd_u64_t; }; // clang-format on template @@ -72,12 +72,11 @@ template struct vector_gt { using scalar_t = typename datatype_enum_to_type_gt::value_t; using compressed16_t = unsigned short; - static constexpr bool is_integral = - datatype_ak == datatype_ak == simsimd_datatype_b8_k || // - datatype_ak == simsimd_datatype_i8_k || datatype_ak == simsimd_datatype_u8_k || // - datatype_ak == simsimd_datatype_i16_k || datatype_ak == simsimd_datatype_u16_k || // - datatype_ak == simsimd_datatype_i32_k || datatype_ak == simsimd_datatype_u32_k || - datatype_ak == simsimd_datatype_i64_k || datatype_ak == simsimd_datatype_u64_k; + static constexpr bool is_integral = datatype_ak == datatype_ak == simsimd_b8_k || // + datatype_ak == simsimd_i8_k || datatype_ak == simsimd_u8_k || // + datatype_ak == simsimd_i16_k || datatype_ak == simsimd_u16_k || // + datatype_ak == simsimd_i32_k || datatype_ak == simsimd_u32_k || + datatype_ak == simsimd_i64_k || datatype_ak == simsimd_u64_k; static constexpr std::size_t cacheline_length = 64; scalar_t *buffer_ = nullptr; @@ -141,7 +140,7 @@ struct vector_gt { 0b011111111110000000000000000000000000000000000000000000000000000; #if !SIMSIMD_NATIVE_BF16 - if constexpr (datatype_ak == simsimd_datatype_bf16_k || datatype_ak == simsimd_datatype_bf16c_k) { + if constexpr (datatype_ak == simsimd_bf16_k || datatype_ak == simsimd_bf16c_k) { simsimd_f32_t f32 = static_cast(from); simsimd_f32_to_bf16(&f32, &to); if ((to & exponent_mask_bf16) == exponent_mask_bf16) to = 0; @@ -150,7 +149,7 @@ struct vector_gt { } #endif #if !SIMSIMD_NATIVE_F16 - if constexpr (datatype_ak == simsimd_datatype_f16_k || datatype_ak == simsimd_datatype_f16c_k) { + if constexpr (datatype_ak == simsimd_f16_k || datatype_ak == simsimd_f16c_k) { simsimd_f32_t f32 = static_cast(from); simsimd_f32_to_f16(&f32, &to); if ((to & exponent_mask_f16) == exponent_mask_f16) to = 0; @@ -168,14 +167,14 @@ struct vector_gt { */ static double uncompress(scalar_t const &from) noexcept { #if !SIMSIMD_NATIVE_BF16 - if constexpr (datatype_ak == simsimd_datatype_bf16_k || datatype_ak == simsimd_datatype_bf16c_k) { + if constexpr (datatype_ak == simsimd_bf16_k || datatype_ak == simsimd_bf16c_k) { simsimd_f32_t f32; simsimd_bf16_to_f32((simsimd_bf16_t const *)&from, &f32); return f32; } #endif #if !SIMSIMD_NATIVE_F16 - if constexpr (datatype_ak == simsimd_datatype_f16_k || datatype_ak == simsimd_datatype_f16c_k) { + if constexpr (datatype_ak == simsimd_f16_k || datatype_ak == simsimd_f16c_k) { simsimd_f32_t f32; simsimd_f16_to_f32((simsimd_f16_t const *)&from, &f32); return f32; @@ -498,8 +497,8 @@ constexpr std::size_t function_args_count(void (*function)(function_args_at...)) * @param dimensions The number of dimensions in the vectors. */ template -void measure_fma(bm::State &state, kernel_at kernel, kernel_at baseline, l2_metric_at l2_metric, - std::size_t dimensions) { +void measure_elementwise(bm::State &state, kernel_at kernel, kernel_at baseline, l2_metric_at l2_metric, + std::size_t dimensions) { using pair_t = pair_at; using vector_t = typename pair_at::vector_t; @@ -584,11 +583,11 @@ void dense_(std::string name, metric_at *distance_func, metric_at *baseline_func ->Threads(default_threads); } -template -void fma_(std::string name, kernel_at *kernel_func, kernel_at *baseline_func, l2_metric_at *l2_metric_func) { +template +void elementwise_(std::string name, kernel_at *kernel_func, kernel_at *baseline_func, l2_metric_at *l2_metric_func) { using pair_t = vectors_pair_gt; std::string bench_name = name + "<" + std::to_string(dense_dimensions) + "d>"; - bm::RegisterBenchmark(bench_name.c_str(), measure_fma, kernel_func, + bm::RegisterBenchmark(bench_name.c_str(), measure_elementwise, kernel_func, baseline_func, l2_metric_func, dense_dimensions) ->MinTime(default_seconds) ->Threads(default_threads); @@ -708,24 +707,24 @@ int main(int argc, char **argv) { bm::Initialize(&argc, argv); if (bm::ReportUnrecognizedArguments(argc, argv)) return 1; - constexpr simsimd_datatype_t b8_k = simsimd_datatype_b8_k; - constexpr simsimd_datatype_t i4x2_k = simsimd_datatype_i4x2_k; - constexpr simsimd_datatype_t i8_k = simsimd_datatype_i8_k; - constexpr simsimd_datatype_t i16_k = simsimd_datatype_i16_k; - constexpr simsimd_datatype_t i32_k = simsimd_datatype_i32_k; - constexpr simsimd_datatype_t i64_k = simsimd_datatype_i64_k; - constexpr simsimd_datatype_t u8_k = simsimd_datatype_u8_k; - constexpr simsimd_datatype_t u16_k = simsimd_datatype_u16_k; - constexpr simsimd_datatype_t u32_k = simsimd_datatype_u32_k; - constexpr simsimd_datatype_t u64_k = simsimd_datatype_u64_k; - constexpr simsimd_datatype_t f64_k = simsimd_datatype_f64_k; - constexpr simsimd_datatype_t f32_k = simsimd_datatype_f32_k; - constexpr simsimd_datatype_t f16_k = simsimd_datatype_f16_k; - constexpr simsimd_datatype_t bf16_k = simsimd_datatype_bf16_k; - constexpr simsimd_datatype_t f64c_k = simsimd_datatype_f64c_k; - constexpr simsimd_datatype_t f32c_k = simsimd_datatype_f32c_k; - constexpr simsimd_datatype_t f16c_k = simsimd_datatype_f16c_k; - constexpr simsimd_datatype_t bf16c_k = simsimd_datatype_bf16c_k; + constexpr simsimd_datatype_t b8_k = simsimd_b8_k; + constexpr simsimd_datatype_t i4x2_k = simsimd_i4x2_k; + constexpr simsimd_datatype_t i8_k = simsimd_i8_k; + constexpr simsimd_datatype_t i16_k = simsimd_i16_k; + constexpr simsimd_datatype_t i32_k = simsimd_i32_k; + constexpr simsimd_datatype_t i64_k = simsimd_i64_k; + constexpr simsimd_datatype_t u8_k = simsimd_u8_k; + constexpr simsimd_datatype_t u16_k = simsimd_u16_k; + constexpr simsimd_datatype_t u32_k = simsimd_u32_k; + constexpr simsimd_datatype_t u64_k = simsimd_u64_k; + constexpr simsimd_datatype_t f64_k = simsimd_f64_k; + constexpr simsimd_datatype_t f32_k = simsimd_f32_k; + constexpr simsimd_datatype_t f16_k = simsimd_f16_k; + constexpr simsimd_datatype_t bf16_k = simsimd_bf16_k; + constexpr simsimd_datatype_t f64c_k = simsimd_f64c_k; + constexpr simsimd_datatype_t f32c_k = simsimd_f32c_k; + constexpr simsimd_datatype_t f16c_k = simsimd_f16c_k; + constexpr simsimd_datatype_t bf16c_k = simsimd_bf16c_k; #if SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS @@ -772,10 +771,10 @@ int main(int argc, char **argv) { sparse_("intersect_u16_neon", simsimd_intersect_u16_neon, simsimd_intersect_u16_accurate); sparse_("intersect_u32_neon", simsimd_intersect_u32_neon, simsimd_intersect_u32_accurate); - fma_("fma_f32_neon", simsimd_fma_f32_neon, simsimd_fma_f32_accurate, simsimd_l2_f32_accurate); - fma_("wsum_f32_neon", simsimd_wsum_f32_neon, simsimd_wsum_f32_accurate, simsimd_l2_f32_accurate); - fma_("fma_f32_serial", simsimd_fma_f32_serial, simsimd_fma_f32_accurate, simsimd_l2_f32_accurate); - fma_("wsum_f32_serial", simsimd_wsum_f32_serial, simsimd_wsum_f32_accurate, simsimd_l2_f32_accurate); + elementwise_("fma_f32_neon", simsimd_fma_f32_neon, simsimd_fma_f32_accurate, simsimd_l2_f32_accurate); + elementwise_("wsum_f32_neon", simsimd_wsum_f32_neon, simsimd_wsum_f32_accurate, simsimd_l2_f32_accurate); + elementwise_("fma_f32_serial", simsimd_fma_f32_serial, simsimd_fma_f32_accurate, simsimd_l2_f32_accurate); + elementwise_("wsum_f32_serial", simsimd_wsum_f32_serial, simsimd_wsum_f32_accurate, simsimd_l2_f32_accurate); #endif @@ -793,14 +792,14 @@ int main(int argc, char **argv) { curved_("bilinear_f16_neon", simsimd_bilinear_f16_neon, simsimd_bilinear_f16_accurate); curved_("mahalanobis_f16_neon", simsimd_mahalanobis_f16_neon, simsimd_mahalanobis_f16_accurate); - fma_("fma_f16_neon", simsimd_fma_f16_neon, simsimd_fma_f16_accurate, simsimd_l2_f16_accurate); - fma_("wsum_f16_neon", simsimd_wsum_f16_neon, simsimd_wsum_f16_accurate, simsimd_l2_f16_accurate); + elementwise_("fma_f16_neon", simsimd_fma_f16_neon, simsimd_fma_f16_accurate, simsimd_l2_f16_accurate); + elementwise_("wsum_f16_neon", simsimd_wsum_f16_neon, simsimd_wsum_f16_accurate, simsimd_l2_f16_accurate); // FMA kernels for `u8` on NEON use `f16` arithmetic - fma_("fma_u8_neon", simsimd_fma_u8_neon, simsimd_fma_u8_accurate, simsimd_l2_u8_serial); - fma_("wsum_u8_neon", simsimd_wsum_u8_neon, simsimd_wsum_u8_accurate, simsimd_l2_u8_serial); - fma_("fma_i8_neon", simsimd_fma_i8_neon, simsimd_fma_i8_accurate, simsimd_l2_i8_serial); - fma_("wsum_i8_neon", simsimd_wsum_i8_neon, simsimd_wsum_i8_accurate, simsimd_l2_i8_serial); + elementwise_("fma_u8_neon", simsimd_fma_u8_neon, simsimd_fma_u8_accurate, simsimd_l2_u8_serial); + elementwise_("wsum_u8_neon", simsimd_wsum_u8_neon, simsimd_wsum_u8_accurate, simsimd_l2_u8_serial); + elementwise_("fma_i8_neon", simsimd_fma_i8_neon, simsimd_fma_i8_accurate, simsimd_l2_i8_serial); + elementwise_("wsum_i8_neon", simsimd_wsum_i8_neon, simsimd_wsum_i8_accurate, simsimd_l2_i8_serial); #endif #if SIMSIMD_TARGET_NEON_BF16 @@ -815,8 +814,9 @@ int main(int argc, char **argv) { curved_("bilinear_bf16_neon", simsimd_bilinear_bf16_neon, simsimd_bilinear_bf16_accurate); curved_("mahalanobis_bf16_neon", simsimd_mahalanobis_bf16_neon, simsimd_mahalanobis_bf16_accurate); - fma_("fma_bf16_neon", simsimd_fma_bf16_neon, simsimd_fma_bf16_accurate, simsimd_l2_bf16_accurate); - fma_("wsum_bf16_neon", simsimd_wsum_bf16_neon, simsimd_wsum_bf16_accurate, simsimd_l2_bf16_accurate); + elementwise_("fma_bf16_neon", simsimd_fma_bf16_neon, simsimd_fma_bf16_accurate, simsimd_l2_bf16_accurate); + elementwise_("wsum_bf16_neon", simsimd_wsum_bf16_neon, simsimd_wsum_bf16_accurate, + simsimd_l2_bf16_accurate); #endif #if SIMSIMD_TARGET_SVE @@ -904,18 +904,22 @@ int main(int argc, char **argv) { curved_("bilinear_bf16_haswell", simsimd_bilinear_bf16_haswell, simsimd_bilinear_bf16_accurate); curved_("mahalanobis_bf16_haswell", simsimd_mahalanobis_bf16_haswell, simsimd_mahalanobis_bf16_accurate); - fma_("fma_f64_haswell", simsimd_fma_f64_haswell, simsimd_fma_f64_serial, simsimd_l2_f64_serial); - fma_("wsum_f64_haswell", simsimd_wsum_f64_haswell, simsimd_wsum_f64_serial, simsimd_l2_f64_serial); - fma_("fma_f32_haswell", simsimd_fma_f32_haswell, simsimd_fma_f32_accurate, simsimd_l2_f32_accurate); - fma_("wsum_f32_haswell", simsimd_wsum_f32_haswell, simsimd_wsum_f32_accurate, simsimd_l2_f32_accurate); - fma_("fma_f16_haswell", simsimd_fma_f16_haswell, simsimd_fma_f16_accurate, simsimd_l2_f16_accurate); - fma_("wsum_f16_haswell", simsimd_wsum_f16_haswell, simsimd_wsum_f16_accurate, simsimd_l2_f16_accurate); - fma_("fma_bf16_haswell", simsimd_fma_bf16_haswell, simsimd_fma_bf16_accurate, simsimd_l2_bf16_accurate); - fma_("wsum_bf16_haswell", simsimd_wsum_bf16_haswell, simsimd_wsum_bf16_accurate, simsimd_l2_bf16_accurate); - fma_("fma_i8_haswell", simsimd_fma_i8_haswell, simsimd_fma_i8_accurate, simsimd_l2_i8_serial); - fma_("wsum_i8_haswell", simsimd_wsum_i8_haswell, simsimd_wsum_i8_accurate, simsimd_l2_i8_serial); - fma_("fma_u8_haswell", simsimd_fma_u8_haswell, simsimd_fma_u8_accurate, simsimd_l2_u8_serial); - fma_("wsum_u8_haswell", simsimd_wsum_u8_haswell, simsimd_wsum_u8_accurate, simsimd_l2_u8_serial); + elementwise_("fma_f64_haswell", simsimd_fma_f64_haswell, simsimd_fma_f64_serial, simsimd_l2_f64_serial); + elementwise_("wsum_f64_haswell", simsimd_wsum_f64_haswell, simsimd_wsum_f64_serial, simsimd_l2_f64_serial); + elementwise_("fma_f32_haswell", simsimd_fma_f32_haswell, simsimd_fma_f32_accurate, simsimd_l2_f32_accurate); + elementwise_("wsum_f32_haswell", simsimd_wsum_f32_haswell, simsimd_wsum_f32_accurate, + simsimd_l2_f32_accurate); + elementwise_("fma_f16_haswell", simsimd_fma_f16_haswell, simsimd_fma_f16_accurate, simsimd_l2_f16_accurate); + elementwise_("wsum_f16_haswell", simsimd_wsum_f16_haswell, simsimd_wsum_f16_accurate, + simsimd_l2_f16_accurate); + elementwise_("fma_bf16_haswell", simsimd_fma_bf16_haswell, simsimd_fma_bf16_accurate, + simsimd_l2_bf16_accurate); + elementwise_("wsum_bf16_haswell", simsimd_wsum_bf16_haswell, simsimd_wsum_bf16_accurate, + simsimd_l2_bf16_accurate); + elementwise_("fma_i8_haswell", simsimd_fma_i8_haswell, simsimd_fma_i8_accurate, simsimd_l2_i8_serial); + elementwise_("wsum_i8_haswell", simsimd_wsum_i8_haswell, simsimd_wsum_i8_accurate, simsimd_l2_i8_serial); + elementwise_("fma_u8_haswell", simsimd_fma_u8_haswell, simsimd_fma_u8_accurate, simsimd_l2_u8_serial); + elementwise_("wsum_u8_haswell", simsimd_wsum_u8_haswell, simsimd_wsum_u8_accurate, simsimd_l2_u8_serial); #endif @@ -943,10 +947,10 @@ int main(int argc, char **argv) { dense_("dot_f16c_sapphire", simsimd_dot_f16c_sapphire, simsimd_dot_f16c_accurate); dense_("vdot_f16c_sapphire", simsimd_vdot_f16c_sapphire, simsimd_vdot_f16c_accurate); - fma_("fma_u8_sapphire", simsimd_fma_u8_sapphire, simsimd_fma_u8_accurate, simsimd_l2_u8_serial); - fma_("wsum_u8_sapphire", simsimd_wsum_u8_sapphire, simsimd_wsum_u8_accurate, simsimd_l2_u8_serial); - fma_("fma_i8_sapphire", simsimd_fma_i8_sapphire, simsimd_fma_i8_accurate, simsimd_l2_i8_serial); - fma_("wsum_i8_sapphire", simsimd_wsum_i8_sapphire, simsimd_wsum_i8_accurate, simsimd_l2_i8_serial); + elementwise_("fma_u8_sapphire", simsimd_fma_u8_sapphire, simsimd_fma_u8_accurate, simsimd_l2_u8_serial); + elementwise_("wsum_u8_sapphire", simsimd_wsum_u8_sapphire, simsimd_wsum_u8_accurate, simsimd_l2_u8_serial); + elementwise_("fma_i8_sapphire", simsimd_fma_i8_sapphire, simsimd_fma_i8_accurate, simsimd_l2_i8_serial); + elementwise_("wsum_i8_sapphire", simsimd_wsum_i8_sapphire, simsimd_wsum_i8_accurate, simsimd_l2_i8_serial); #endif #if SIMSIMD_TARGET_ICE @@ -990,12 +994,15 @@ int main(int argc, char **argv) { dense_("dot_f64c_skylake", simsimd_dot_f64c_skylake, simsimd_dot_f64c_serial); dense_("vdot_f64c_skylake", simsimd_vdot_f64c_skylake, simsimd_vdot_f64c_serial); - fma_("fma_f64_skylake", simsimd_fma_f64_skylake, simsimd_fma_f64_serial, simsimd_l2_f64_serial); - fma_("wsum_f64_skylake", simsimd_wsum_f64_skylake, simsimd_wsum_f64_serial, simsimd_l2_f64_serial); - fma_("fma_f32_skylake", simsimd_fma_f32_skylake, simsimd_fma_f32_accurate, simsimd_l2_f32_accurate); - fma_("wsum_f32_skylake", simsimd_wsum_f32_skylake, simsimd_wsum_f32_accurate, simsimd_l2_f32_accurate); - fma_("fma_bf16_skylake", simsimd_fma_bf16_skylake, simsimd_fma_bf16_accurate, simsimd_l2_bf16_accurate); - fma_("wsum_bf16_skylake", simsimd_wsum_bf16_skylake, simsimd_wsum_bf16_accurate, simsimd_l2_bf16_accurate); + elementwise_("fma_f64_skylake", simsimd_fma_f64_skylake, simsimd_fma_f64_serial, simsimd_l2_f64_serial); + elementwise_("wsum_f64_skylake", simsimd_wsum_f64_skylake, simsimd_wsum_f64_serial, simsimd_l2_f64_serial); + elementwise_("fma_f32_skylake", simsimd_fma_f32_skylake, simsimd_fma_f32_accurate, simsimd_l2_f32_accurate); + elementwise_("wsum_f32_skylake", simsimd_wsum_f32_skylake, simsimd_wsum_f32_accurate, + simsimd_l2_f32_accurate); + elementwise_("fma_bf16_skylake", simsimd_fma_bf16_skylake, simsimd_fma_bf16_accurate, + simsimd_l2_bf16_accurate); + elementwise_("wsum_bf16_skylake", simsimd_wsum_bf16_skylake, simsimd_wsum_bf16_accurate, + simsimd_l2_bf16_accurate); #endif @@ -1064,12 +1071,12 @@ int main(int argc, char **argv) { dense_("hamming_b8_serial", simsimd_hamming_b8_serial, simsimd_hamming_b8_serial); dense_("jaccard_b8_serial", simsimd_jaccard_b8_serial, simsimd_jaccard_b8_serial); - fma_("fma_f16_serial", simsimd_fma_f16_serial, simsimd_fma_f16_accurate, simsimd_l2_f16_accurate); - fma_("wsum_f16_serial", simsimd_wsum_f16_serial, simsimd_wsum_f16_accurate, simsimd_l2_f16_accurate); - fma_("fma_u8_serial", simsimd_fma_u8_serial, simsimd_fma_u8_accurate, simsimd_l2_u8_serial); - fma_("wsum_u8_serial", simsimd_wsum_u8_serial, simsimd_wsum_u8_accurate, simsimd_l2_u8_serial); - fma_("fma_i8_serial", simsimd_fma_i8_serial, simsimd_fma_i8_accurate, simsimd_l2_i8_serial); - fma_("wsum_i8_serial", simsimd_wsum_i8_serial, simsimd_wsum_i8_accurate, simsimd_l2_i8_serial); + elementwise_("fma_f16_serial", simsimd_fma_f16_serial, simsimd_fma_f16_accurate, simsimd_l2_f16_accurate); + elementwise_("wsum_f16_serial", simsimd_wsum_f16_serial, simsimd_wsum_f16_accurate, simsimd_l2_f16_accurate); + elementwise_("fma_u8_serial", simsimd_fma_u8_serial, simsimd_fma_u8_accurate, simsimd_l2_u8_serial); + elementwise_("wsum_u8_serial", simsimd_wsum_u8_serial, simsimd_wsum_u8_accurate, simsimd_l2_u8_serial); + elementwise_("fma_i8_serial", simsimd_fma_i8_serial, simsimd_fma_i8_accurate, simsimd_l2_i8_serial); + elementwise_("wsum_i8_serial", simsimd_wsum_i8_serial, simsimd_wsum_i8_accurate, simsimd_l2_i8_serial); bm::RunSpecifiedBenchmarks(); bm::Shutdown(); diff --git a/swift/SimSIMD.swift b/swift/SimSIMD.swift index 9ac7dae7..a96358ad 100644 --- a/swift/SimSIMD.swift +++ b/swift/SimSIMD.swift @@ -2,13 +2,13 @@ import CSimSIMD public protocol SimSIMD { static var dataType: simsimd_datatype_t { get } - static var cosine: simsimd_metric_dense_punned_t { get } - static var dotProduct: simsimd_metric_dense_punned_t { get } - static var squaredEuclidean: simsimd_metric_dense_punned_t { get } + static var cosine: simsimd_dense_metric_t { get } + static var dotProduct: simsimd_dense_metric_t { get } + static var squaredEuclidean: simsimd_dense_metric_t { get } } extension Int8: SimSIMD { - public static let dataType = simsimd_datatype_i8_k + public static let dataType = simsimd_i8_k public static let cosine = find(kind: simsimd_cosine_k, dataType: dataType) public static let dotProduct = find(kind: simsimd_dot_k, dataType: dataType) public static let squaredEuclidean = find(kind: simsimd_sqeuclidean_k, dataType: dataType) @@ -16,21 +16,21 @@ extension Int8: SimSIMD { @available(macOS 11.0, iOS 14.0, tvOS 14.0, watchOS 7.0, *) extension Float16: SimSIMD { - public static let dataType = simsimd_datatype_f16_k + public static let dataType = simsimd_f16_k public static let cosine = find(kind: simsimd_cosine_k, dataType: dataType) public static let dotProduct = find(kind: simsimd_dot_k, dataType: dataType) public static let squaredEuclidean = find(kind: simsimd_sqeuclidean_k, dataType: dataType) } extension Float32: SimSIMD { - public static let dataType = simsimd_datatype_f32_k + public static let dataType = simsimd_f32_k public static let cosine = find(kind: simsimd_cosine_k, dataType: dataType) public static let dotProduct = find(kind: simsimd_inner_k, dataType: dataType) public static let squaredEuclidean = find(kind: simsimd_sqeuclidean_k, dataType: dataType) } extension Float64: SimSIMD { - public static let dataType = simsimd_datatype_f64_k + public static let dataType = simsimd_f64_k public static let cosine = find(kind: simsimd_cosine_k, dataType: dataType) public static let dotProduct = find(kind: simsimd_dot_k, dataType: dataType) public static let squaredEuclidean = find(kind: simsimd_sqeuclidean_k, dataType: dataType) @@ -71,7 +71,7 @@ extension RandomAccessCollection where Element: SimSIMD { } @inlinable @inline(__always) -func perform(_ metric: simsimd_metric_dense_punned_t, a: A, b: B) -> Double? where A: Sequence, B: Sequence, A.Element == B.Element { +func perform(_ metric: simsimd_dense_metric_t, a: A, b: B) -> Double? where A: Sequence, B: Sequence, A.Element == B.Element { var distance: simsimd_distance_t = 0 let result = a.withContiguousStorageIfAvailable { a in b.withContiguousStorageIfAvailable { b in @@ -118,14 +118,14 @@ extension simsimd_capability_t: OptionSet, CustomStringConvertible { } @inline(__always) -private func find(kind: simsimd_metric_kind_t, dataType: simsimd_datatype_t) -> simsimd_metric_dense_punned_t { - var output: simsimd_metric_dense_punned_t? +private func find(kind: simsimd_metric_kind_t, dataType: simsimd_datatype_t) -> simsimd_dense_metric_t { + var output: simsimd_dense_metric_t? var used = simsimd_capability_t.any // Use `withUnsafeMutablePointer` to safely cast `output` to the required pointer type. withUnsafeMutablePointer(to: &output) { outputPtr in // Cast the pointer to `UnsafeMutablePointer` let castedPtr = outputPtr.withMemoryRebound(to: Optional.self, capacity: 1) { $0 } - simsimd_find_kernel_punned(kind, dataType, .available, .any, castedPtr, &used) + simsimd_find_kernel(kind, dataType, .available, .any, castedPtr, &used) } guard let output else { fatalError("Could not find function \(kind) for \(dataType)") } return output