From 14fd5d3e11bd5ef008cd09ae7354f1daa660d93d Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Tue, 5 Nov 2024 22:14:06 +0000 Subject: [PATCH] Improve: Type-casting logic --- include/simsimd/curved.h | 93 +++++---- include/simsimd/dot.h | 75 +++---- include/simsimd/elementwise.h | 282 +++++++++++++------------- include/simsimd/probability.h | 42 ++-- include/simsimd/simsimd.h | 37 ++++ include/simsimd/sparse.h | 24 ++- include/simsimd/spatial.h | 66 +++---- include/simsimd/types.h | 363 +++++++++++++++++++--------------- python/lib.c | 126 ++++-------- 9 files changed, 589 insertions(+), 519 deletions(-) diff --git a/include/simsimd/curved.h b/include/simsimd/curved.h index 59a99fe6..ef9ef2df 100644 --- a/include/simsimd/curved.h +++ b/include/simsimd/curved.h @@ -96,13 +96,13 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_sapphire(simsimd_f16_t const* a, sim SIMSIMD_PUBLIC void simsimd_bilinear_##input_type##_##name( \ simsimd_##input_type##_t const *a, simsimd_##input_type##_t const *b, simsimd_##input_type##_t const *c, \ simsimd_size_t n, simsimd_distance_t *result) { \ - simsimd_##accumulator_type##_t sum = 0; \ + simsimd_##accumulator_type##_t sum = 0, a_i, b_j, c_ij; \ for (simsimd_size_t i = 0; i != n; ++i) { \ simsimd_##accumulator_type##_t partial = 0; \ - simsimd_##accumulator_type##_t a_i = load_and_convert(a + i); \ + load_and_convert(a + i, &a_i); \ for (simsimd_size_t j = 0; j != n; ++j) { \ - simsimd_##accumulator_type##_t b_j = load_and_convert(b + j); \ - simsimd_##accumulator_type##_t c_ij = load_and_convert(c + i * n + j); \ + load_and_convert(b + j, &b_j); \ + load_and_convert(c + i * n + j, &c_ij); \ partial += c_ij * b_j; \ } \ sum += a_i * partial; \ @@ -114,13 +114,17 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_sapphire(simsimd_f16_t const* a, sim SIMSIMD_PUBLIC void simsimd_mahalanobis_##input_type##_##name( \ simsimd_##input_type##_t const *a, simsimd_##input_type##_t const *b, simsimd_##input_type##_t const *c, \ simsimd_size_t n, simsimd_distance_t *result) { \ - simsimd_##accumulator_type##_t sum = 0; \ + simsimd_##accumulator_type##_t sum = 0, a_i, a_j, b_i, b_j, c_ij; \ for (simsimd_size_t i = 0; i != n; ++i) { \ simsimd_##accumulator_type##_t partial = 0; \ - simsimd_##accumulator_type##_t diff_i = load_and_convert(a + i) - load_and_convert(b + i); \ + load_and_convert(a + i, &a_i); \ + load_and_convert(b + i, &b_i); \ + simsimd_##accumulator_type##_t diff_i = a_i - b_i; \ for (simsimd_size_t j = 0; j != n; ++j) { \ - simsimd_##accumulator_type##_t diff_j = load_and_convert(a + j) - load_and_convert(b + j); \ - simsimd_##accumulator_type##_t c_ij = load_and_convert(c + i * n + j); \ + load_and_convert(a + j, &a_j); \ + load_and_convert(b + j, &b_j); \ + simsimd_##accumulator_type##_t diff_j = a_j - b_j; \ + load_and_convert(c + i * n + j, &c_ij); \ partial += c_ij * diff_j; \ } \ sum += diff_i * partial; \ @@ -128,26 +132,26 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_f16_sapphire(simsimd_f16_t const* a, sim *result = (simsimd_distance_t)SIMSIMD_SQRT(sum); \ } -SIMSIMD_MAKE_BILINEAR(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_bilinear_f64_serial -SIMSIMD_MAKE_MAHALANOBIS(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_mahalanobis_f64_serial +SIMSIMD_MAKE_BILINEAR(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_bilinear_f64_serial +SIMSIMD_MAKE_MAHALANOBIS(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_mahalanobis_f64_serial -SIMSIMD_MAKE_BILINEAR(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_bilinear_f32_serial -SIMSIMD_MAKE_MAHALANOBIS(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_mahalanobis_f32_serial +SIMSIMD_MAKE_BILINEAR(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_bilinear_f32_serial +SIMSIMD_MAKE_MAHALANOBIS(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_mahalanobis_f32_serial -SIMSIMD_MAKE_BILINEAR(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_bilinear_f16_serial -SIMSIMD_MAKE_MAHALANOBIS(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_mahalanobis_f16_serial +SIMSIMD_MAKE_BILINEAR(serial, f16, f32, simsimd_f16_to_f32) // simsimd_bilinear_f16_serial +SIMSIMD_MAKE_MAHALANOBIS(serial, f16, f32, simsimd_f16_to_f32) // simsimd_mahalanobis_f16_serial -SIMSIMD_MAKE_BILINEAR(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_bilinear_bf16_serial -SIMSIMD_MAKE_MAHALANOBIS(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_mahalanobis_bf16_serial +SIMSIMD_MAKE_BILINEAR(serial, bf16, f32, simsimd_bf16_to_f32) // simsimd_bilinear_bf16_serial +SIMSIMD_MAKE_MAHALANOBIS(serial, bf16, f32, simsimd_bf16_to_f32) // simsimd_mahalanobis_bf16_serial -SIMSIMD_MAKE_BILINEAR(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_bilinear_f32_accurate -SIMSIMD_MAKE_MAHALANOBIS(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_mahalanobis_f32_accurate +SIMSIMD_MAKE_BILINEAR(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_bilinear_f32_accurate +SIMSIMD_MAKE_MAHALANOBIS(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_mahalanobis_f32_accurate -SIMSIMD_MAKE_BILINEAR(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_bilinear_f16_accurate -SIMSIMD_MAKE_MAHALANOBIS(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_mahalanobis_f16_accurate +SIMSIMD_MAKE_BILINEAR(accurate, f16, f64, _simsimd_f16_to_f64) // simsimd_bilinear_f16_accurate +SIMSIMD_MAKE_MAHALANOBIS(accurate, f16, f64, _simsimd_f16_to_f64) // simsimd_mahalanobis_f16_accurate -SIMSIMD_MAKE_BILINEAR(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_bilinear_bf16_accurate -SIMSIMD_MAKE_MAHALANOBIS(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_mahalanobis_bf16_accurate +SIMSIMD_MAKE_BILINEAR(accurate, bf16, f64, _simsimd_bf16_to_f64) // simsimd_bilinear_bf16_accurate +SIMSIMD_MAKE_MAHALANOBIS(accurate, bf16, f64, _simsimd_bf16_to_f64) // simsimd_mahalanobis_bf16_accurate #if _SIMSIMD_TARGET_ARM #if SIMSIMD_TARGET_NEON @@ -313,7 +317,9 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16_neon(simsimd_bf16_t const *a, simsimd_ simsimd_bf16_t const *c, simsimd_size_t n, simsimd_distance_t *result) { float32x4_t sum_vec = vdupq_n_f32(0); for (simsimd_size_t i = 0; i != n; ++i) { - float32x4_t a_vec = vdupq_n_f32(simsimd_bf16_to_f32(a + i)); + simsimd_f32_t a_i; + simsimd_bf16_to_f32(a + i, &a_i); + float32x4_t a_vec = vdupq_n_f32(a_i); float32x4_t partial_sum_vec = vdupq_n_f32(0); for (simsimd_size_t j = 0; j + 8 <= n; j += 8) { bfloat16x8_t b_vec = vld1q_bf16((simsimd_bf16_for_arm_simd_t const *)(b + j)); @@ -329,7 +335,8 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16_neon(simsimd_bf16_t const *a, simsimd_ simsimd_size_t tail_start = n - tail_length; if (tail_length) { for (simsimd_size_t i = 0; i != n; ++i) { - simsimd_f32_t a_i = simsimd_bf16_to_f32(a + i); + simsimd_f32_t a_i; + simsimd_bf16_to_f32(a + i, &a_i); bfloat16x8_t b_vec = _simsimd_partial_load_bf16x8_neon(b + tail_start, tail_length); bfloat16x8_t c_vec = _simsimd_partial_load_bf16x8_neon(c + i * n + tail_start, tail_length); simsimd_f32_t partial_sum = vaddvq_f32(vbfdotq_f32(vdupq_n_f32(0), b_vec, c_vec)); @@ -345,8 +352,9 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_neon(simsimd_bf16_t const *a, simsi simsimd_distance_t *result) { float32x4_t sum_vec = vdupq_n_f32(0); for (simsimd_size_t i = 0; i != n; ++i) { - simsimd_f32_t a_i = simsimd_bf16_to_f32(a + i); - simsimd_f32_t b_i = simsimd_bf16_to_f32(b + i); + simsimd_f32_t a_i, b_i; + simsimd_bf16_to_f32(a + i, &a_i); + simsimd_bf16_to_f32(b + i, &b_i); float32x4_t diff_i_vec = vdupq_n_f32(a_i - b_i); float32x4_t partial_sum_vec = vdupq_n_f32(0); for (simsimd_size_t j = 0; j + 8 <= n; j += 8) { @@ -376,8 +384,9 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_neon(simsimd_bf16_t const *a, simsi simsimd_size_t tail_start = n - tail_length; if (tail_length) { for (simsimd_size_t i = 0; i != n; ++i) { - simsimd_f32_t a_i = simsimd_bf16_to_f32(a + i); - simsimd_f32_t b_i = simsimd_bf16_to_f32(b + i); + simsimd_f32_t a_i, b_i; + simsimd_bf16_to_f32(a + i, &a_i); + simsimd_bf16_to_f32(b + i, &b_i); simsimd_f32_t diff_i = a_i - b_i; bfloat16x8_t a_j_vec = _simsimd_partial_load_bf16x8_neon(a + tail_start, tail_length); bfloat16x8_t b_j_vec = _simsimd_partial_load_bf16x8_neon(b + tail_start, tail_length); @@ -489,7 +498,9 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16_haswell(simsimd_bf16_t const *a, simsi __m256 sum_vec = _mm256_setzero_ps(); for (simsimd_size_t i = 0; i != n; ++i) { // The `simsimd_bf16_to_f32` is cheaper than `_simsimd_bf16x8_to_f32x8_haswell` - __m256 a_vec = _mm256_set1_ps(simsimd_bf16_to_f32(a + i)); + simsimd_f32_t a_i; + simsimd_bf16_to_f32(a + i, &a_i); + __m256 a_vec = _mm256_set1_ps(a_i); __m256 partial_sum_vec = _mm256_setzero_ps(); for (simsimd_size_t j = 0; j + 8 <= n; j += 8) { __m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell(_mm_lddqu_si128((__m128i const *)(b + j))); @@ -505,7 +516,8 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16_haswell(simsimd_bf16_t const *a, simsi simsimd_size_t tail_start = n - tail_length; if (tail_length) { for (simsimd_size_t i = 0; i != n; ++i) { - simsimd_f32_t a_i = simsimd_bf16_to_f32(a + i); + simsimd_f32_t a_i; + simsimd_bf16_to_f32(a + i, &a_i); __m256 b_vec = _simsimd_bf16x8_to_f32x8_haswell( // _simsimd_partial_load_bf16x8_haswell(b + tail_start, tail_length)); __m256 c_vec = _simsimd_bf16x8_to_f32x8_haswell( // @@ -523,9 +535,10 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_haswell(simsimd_bf16_t const *a, si simsimd_distance_t *result) { __m256 sum_vec = _mm256_setzero_ps(); for (simsimd_size_t i = 0; i != n; ++i) { - __m256 diff_i_vec = _mm256_sub_ps( // - _mm256_set1_ps(simsimd_bf16_to_f32(a + i)), // - _mm256_set1_ps(simsimd_bf16_to_f32(b + i))); + simsimd_f32_t a_i, b_i; + simsimd_bf16_to_f32(a + i, &a_i); + simsimd_bf16_to_f32(b + i, &b_i); + __m256 diff_i_vec = _mm256_set1_ps(a_i - b_i); __m256 partial_sum_vec = _mm256_setzero_ps(); for (simsimd_size_t j = 0; j + 8 <= n; j += 8) { __m256 diff_j_vec = _mm256_sub_ps( // @@ -543,7 +556,10 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_haswell(simsimd_bf16_t const *a, si simsimd_size_t tail_start = n - tail_length; if (tail_length) { for (simsimd_size_t i = 0; i != n; ++i) { - simsimd_f32_t diff_i = simsimd_bf16_to_f32(a + i) - simsimd_bf16_to_f32(b + i); + simsimd_f32_t a_i, b_i; + simsimd_bf16_to_f32(a + i, &a_i); + simsimd_bf16_to_f32(b + i, &b_i); + simsimd_f32_t diff_i = a_i - b_i; __m256 diff_j_vec = _mm256_sub_ps( // _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(a + tail_start, tail_length)), _simsimd_bf16x8_to_f32x8_haswell(_simsimd_partial_load_bf16x8_haswell(b + tail_start, tail_length))); @@ -651,7 +667,9 @@ SIMSIMD_PUBLIC void simsimd_bilinear_bf16_genoa(simsimd_bf16_t const *a, simsimd __mmask32 tail_mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, tail_length); for (simsimd_size_t i = 0; i != n; ++i) { - __m512 a_vec = _mm512_set1_ps(simsimd_bf16_to_f32(a + i)); + simsimd_f32_t a_i; + simsimd_bf16_to_f32(a + i, &a_i); + __m512 a_vec = _mm512_set1_ps(a_i); __m512 partial_sum_vec = _mm512_setzero_ps(); __m512i b_vec, c_vec; simsimd_size_t j = 0; @@ -683,7 +701,10 @@ SIMSIMD_PUBLIC void simsimd_mahalanobis_bf16_genoa(simsimd_bf16_t const *a, sims __mmask32 tail_mask = (__mmask32)_bzhi_u32(0xFFFFFFFF, tail_length); for (simsimd_size_t i = 0; i != n; ++i) { - __m512 diff_i_vec = _mm512_set1_ps(simsimd_bf16_to_f32(a + i) - simsimd_bf16_to_f32(b + i)); + simsimd_f32_t a_i, b_i; + simsimd_bf16_to_f32(a + i, &a_i); + simsimd_bf16_to_f32(b + i, &b_i); + __m512 diff_i_vec = _mm512_set1_ps(a_i - b_i); __m512 partial_sum_vec = _mm512_setzero_ps(); __m512i a_j_vec, b_j_vec, diff_j_vec, c_vec; simsimd_size_t j = 0; diff --git a/include/simsimd/dot.h b/include/simsimd/dot.h index 556940b1..1280c499 100644 --- a/include/simsimd/dot.h +++ b/include/simsimd/dot.h @@ -20,6 +20,9 @@ * - Arm: NEON, SVE * - x86: Haswell, Ice Lake, Skylake, Genoa, Sapphire * + * ! When dealing with complex numbers, the dot product exports two results: the real and imaginary parts. + * ? When dealing with low-precision input numbers, the dot product is still computed with higher precision. + * * x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/ * Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/ */ @@ -157,10 +160,10 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_sierra(simsimd_i8_t const* a, simsimd_i8_t co SIMSIMD_PUBLIC void simsimd_dot_##input_type##_##name(simsimd_##input_type##_t const *a, \ simsimd_##input_type##_t const *b, simsimd_size_t n, \ simsimd_distance_t *result) { \ - simsimd_##accumulator_type##_t ab = 0; \ + simsimd_##accumulator_type##_t ab = 0, ai, bi; \ for (simsimd_size_t i = 0; i != n; ++i) { \ - simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \ - simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \ + load_and_convert(a + i, &ai); \ + load_and_convert(b + i, &bi); \ ab += ai * bi; \ } \ *result = ab; \ @@ -170,12 +173,12 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_sierra(simsimd_i8_t const* a, simsimd_i8_t co SIMSIMD_PUBLIC void simsimd_dot_##input_type##c_##name(simsimd_##input_type##_t const *a, \ simsimd_##input_type##_t const *b, simsimd_size_t n, \ simsimd_distance_t *results) { \ - simsimd_##accumulator_type##_t ab_real = 0, ab_imag = 0; \ + simsimd_##accumulator_type##_t ab_real = 0, ab_imag = 0, ar, br, ai, bi; \ for (simsimd_size_t i = 0; i + 2 <= n; i += 2) { \ - simsimd_##accumulator_type##_t ar = load_and_convert(a + i); \ - simsimd_##accumulator_type##_t br = load_and_convert(b + i); \ - simsimd_##accumulator_type##_t ai = load_and_convert(a + i + 1); \ - simsimd_##accumulator_type##_t bi = load_and_convert(b + i + 1); \ + load_and_convert(a + i, &ar); \ + load_and_convert(b + i, &br); \ + load_and_convert(a + i + 1, &ai); \ + load_and_convert(b + i + 1, &bi); \ ab_real += ar * br - ai * bi; \ ab_imag += ar * bi + ai * br; \ } \ @@ -187,12 +190,12 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_sierra(simsimd_i8_t const* a, simsimd_i8_t co SIMSIMD_PUBLIC void simsimd_vdot_##input_type##c_##name(simsimd_##input_type##_t const *a, \ simsimd_##input_type##_t const *b, simsimd_size_t n, \ simsimd_distance_t *results) { \ - simsimd_##accumulator_type##_t ab_real = 0, ab_imag = 0; \ + simsimd_##accumulator_type##_t ab_real = 0, ab_imag = 0, ar, br, ai, bi; \ for (simsimd_size_t i = 0; i + 2 <= n; i += 2) { \ - simsimd_##accumulator_type##_t ar = load_and_convert(a + i); \ - simsimd_##accumulator_type##_t br = load_and_convert(b + i); \ - simsimd_##accumulator_type##_t ai = load_and_convert(a + i + 1); \ - simsimd_##accumulator_type##_t bi = load_and_convert(b + i + 1); \ + load_and_convert(a + i, &ar); \ + load_and_convert(b + i, &br); \ + load_and_convert(a + i + 1, &ai); \ + load_and_convert(b + i + 1, &bi); \ ab_real += ar * br + ai * bi; \ ab_imag += ar * bi - ai * br; \ } \ @@ -200,36 +203,36 @@ SIMSIMD_PUBLIC void simsimd_dot_i8_sierra(simsimd_i8_t const* a, simsimd_i8_t co results[1] = ab_imag; \ } -SIMSIMD_MAKE_DOT(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_dot_f64_serial -SIMSIMD_MAKE_COMPLEX_DOT(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_dot_f64c_serial -SIMSIMD_MAKE_COMPLEX_VDOT(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_vdot_f64c_serial +SIMSIMD_MAKE_DOT(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_dot_f64_serial +SIMSIMD_MAKE_COMPLEX_DOT(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_dot_f64c_serial +SIMSIMD_MAKE_COMPLEX_VDOT(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_vdot_f64c_serial -SIMSIMD_MAKE_DOT(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_dot_f32_serial -SIMSIMD_MAKE_COMPLEX_DOT(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_dot_f32c_serial -SIMSIMD_MAKE_COMPLEX_VDOT(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_vdot_f32c_serial +SIMSIMD_MAKE_DOT(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_dot_f32_serial +SIMSIMD_MAKE_COMPLEX_DOT(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_dot_f32c_serial +SIMSIMD_MAKE_COMPLEX_VDOT(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_vdot_f32c_serial -SIMSIMD_MAKE_DOT(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_dot_f16_serial -SIMSIMD_MAKE_COMPLEX_DOT(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_dot_f16c_serial -SIMSIMD_MAKE_COMPLEX_VDOT(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_vdot_f16c_serial +SIMSIMD_MAKE_DOT(serial, f16, f32, simsimd_f16_to_f32) // simsimd_dot_f16_serial +SIMSIMD_MAKE_COMPLEX_DOT(serial, f16, f32, simsimd_f16_to_f32) // simsimd_dot_f16c_serial +SIMSIMD_MAKE_COMPLEX_VDOT(serial, f16, f32, simsimd_f16_to_f32) // simsimd_vdot_f16c_serial -SIMSIMD_MAKE_DOT(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_dot_bf16_serial -SIMSIMD_MAKE_COMPLEX_DOT(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_dot_bf16c_serial -SIMSIMD_MAKE_COMPLEX_VDOT(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_vdot_bf16c_serial +SIMSIMD_MAKE_DOT(serial, bf16, f32, simsimd_bf16_to_f32) // simsimd_dot_bf16_serial +SIMSIMD_MAKE_COMPLEX_DOT(serial, bf16, f32, simsimd_bf16_to_f32) // simsimd_dot_bf16c_serial +SIMSIMD_MAKE_COMPLEX_VDOT(serial, bf16, f32, simsimd_bf16_to_f32) // simsimd_vdot_bf16c_serial -SIMSIMD_MAKE_DOT(serial, i8, i64, SIMSIMD_DEREFERENCE) // simsimd_dot_i8_serial -SIMSIMD_MAKE_DOT(serial, u8, i64, SIMSIMD_DEREFERENCE) // simsimd_dot_u8_serial +SIMSIMD_MAKE_DOT(serial, i8, i64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_dot_i8_serial +SIMSIMD_MAKE_DOT(serial, u8, i64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_dot_u8_serial -SIMSIMD_MAKE_DOT(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_dot_f32_accurate -SIMSIMD_MAKE_COMPLEX_DOT(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_dot_f32c_accurate -SIMSIMD_MAKE_COMPLEX_VDOT(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_vdot_f32c_accurate +SIMSIMD_MAKE_DOT(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_dot_f32_accurate +SIMSIMD_MAKE_COMPLEX_DOT(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_dot_f32c_accurate +SIMSIMD_MAKE_COMPLEX_VDOT(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_vdot_f32c_accurate -SIMSIMD_MAKE_DOT(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_dot_f16_accurate -SIMSIMD_MAKE_COMPLEX_DOT(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_dot_f16c_accurate -SIMSIMD_MAKE_COMPLEX_VDOT(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_vdot_f16c_accurate +SIMSIMD_MAKE_DOT(accurate, f16, f64, _simsimd_f16_to_f64) // simsimd_dot_f16_accurate +SIMSIMD_MAKE_COMPLEX_DOT(accurate, f16, f64, _simsimd_f16_to_f64) // simsimd_dot_f16c_accurate +SIMSIMD_MAKE_COMPLEX_VDOT(accurate, f16, f64, _simsimd_f16_to_f64) // simsimd_vdot_f16c_accurate -SIMSIMD_MAKE_DOT(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_dot_bf16_accurate -SIMSIMD_MAKE_COMPLEX_DOT(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_dot_bf16c_accurate -SIMSIMD_MAKE_COMPLEX_VDOT(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_vdot_bf16c_accurate +SIMSIMD_MAKE_DOT(accurate, bf16, f64, _simsimd_bf16_to_f64) // simsimd_dot_bf16_accurate +SIMSIMD_MAKE_COMPLEX_DOT(accurate, bf16, f64, _simsimd_bf16_to_f64) // simsimd_dot_bf16c_accurate +SIMSIMD_MAKE_COMPLEX_VDOT(accurate, bf16, f64, _simsimd_bf16_to_f64) // simsimd_vdot_bf16c_accurate #if _SIMSIMD_TARGET_ARM #if SIMSIMD_TARGET_NEON diff --git a/include/simsimd/elementwise.h b/include/simsimd/elementwise.h index 94438c33..9779f355 100644 --- a/include/simsimd/elementwise.h +++ b/include/simsimd/elementwise.h @@ -192,38 +192,41 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_sapphire(simsimd_u8_t const *a, simsimd_u8_t simsimd_##input_type##_t *result) { \ simsimd_##accumulator_type##_t alpha_cast = (simsimd_##accumulator_type##_t)alpha; \ simsimd_##accumulator_type##_t beta_cast = (simsimd_##accumulator_type##_t)beta; \ + simsimd_##accumulator_type##_t ai, sum; \ for (simsimd_size_t i = 0; i != n; ++i) { \ - simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \ - simsimd_##accumulator_type##_t sum = (simsimd_##accumulator_type##_t)(alpha_cast * ai + beta_cast); \ - convert_and_store(sum, result + i); \ + load_and_convert(a + i, &ai); \ + sum = (simsimd_##accumulator_type##_t)(alpha_cast * ai + beta_cast); \ + convert_and_store(&sum, result + i); \ } \ } #define SIMSIMD_MAKE_SUM(name, input_type, accumulator_type, load_and_convert, convert_and_store) \ SIMSIMD_PUBLIC void simsimd_sum_##input_type##_##name(simsimd_##input_type##_t const *a, \ simsimd_##input_type##_t const *b, simsimd_size_t n, \ simsimd_##input_type##_t *result) { \ + simsimd_##accumulator_type##_t ai, bi, sum; \ for (simsimd_size_t i = 0; i != n; ++i) { \ - simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \ - simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \ - simsimd_##accumulator_type##_t sum = ai + bi; \ - convert_and_store(sum, result + i); \ + load_and_convert(a + i, &ai); \ + load_and_convert(b + i, &bi); \ + sum = ai + bi; \ + convert_and_store(&sum, result + i); \ } \ } -#define SIMSIMD_MAKE_WSUM(name, input_type, accumulator_type, load_and_convert, convert_and_store) \ - SIMSIMD_PUBLIC void simsimd_wsum_##input_type##_##name( \ - simsimd_##input_type##_t const *a, simsimd_##input_type##_t const *b, simsimd_size_t n, \ - simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_##input_type##_t *result) { \ - simsimd_##accumulator_type##_t alpha_cast = (simsimd_##accumulator_type##_t)alpha; \ - simsimd_##accumulator_type##_t beta_cast = (simsimd_##accumulator_type##_t)beta; \ - for (simsimd_size_t i = 0; i != n; ++i) { \ - simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \ - simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \ - simsimd_##accumulator_type##_t ai_scaled = (simsimd_##accumulator_type##_t)(ai * alpha_cast); \ - simsimd_##accumulator_type##_t bi_scaled = (simsimd_##accumulator_type##_t)(bi * beta_cast); \ - simsimd_##accumulator_type##_t sum = ai_scaled + bi_scaled; \ - convert_and_store(sum, result + i); \ - } \ +#define SIMSIMD_MAKE_WSUM(name, input_type, accumulator_type, load_and_convert, convert_and_store) \ + SIMSIMD_PUBLIC void simsimd_wsum_##input_type##_##name( \ + simsimd_##input_type##_t const *a, simsimd_##input_type##_t const *b, simsimd_size_t n, \ + simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_##input_type##_t *result) { \ + simsimd_##accumulator_type##_t alpha_cast = (simsimd_##accumulator_type##_t)alpha; \ + simsimd_##accumulator_type##_t beta_cast = (simsimd_##accumulator_type##_t)beta; \ + simsimd_##accumulator_type##_t ai, bi, ai_scaled, bi_scaled, sum; \ + for (simsimd_size_t i = 0; i != n; ++i) { \ + load_and_convert(a + i, &ai); \ + load_and_convert(b + i, &bi); \ + ai_scaled = ai * alpha_cast; \ + bi_scaled = bi * beta_cast; \ + sum = ai_scaled + bi_scaled; \ + convert_and_store(&sum, result + i); \ + } \ } #define SIMSIMD_MAKE_FMA(name, input_type, accumulator_type, load_and_convert, convert_and_store) \ @@ -232,78 +235,79 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_sapphire(simsimd_u8_t const *a, simsimd_u8_t simsimd_size_t n, simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_##input_type##_t *result) { \ simsimd_##accumulator_type##_t alpha_cast = (simsimd_##accumulator_type##_t)alpha; \ simsimd_##accumulator_type##_t beta_cast = (simsimd_##accumulator_type##_t)beta; \ + simsimd_##accumulator_type##_t ai, bi, ci, abi_scaled, ci_scaled, sum; \ for (simsimd_size_t i = 0; i != n; ++i) { \ - simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \ - simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \ - simsimd_##accumulator_type##_t ci = load_and_convert(c + i); \ - simsimd_##accumulator_type##_t abi_scaled = (simsimd_##accumulator_type##_t)(ai * bi * alpha_cast); \ - simsimd_##accumulator_type##_t ci_scaled = (simsimd_##accumulator_type##_t)(ci * beta_cast); \ - simsimd_##accumulator_type##_t sum = abi_scaled + ci_scaled; \ - convert_and_store(sum, result + i); \ + load_and_convert(a + i, &ai); \ + load_and_convert(b + i, &bi); \ + load_and_convert(c + i, &ci); \ + abi_scaled = ai * bi * alpha_cast; \ + ci_scaled = ci * beta_cast; \ + sum = abi_scaled + ci_scaled; \ + convert_and_store(&sum, result + i); \ } \ } -SIMSIMD_MAKE_SUM(serial, f64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_sum_f64_serial -SIMSIMD_MAKE_SUM(serial, f32, f32, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_sum_f32_serial -SIMSIMD_MAKE_SUM(serial, f16, f32, SIMSIMD_F16_TO_F32, SIMSIMD_F32_TO_F16) // simsimd_sum_f16_serial -SIMSIMD_MAKE_SUM(serial, bf16, f32, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_TO_BF16) // simsimd_sum_bf16_serial -SIMSIMD_MAKE_SUM(serial, i8, i64, SIMSIMD_DEREFERENCE, SIMSIMD_I64_TO_I8) // simsimd_sum_i8_serial -SIMSIMD_MAKE_SUM(serial, u8, i64, SIMSIMD_DEREFERENCE, SIMSIMD_I64_TO_U8) // simsimd_sum_u8_serial -SIMSIMD_MAKE_SUM(serial, i16, i64, SIMSIMD_DEREFERENCE, SIMSIMD_I64_TO_I16) // simsimd_sum_i16_serial -SIMSIMD_MAKE_SUM(serial, u16, i64, SIMSIMD_DEREFERENCE, SIMSIMD_I64_TO_U16) // simsimd_sum_u16_serial -SIMSIMD_MAKE_SUM(serial, i32, i64, SIMSIMD_DEREFERENCE, SIMSIMD_I64_TO_I32) // simsimd_sum_i32_serial -SIMSIMD_MAKE_SUM(serial, u32, i64, SIMSIMD_DEREFERENCE, SIMSIMD_I64_TO_U32) // simsimd_sum_u32_serial -SIMSIMD_MAKE_SUM(serial, i64, i64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_sum_i64_serial -SIMSIMD_MAKE_SUM(serial, u64, u64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_sum_u64_serial - -SIMSIMD_MAKE_SUM(accurate, f32, f64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_sum_f32_accurate -SIMSIMD_MAKE_SUM(accurate, f16, f64, SIMSIMD_F16_TO_F32, SIMSIMD_F32_TO_F16) // simsimd_sum_f16_accurate -SIMSIMD_MAKE_SUM(accurate, bf16, f64, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_TO_BF16) // simsimd_sum_bf16_accurate - -SIMSIMD_MAKE_SCALE(serial, f64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_scale_f64_serial -SIMSIMD_MAKE_SCALE(serial, f32, f32, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_scale_f32_serial -SIMSIMD_MAKE_SCALE(serial, f16, f32, SIMSIMD_F16_TO_F32, SIMSIMD_F32_TO_F16) // simsimd_scale_f16_serial -SIMSIMD_MAKE_SCALE(serial, bf16, f32, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_TO_BF16) // simsimd_scale_bf16_serial -SIMSIMD_MAKE_SCALE(serial, i8, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_TO_I8) // simsimd_scale_i8_serial -SIMSIMD_MAKE_SCALE(serial, u8, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_TO_U8) // simsimd_scale_u8_serial - -SIMSIMD_MAKE_SCALE(accurate, f32, f64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_scale_f32_accurate -SIMSIMD_MAKE_SCALE(accurate, f16, f64, SIMSIMD_F16_TO_F32, SIMSIMD_F32_TO_F16) // simsimd_scale_f16_accurate -SIMSIMD_MAKE_SCALE(accurate, bf16, f64, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_TO_BF16) // simsimd_scale_bf16_accurate -SIMSIMD_MAKE_SCALE(accurate, i8, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_I8) // simsimd_scale_i8_accurate -SIMSIMD_MAKE_SCALE(accurate, u8, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_U8) // simsimd_scale_u8_accurate - -SIMSIMD_MAKE_WSUM(serial, f64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_wsum_f64_serial -SIMSIMD_MAKE_WSUM(serial, f32, f32, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_wsum_f32_serial -SIMSIMD_MAKE_WSUM(serial, f16, f32, SIMSIMD_F16_TO_F32, SIMSIMD_F32_TO_F16) // simsimd_wsum_f16_serial -SIMSIMD_MAKE_WSUM(serial, bf16, f32, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_TO_BF16) // simsimd_wsum_bf16_serial -SIMSIMD_MAKE_WSUM(serial, i8, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_TO_I8) // simsimd_wsum_i8_serial -SIMSIMD_MAKE_WSUM(serial, u8, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_TO_U8) // simsimd_wsum_u8_serial - -SIMSIMD_MAKE_WSUM(accurate, f32, f64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_wsum_f32_accurate -SIMSIMD_MAKE_WSUM(accurate, f16, f64, SIMSIMD_F16_TO_F32, SIMSIMD_F32_TO_F16) // simsimd_wsum_f16_accurate -SIMSIMD_MAKE_WSUM(accurate, bf16, f64, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_TO_BF16) // simsimd_wsum_bf16_accurate -SIMSIMD_MAKE_WSUM(accurate, i8, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_I8) // simsimd_wsum_i8_accurate -SIMSIMD_MAKE_WSUM(accurate, u8, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_U8) // simsimd_wsum_u8_accurate - -SIMSIMD_MAKE_FMA(serial, f64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_fma_f64_serial -SIMSIMD_MAKE_FMA(serial, f32, f32, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_fma_f32_serial -SIMSIMD_MAKE_FMA(serial, f16, f32, SIMSIMD_F16_TO_F32, SIMSIMD_F32_TO_F16) // simsimd_fma_f16_serial -SIMSIMD_MAKE_FMA(serial, bf16, f32, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_TO_BF16) // simsimd_fma_bf16_serial -SIMSIMD_MAKE_FMA(serial, i8, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_TO_I8) // simsimd_fma_i8_serial -SIMSIMD_MAKE_FMA(serial, u8, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_TO_U8) // simsimd_fma_u8_serial -SIMSIMD_MAKE_FMA(serial, i16, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_I16) // simsimd_fma_i16_serial -SIMSIMD_MAKE_FMA(serial, u16, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_U16) // simsimd_fma_u16_serial -SIMSIMD_MAKE_FMA(serial, i32, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_I32) // simsimd_fma_i32_serial -SIMSIMD_MAKE_FMA(serial, u32, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_U32) // simsimd_fma_u32_serial -SIMSIMD_MAKE_FMA(serial, i64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_I64) // simsimd_fma_i64_serial -SIMSIMD_MAKE_FMA(serial, u64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_U64) // simsimd_fma_u64_serial - -SIMSIMD_MAKE_FMA(accurate, f32, f64, SIMSIMD_DEREFERENCE, SIMSIMD_EXPORT) // simsimd_fma_f32_accurate -SIMSIMD_MAKE_FMA(accurate, f16, f64, SIMSIMD_F16_TO_F32, SIMSIMD_F32_TO_F16) // simsimd_fma_f16_accurate -SIMSIMD_MAKE_FMA(accurate, bf16, f64, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_TO_BF16) // simsimd_fma_bf16_accurate -SIMSIMD_MAKE_FMA(accurate, i8, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_I8) // simsimd_fma_i8_accurate -SIMSIMD_MAKE_FMA(accurate, u8, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F64_TO_U8) // simsimd_fma_u8_accurate +SIMSIMD_MAKE_SUM(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_sum_f64_serial +SIMSIMD_MAKE_SUM(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_sum_f32_serial +SIMSIMD_MAKE_SUM(serial, f16, f32, simsimd_f16_to_f32, simsimd_f32_to_f16) // simsimd_sum_f16_serial +SIMSIMD_MAKE_SUM(serial, bf16, f32, simsimd_bf16_to_f32, simsimd_f32_to_bf16) // simsimd_sum_bf16_serial +SIMSIMD_MAKE_SUM(serial, i8, i64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_i64_to_i8) // simsimd_sum_i8_serial +SIMSIMD_MAKE_SUM(serial, u8, i64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_i64_to_u8) // simsimd_sum_u8_serial +SIMSIMD_MAKE_SUM(serial, i16, i64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_i64_to_i16) // simsimd_sum_i16_serial +SIMSIMD_MAKE_SUM(serial, u16, i64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_i64_to_u16) // simsimd_sum_u16_serial +SIMSIMD_MAKE_SUM(serial, i32, i64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_i64_to_i32) // simsimd_sum_i32_serial +SIMSIMD_MAKE_SUM(serial, u32, i64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_i64_to_u32) // simsimd_sum_u32_serial +SIMSIMD_MAKE_SUM(serial, i64, i64, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_sum_i64_serial +SIMSIMD_MAKE_SUM(serial, u64, u64, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_sum_u64_serial + +SIMSIMD_MAKE_SUM(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_sum_f32_accurate +SIMSIMD_MAKE_SUM(accurate, f16, f64, _simsimd_f16_to_f64, _simsimd_f64_to_f16) // simsimd_sum_f16_accurate +SIMSIMD_MAKE_SUM(accurate, bf16, f64, _simsimd_bf16_to_f64, _simsimd_f64_to_bf16) // simsimd_sum_bf16_accurate + +SIMSIMD_MAKE_SCALE(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_scale_f64_serial +SIMSIMD_MAKE_SCALE(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_scale_f32_serial +SIMSIMD_MAKE_SCALE(serial, f16, f32, simsimd_f16_to_f32, simsimd_f32_to_f16) // simsimd_scale_f16_serial +SIMSIMD_MAKE_SCALE(serial, bf16, f32, simsimd_bf16_to_f32, simsimd_f32_to_bf16) // simsimd_scale_bf16_serial +SIMSIMD_MAKE_SCALE(serial, i8, f32, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f32_to_i8) // simsimd_scale_i8_serial +SIMSIMD_MAKE_SCALE(serial, u8, f32, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f32_to_u8) // simsimd_scale_u8_serial + +SIMSIMD_MAKE_SCALE(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_scale_f32_accurate +SIMSIMD_MAKE_SCALE(accurate, f16, f64, _simsimd_f16_to_f64, _simsimd_f64_to_f16) // simsimd_scale_f16_accurate +SIMSIMD_MAKE_SCALE(accurate, bf16, f64, _simsimd_bf16_to_f64, _simsimd_f64_to_bf16) // simsimd_scale_bf16_accurate +SIMSIMD_MAKE_SCALE(accurate, i8, f64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f64_to_i8) // simsimd_scale_i8_accurate +SIMSIMD_MAKE_SCALE(accurate, u8, f64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f64_to_u8) // simsimd_scale_u8_accurate + +SIMSIMD_MAKE_WSUM(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_wsum_f64_serial +SIMSIMD_MAKE_WSUM(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_wsum_f32_serial +SIMSIMD_MAKE_WSUM(serial, f16, f32, simsimd_f16_to_f32, simsimd_f32_to_f16) // simsimd_wsum_f16_serial +SIMSIMD_MAKE_WSUM(serial, bf16, f32, simsimd_bf16_to_f32, simsimd_f32_to_bf16) // simsimd_wsum_bf16_serial +SIMSIMD_MAKE_WSUM(serial, i8, f32, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f32_to_i8) // simsimd_wsum_i8_serial +SIMSIMD_MAKE_WSUM(serial, u8, f32, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f32_to_u8) // simsimd_wsum_u8_serial + +SIMSIMD_MAKE_WSUM(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_wsum_f32_accurate +SIMSIMD_MAKE_WSUM(accurate, f16, f64, _simsimd_f16_to_f64, _simsimd_f64_to_f16) // simsimd_wsum_f16_accurate +SIMSIMD_MAKE_WSUM(accurate, bf16, f64, _simsimd_bf16_to_f64, _simsimd_f64_to_bf16) // simsimd_wsum_bf16_accurate +SIMSIMD_MAKE_WSUM(accurate, i8, f64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f64_to_i8) // simsimd_wsum_i8_accurate +SIMSIMD_MAKE_WSUM(accurate, u8, f64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f64_to_u8) // simsimd_wsum_u8_accurate + +SIMSIMD_MAKE_FMA(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_fma_f64_serial +SIMSIMD_MAKE_FMA(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_fma_f32_serial +SIMSIMD_MAKE_FMA(serial, f16, f32, simsimd_f16_to_f32, simsimd_f32_to_f16) // simsimd_fma_f16_serial +SIMSIMD_MAKE_FMA(serial, bf16, f32, simsimd_bf16_to_f32, simsimd_f32_to_bf16) // simsimd_fma_bf16_serial +SIMSIMD_MAKE_FMA(serial, i8, f32, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f32_to_i8) // simsimd_fma_i8_serial +SIMSIMD_MAKE_FMA(serial, u8, f32, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f32_to_u8) // simsimd_fma_u8_serial +SIMSIMD_MAKE_FMA(serial, i16, f64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f64_to_i16) // simsimd_fma_i16_serial +SIMSIMD_MAKE_FMA(serial, u16, f64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f64_to_u16) // simsimd_fma_u16_serial +SIMSIMD_MAKE_FMA(serial, i32, f64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f64_to_i32) // simsimd_fma_i32_serial +SIMSIMD_MAKE_FMA(serial, u32, f64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f64_to_u32) // simsimd_fma_u32_serial +SIMSIMD_MAKE_FMA(serial, i64, f64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f64_to_i64) // simsimd_fma_i64_serial +SIMSIMD_MAKE_FMA(serial, u64, f64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f64_to_u64) // simsimd_fma_u64_serial + +SIMSIMD_MAKE_FMA(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_fma_f32_accurate +SIMSIMD_MAKE_FMA(accurate, f16, f64, _simsimd_f16_to_f64, _simsimd_f64_to_f16) // simsimd_fma_f16_accurate +SIMSIMD_MAKE_FMA(accurate, bf16, f64, _simsimd_bf16_to_f64, _simsimd_f64_to_bf16) // simsimd_fma_bf16_accurate +SIMSIMD_MAKE_FMA(accurate, i8, f64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f64_to_i8) // simsimd_fma_i8_accurate +SIMSIMD_MAKE_FMA(accurate, u8, f64, _SIMSIMD_ASSIGN_1_TO_2, _simsimd_f64_to_u8) // simsimd_fma_u8_accurate #if _SIMSIMD_TARGET_X86 #if SIMSIMD_TARGET_HASWELL @@ -470,10 +474,11 @@ SIMSIMD_PUBLIC void simsimd_sum_f16_haswell(simsimd_f16_t const *a, simsimd_f16_ // The tail: for (; i < n; ++i) { - simsimd_f32_t ai = SIMSIMD_F16_TO_F32(a + i); - simsimd_f32_t bi = SIMSIMD_F16_TO_F32(b + i); + simsimd_f32_t ai, bi; + simsimd_f16_to_f32(a + i, &ai); + simsimd_f16_to_f32(b + i, &bi); simsimd_f32_t sum = ai + bi; - SIMSIMD_F32_TO_F16(sum, result + i); + simsimd_f32_to_f16(&sum, result + i); } } @@ -496,9 +501,10 @@ SIMSIMD_PUBLIC void simsimd_scale_f16_haswell(simsimd_f16_t const *a, simsimd_si // The tail: for (; i < n; ++i) { - simsimd_f32_t ai = SIMSIMD_F16_TO_F32(a + i); + simsimd_f32_t ai; + simsimd_f16_to_f32(a + i, &ai); simsimd_f32_t sum = alpha_f32 * ai + beta_f32; - SIMSIMD_F32_TO_F16(sum, result + i); + simsimd_f32_to_f16(&sum, result + i); } } @@ -542,10 +548,11 @@ SIMSIMD_PUBLIC void simsimd_wsum_f16_haswell( // // The tail: for (; i < n; ++i) { - simsimd_f32_t ai = SIMSIMD_F16_TO_F32(a + i); - simsimd_f32_t bi = SIMSIMD_F16_TO_F32(b + i); + simsimd_f32_t ai, bi; + simsimd_f16_to_f32(a + i, &ai); + simsimd_f16_to_f32(b + i, &bi); simsimd_f32_t sum = alpha_f32 * ai + beta_f32 * bi; - SIMSIMD_F32_TO_F16(sum, result + i); + simsimd_f32_to_f16(&sum, result + i); } } @@ -565,10 +572,11 @@ SIMSIMD_PUBLIC void simsimd_sum_bf16_haswell(simsimd_bf16_t const *a, simsimd_bf // The tail: for (; i < n; ++i) { - simsimd_f32_t ai = SIMSIMD_BF16_TO_F32(a + i); - simsimd_f32_t bi = SIMSIMD_BF16_TO_F32(b + i); + simsimd_f32_t ai, bi; + simsimd_bf16_to_f32(a + i, &ai); + simsimd_bf16_to_f32(b + i, &bi); simsimd_f32_t sum = ai + bi; - SIMSIMD_F32_TO_BF16(sum, result + i); + simsimd_f32_to_bf16(&sum, result + i); } } @@ -591,9 +599,10 @@ SIMSIMD_PUBLIC void simsimd_scale_bf16_haswell(simsimd_bf16_t const *a, simsimd_ // The tail: for (; i < n; ++i) { - simsimd_f32_t ai = SIMSIMD_BF16_TO_F32(a + i); + simsimd_f32_t ai; + simsimd_bf16_to_f32(a + i, &ai); simsimd_f32_t sum = alpha_f32 * ai + beta_f32; - SIMSIMD_F32_TO_BF16(sum, result + i); + simsimd_f32_to_bf16(&sum, result + i); } } @@ -637,10 +646,11 @@ SIMSIMD_PUBLIC void simsimd_wsum_bf16_haswell( // // The tail: for (; i < n; ++i) { - simsimd_f32_t ai = SIMSIMD_BF16_TO_F32(a + i); - simsimd_f32_t bi = SIMSIMD_BF16_TO_F32(b + i); + simsimd_f32_t ai, bi; + simsimd_bf16_to_f32(a + i, &ai); + simsimd_bf16_to_f32(b + i, &bi); simsimd_f32_t sum = alpha_f32 * ai + beta_f32 * bi; - SIMSIMD_F32_TO_BF16(sum, result + i); + simsimd_f32_to_bf16(&sum, result + i); } } @@ -716,11 +726,12 @@ SIMSIMD_PUBLIC void simsimd_fma_f16_haswell( // // The tail: for (; i < n; ++i) { - simsimd_f32_t ai = SIMSIMD_F16_TO_F32(a + i); - simsimd_f32_t bi = SIMSIMD_F16_TO_F32(b + i); - simsimd_f32_t ci = SIMSIMD_F16_TO_F32(c + i); + simsimd_f32_t ai, bi, ci; + simsimd_f16_to_f32(a + i, &ai); + simsimd_f16_to_f32(b + i, &bi); + simsimd_f16_to_f32(c + i, &ci); simsimd_f32_t sum = alpha * ai * bi + beta * ci; - SIMSIMD_F32_TO_F16(sum, result + i); + simsimd_f32_to_f16(&sum, result + i); } } @@ -750,11 +761,12 @@ SIMSIMD_PUBLIC void simsimd_fma_bf16_haswell( / // The tail: for (; i < n; ++i) { - simsimd_f32_t ai = SIMSIMD_BF16_TO_F32(a + i); - simsimd_f32_t bi = SIMSIMD_BF16_TO_F32(b + i); - simsimd_f32_t ci = SIMSIMD_BF16_TO_F32(c + i); + simsimd_f32_t ai, bi, ci; + simsimd_bf16_to_f32(a + i, &ai); + simsimd_bf16_to_f32(b + i, &bi); + simsimd_bf16_to_f32(c + i, &ci); simsimd_f32_t sum = alpha * ai * bi + beta * ci; - SIMSIMD_F32_TO_BF16(sum, result + i); + simsimd_f32_to_bf16(&sum, result + i); } } @@ -773,7 +785,7 @@ SIMSIMD_PUBLIC void simsimd_sum_i8_haswell(simsimd_i8_t const *a, simsimd_i8_t c for (; i < n; ++i) { simsimd_f32_t ai = a[i], bi = b[i]; simsimd_f32_t sum = ai + bi; - SIMSIMD_F32_TO_I8(sum, result + i); + _simsimd_f32_to_i8(&sum, result + i); } } @@ -798,7 +810,7 @@ SIMSIMD_PUBLIC void simsimd_scale_i8_haswell(simsimd_i8_t const *a, simsimd_size __m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s)); // The normal part. __m256 sum_vec = _mm256_fmadd_ps(a_vec, alpha_vec, beta_vec); - // Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD. + // Instead of serial calls to expensive `_simsimd_f32_to_u8`, convert and clip with SIMD. __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128)); sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(127)); @@ -818,7 +830,7 @@ SIMSIMD_PUBLIC void simsimd_scale_i8_haswell(simsimd_i8_t const *a, simsimd_size for (; i < n; ++i) { simsimd_f32_t ai = a[i]; simsimd_f32_t sum = alpha_f32 * ai + beta_f32; - SIMSIMD_F32_TO_I8(sum, result + i); + _simsimd_f32_to_i8(&sum, result + i); } } @@ -864,7 +876,7 @@ SIMSIMD_PUBLIC void simsimd_wsum_i8_haswell( // // The normal part. __m256 a_scaled_vec = _mm256_mul_ps(a_vec, alpha_vec); __m256 sum_vec = _mm256_fmadd_ps(b_vec, beta_vec, a_scaled_vec); - // Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD. + // Instead of serial calls to expensive `_simsimd_f32_to_u8`, convert and clip with SIMD. __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128)); sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(127)); @@ -884,7 +896,7 @@ SIMSIMD_PUBLIC void simsimd_wsum_i8_haswell( // for (; i < n; ++i) { simsimd_f32_t ai = a[i], bi = b[i]; simsimd_f32_t sum = alpha_f32 * ai + beta_f32 * bi; - SIMSIMD_F32_TO_I8(sum, result + i); + _simsimd_f32_to_i8(&sum, result + i); } } @@ -903,7 +915,7 @@ SIMSIMD_PUBLIC void simsimd_sum_u8_haswell(simsimd_u8_t const *a, simsimd_u8_t c for (; i < n; ++i) { simsimd_f32_t ai = a[i], bi = b[i]; simsimd_f32_t sum = ai + bi; - SIMSIMD_F32_TO_U8(sum, result + i); + _simsimd_f32_to_u8(&sum, result + i); } } @@ -928,7 +940,7 @@ SIMSIMD_PUBLIC void simsimd_scale_u8_haswell(simsimd_u8_t const *a, simsimd_size __m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s)); // The normal part. __m256 sum_vec = _mm256_fmadd_ps(a_vec, alpha_vec, beta_vec); - // Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD. + // Instead of serial calls to expensive `_simsimd_f32_to_u8`, convert and clip with SIMD. __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(0)); sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(255)); @@ -948,7 +960,7 @@ SIMSIMD_PUBLIC void simsimd_scale_u8_haswell(simsimd_u8_t const *a, simsimd_size for (; i < n; ++i) { simsimd_f32_t ai = a[i]; simsimd_f32_t sum = alpha_f32 * ai + beta_f32; - SIMSIMD_F32_TO_U8(sum, result + i); + _simsimd_f32_to_u8(&sum, result + i); } } @@ -994,7 +1006,7 @@ SIMSIMD_PUBLIC void simsimd_wsum_u8_haswell( // // The normal part. __m256 a_scaled_vec = _mm256_mul_ps(a_vec, alpha_vec); __m256 sum_vec = _mm256_fmadd_ps(b_vec, beta_vec, a_scaled_vec); - // Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD. + // Instead of serial calls to expensive `_simsimd_f32_to_u8`, convert and clip with SIMD. __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(0)); sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(255)); @@ -1014,7 +1026,7 @@ SIMSIMD_PUBLIC void simsimd_wsum_u8_haswell( // for (; i < n; ++i) { simsimd_f32_t ai = a[i], bi = b[i]; simsimd_f32_t sum = alpha_f32 * ai + beta_f32 * bi; - SIMSIMD_F32_TO_U8(sum, result + i); + _simsimd_f32_to_u8(&sum, result + i); } } @@ -1048,7 +1060,7 @@ SIMSIMD_PUBLIC void simsimd_fma_i8_haswell( __m256 ab_vec = _mm256_mul_ps(a_vec, b_vec); __m256 ab_scaled_vec = _mm256_mul_ps(ab_vec, alpha_vec); __m256 sum_vec = _mm256_fmadd_ps(c_vec, beta_vec, ab_scaled_vec); - // Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD. + // Instead of serial calls to expensive `_simsimd_f32_to_u8`, convert and clip with SIMD. __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128)); sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(127)); @@ -1068,7 +1080,7 @@ SIMSIMD_PUBLIC void simsimd_fma_i8_haswell( for (; i < n; ++i) { simsimd_f32_t ai = a[i], bi = b[i], ci = c[i]; simsimd_f32_t sum = alpha_f32 * ai * bi + beta_f32 * ci; - SIMSIMD_F32_TO_I8(sum, result + i); + _simsimd_f32_to_i8(&sum, result + i); } } @@ -1102,7 +1114,7 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_haswell( __m256 ab_vec = _mm256_mul_ps(a_vec, b_vec); __m256 ab_scaled_vec = _mm256_mul_ps(ab_vec, alpha_vec); __m256 sum_vec = _mm256_fmadd_ps(c_vec, beta_vec, ab_scaled_vec); - // Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD. + // Instead of serial calls to expensive `_simsimd_f32_to_u8`, convert and clip with SIMD. __m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec); sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(0)); sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(255)); @@ -1122,7 +1134,7 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_haswell( for (; i < n; ++i) { simsimd_f32_t ai = a[i], bi = b[i], ci = c[i]; simsimd_f32_t sum = alpha_f32 * ai * bi + beta_f32 * ci; - SIMSIMD_F32_TO_U8(sum, result + i); + _simsimd_f32_to_u8(&sum, result + i); } } @@ -2302,7 +2314,7 @@ SIMSIMD_PUBLIC void simsimd_sum_u8_neon(simsimd_u8_t const *a, simsimd_u8_t cons } // The tail: - for (; i < n; ++i) { SIMSIMD_F32_TO_U8(a[i] + b[i], result + i); } + for (; i < n; ++i) { _simsimd_f32_to_u8(a[i] + b[i], result + i); } } SIMSIMD_PUBLIC void simsimd_scale_u8_neon(simsimd_u8_t const *a, simsimd_size_t n, simsimd_distance_t alpha, @@ -2323,7 +2335,7 @@ SIMSIMD_PUBLIC void simsimd_scale_u8_neon(simsimd_u8_t const *a, simsimd_size_t } // The tail: - for (; i < n; ++i) { SIMSIMD_F32_TO_U8(alpha_f16 * a[i] + beta_f16, result + i); } + for (; i < n; ++i) { _simsimd_f32_to_u8(alpha_f16 * a[i] + beta_f16, result + i); } } SIMSIMD_PUBLIC void simsimd_wsum_u8_neon( // @@ -2364,7 +2376,7 @@ SIMSIMD_PUBLIC void simsimd_wsum_u8_neon( // } // The tail: - for (; i < n; ++i) { SIMSIMD_F32_TO_U8(alpha_f16 * a[i] + beta_f16 * b[i], result + i); } + for (; i < n; ++i) { _simsimd_f32_to_u8(alpha_f16 * a[i] + beta_f16 * b[i], result + i); } } SIMSIMD_PUBLIC void simsimd_fma_u8_neon( // @@ -2390,7 +2402,7 @@ SIMSIMD_PUBLIC void simsimd_fma_u8_neon( // } // The tail: - for (; i < n; ++i) { SIMSIMD_F32_TO_U8(alpha_f16 * a[i] * b[i] + beta_f16 * c[i], result + i); } + for (; i < n; ++i) { _simsimd_f32_to_u8(alpha_f16 * a[i] * b[i] + beta_f16 * c[i], result + i); } } SIMSIMD_PUBLIC void simsimd_sum_i8_neon(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n, @@ -2405,7 +2417,7 @@ SIMSIMD_PUBLIC void simsimd_sum_i8_neon(simsimd_i8_t const *a, simsimd_i8_t cons } // The tail: - for (; i < n; ++i) { SIMSIMD_F32_TO_I8(a[i] + b[i], result + i); } + for (; i < n; ++i) { _simsimd_f32_to_i8(a[i] + b[i], result + i); } } SIMSIMD_PUBLIC void simsimd_scale_i8_neon(simsimd_i8_t const *a, simsimd_size_t n, simsimd_distance_t alpha, @@ -2426,7 +2438,7 @@ SIMSIMD_PUBLIC void simsimd_scale_i8_neon(simsimd_i8_t const *a, simsimd_size_t } // The tail: - for (; i < n; ++i) { SIMSIMD_F32_TO_I8(alpha_f16 * a[i] + beta_f16, result + i); } + for (; i < n; ++i) { _simsimd_f32_to_i8(alpha_f16 * a[i] + beta_f16, result + i); } } SIMSIMD_PUBLIC void simsimd_wsum_i8_neon( // @@ -2467,7 +2479,7 @@ SIMSIMD_PUBLIC void simsimd_wsum_i8_neon( // } // The tail: - for (; i < n; ++i) { SIMSIMD_F32_TO_I8(alpha_f16 * a[i] + beta_f16 * b[i], result + i); } + for (; i < n; ++i) { _simsimd_f32_to_i8(alpha_f16 * a[i] + beta_f16 * b[i], result + i); } } SIMSIMD_PUBLIC void simsimd_fma_i8_neon( // @@ -2493,7 +2505,7 @@ SIMSIMD_PUBLIC void simsimd_fma_i8_neon( // } // The tail: - for (; i < n; ++i) { SIMSIMD_F32_TO_I8(alpha_f16 * a[i] * b[i] + beta_f16 * c[i], result + i); } + for (; i < n; ++i) { _simsimd_f32_to_i8(alpha_f16 * a[i] * b[i] + beta_f16 * c[i], result + i); } } #pragma clang attribute pop diff --git a/include/simsimd/probability.h b/include/simsimd/probability.h index 2865aa32..8458c51f 100644 --- a/include/simsimd/probability.h +++ b/include/simsimd/probability.h @@ -87,10 +87,10 @@ SIMSIMD_PUBLIC void simsimd_js_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_ SIMSIMD_PUBLIC void simsimd_kl_##input_type##_##name(simsimd_##input_type##_t const *a, \ simsimd_##input_type##_t const *b, simsimd_size_t n, \ simsimd_distance_t *result) { \ - simsimd_##accumulator_type##_t d = 0; \ + simsimd_##accumulator_type##_t d = 0, ai, bi; \ for (simsimd_size_t i = 0; i != n; ++i) { \ - simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \ - simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \ + load_and_convert(a + i, &ai); \ + load_and_convert(b + i, &bi); \ d += ai * SIMSIMD_LOG((ai + epsilon) / (bi + epsilon)); \ } \ *result = (simsimd_distance_t)d; \ @@ -100,37 +100,37 @@ SIMSIMD_PUBLIC void simsimd_js_f16_sapphire(simsimd_f16_t const* a, simsimd_f16_ SIMSIMD_PUBLIC void simsimd_js_##input_type##_##name(simsimd_##input_type##_t const *a, \ simsimd_##input_type##_t const *b, simsimd_size_t n, \ simsimd_distance_t *result) { \ - simsimd_##accumulator_type##_t d = 0; \ + simsimd_##accumulator_type##_t d = 0, ai, bi, mi; \ for (simsimd_size_t i = 0; i != n; ++i) { \ - simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \ - simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \ - simsimd_##accumulator_type##_t mi = (ai + bi) / 2; \ + load_and_convert(a + i, &ai); \ + load_and_convert(b + i, &bi); \ + mi = (ai + bi) / 2; \ d += ai * SIMSIMD_LOG((ai + epsilon) / (mi + epsilon)); \ d += bi * SIMSIMD_LOG((bi + epsilon) / (mi + epsilon)); \ } \ *result = (simsimd_distance_t)d / 2; \ } -SIMSIMD_MAKE_KL(serial, f64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f64_serial -SIMSIMD_MAKE_JS(serial, f64, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f64_serial +SIMSIMD_MAKE_KL(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f64_serial +SIMSIMD_MAKE_JS(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f64_serial -SIMSIMD_MAKE_KL(serial, f32, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f32_serial -SIMSIMD_MAKE_JS(serial, f32, f32, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f32_serial +SIMSIMD_MAKE_KL(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f32_serial +SIMSIMD_MAKE_JS(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f32_serial -SIMSIMD_MAKE_KL(serial, f16, f32, SIMSIMD_F16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f16_serial -SIMSIMD_MAKE_JS(serial, f16, f32, SIMSIMD_F16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f16_serial +SIMSIMD_MAKE_KL(serial, f16, f32, simsimd_f16_to_f32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f16_serial +SIMSIMD_MAKE_JS(serial, f16, f32, simsimd_f16_to_f32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f16_serial -SIMSIMD_MAKE_KL(serial, bf16, f32, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_bf16_serial -SIMSIMD_MAKE_JS(serial, bf16, f32, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_bf16_serial +SIMSIMD_MAKE_KL(serial, bf16, f32, simsimd_bf16_to_f32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_bf16_serial +SIMSIMD_MAKE_JS(serial, bf16, f32, simsimd_bf16_to_f32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_bf16_serial -SIMSIMD_MAKE_KL(accurate, f32, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f32_accurate -SIMSIMD_MAKE_JS(accurate, f32, f64, SIMSIMD_DEREFERENCE, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f32_accurate +SIMSIMD_MAKE_KL(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f32_accurate +SIMSIMD_MAKE_JS(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f32_accurate -SIMSIMD_MAKE_KL(accurate, f16, f64, SIMSIMD_F16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f16_accurate -SIMSIMD_MAKE_JS(accurate, f16, f64, SIMSIMD_F16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f16_accurate +SIMSIMD_MAKE_KL(accurate, f16, f64, _simsimd_f16_to_f64, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_f16_accurate +SIMSIMD_MAKE_JS(accurate, f16, f64, _simsimd_f16_to_f64, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_f16_accurate -SIMSIMD_MAKE_KL(accurate, bf16, f64, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_bf16_accurate -SIMSIMD_MAKE_JS(accurate, bf16, f64, SIMSIMD_BF16_TO_F32, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_bf16_accurate +SIMSIMD_MAKE_KL(accurate, bf16, f64, _simsimd_bf16_to_f64, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_kl_bf16_accurate +SIMSIMD_MAKE_JS(accurate, bf16, f64, _simsimd_bf16_to_f64, SIMSIMD_F32_DIVISION_EPSILON) // simsimd_js_bf16_accurate #if _SIMSIMD_TARGET_ARM #if SIMSIMD_TARGET_NEON diff --git a/include/simsimd/simsimd.h b/include/simsimd/simsimd.h index 153bacfe..f1f95974 100644 --- a/include/simsimd/simsimd.h +++ b/include/simsimd/simsimd.h @@ -235,6 +235,43 @@ typedef enum { simsimd_datatype_bf16c_k = 1 << 23, ///< Complex brain floating point } simsimd_datatype_t; +typedef enum { + simsimd_datatype_unknown_family_k = 0, + simsimd_datatype_binary_famiily_k, + simsimd_datatype_float_family_k, + simsimd_datatype_complex_float_family_k, + simsimd_datatype_int_family_k, + simsimd_datatype_uint_family_k, +} simsimd_datatype_family_k; + +/** + * @brief Classifies the family of the datatype. + * @return The family of the datatype. + */ +SIMSIMD_PUBLIC simsimd_datatype_family_k simsimd_datatype_family(simsimd_datatype_t dtype) { + switch (dtype) { + case simsimd_datatype_f64_k: return simsimd_datatype_float_family_k; + case simsimd_datatype_f32_k: return simsimd_datatype_float_family_k; + case simsimd_datatype_f16_k: return simsimd_datatype_float_family_k; + case simsimd_datatype_bf16_k: return simsimd_datatype_float_family_k; + case simsimd_datatype_f64c_k: return simsimd_datatype_complex_float_family_k; + case simsimd_datatype_f32c_k: return simsimd_datatype_complex_float_family_k; + case simsimd_datatype_f16c_k: return simsimd_datatype_complex_float_family_k; + case simsimd_datatype_bf16c_k: return simsimd_datatype_complex_float_family_k; + case simsimd_datatype_b8_k: return simsimd_datatype_binary_famiily_k; + case simsimd_datatype_u8_k: return simsimd_datatype_uint_family_k; + case simsimd_datatype_u16_k: return simsimd_datatype_uint_family_k; + case simsimd_datatype_u32_k: return simsimd_datatype_uint_family_k; + case simsimd_datatype_u64_k: return simsimd_datatype_uint_family_k; + case simsimd_datatype_i8_k: return simsimd_datatype_int_family_k; + case simsimd_datatype_i16_k: return simsimd_datatype_int_family_k; + case simsimd_datatype_i32_k: return simsimd_datatype_int_family_k; + case simsimd_datatype_i64_k: return simsimd_datatype_int_family_k; + case simsimd_datatype_i4x2_k: return simsimd_datatype_int_family_k; + default: return simsimd_datatype_unknown_family_k; + } +} + /** * @brief Type-punned function pointer for dense vector representations and simplest similarity measures. * diff --git a/include/simsimd/sparse.h b/include/simsimd/sparse.h index 493828bb..c60b3433 100644 --- a/include/simsimd/sparse.h +++ b/include/simsimd/sparse.h @@ -42,6 +42,9 @@ * j += ai >= bj; * } * + * ! When dealing with weighted intersections, the kernel exports two results: the count and weights dot product. + * ? When dealing with low-precision weights, the dot product is still computed with higher precision. + * * x86 intrinsics: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/ * Arm intrinsics: https://developer.arm.com/architectures/instruction-sets/intrinsics/ */ @@ -181,15 +184,15 @@ SIMSIMD_MAKE_INTERSECT_LINEAR(accurate, u32, size) // simsimd_intersect_u32_accu simsimd_##weight_type##_t const *a_weights, simsimd_##weight_type##_t const *b_weights, \ simsimd_size_t a_length, simsimd_size_t b_length, simsimd_distance_t *results) { \ simsimd_##counter_type##_t intersection_size = 0; \ - simsimd_##accumulator_type##_t weights_product = 0; \ + simsimd_##accumulator_type##_t weights_product = 0, awi, bwj; \ simsimd_size_t i = 0, j = 0; \ while (i != a_length && j != b_length) { \ simsimd_##input_type##_t ai = a[i]; \ simsimd_##input_type##_t bj = b[j]; \ int matches = ai == bj; \ - simsimd_##counter_type##_t awi = load_and_convert(a_weights + i); \ - simsimd_##counter_type##_t bwi = load_and_convert(b_weights + i); \ - weights_product += matches * awi * bwi; \ + load_and_convert(a_weights + i, &awi); \ + load_and_convert(b_weights + j, &bwj); \ + weights_product += matches * awi * bwj; \ intersection_size += matches; \ i += ai < bj; \ j += ai >= bj; \ @@ -199,9 +202,9 @@ SIMSIMD_MAKE_INTERSECT_LINEAR(accurate, u32, size) // simsimd_intersect_u32_accu } SIMSIMD_MAKE_INTERSECT_WEIGHTED(accurate, spdot_counts, u16, size, i16, i64, - SIMSIMD_DEREFERENCE) // simsimd_spdot_counts_u16_accurate + _SIMSIMD_ASSIGN_1_TO_2) // simsimd_spdot_counts_u16_accurate SIMSIMD_MAKE_INTERSECT_WEIGHTED(accurate, spdot_weights, u16, size, bf16, f64, - SIMSIMD_BF16_TO_F32) // simsimd_spdot_weights_u16_accurate + _simsimd_bf16_to_f64) // simsimd_spdot_weights_u16_accurate #define SIMSIMD_MAKE_INTERSECT_GALLOPING(name, input_type, counter_type) \ SIMSIMD_PUBLIC simsimd_size_t simsimd_galloping_search_##input_type(simsimd_##input_type##_t const *array, \ @@ -254,9 +257,9 @@ SIMSIMD_MAKE_INTERSECT_WEIGHTED(accurate, spdot_weights, u16, size, bf16, f64, SIMSIMD_MAKE_INTERSECT_GALLOPING(serial, u16, size) // simsimd_intersect_u16_serial SIMSIMD_MAKE_INTERSECT_GALLOPING(serial, u32, size) // simsimd_intersect_u32_serial SIMSIMD_MAKE_INTERSECT_WEIGHTED(serial, spdot_counts, u16, size, i16, i32, - SIMSIMD_DEREFERENCE) // simsimd_spdot_counts_u16_serial + _SIMSIMD_ASSIGN_1_TO_2) // simsimd_spdot_counts_u16_serial SIMSIMD_MAKE_INTERSECT_WEIGHTED(serial, spdot_weights, u16, size, bf16, f32, - SIMSIMD_BF16_TO_F32) // simsimd_spdot_weights_u16_serial + simsimd_bf16_to_f32) // simsimd_spdot_weights_u16_serial /* The AVX-512 implementations are inspired by the "Faster-Than-Native Alternatives * for x86 VP2INTERSECT Instructions" paper by Guille Diez-Canas, 2022. @@ -336,7 +339,8 @@ SIMSIMD_INTERNAL simsimd_u32_t _simsimd_intersect_u16x32_ice(__m512i a, __m512i __mmask32 nm72 = _mm512_mask_cmpneq_epi16_mask(nm62, a2, b31); __mmask32 nm73 = _mm512_mask_cmpneq_epi16_mask(nm63, a3, b31); - return ~(simsimd_u32_t)(nm70 & simsimd_u32_rol(nm71, 8) & simsimd_u32_rol(nm72, 16) & simsimd_u32_ror(nm73, 8)); + return ~(simsimd_u32_t)(nm70 & _simsimd_u32_rol(&nm71, 8) & _simsimd_u32_rol(&nm72, 16) & + _simsimd_u32_ror(&nm73, 8)); } /** @@ -387,7 +391,7 @@ SIMSIMD_INTERNAL simsimd_u16_t _simsimd_intersect_u32x16_ice(__m512i a, __m512i __mmask16 nm2 = _mm512_mask_cmpneq_epi32_mask(nm22, a2, b3); __mmask16 nm3 = _mm512_mask_cmpneq_epi32_mask(nm23, a3, b3); - return ~(simsimd_u16_t)(nm0 & simsimd_u16_rol(nm1, 4) & simsimd_u16_rol(nm2, 8) & simsimd_u16_ror(nm3, 4)); + return ~(simsimd_u16_t)(nm0 & _simsimd_u16_rol(&nm1, 4) & _simsimd_u16_rol(&nm2, 8) & _simsimd_u16_ror(&nm3, 4)); } SIMSIMD_PUBLIC void simsimd_intersect_u16_ice( // diff --git a/include/simsimd/spatial.h b/include/simsimd/spatial.h index e03e5a78..5c6a9687 100644 --- a/include/simsimd/spatial.h +++ b/include/simsimd/spatial.h @@ -177,10 +177,10 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_sierra(simsimd_i8_t const* a, simsimd_i8_t co SIMSIMD_PUBLIC void simsimd_l2sq_##input_type##_##name(simsimd_##input_type##_t const *a, \ simsimd_##input_type##_t const *b, simsimd_size_t n, \ simsimd_distance_t *result) { \ - simsimd_##accumulator_type##_t d2 = 0; \ + simsimd_##accumulator_type##_t d2 = 0, ai, bi; \ for (simsimd_size_t i = 0; i != n; ++i) { \ - simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \ - simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \ + load_and_convert(a + i, &ai); \ + load_and_convert(b + i, &bi); \ d2 += (ai - bi) * (ai - bi); \ } \ *result = d2; \ @@ -198,10 +198,10 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_sierra(simsimd_i8_t const* a, simsimd_i8_t co SIMSIMD_PUBLIC void simsimd_cos_##input_type##_##name(simsimd_##input_type##_t const *a, \ simsimd_##input_type##_t const *b, simsimd_size_t n, \ simsimd_distance_t *result) { \ - simsimd_##accumulator_type##_t ab = 0, a2 = 0, b2 = 0; \ + simsimd_##accumulator_type##_t ab = 0, a2 = 0, b2 = 0, ai, bi; \ for (simsimd_size_t i = 0; i != n; ++i) { \ - simsimd_##accumulator_type##_t ai = load_and_convert(a + i); \ - simsimd_##accumulator_type##_t bi = load_and_convert(b + i); \ + load_and_convert(a + i, &ai); \ + load_and_convert(b + i, &bi); \ ab += ai * bi; \ a2 += ai * ai; \ b2 += bi * bi; \ @@ -214,41 +214,41 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_sierra(simsimd_i8_t const* a, simsimd_i8_t co } \ } -SIMSIMD_MAKE_COS(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_cos_f64_serial -SIMSIMD_MAKE_L2SQ(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_l2sq_f64_serial -SIMSIMD_MAKE_L2(serial, f64, f64, SIMSIMD_DEREFERENCE) // simsimd_l2_f64_serial +SIMSIMD_MAKE_COS(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_cos_f64_serial +SIMSIMD_MAKE_L2SQ(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_l2sq_f64_serial +SIMSIMD_MAKE_L2(serial, f64, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_l2_f64_serial -SIMSIMD_MAKE_COS(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_cos_f32_serial -SIMSIMD_MAKE_L2SQ(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_l2sq_f32_serial -SIMSIMD_MAKE_L2(serial, f32, f32, SIMSIMD_DEREFERENCE) // simsimd_l2_f32_serial +SIMSIMD_MAKE_COS(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_cos_f32_serial +SIMSIMD_MAKE_L2SQ(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_l2sq_f32_serial +SIMSIMD_MAKE_L2(serial, f32, f32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_l2_f32_serial -SIMSIMD_MAKE_COS(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_cos_f16_serial -SIMSIMD_MAKE_L2SQ(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_l2sq_f16_serial -SIMSIMD_MAKE_L2(serial, f16, f32, SIMSIMD_F16_TO_F32) // simsimd_l2_f16_serial +SIMSIMD_MAKE_COS(serial, f16, f32, simsimd_f16_to_f32) // simsimd_cos_f16_serial +SIMSIMD_MAKE_L2SQ(serial, f16, f32, simsimd_f16_to_f32) // simsimd_l2sq_f16_serial +SIMSIMD_MAKE_L2(serial, f16, f32, simsimd_f16_to_f32) // simsimd_l2_f16_serial -SIMSIMD_MAKE_COS(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_cos_bf16_serial -SIMSIMD_MAKE_L2SQ(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_l2sq_bf16_serial -SIMSIMD_MAKE_L2(serial, bf16, f32, SIMSIMD_BF16_TO_F32) // simsimd_l2_bf16_serial +SIMSIMD_MAKE_COS(serial, bf16, f32, simsimd_bf16_to_f32) // simsimd_cos_bf16_serial +SIMSIMD_MAKE_L2SQ(serial, bf16, f32, simsimd_bf16_to_f32) // simsimd_l2sq_bf16_serial +SIMSIMD_MAKE_L2(serial, bf16, f32, simsimd_bf16_to_f32) // simsimd_l2_bf16_serial -SIMSIMD_MAKE_COS(serial, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_cos_i8_serial -SIMSIMD_MAKE_L2SQ(serial, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2sq_i8_serial -SIMSIMD_MAKE_L2(serial, i8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2_i8_serial +SIMSIMD_MAKE_COS(serial, i8, i32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_cos_i8_serial +SIMSIMD_MAKE_L2SQ(serial, i8, i32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_l2sq_i8_serial +SIMSIMD_MAKE_L2(serial, i8, i32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_l2_i8_serial -SIMSIMD_MAKE_COS(serial, u8, i32, SIMSIMD_DEREFERENCE) // simsimd_cos_u8_serial -SIMSIMD_MAKE_L2SQ(serial, u8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2sq_u8_serial -SIMSIMD_MAKE_L2(serial, u8, i32, SIMSIMD_DEREFERENCE) // simsimd_l2_u8_serial +SIMSIMD_MAKE_COS(serial, u8, i32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_cos_u8_serial +SIMSIMD_MAKE_L2SQ(serial, u8, i32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_l2sq_u8_serial +SIMSIMD_MAKE_L2(serial, u8, i32, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_l2_u8_serial -SIMSIMD_MAKE_COS(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_cos_f32_accurate -SIMSIMD_MAKE_L2SQ(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_l2sq_f32_accurate -SIMSIMD_MAKE_L2(accurate, f32, f64, SIMSIMD_DEREFERENCE) // simsimd_l2_f32_accurate +SIMSIMD_MAKE_COS(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_cos_f32_accurate +SIMSIMD_MAKE_L2SQ(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_l2sq_f32_accurate +SIMSIMD_MAKE_L2(accurate, f32, f64, _SIMSIMD_ASSIGN_1_TO_2) // simsimd_l2_f32_accurate -SIMSIMD_MAKE_COS(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_cos_f16_accurate -SIMSIMD_MAKE_L2SQ(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_l2sq_f16_accurate -SIMSIMD_MAKE_L2(accurate, f16, f64, SIMSIMD_F16_TO_F32) // simsimd_l2_f16_accurate +SIMSIMD_MAKE_COS(accurate, f16, f64, _simsimd_f16_to_f64) // simsimd_cos_f16_accurate +SIMSIMD_MAKE_L2SQ(accurate, f16, f64, _simsimd_f16_to_f64) // simsimd_l2sq_f16_accurate +SIMSIMD_MAKE_L2(accurate, f16, f64, _simsimd_f16_to_f64) // simsimd_l2_f16_accurate -SIMSIMD_MAKE_COS(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_cos_bf16_accurate -SIMSIMD_MAKE_L2SQ(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_l2sq_bf16_accurate -SIMSIMD_MAKE_L2(accurate, bf16, f64, SIMSIMD_BF16_TO_F32) // simsimd_l2_bf16_accurate +SIMSIMD_MAKE_COS(accurate, bf16, f64, _simsimd_bf16_to_f64) // simsimd_cos_bf16_accurate +SIMSIMD_MAKE_L2SQ(accurate, bf16, f64, _simsimd_bf16_to_f64) // simsimd_l2sq_bf16_accurate +SIMSIMD_MAKE_L2(accurate, bf16, f64, _simsimd_bf16_to_f64) // simsimd_l2_bf16_accurate #if _SIMSIMD_TARGET_ARM #if SIMSIMD_TARGET_NEON diff --git a/include/simsimd/types.h b/include/simsimd/types.h index b3e98544..b1f4a32c 100644 --- a/include/simsimd/types.h +++ b/include/simsimd/types.h @@ -286,8 +286,9 @@ /** * @brief Similat to `static_assert`, but compatible with C 99. + * In C the `_Static_assert` is only available with C 11 and later. */ -#define SIMSIMD_STATIC_ASSERT(expr, msg) typedef char static_assert_##msg[(expr) ? 1 : -1] +#define _SIMSIMD_STATIC_ASSERT(expr, msg) typedef char static_assert_##msg[(expr) ? 1 : -1] #ifdef __cplusplus extern "C" { @@ -411,145 +412,21 @@ typedef unsigned short simsimd_bf16_t; /* * Let's make sure the sizes of the types are as expected. - * In C the `_Static_assert` is only available with C 11 and later. */ -#define SIMSIMD_STATIC_ASSERT(cond, msg) typedef char static_assertion_##msg[(cond) ? 1 : -1] -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_b8_t) == 1, simsimd_b8_t_must_be_1_byte); -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i4x2_t) == 1, simsimd_i4x2_t_must_be_1_byte); -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i8_t) == 1, simsimd_i8_t_must_be_1_byte); -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_u8_t) == 1, simsimd_u8_t_must_be_1_byte); -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i16_t) == 2, simsimd_i16_t_must_be_2_bytes); -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_u16_t) == 2, simsimd_u16_t_must_be_2_bytes); -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i32_t) == 4, simsimd_i32_t_must_be_4_bytes); -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_u32_t) == 4, simsimd_u32_t_must_be_4_bytes); -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i64_t) == 8, simsimd_i64_t_must_be_8_bytes); -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_u64_t) == 8, simsimd_u64_t_must_be_8_bytes); -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_f32_t) == 4, simsimd_f32_t_must_be_4_bytes); -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_f64_t) == 8, simsimd_f64_t_must_be_8_bytes); -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_f16_t) == 2, simsimd_f16_t_must_be_2_bytes); -SIMSIMD_STATIC_ASSERT(sizeof(simsimd_bf16_t) == 2, simsimd_bf16_t_must_be_2_bytes); - -#define SIMSIMD_DEREFERENCE(x) (*(x)) -#define SIMSIMD_EXPORT(x, y) *(y) = x - -/** - * @brief Returns the value of the half-precision floating-point number, - * potentially decompressed into single-precision. - */ -#if !defined(SIMSIMD_F16_TO_F32) -#if SIMSIMD_NATIVE_F16 -#define SIMSIMD_F16_TO_F32(x) (SIMSIMD_DEREFERENCE(x)) -#define SIMSIMD_F32_TO_F16(x, y) (SIMSIMD_EXPORT(x, y)) -#else -#define SIMSIMD_F16_TO_F32(x) (simsimd_f16_to_f32(x)) -#define SIMSIMD_F32_TO_F16(x, y) (simsimd_f32_to_f16(x, y)) -#endif -#endif - -/** - * @brief Returns the value of the half-precision brain floating-point number, - * potentially decompressed into single-precision. - */ -#if !defined(SIMSIMD_BF16_TO_F32) -#if SIMSIMD_NATIVE_BF16 -#define SIMSIMD_BF16_TO_F32(x) (SIMSIMD_DEREFERENCE(x)) -#define SIMSIMD_F32_TO_BF16(x, y) (SIMSIMD_EXPORT(x, y)) -#else -#define SIMSIMD_BF16_TO_F32(x) (simsimd_bf16_to_f32(x)) -#define SIMSIMD_F32_TO_BF16(x, y) (simsimd_f32_to_bf16(x, y)) -#endif -#endif - -#if !defined(SIMSIMD_F32_TO_I8) -#define SIMSIMD_F32_TO_I8(x, y) *(y) = (simsimd_i8_t)fminf(fmaxf(roundf(x), -128), 127) -#endif -#if !defined(SIMSIMD_F32_TO_U8) -#define SIMSIMD_F32_TO_U8(x, y) *(y) = (simsimd_u8_t)fminf(fmaxf(roundf(x), 0), 255) -#endif -#if !defined(SIMSIMD_F64_TO_I8) -#define SIMSIMD_F64_TO_I8(x, y) *(y) = (simsimd_i8_t)fmin(fmax(round(x), -128), 127) -#endif -#if !defined(SIMSIMD_F64_TO_U8) -#define SIMSIMD_F64_TO_U8(x, y) *(y) = (simsimd_u8_t)fmin(fmax(round(x), 0), 255) -#endif - -/** - * @brief Converts floating pointer numbers to integers, clamping them to the range of signed - * and unsigned low-resolution integers, and rounding them to the nearest integer. - * - * In C++ the analogous solution with STL could be: `std::clamp(std::round(x), -128, 127)`. - * In C, using the standard library: `fminf(fmaxf(roundf(x), -128), 127)`. - */ -#if !defined(SIMSIMD_F32_TO_I8) -#define SIMSIMD_F32_TO_I8(x, y) \ - *(y) = (simsimd_i8_t)((x) > 127 ? 127 : ((x) < -128 ? -128 : (int)((x) + ((x) < 0 ? -0.5f : 0.5f)))) -#endif -#if !defined(SIMSIMD_F32_TO_U8) -#define SIMSIMD_F32_TO_U8(x, y) \ - *(y) = (simsimd_u8_t)((x) > 255 ? 255 : ((x) < 0 ? 0 : (int)((x) + ((x) < 0 ? -0.5f : 0.5f)))) -#endif -#if !defined(SIMSIMD_F64_TO_I8) -#define SIMSIMD_F64_TO_I8(x, y) \ - *(y) = (simsimd_i8_t)((x) > 127 ? 127 : ((x) < -128 ? -128 : (int)((x) + ((x) < 0 ? -0.5 : 0.5)))) -#endif -#if !defined(SIMSIMD_F64_TO_U8) -#define SIMSIMD_F64_TO_U8(x, y) \ - *(y) = (simsimd_u8_t)((x) > 255 ? 255 : ((x) < 0 ? 0 : (int)((x) + ((x) < 0 ? -0.5 : 0.5)))) -#endif -#if !defined(SIMSIMD_F64_TO_I16) -#define SIMSIMD_F64_TO_I16(x, y) \ - *(y) = (simsimd_i16_t)((x) > 32767 ? 32767 : ((x) < -32768 ? -32768 : (int)((x) + ((x) < 0 ? -0.5 : 0.5)))) -#endif -#if !defined(SIMSIMD_F64_TO_U16) -#define SIMSIMD_F64_TO_U16(x, y) \ - *(y) = (simsimd_u16_t)((x) > 65535 ? 65535 : ((x) < 0 ? 0 : (int)((x) + ((x) < 0 ? -0.5 : 0.5)))) -#endif -#if !defined(SIMSIMD_F64_TO_I32) -#define SIMSIMD_F64_TO_I32(x, y) \ - *(y) = (simsimd_i32_t)((x) > 2147483647 ? 2147483647 \ - : ((x) < -2147483648 ? -2147483648 : (int)((x) + ((x) < 0 ? -0.5 : 0.5)))) -#endif -#if !defined(SIMSIMD_F64_TO_U32) -#define SIMSIMD_F64_TO_U32(x, y) \ - *(y) = (simsimd_u32_t)((x) > 4294967295 ? 4294967295 : ((x) < 0 ? 0 : (unsigned int)((x) + ((x) < 0 ? -0.5 : 0.5)))) -#endif -#if !defined(SIMSIMD_F64_TO_I64) -#define SIMSIMD_F64_TO_I64(x, y) \ - *(y) = (simsimd_i64_t)((x) > 9223372036854775807.0 \ - ? 9223372036854775807 \ - : ((x) < -9223372036854775808.0 ? -9223372036854775808 \ - : (long long)((x) + ((x) < 0 ? -0.5 : 0.5)))) -#endif -#if !defined(SIMSIMD_F64_TO_U64) -#define SIMSIMD_F64_TO_U64(x, y) \ - *(y) = (simsimd_u64_t)((x) > 18446744073709551615.0 \ - ? 18446744073709551615ULL \ - : ((x) < 0 ? 0 : (unsigned long long)((x) + ((x) < 0 ? -0.5 : 0.5)))) -#endif - -/** - * @brief Converts high-resolution signed integers to low-resolution signed and unsigned integers, - * clamping them to indicate saturation. - */ -#if !defined(SIMSIMD_I64_TO_I8) -#define SIMSIMD_I64_TO_I8(x, y) *(y) = (simsimd_i8_t)((x) > 127 ? 127 : ((x) < -128 ? -128 : (x))) -#endif -#if !defined(SIMSIMD_I64_TO_U8) -#define SIMSIMD_I64_TO_U8(x, y) *(y) = (simsimd_u8_t)((x) > 255 ? 255 : ((x) < 0 ? 0 : (x))) -#endif -#if !defined(SIMSIMD_I64_TO_I16) -#define SIMSIMD_I64_TO_I16(x, y) *(y) = (simsimd_i16_t)((x) > 32767 ? 32767 : ((x) < -32768 ? -32768 : (x))) -#endif -#if !defined(SIMSIMD_I64_TO_U16) -#define SIMSIMD_I64_TO_U16(x, y) *(y) = (simsimd_u16_t)((x) > 65535 ? 65535 : ((x) < 0 ? 0 : (x))) -#endif -#if !defined(SIMSIMD_I64_TO_I32) -#define SIMSIMD_I64_TO_I32(x, y) \ - *(y) = (simsimd_i32_t)((x) > 2147483647 ? 2147483647 : ((x) < -2147483648 ? -2147483648 : (x))) -#endif -#if !defined(SIMSIMD_I64_TO_U32) -#define SIMSIMD_I64_TO_U32(x, y) *(y) = (simsimd_u32_t)((x) > 4294967295 ? 4294967295 : ((x) < 0 ? 0 : (x))) -#endif +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_b8_t) == 1, simsimd_b8_t_must_be_1_byte); +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i4x2_t) == 1, simsimd_i4x2_t_must_be_1_byte); +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i8_t) == 1, simsimd_i8_t_must_be_1_byte); +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_u8_t) == 1, simsimd_u8_t_must_be_1_byte); +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i16_t) == 2, simsimd_i16_t_must_be_2_bytes); +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_u16_t) == 2, simsimd_u16_t_must_be_2_bytes); +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i32_t) == 4, simsimd_i32_t_must_be_4_bytes); +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_u32_t) == 4, simsimd_u32_t_must_be_4_bytes); +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_i64_t) == 8, simsimd_i64_t_must_be_8_bytes); +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_u64_t) == 8, simsimd_u64_t_must_be_8_bytes); +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_f32_t) == 4, simsimd_f32_t_must_be_4_bytes); +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_f64_t) == 8, simsimd_f64_t_must_be_8_bytes); +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_f16_t) == 2, simsimd_f16_t_must_be_2_bytes); +_SIMSIMD_STATIC_ASSERT(sizeof(simsimd_bf16_t) == 2, simsimd_bf16_t_must_be_2_bytes); /** @brief Convenience type for half-precision floating-point type conversions. */ typedef union { @@ -590,6 +467,8 @@ SIMSIMD_PUBLIC simsimd_f32_t simsimd_approximate_log(simsimd_f32_t number) { return x - x2 / 2 + x3 / 3; } +#define _SIMSIMD_ASSIGN_1_TO_2(x, y) *(y) = *(x) + /** * @brief For compilers that don't natively support the `_Float16` type, * upcasts contents into a more conventional `float`. @@ -600,17 +479,21 @@ SIMSIMD_PUBLIC simsimd_f32_t simsimd_approximate_log(simsimd_f32_t number) { * https://gist.github.com/milhidaka/95863906fe828198f47991c813dbe233 * https://github.com/OpenCyphal/libcanard/blob/636795f4bc395f56af8d2c61d3757b5e762bb9e5/canard.c#L811-L834 */ -SIMSIMD_PUBLIC simsimd_f32_t simsimd_f16_to_f32(simsimd_f16_t const *x_ptr) { - unsigned short x = *(unsigned short const *)x_ptr; - unsigned int exponent = (x & 0x7C00) >> 10; - unsigned int mantissa = (x & 0x03FF) << 13; +SIMSIMD_PUBLIC void simsimd_f16_to_f32(simsimd_f16_t const *x, simsimd_f32_t *y) { +#if SIMSIMD_NATIVE_F16 + *y = *x; +#else + unsigned short x_short = *(unsigned short const *)x; + unsigned int exponent = (x_short & 0x7C00) >> 10; + unsigned int mantissa = (x_short & 0x03FF) << 13; simsimd_f32i32_t mantissa_conv; mantissa_conv.f = (float)mantissa; unsigned int v = (mantissa_conv.i) >> 23; simsimd_f32i32_t conv; - conv.i = (x & 0x8000) << 16 | (exponent != 0) * ((exponent + 112) << 23 | mantissa) | + conv.i = (x_short & 0x8000) << 16 | (exponent != 0) * ((exponent + 112) << 23 | mantissa) | ((exponent == 0) & (mantissa != 0)) * ((v - 37) << 23 | ((mantissa << (150 - v)) & 0x007FE000)); - return conv.f; + *y = conv.f; +#endif } /** @@ -622,16 +505,20 @@ SIMSIMD_PUBLIC simsimd_f32_t simsimd_f16_to_f32(simsimd_f16_t const *x_ptr) { * https://gist.github.com/milhidaka/95863906fe828198f47991c813dbe233 * https://github.com/OpenCyphal/libcanard/blob/636795f4bc395f56af8d2c61d3757b5e762bb9e5/canard.c#L811-L834 */ -SIMSIMD_PUBLIC void simsimd_f32_to_f16(simsimd_f32_t x, simsimd_f16_t *result_ptr) { +SIMSIMD_PUBLIC void simsimd_f32_to_f16(simsimd_f32_t const *x, simsimd_f16_t *y) { +#if SIMSIMD_NATIVE_F16 + *y = (simsimd_f16_t)*x; +#else simsimd_f32i32_t conv; - conv.f = x; + conv.f = *x; unsigned int b = conv.i + 0x00001000; unsigned int e = (b & 0x7F800000) >> 23; unsigned int m = b & 0x007FFFFF; unsigned short result = ((b & 0x80000000) >> 16) | (e > 112) * ((((e - 112) << 10) & 0x7C00) | (m >> 13)) | ((e < 113) & (e > 101)) * ((((0x007FF000 + m) >> (125 - e)) + 1) >> 1) | ((e > 143) * 0x7FFF); - *(unsigned short *)result_ptr = result; + *(unsigned short *)y = result; +#endif } /** @@ -641,11 +528,15 @@ SIMSIMD_PUBLIC void simsimd_f32_to_f16(simsimd_f32_t x, simsimd_f16_t *result_pt * https://stackoverflow.com/questions/55253233/convert-fp32-to-bfloat16-in-c/55254307#55254307 * https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus */ -SIMSIMD_PUBLIC simsimd_f32_t simsimd_bf16_to_f32(simsimd_bf16_t const *x_ptr) { - unsigned short x = *(unsigned short const *)x_ptr; +SIMSIMD_PUBLIC void simsimd_bf16_to_f32(simsimd_bf16_t const *x, simsimd_f32_t *y) { +#if SIMSIMD_NATIVE_BF16 + *y = *x; +#else + unsigned short x_short = *(unsigned short const *)x; simsimd_f32i32_t conv; - conv.i = x << 16; // Zero extends the mantissa - return conv.f; + conv.i = x_short << 16; // Zero extends the mantissa + *y = conv.f; +#endif } /** @@ -654,14 +545,112 @@ SIMSIMD_PUBLIC simsimd_f32_t simsimd_bf16_to_f32(simsimd_bf16_t const *x_ptr) { * https://stackoverflow.com/questions/55253233/convert-fp32-to-bfloat16-in-c/55254307#55254307 * https://cloud.google.com/blog/products/ai-machine-learning/bfloat16-the-secret-to-high-performance-on-cloud-tpus */ -SIMSIMD_PUBLIC void simsimd_f32_to_bf16(simsimd_f32_t x, simsimd_bf16_t *result_ptr) { +SIMSIMD_PUBLIC void simsimd_f32_to_bf16(simsimd_f32_t const *x, simsimd_bf16_t *y) { +#if SIMSIMD_NATIVE_BF16 + *y = (simsimd_bf16_t)*x; +#else simsimd_f32i32_t conv; - conv.f = x; + conv.f = *x; conv.i += 0x8000; // Rounding is optional conv.i >>= 16; // The top 16 bits will be zeroed out anyways // conv.i &= 0xFFFF; - *(unsigned short *)result_ptr = (unsigned short)conv.i; + *(unsigned short *)y = (unsigned short)conv.i; +#endif +} + +SIMSIMD_INTERNAL void _simsimd_f16_to_f64(simsimd_f16_t const *x, simsimd_f64_t *y) { + simsimd_f32_t f32; + simsimd_f16_to_f32(x, &f32); + *y = (simsimd_f64_t)f32; +} +SIMSIMD_INTERNAL void _simsimd_f64_to_f16(simsimd_f64_t const *x, simsimd_f16_t *y) { + simsimd_f32_t f32 = (simsimd_f32_t)*x; + simsimd_f32_to_f16(&f32, y); +} +SIMSIMD_INTERNAL void _simsimd_bf16_to_f64(simsimd_bf16_t const *x, simsimd_f64_t *y) { + simsimd_f32_t f32; + simsimd_bf16_to_f32(x, &f32); + *y = (simsimd_f64_t)f32; +} +SIMSIMD_INTERNAL void _simsimd_f64_to_bf16(simsimd_f64_t const *x, simsimd_bf16_t *y) { + simsimd_f32_t f32 = (simsimd_f32_t)*x; + simsimd_f32_to_bf16(&f32, y); +} + +/* Convert floating pointer numbers to integers, clamping them to the range of signed + * and unsigned low-resolution integers, and rounding them to the nearest integer. + * + * In C++ the analogous solution with STL could be: `*y = std::clamp(std::round(*x), -128, 127)`. + * In C, using the standard library: `*x = fminf(fmaxf(roundf(*x), -128), 127)`. + */ +SIMSIMD_INTERNAL void _simsimd_f32_to_i8(simsimd_f32_t const *x, simsimd_i8_t *y) { + *y = (simsimd_i8_t)(*x > 127 ? 127 : (*x < -128 ? -128 : (int)(*x + (*x < 0 ? -0.5f : 0.5f)))); +} + +SIMSIMD_INTERNAL void _simsimd_f32_to_u8(simsimd_f32_t const *x, simsimd_u8_t *y) { + *y = (simsimd_u8_t)(*x > 255 ? 255 : (*x < 0 ? 0 : (int)(*x + (*x < 0 ? -0.5f : 0.5f)))); +} + +SIMSIMD_INTERNAL void _simsimd_f64_to_i8(simsimd_f64_t const *x, simsimd_i8_t *y) { + *y = (simsimd_i8_t)(*x > 127 ? 127 : (*x < -128 ? -128 : (int)(*x + (*x < 0 ? -0.5 : 0.5)))); +} + +SIMSIMD_INTERNAL void _simsimd_f64_to_u8(simsimd_f64_t const *x, simsimd_u8_t *y) { + *y = (simsimd_u8_t)(*x > 255 ? 255 : (*x < 0 ? 0 : (int)(*x + (*x < 0 ? -0.5 : 0.5)))); +} + +SIMSIMD_INTERNAL void _simsimd_f64_to_i16(simsimd_f64_t const *x, simsimd_i16_t *y) { + *y = (simsimd_i16_t)(*x > 32767 ? 32767 : (*x < -32768 ? -32768 : (int)(*x + (*x < 0 ? -0.5 : 0.5)))); +} + +SIMSIMD_INTERNAL void _simsimd_f64_to_u16(simsimd_f64_t const *x, simsimd_u16_t *y) { + *y = (simsimd_u16_t)(*x > 65535 ? 65535 : (*x < 0 ? 0 : (int)(*x + (*x < 0 ? -0.5 : 0.5)))); +} + +SIMSIMD_INTERNAL void _simsimd_f64_to_i32(simsimd_f64_t const *x, simsimd_i32_t *y) { + *y = (simsimd_i32_t)(*x > 2147483647 ? 2147483647 + : (*x < -2147483648 ? -2147483648 : (int)(*x + (*x < 0 ? -0.5 : 0.5)))); +} + +SIMSIMD_INTERNAL void _simsimd_f64_to_u32(simsimd_f64_t const *x, simsimd_u32_t *y) { + *y = (simsimd_u32_t)(*x > 4294967295 ? 4294967295 : (*x < 0 ? 0 : (unsigned int)(*x + (*x < 0 ? -0.5 : 0.5)))); +} + +SIMSIMD_INTERNAL void _simsimd_f64_to_i64(simsimd_f64_t const *x, simsimd_i64_t *y) { + *y = (simsimd_i64_t)(*x > 9223372036854775807.0 + ? 9223372036854775807ll + : (*x < -9223372036854775808.0 ? -9223372036854775808 + : (long long)(*x + (*x < 0 ? -0.5 : 0.5)))); +} + +SIMSIMD_INTERNAL void _simsimd_f64_to_u64(simsimd_f64_t const *x, simsimd_u64_t *y) { + *y = (simsimd_u64_t)(*x > 18446744073709551615.0 ? 18446744073709551615ull + : (*x < 0 ? 0 : (unsigned long long)(*x + (*x < 0 ? -0.5 : 0.5)))); +} + +SIMSIMD_INTERNAL void _simsimd_i64_to_i8(simsimd_i64_t const *x, simsimd_i8_t *y) { + *y = (simsimd_i8_t)(*x > 127 ? 127 : (*x < -128 ? -128 : *x)); +} + +SIMSIMD_INTERNAL void _simsimd_i64_to_u8(simsimd_i64_t const *x, simsimd_u8_t *y) { + *y = (simsimd_u8_t)(*x > 255 ? 255 : (*x < 0 ? 0 : *x)); +} + +SIMSIMD_INTERNAL void _simsimd_i64_to_i16(simsimd_i64_t const *x, simsimd_i16_t *y) { + *y = (simsimd_i16_t)(*x > 32767 ? 32767 : (*x < -32768 ? -32768 : *x)); +} + +SIMSIMD_INTERNAL void _simsimd_i64_to_u16(simsimd_i64_t const *x, simsimd_u16_t *y) { + *y = (simsimd_u16_t)(*x > 65535 ? 65535 : (*x < 0 ? 0 : *x)); +} + +SIMSIMD_INTERNAL void _simsimd_i64_to_i32(simsimd_i64_t const *x, simsimd_i32_t *y) { + *y = (simsimd_i32_t)(*x > 2147483647 ? 2147483647 : (*x < -2147483648 ? -2147483648 : *x)); +} + +SIMSIMD_INTERNAL void _simsimd_i64_to_u32(simsimd_i64_t const *x, simsimd_u32_t *y) { + *y = (simsimd_u32_t)(*x > 4294967295 ? 4294967295 : (*x < 0 ? 0 : *x)); } /** @@ -678,13 +667,6 @@ SIMSIMD_INTERNAL simsimd_size_t _simsimd_divide_ceil(simsimd_size_t dividend, si return (dividend + divisor - 1) / divisor; } -SIMSIMD_PUBLIC simsimd_u32_t simsimd_u32_rol(simsimd_u32_t x, int n) { return (x << n) | (x >> (32 - n)); } -SIMSIMD_PUBLIC simsimd_u16_t simsimd_u16_rol(simsimd_u16_t x, int n) { return (x << n) | (x >> (16 - n)); } -SIMSIMD_PUBLIC simsimd_u8_t simsimd_u8_rol(simsimd_u8_t x, int n) { return (x << n) | (x >> (8 - n)); } -SIMSIMD_PUBLIC simsimd_u32_t simsimd_u32_ror(simsimd_u32_t x, int n) { return (x >> n) | (x << (32 - n)); } -SIMSIMD_PUBLIC simsimd_u16_t simsimd_u16_ror(simsimd_u16_t x, int n) { return (x >> n) | (x << (16 - n)); } -SIMSIMD_PUBLIC simsimd_u8_t simsimd_u8_ror(simsimd_u8_t x, int n) { return (x >> n) | (x << (8 - n)); } - /** * @brief A @b beefy structure to keep track of the N-Dimensional array index. * Occupies 512 + 16 = 528 bytes on a 64-bit machine, or 9 cache-lines, by default. @@ -744,6 +726,63 @@ SIMSIMD_PUBLIC int simsimd_ndindex_advance_to(simsimd_ndindex_t *ndindex, simsim return 0; // End of iteration } +SIMSIMD_INTERNAL simsimd_u32_t _simsimd_u32_rol(simsimd_u32_t *x, int n) { return (*x << n) | (*x >> (32 - n)); } +SIMSIMD_INTERNAL simsimd_u16_t _simsimd_u16_rol(simsimd_u16_t *x, int n) { return (*x << n) | (*x >> (16 - n)); } +SIMSIMD_INTERNAL simsimd_u8_t _simsimd_u8_rol(simsimd_u8_t *x, int n) { return (*x << n) | (*x >> (8 - n)); } +SIMSIMD_INTERNAL simsimd_u32_t _simsimd_u32_ror(simsimd_u32_t *x, int n) { return (*x >> n) | (*x << (32 - n)); } +SIMSIMD_INTERNAL simsimd_u16_t _simsimd_u16_ror(simsimd_u16_t *x, int n) { return (*x >> n) | (*x << (16 - n)); } +SIMSIMD_INTERNAL simsimd_u8_t _simsimd_u8_ror(simsimd_u8_t *x, int n) { return (*x >> n) | (*x << (8 - n)); } + +SIMSIMD_INTERNAL void _simsimd_u8_sadd(simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_u8_t *r) { + *r = (*a + *b < *a) ? 255 : (*a + *b); +} +SIMSIMD_INTERNAL void _simsimd_u16_sadd(simsimd_u16_t const *a, simsimd_u16_t const *b, simsimd_u16_t *r) { + *r = (*a + *b < *a) ? 65535 : (*a + *b); +} +SIMSIMD_INTERNAL void _simsimd_u32_sadd(simsimd_u32_t const *a, simsimd_u32_t const *b, simsimd_u32_t *r) { + *r = (*a + *b < *a) ? 4294967295u : (*a + *b); +} +SIMSIMD_INTERNAL void _simsimd_u64_sadd(simsimd_u64_t const *a, simsimd_u64_t const *b, simsimd_u64_t *r) { + *r = (*a + *b < *a) ? 18446744073709551615ull : (*a + *b); +} +SIMSIMD_INTERNAL void _simsimd_i8_sadd(simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_i8_t *r) { + simsimd_i16_t result = (simsimd_i16_t)*a + (simsimd_i16_t)*b; + *r = (result > 127) ? 127 : (result < -128 ? -128 : result); +} +SIMSIMD_INTERNAL void _simsimd_i16_sadd(simsimd_i16_t const *a, simsimd_i16_t const *b, simsimd_i16_t *r) { + simsimd_i32_t result = (simsimd_i32_t)*a + (simsimd_i32_t)*b; + *r = (result > 32767) ? 32767 : (result < -32768 ? -32768 : result); +} +SIMSIMD_INTERNAL void _simsimd_i32_sadd(simsimd_i32_t const *a, simsimd_i32_t const *b, simsimd_i32_t *r) { + simsimd_i64_t result = (simsimd_i64_t)*a + (simsimd_i64_t)*b; + *r = (result > 2147483647) ? 2147483647 : (result < -2147483648 ? -2147483648 : (simsimd_i32_t)result); +} +SIMSIMD_INTERNAL void _simsimd_i64_sadd(simsimd_i64_t const *a, simsimd_i64_t const *b, simsimd_i64_t *r) { + if ((*b > 0) && (*a > (9223372036854775807ll) - *b)) { *r = 9223372036854775807ll; } // Overflow + else if ((*b < 0) && (*a < (-9223372036854775808ll) - *b)) { *r = -9223372036854775808ll; } // Underflow + else { *r = *a + *b; } +} +SIMSIMD_INTERNAL void _simsimd_f32_sadd(simsimd_f32_t const *a, simsimd_f32_t const *b, simsimd_f32_t *r) { + *r = *a + *b; +} +SIMSIMD_INTERNAL void _simsimd_f64_sadd(simsimd_f64_t const *a, simsimd_f64_t const *b, simsimd_f64_t *r) { + *r = *a + *b; +} +SIMSIMD_INTERNAL void _simsimd_f16_sadd(simsimd_f16_t const *a, simsimd_f16_t const *b, simsimd_f16_t *r) { + simsimd_f32_t a_f32, b_f32, r_f32; + simsimd_f16_to_f32(a, &a_f32); + simsimd_f16_to_f32(b, &b_f32); + r_f32 = a_f32 + b_f32; + simsimd_f32_to_f16(&r_f32, r); +} +SIMSIMD_INTERNAL void _simsimd_bf16_sadd(simsimd_bf16_t const *a, simsimd_bf16_t const *b, simsimd_bf16_t *r) { + simsimd_f32_t a_f32, b_f32, r_f32; + simsimd_bf16_to_f32(a, &a_f32); + simsimd_bf16_to_f32(b, &b_f32); + r_f32 = a_f32 + b_f32; + simsimd_f32_to_bf16(&r_f32, r); +} + #ifdef __cplusplus } // extern "C" #endif diff --git a/python/lib.c b/python/lib.c index ba195065..bab6c93c 100644 --- a/python/lib.c +++ b/python/lib.c @@ -393,40 +393,6 @@ size_t bytes_per_datatype(simsimd_datatype_t dtype) { } } -typedef enum DatatypeKind { - FloatKind = 0, - ComplexKind = 1, - IntegerKind = 2, - UnsignedKind = 3, - BooleanKind = 4, -} DatatypeKind; - -/// @brief Check if the datatype has a sign. -/// @param dtype Logical datatype, can be complex. -/// @return Zero if the datatype is an unsigned ingteger, positive for signed integers and floats -DatatypeKind datatype_kind(simsimd_datatype_t dtype) { - switch (dtype) { - case simsimd_datatype_f64_k: return FloatKind; - case simsimd_datatype_f32_k: return FloatKind; - case simsimd_datatype_f16_k: return FloatKind; - case simsimd_datatype_bf16_k: return FloatKind; - case simsimd_datatype_f64c_k: return ComplexKind; - case simsimd_datatype_f32c_k: return ComplexKind; - case simsimd_datatype_f16c_k: return ComplexKind; - case simsimd_datatype_bf16c_k: return ComplexKind; - case simsimd_datatype_b8_k: return BooleanKind; - case simsimd_datatype_u8_k: return UnsignedKind; - case simsimd_datatype_u16_k: return UnsignedKind; - case simsimd_datatype_u32_k: return UnsignedKind; - case simsimd_datatype_u64_k: return UnsignedKind; - case simsimd_datatype_i8_k: return IntegerKind; - case simsimd_datatype_i16_k: return IntegerKind; - case simsimd_datatype_i32_k: return IntegerKind; - case simsimd_datatype_i64_k: return IntegerKind; - default: return 0; - } -} - /// @brief Copy a distance to a target datatype, downcasting if necessary. /// @return 1 if the cast was successful, 0 if the target datatype is not supported. int cast_distance(simsimd_distance_t distance, simsimd_datatype_t target_dtype, void *target_ptr, size_t offset) { @@ -2554,43 +2520,6 @@ static PyObject *api_fma(PyObject *self, PyObject *const *args, Py_ssize_t const return return_obj; } -void _plus_u64(void const *a, void const *b, void *o) { - *(simsimd_u64_t *)o = *(simsimd_u64_t const *)a + *(simsimd_u64_t const *)b; -} -void _plus_u32(void const *a, void const *b, void *o) { - *(simsimd_u32_t *)o = *(simsimd_u32_t const *)a + *(simsimd_u32_t const *)b; -} -void _plus_u16(void const *a, void const *b, void *o) { - *(simsimd_u16_t *)o = *(simsimd_u16_t const *)a + *(simsimd_u16_t const *)b; -} -void _plus_u8(void const *a, void const *b, void *o) { - *(simsimd_u8_t *)o = *(simsimd_u8_t const *)a + *(simsimd_u8_t const *)b; -} -void _plus_i64(void const *a, void const *b, void *o) { - *(simsimd_i64_t *)o = *(simsimd_i64_t const *)a + *(simsimd_i64_t const *)b; -} -void _plus_i32(void const *a, void const *b, void *o) { - *(simsimd_i32_t *)o = *(simsimd_i32_t const *)a + *(simsimd_i32_t const *)b; -} -void _plus_i16(void const *a, void const *b, void *o) { - *(simsimd_i16_t *)o = *(simsimd_i16_t const *)a + *(simsimd_i16_t const *)b; -} -void _plus_i8(void const *a, void const *b, void *o) { - *(simsimd_i8_t *)o = *(simsimd_i8_t const *)a + *(simsimd_i8_t const *)b; -} -void _plus_f64(void const *a, void const *b, void *o) { - *(simsimd_f64_t *)o = *(simsimd_f64_t const *)a + *(simsimd_f64_t const *)b; -} -void _plus_f32(void const *a, void const *b, void *o) { - *(simsimd_f32_t *)o = *(simsimd_f32_t const *)a + *(simsimd_f32_t const *)b; -} -void _plus_f16(void const *a, void const *b, void *o) { - simsimd_f32_to_f16(simsimd_f16_to_f32(a) + simsimd_f16_to_f32(b), (simsimd_f16_t *)o); -} -void _plus_bf16(void const *a, void const *b, void *o) { - simsimd_f32_to_bf16(simsimd_bf16_to_f32(a) + simsimd_bf16_to_f32(b), (simsimd_bf16_t *)o); -} - void implementation_elementwise_binary_tensor_operation( // BufferOrScalarArgument const *a_parsed, BufferOrScalarArgument const *b_parsed, BufferOrScalarArgument const *out_parsed, // @@ -2685,6 +2614,27 @@ void implementation_vectorized_binary_tensor_operation( // } } +typedef void (*elementwise_scalar_kernel_t)(void const *, void const *, void *); + +static elementwise_scalar_kernel_t elementwise_sadd(simsimd_datatype_t dtype) { + void (*scalar_kernel)(void const *, void const *, void *) = NULL; + switch (dtype) { + case simsimd_datatype_u64_k: return (elementwise_scalar_kernel_t)&_simsimd_u64_sadd; + case simsimd_datatype_u32_k: return (elementwise_scalar_kernel_t)&_simsimd_u32_sadd; + case simsimd_datatype_u16_k: return (elementwise_scalar_kernel_t)&_simsimd_u16_sadd; + case simsimd_datatype_u8_k: return (elementwise_scalar_kernel_t)&_simsimd_u8_sadd; + case simsimd_datatype_i64_k: return (elementwise_scalar_kernel_t)&_simsimd_i64_sadd; + case simsimd_datatype_i32_k: return (elementwise_scalar_kernel_t)&_simsimd_i32_sadd; + case simsimd_datatype_i16_k: return (elementwise_scalar_kernel_t)&_simsimd_i16_sadd; + case simsimd_datatype_i8_k: return (elementwise_scalar_kernel_t)&_simsimd_i8_sadd; + case simsimd_datatype_f64_k: return (elementwise_scalar_kernel_t)&_simsimd_f64_sadd; + case simsimd_datatype_f32_k: return (elementwise_scalar_kernel_t)&_simsimd_f32_sadd; + case simsimd_datatype_f16_k: return (elementwise_scalar_kernel_t)&_simsimd_f16_sadd; + case simsimd_datatype_bf16_k: return (elementwise_scalar_kernel_t)&_simsimd_bf16_sadd; + default: return NULL; + } +} + static char const doc_add[] = // "Tensor-Tensor or Tensor-Scalar element-wise addition.\n" "\n" @@ -2707,14 +2657,16 @@ static char const doc_add[] = // "Performance recommendations:\n" " - Provide an output tensor to avoid memory allocations.\n" " - Use the same datatype for both inputs and outputs, if supplied.\n" - " - Ideally keep operands in continuous memory. Otherwise, maximize the number of last continuous dimensions.\n" + " - Ideally keep operands in continuous memory. Otherwise, maximize the number of last continuous " + "dimensions.\n" " - On tiny inputs you may want to avoid passing arguments by name.\n" "In most cases, conforming to these recommendations is easy and will result in the best performance.\n" "\n" "Broadcasting rules:\n" " - If both inputs are scalars, the output will be a scalar.\n" " - If one input is a scalar, the output will be a tensor of the same shape as the other input.\n" - " - If both inputs are tensors, in every dimension, the dimension sizes must match or one of them must be 1.\n" + " - If both inputs are tensors, in every dimension, the dimension sizes must match or one of them must be " + "1.\n" "Broadcasting examples for different shapes:\n" " - (3) + (1) -> (3)\n" " - (3, 1) + (1) -> (3, 1)\n" @@ -2963,11 +2915,11 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const // Infer the output tensor shape if it wasn't provided else { // For addition and multiplication, treat complex numbers as floats - DatatypeKind a_kind = a_parsed.kind; - DatatypeKind b_kind = b_parsed.kind; - if (a_kind == ComplexKind) a_kind = FloatKind; - if (b_kind == ComplexKind) b_kind = FloatKind; - if (a_kind == BooleanKind || b_kind == BooleanKind) { + simsimd_datatype_family_k a_kind = simsimd_datatype_family(a_parsed.datatype); + simsimd_datatype_family_k b_kind = simsimd_datatype_family(b_parsed.datatype); + if (a_kind == simsimd_datatype_complex_float_family_k) a_kind = simsimd_datatype_float_family_k; + if (b_kind == simsimd_datatype_complex_float_family_k) b_kind = simsimd_datatype_float_family_k; + if (a_kind == simsimd_datatype_binary_famiily_k || b_kind == simsimd_datatype_binary_famiily_k) { PyErr_SetString(PyExc_ValueError, "Boolean tensors are not supported in element-wise operations"); goto cleanup; } @@ -2982,16 +2934,16 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const } // If only one of the operands is a float, the output should be a float, of the next size... // Sum of `float16` and `int32` is a `float64`. Sum of `float16` and `int16` is a `float32`. - else if (a_kind == FloatKind || b_kind == FloatKind) { + else if (a_kind == simsimd_datatype_float_family_k || b_kind == simsimd_datatype_float_family_k) { //? No 128-bit float on most platforms if (max_itemsize == 8) { out_parsed.datatype = simsimd_datatype_f64_k; } else if (max_itemsize == 4) { out_parsed.datatype = simsimd_datatype_f64_k; } else if (max_itemsize == 2) { out_parsed.datatype = simsimd_datatype_f32_k; } else if (max_itemsize == 1) { out_parsed.datatype = simsimd_datatype_f16_k; } } - // If only one of the operands is a signed integer, the output should be a signed integer, of the next size... - // Sum of `int16` and `uint32` is a `int64`. Sum of `int16` and `uint16` is a `int32`. - else if (a_kind == IntegerKind || b_kind == IntegerKind) { + // If only one of the operands is a signed integer, the output should be a signed integer, of the next + // size... Sum of `int16` and `uint32` is a `int64`. Sum of `int16` and `uint16` is a `int32`. + else if (a_kind == simsimd_datatype_int_family_k || b_kind == simsimd_datatype_int_family_k) { //? No 128-bit integer on most platforms if (max_itemsize == 8) { out_parsed.datatype = simsimd_datatype_i64_k; } else if (max_itemsize == 4) { out_parsed.datatype = simsimd_datatype_i64_k; } @@ -3098,10 +3050,9 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const } // If the output has no continuous dimensions at all, our situation sucks! - // If the type of outputs and inputs doesn't match, it also sucks! - // We can't use SIMD effectively and need to fall back to the scalar operation. + // We can't use SIMD effectively and need to fall back to the scalar operation, + // but if the input/output types match, at least we don't need to cast the data back and forth. void (*scalar_kernel)(void const *, void const *, void *) = NULL; - // clang-format off switch (dtype) { case simsimd_datatype_u64_k: scalar_kernel = _plus_u64; break; case simsimd_datatype_u32_k: scalar_kernel = _plus_u32; break; @@ -3115,7 +3066,6 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const case simsimd_datatype_f32_k: scalar_kernel = _plus_f32; break; case simsimd_datatype_f16_k: scalar_kernel = _plus_f16; break; case simsimd_datatype_bf16_k: scalar_kernel = _plus_bf16; break; - // clang-format on default: PyErr_SetString(PyExc_ValueError, "Unsupported datatype"); return_obj = NULL; @@ -3124,6 +3074,10 @@ static PyObject *api_add(PyObject *self, PyObject *const *args, Py_ssize_t const // Finally call the serial kernels implementation_elementwise_binary_tensor_operation(&a_parsed, &b_parsed, &out_parsed, scalar_kernel); + // If the output has no continuous dimensions at all, our situation sucks! + // If the type of outputs and inputs doesn't match, it also sucks! + // We can't use SIMD effectively and need to fall back to the scalar operation. + cleanup: PyBuffer_Release(&a_buffer); PyBuffer_Release(&b_buffer);