Skip to content

Commit

Permalink
Merge pull request #216 from ashvardanian/main-dev
Browse files Browse the repository at this point in the history
Faster FMA on Haswell
  • Loading branch information
ashvardanian authored Oct 27, 2024
2 parents f5d7636 + 6c4e595 commit ebd0537
Show file tree
Hide file tree
Showing 5 changed files with 279 additions and 93 deletions.
1 change: 1 addition & 0 deletions .git-blame-ignore-revs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
a4022a988287e527757ecc9bc16a4f2e7dc4770e
750c59f5116a2000507a0cec09db009fd7d31232
199 changes: 195 additions & 4 deletions include/simsimd/elementwise.h
Original file line number Diff line number Diff line change
Expand Up @@ -497,22 +497,213 @@ SIMSIMD_PUBLIC void simsimd_fma_bf16_haswell( /
SIMSIMD_PUBLIC void simsimd_wsum_i8_haswell( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result) {
simsimd_wsum_i8_serial(a, b, n, alpha, beta, result); // TODO

simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
int sum_i32s[8], a_i32s[8], b_i32s[8];

// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
//? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
//? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], //
b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7];
//! This can be done at least 50% faster if we convert 8-bit integers to floats instead
//! of relying on the slow `_mm256_cvtepi32_ps` instruction.
__m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s));
__m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s));
// The normal part.
__m256 a_scaled = _mm256_mul_ps(a_vec, alpha_vec);
__m256 b_scaled = _mm256_mul_ps(b_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(a_scaled, b_scaled);
// Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD.
__m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec);
sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128));
sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(127));
// Export into a serial buffer.
_mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec);
result[i + 0] = (simsimd_i8_t)sum_i32s[0];
result[i + 1] = (simsimd_i8_t)sum_i32s[1];
result[i + 2] = (simsimd_i8_t)sum_i32s[2];
result[i + 3] = (simsimd_i8_t)sum_i32s[3];
result[i + 4] = (simsimd_i8_t)sum_i32s[4];
result[i + 5] = (simsimd_i8_t)sum_i32s[5];
result[i + 6] = (simsimd_i8_t)sum_i32s[6];
result[i + 7] = (simsimd_i8_t)sum_i32s[7];
}

// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = a[i], bi = b[i];
simsimd_f32_t sum = alpha_f32 * ai + beta_f32 * bi;
SIMSIMD_F32_TO_I8(sum, result + i);
}
}

SIMSIMD_PUBLIC void simsimd_wsum_u8_haswell( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result) {
simsimd_wsum_u8_serial(a, b, n, alpha, beta, result); // TODO

simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
int sum_i32s[8], a_i32s[8], b_i32s[8];

// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
//? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
//? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], //
b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7];
//! This can be done at least 50% faster if we convert 8-bit integers to floats instead
//! of relying on the slow `_mm256_cvtepi32_ps` instruction.
__m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s));
__m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s));
// The normal part.
__m256 a_scaled = _mm256_mul_ps(a_vec, alpha_vec);
__m256 b_scaled = _mm256_mul_ps(b_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(a_scaled, b_scaled);
// Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD.
__m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec);
sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(0));
sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(255));
// Export into a serial buffer.
_mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec);
result[i + 0] = (simsimd_u8_t)sum_i32s[0];
result[i + 1] = (simsimd_u8_t)sum_i32s[1];
result[i + 2] = (simsimd_u8_t)sum_i32s[2];
result[i + 3] = (simsimd_u8_t)sum_i32s[3];
result[i + 4] = (simsimd_u8_t)sum_i32s[4];
result[i + 5] = (simsimd_u8_t)sum_i32s[5];
result[i + 6] = (simsimd_u8_t)sum_i32s[6];
result[i + 7] = (simsimd_u8_t)sum_i32s[7];
}

// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = a[i], bi = b[i];
simsimd_f32_t sum = alpha_f32 * ai + beta_f32 * bi;
SIMSIMD_F32_TO_U8(sum, result + i);
}
}

SIMSIMD_PUBLIC void simsimd_fma_i8_haswell( //
simsimd_i8_t const *a, simsimd_i8_t const *b, simsimd_i8_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_i8_t *result) {
simsimd_fma_i8_serial(a, b, c, n, alpha, beta, result); // TODO

simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
int sum_i32s[8], a_i32s[8], b_i32s[8], c_i32s[8];

// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
//? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
//? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], //
b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7];
c_i32s[0] = c[i + 0], c_i32s[1] = c[i + 1], c_i32s[2] = c[i + 2], c_i32s[3] = c[i + 3], //
c_i32s[4] = c[i + 4], c_i32s[5] = c[i + 5], c_i32s[6] = c[i + 6], c_i32s[7] = c[i + 7];
//! This can be done at least 50% faster if we convert 8-bit integers to floats instead
//! of relying on the slow `_mm256_cvtepi32_ps` instruction.
__m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s));
__m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s));
__m256 c_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)c_i32s));
// The normal part.
__m256 ab_vec = _mm256_mul_ps(a_vec, b_vec);
__m256 ab_scaled_vec = _mm256_mul_ps(ab_vec, alpha_vec);
__m256 c_scaled_vec = _mm256_mul_ps(c_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(ab_scaled_vec, c_scaled_vec);
// Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD.
__m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec);
sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(-128));
sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(127));
// Export into a serial buffer.
_mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec);
result[i + 0] = (simsimd_i8_t)sum_i32s[0];
result[i + 1] = (simsimd_i8_t)sum_i32s[1];
result[i + 2] = (simsimd_i8_t)sum_i32s[2];
result[i + 3] = (simsimd_i8_t)sum_i32s[3];
result[i + 4] = (simsimd_i8_t)sum_i32s[4];
result[i + 5] = (simsimd_i8_t)sum_i32s[5];
result[i + 6] = (simsimd_i8_t)sum_i32s[6];
result[i + 7] = (simsimd_i8_t)sum_i32s[7];
}

// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = a[i], bi = b[i], ci = c[i];
simsimd_f32_t sum = alpha_f32 * ai * bi + beta_f32 * ci;
SIMSIMD_F32_TO_I8(sum, result + i);
}
}

SIMSIMD_PUBLIC void simsimd_fma_u8_haswell( //
simsimd_u8_t const *a, simsimd_u8_t const *b, simsimd_u8_t const *c, simsimd_size_t n, //
simsimd_distance_t alpha, simsimd_distance_t beta, simsimd_u8_t *result) {
simsimd_fma_u8_serial(a, b, c, n, alpha, beta, result); // TODO

simsimd_f32_t alpha_f32 = (simsimd_f32_t)alpha;
simsimd_f32_t beta_f32 = (simsimd_f32_t)beta;
__m256 alpha_vec = _mm256_set1_ps(alpha_f32);
__m256 beta_vec = _mm256_set1_ps(beta_f32);
int sum_i32s[8], a_i32s[8], b_i32s[8], c_i32s[8];

// The main loop:
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
//? Handling loads and stores with SIMD is tricky. Not because of upcasting, but the
//? downcasting at the end of the loop. In AVX2 it's a drag! Keep it for another day.
a_i32s[0] = a[i + 0], a_i32s[1] = a[i + 1], a_i32s[2] = a[i + 2], a_i32s[3] = a[i + 3], //
a_i32s[4] = a[i + 4], a_i32s[5] = a[i + 5], a_i32s[6] = a[i + 6], a_i32s[7] = a[i + 7];
b_i32s[0] = b[i + 0], b_i32s[1] = b[i + 1], b_i32s[2] = b[i + 2], b_i32s[3] = b[i + 3], //
b_i32s[4] = b[i + 4], b_i32s[5] = b[i + 5], b_i32s[6] = b[i + 6], b_i32s[7] = b[i + 7];
c_i32s[0] = c[i + 0], c_i32s[1] = c[i + 1], c_i32s[2] = c[i + 2], c_i32s[3] = c[i + 3], //
c_i32s[4] = c[i + 4], c_i32s[5] = c[i + 5], c_i32s[6] = c[i + 6], c_i32s[7] = c[i + 7];
//! This can be done at least 50% faster if we convert 8-bit integers to floats instead
//! of relying on the slow `_mm256_cvtepi32_ps` instruction.
__m256 a_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)a_i32s));
__m256 b_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)b_i32s));
__m256 c_vec = _mm256_cvtepi32_ps(_mm256_lddqu_si256((__m256i *)c_i32s));
// The normal part.
__m256 ab_vec = _mm256_mul_ps(a_vec, b_vec);
__m256 ab_scaled_vec = _mm256_mul_ps(ab_vec, alpha_vec);
__m256 c_scaled_vec = _mm256_mul_ps(c_vec, beta_vec);
__m256 sum_vec = _mm256_add_ps(ab_scaled_vec, c_scaled_vec);
// Instead of serial calls to expensive `SIMSIMD_F32_TO_U8`, convert and clip with SIMD.
__m256i sum_i32_vec = _mm256_cvtps_epi32(sum_vec);
sum_i32_vec = _mm256_max_epi32(sum_i32_vec, _mm256_set1_epi32(0));
sum_i32_vec = _mm256_min_epi32(sum_i32_vec, _mm256_set1_epi32(255));
// Export into a serial buffer.
_mm256_storeu_si256((__m256i *)sum_i32s, sum_i32_vec);
result[i + 0] = (simsimd_u8_t)sum_i32s[0];
result[i + 1] = (simsimd_u8_t)sum_i32s[1];
result[i + 2] = (simsimd_u8_t)sum_i32s[2];
result[i + 3] = (simsimd_u8_t)sum_i32s[3];
result[i + 4] = (simsimd_u8_t)sum_i32s[4];
result[i + 5] = (simsimd_u8_t)sum_i32s[5];
result[i + 6] = (simsimd_u8_t)sum_i32s[6];
result[i + 7] = (simsimd_u8_t)sum_i32s[7];
}

// The tail:
for (; i < n; ++i) {
simsimd_f32_t ai = a[i], bi = b[i], ci = c[i];
simsimd_f32_t sum = alpha_f32 * ai * bi + beta_f32 * ci;
SIMSIMD_F32_TO_U8(sum, result + i);
}
}

#pragma clang attribute pop
Expand Down
2 changes: 0 additions & 2 deletions include/simsimd/sparse.h
Original file line number Diff line number Diff line change
Expand Up @@ -1079,8 +1079,6 @@ SIMSIMD_PUBLIC void simsimd_intersect_u32_neon( //
* https://developer.arm.com/documentation/ddi0602/2021-06/SVE-Instructions/UCLAMP--Unsigned-clamp-to-minimum-maximum-vector-?lang=en
* - `TBLQ`: Table lookup quadword
* https://developer.arm.com/documentation/ddi0602/2022-12/SVE-Instructions/TBLQ--Programmable-table-lookup-within-each-quadword-vector-segment--zeroing--?lang=en
* -
* https://developer.arm.com/documentation/ddi0602/2021-06/SVE-Instructions/INCB--INCD--INCH--INCW--scalar---Increment-scalar-by-multiple-of-predicate-constraint-element-count-
*
* Great resources for SVE2 intrinsics:
*
Expand Down
Loading

0 comments on commit ebd0537

Please sign in to comment.