ashvardanian · ashvardanian · Sep 14, 2024 · Oct 6, 2024 · Oct 6, 2024 · Oct 6, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -19,19 +19,20 @@ set(CMAKE_CXX_EXTENSIONS NO)
 # Determine if SimSIMD is built as a subproject (using `add_subdirectory`) or if it is the main project
 set(SIMSIMD_IS_MAIN_PROJECT OFF)
 
-if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
     set(SIMSIMD_IS_MAIN_PROJECT ON)
-endif ()
+endif()
 
 option(SIMSIMD_BUILD_SHARED "Compile a dynamic library" ${SIMSIMD_IS_MAIN_PROJECT})
 option(SIMSIMD_BUILD_TESTS "Small compilation tests compile-time and run-time dispatch" OFF)
 option(SIMSIMD_BUILD_BENCHMARKS "Compile micro-benchmarks for current ISA" OFF)
 option(SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS "Include BLAS in micro-kernel benchmarks" OFF)
+option(SIMSIMD_BUILD_WITH_OPENMP "Enable OpenMP support" OFF)
 
 # Default to Release build type if not set
-if (NOT CMAKE_BUILD_TYPE)
+if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release)
-endif ()
+endif()
 
 # Global compiler flags for debug and release
 set(CMAKE_CXX_FLAGS_DEBUG "-g -fsanitize=address")
@@ -40,17 +41,18 @@ set(CMAKE_C_FLAGS_DEBUG "-g -fsanitize=address")
 set(CMAKE_C_FLAGS_RELEASE "-O3")
 
 # Compiler-specific flags
-if (CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
-    if (NOT APPLE)
+if(CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
+    if(NOT APPLE)
         add_compile_options(-march=native)
-    endif ()
+    endif()
 
     add_compile_options(-pedantic -ferror-limit=1)
-elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     add_compile_options(-march=native -pedantic -fmax-errors=1 -Wno-tautological-constant-compare)
-elseif (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
+    add_compile_options(-mavx512fp16 -mavx512bf16 -mamx-int8 -mamx-bf16)
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
     add_compile_options(-w -ferror-limit=1)
-endif ()
+endif()
 
 # Define the header-only library
 file(GLOB SIMSIMD_SOURCES include/simsimd/*.h)
@@ -59,11 +61,9 @@ target_sources(simsimd INTERFACE ${SIMSIMD_SOURCES})
 target_include_directories(simsimd INTERFACE "${PROJECT_SOURCE_DIR}/include")
 
 # Build benchmarks if required
-if (SIMSIMD_BUILD_BENCHMARKS)
-    # Fetch external dependencies
+if(SIMSIMD_BUILD_BENCHMARKS)
     include(FetchContent)
 
-    # Suppress building tests of Google Benchmark
     set(BENCHMARK_ENABLE_TESTING OFF)
     set(BENCHMARK_ENABLE_INSTALL OFF)
     set(BENCHMARK_ENABLE_DOXYGEN OFF)
@@ -79,15 +79,16 @@ if (SIMSIMD_BUILD_BENCHMARKS)
     )
     FetchContent_MakeAvailable(benchmark)
 
-    # Remove the Google Benchmark's "built in debug warning"
-    if (CMAKE_BUILD_TYPE STREQUAL "Release")
+    # Remove the "google benchmark built in debug" warning
+    if(CMAKE_BUILD_TYPE STREQUAL "Release")
         target_compile_definitions(benchmark PRIVATE NDEBUG)
-    endif ()
+    endif()
 
     find_package(Threads REQUIRED)
     add_executable(simsimd_bench scripts/bench.cxx)
     target_link_libraries(simsimd_bench simsimd Threads::Threads benchmark)
 
+    # BLAS support
     if (SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS)
         find_package(BLAS REQUIRED)
         if (BLAS_FOUND)
@@ -101,7 +102,17 @@ if (SIMSIMD_BUILD_BENCHMARKS)
         endif ()
     endif ()
 
-endif ()
+    # OpenMP support
+    if(SIMSIMD_BUILD_WITH_OPENMP)
+        find_package(OpenMP REQUIRED)
+
+        if(OpenMP_CXX_FOUND)
+            target_compile_definitions(simsimd INTERFACE SIMSIMD_BUILD_WITH_OPENMP)
+            target_compile_options(simsimd INTERFACE ${OpenMP_CXX_FLAGS})
+            target_link_libraries(simsimd INTERFACE OpenMP::OpenMP_CXX)
+        endif()
+    endif()
+endif()
 
 if (SIMSIMD_BUILD_TESTS)
     add_executable(simsimd_test_compile_time scripts/test.c)
@@ -110,11 +121,11 @@ if (SIMSIMD_BUILD_TESTS)
     add_executable(simsimd_test_run_time scripts/test.c c/lib.c)
     target_compile_definitions(simsimd_test_run_time PRIVATE SIMSIMD_DYNAMIC_DISPATCH=1)
     target_link_libraries(simsimd_test_run_time simsimd m)
-endif ()
+endif()
 
-if (SIMSIMD_BUILD_SHARED)
+if(SIMSIMD_BUILD_SHARED)
     set(SIMSIMD_SOURCES ${SIMSIMD_SOURCES} c/lib.c)
     add_library(simsimd_shared SHARED ${SIMSIMD_SOURCES})
     target_include_directories(simsimd_shared PUBLIC "${PROJECT_SOURCE_DIR}/include")
     set_target_properties(simsimd_shared PROPERTIES OUTPUT_NAME simsimd)
-endif ()
+endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -12,7 +12,10 @@ To rerun experiments utilize the following command:
 
 ```sh
 sudo apt install libopenblas-dev # BLAS installation is optional, but recommended for benchmarks
-cmake -D CMAKE_BUILD_TYPE=Release -D SIMSIMD_BUILD_TESTS=1 -D SIMSIMD_BUILD_BENCHMARKS=1 -D SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS=1 -B build_release
+cmake -D CMAKE_BUILD_TYPE=Release \
+    -D SIMSIMD_BUILD_TESTS=1 -D SIMSIMD_BUILD_BENCHMARKS=1 \
+    -D SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS=1 -D SIMSIMD_BUILD_WITH_OPENMP=1 \
+    -B build_release
 cmake --build build_release --config Release
 build_release/simsimd_bench
 build_release/simsimd_bench --benchmark_filter=js
@@ -63,6 +66,24 @@ cmake -D CMAKE_BUILD_TYPE=Release \
 cmake --build build_release --config Release
 ```
 
+Similarly, using Clang on Linux:
+
+```sh
+sudo apt install clang
+cmake -D CMAKE_BUILD_TYPE=Release \
+    -D SIMSIMD_BUILD_TESTS=1 -D SIMSIMD_BUILD_BENCHMARKS=1 \
+    -D SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS=1 -D SIMSIMD_BUILD_WITH_OPENMP=0 \
+    -D CMAKE_C_COMPILER=clang -D CMAKE_CXX_COMPILER=clang++ \
+    -B build_release
+cmake --build build_release --config Release
+```
+
+I'd recommend putting the following breakpoints:
+
+- `__asan::ReportGenericError` - to detect illegal memory accesses.
+- `__GI_exit` - to stop at exit points - the end of running any executable.
+- `__builtin_unreachable` - to catch all the places where the code is expected to be unreachable.
+
 ## Python
 
 Testing:

diff --git a/README.md b/README.md
@@ -821,7 +821,9 @@ Possibly, in the future:
 Last, but not the least - don't build unless there is a demand for it.
 So if you have a specific use-case, please open an issue or a pull request, and ideally, bring in more users with similar needs.
 
-### Cosine Similarity, Reciprocal Square Root, and Newton-Raphson Iteration
+### Spatial Affinity
+
+> On cosine similarity, reciprocal square root approximations, and the Newton-Raphson iteration.
 
 The cosine similarity is the most common and straightforward metric used in machine learning and information retrieval.
 Interestingly, there are multiple ways to shoot yourself in the foot when computing it.
@@ -876,7 +878,10 @@ On 1536-dimensional inputs on Intel Sapphire Rapids CPU a single such iteration
 | `float32`  | 2.21e-08 ± 1.65e-08 |     3.47e-07 ± 3.49e-07 | 3.77e-09 ± 2.84e-09 |
 | `float64`  | 0.00e+00 ± 0.00e+00 |     3.80e-07 ± 4.50e-07 | 1.35e-11 ± 1.85e-11 |
 
-### Curved Spaces, Mahalanobis Distance, and Bilinear Quadratic Forms
+
+### Curved Spaces
+
+> On non-Euclidean geometry, Mahalanobis distance, and Bilinear Quadratic Forms.
 
 The Mahalanobis distance is a generalization of the Euclidean distance, which takes into account the covariance of the data.
 It's very similar in its form to the bilinear form, which is a generalization of the dot product.
@@ -906,7 +911,9 @@ A $vector * matrix * vector$ product is a scalar, whereas its constituent parts
 
 SimSIMD doesn't produce intermediate vector results, like `a @ M @ b`, but computes the bilinear form directly.
 
-### Set Intersection, Galloping, and Binary Search
+### Sparse Vectors
+
+> On sorted set intersections, "Galloping", and Binary Search.
 
 The set intersection operation is generally defined as the number of elements that are common between two sets, represented as sorted arrays of integers.
 The most common way to compute it is a linear scan:
@@ -934,7 +941,9 @@ Third approach is to use the SIMD instructions to compare multiple elements at o
 
 After benchmarking, the last approach was chosen, as it's the most flexible and often the fastest.
 
-### Complex Dot Products, Conjugate Dot Products, and Complex Numbers
+### Dot Products
+
+> On complex numbers, complex conjugates.
 
 Complex dot products are a generalization of the dot product to complex numbers.
 They are supported by most BLAS packages, but almost never in mixed precision.
@@ -971,7 +980,12 @@ def vdot(a: List[number], b: List[number]) -> number:
     return ab_real, ab_imaginary
 ```
 
-### Logarithms in Kullback-Leibler & Jensen–Shannon Divergences
+### Binary Represntations
+
+
+### Probability Distributions
+
+> On fast logarithm approximations in Kullback-Leibler & Jensen–Shannon divergences.
 
 The Kullback-Leibler divergence is a measure of how one probability distribution diverges from a second, expected probability distribution.
 Jensen-Shannon divergence is a symmetrized and smoothed version of the Kullback-Leibler divergence, which can be used as a distance metric between probability distributions.
@@ -986,7 +1000,60 @@ Jensen-Shannon divergence is a symmetrized and smoothed version of the Kullback-
 
 Both functions are defined for non-negative numbers, and the logarithm is a key part of their computation.
 
-### Mixed Precision in Fused-Multiply-Add and Weighted Sums
+### Dense Matrix Multiplications
+
+> On dense matrix multiplication, and Normalized Cross-Correlation.
+
+Unlike most dense matrix multiplication libraries, SimSIMD:
+
+- Focuses on mixed-precision computation.
+- Focuses on row-by-row multiplication, where the second matrix is transposed.
+- Outputs into the same precision as the input matrices, normalizing the output, like in cosine similarity calculations.
+- Doesn't implement parallelism, making it compatible with arbitrary concurrency models, third-party thread pools and task schedulers, like OpenMP, TBB, or Rayon.
+
+These decisions makes the algorithm much easier to tile, and makes it more robust to noise and precision loss.
+It also makes the multiplication more suitable to integration into multi-step Machine Learning and AI inference pipelines, where the output of one step is the input of the next, and uniform representation is required.
+
+```math
+C_{ij} = \frac{A_i \cdot B_j}{\|A_i\| \|B_j\|} = \frac{\sum_k A_{ik} B_{jk}}{\sqrt{\sum_k A_{ik}^2} \sqrt{\sum_k B_{jk}^2}}
+```
+
+The conventional matrmix multiplication lacks the denominator and indexes the second matrix differently:
+
+```math
+C_{ij} = A_i \cdot B_j = \frac{\sum_k A_{ik} B_{kj}}{\sqrt{\sum_k A_{ik}^2} \sqrt{\sum_k B_{kj}^2}}
+```
+
+Important to note, if you are going to use the Normalized Cross Correlation in training pipelines, the difference in the product definition will affect the gradient flow, and is defined differently:
+
+Gradient with Respect to $A_i$:
+
+```math
+\frac{\partial C_{ij}}{\partial A_i} = \frac{1}{\|A_i\| \|B_j\|} \left( B_j - \frac{A_i \cdot B_j}{\|A_i\|^2} A_i \right)
+```
+
+Similarly, the gradient with respect to $B_j$ is:
+
+```math
+\frac{\partial C_{ij}}{\partial B_j} = \frac{1}{\|A_i\| \|B_j\|} \left( A_i - \frac{A_i \cdot B_j}{\|B_j\|^2} B_j \right)
+```
+
+#### Optimizing for Intel AMX
+
+Intel Advanced Matrix Extensions (AMX) is a new instruction available in Sapphire Rapids CPUs and newer.
+It provides 8x special registers, each __1 kilobyte__ in size, enough to store 16 by 16 matrix tile of 4-byte words.
+
+### Sparse-Sparse Matrix Multiplications
+
+> On sparse matrix multiplication, and the Compressed Sparse Row format, and the Compressed Sparse Column format.
+
+Sparse-Sparse matrix multiplication is just "matrix multiplication" when both matrices have sparse.
+Hence their values are stored in non-regularly addressable arrays, and their indices are often stored in separate arrays.
+Those representations are very effective when $>99%$ of the values are close to zero, and the matrices are large.
+With a good algorithm, a 100x improvement in performance can be achieved.
+### Elementwise Operations
+
+> On mixed precision in Fused-Multiply-Add and Weighted Sums.
 
 The Fused-Multiply-Add (FMA) operation is a single operation that combines element-wise multiplication and addition with different scaling factors.
 The Weighted Sum is it's simplified variant without element-wise multiplication.

diff --git a/include/simsimd/binary.h b/include/simsimd/binary.h
@@ -3,6 +3,7 @@
  *  @brief      SIMD-accelerated Binary Similarity Measures.
  *  @author     Ash Vardanian
  *  @date       July 1, 2023
+ *  @see        https://github.com/ashvardanian/simsimd?tab=readme-ov-file#binary-representations
  *
  *  Contains:
  *  - Hamming distance
@@ -49,14 +50,16 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c
 
 SIMSIMD_PUBLIC unsigned char simsimd_popcount_b8(simsimd_b8_t x) {
     static unsigned char lookup_table[] = {
+        //
         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //
-        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
-        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //
+        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, //
+        1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, //
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, //
+        3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, //
+    };
     return lookup_table[x];
 }