Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Matrix Multiplications #218

Open
wants to merge 9 commits into
base: main-dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 31 additions & 20 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,20 @@ set(CMAKE_CXX_EXTENSIONS NO)
# Determine if SimSIMD is built as a subproject (using `add_subdirectory`) or if it is the main project
set(SIMSIMD_IS_MAIN_PROJECT OFF)

if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
set(SIMSIMD_IS_MAIN_PROJECT ON)
endif ()
endif()

option(SIMSIMD_BUILD_SHARED "Compile a dynamic library" ${SIMSIMD_IS_MAIN_PROJECT})
option(SIMSIMD_BUILD_TESTS "Small compilation tests compile-time and run-time dispatch" OFF)
option(SIMSIMD_BUILD_BENCHMARKS "Compile micro-benchmarks for current ISA" OFF)
option(SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS "Include BLAS in micro-kernel benchmarks" OFF)
option(SIMSIMD_BUILD_WITH_OPENMP "Enable OpenMP support" OFF)

# Default to Release build type if not set
if (NOT CMAKE_BUILD_TYPE)
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif ()
endif()

# Global compiler flags for debug and release
set(CMAKE_CXX_FLAGS_DEBUG "-g -fsanitize=address")
Expand All @@ -40,17 +41,18 @@ set(CMAKE_C_FLAGS_DEBUG "-g -fsanitize=address")
set(CMAKE_C_FLAGS_RELEASE "-O3")

# Compiler-specific flags
if (CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
if (NOT APPLE)
if(CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
if(NOT APPLE)
add_compile_options(-march=native)
endif ()
endif()

add_compile_options(-pedantic -ferror-limit=1)
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
add_compile_options(-march=native -pedantic -fmax-errors=1 -Wno-tautological-constant-compare)
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
add_compile_options(-mavx512fp16 -mavx512bf16 -mamx-int8 -mamx-bf16)
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
add_compile_options(-w -ferror-limit=1)
endif ()
endif()

# Define the header-only library
file(GLOB SIMSIMD_SOURCES include/simsimd/*.h)
Expand All @@ -59,11 +61,9 @@ target_sources(simsimd INTERFACE ${SIMSIMD_SOURCES})
target_include_directories(simsimd INTERFACE "${PROJECT_SOURCE_DIR}/include")

# Build benchmarks if required
if (SIMSIMD_BUILD_BENCHMARKS)
# Fetch external dependencies
if(SIMSIMD_BUILD_BENCHMARKS)
include(FetchContent)

# Suppress building tests of Google Benchmark
set(BENCHMARK_ENABLE_TESTING OFF)
set(BENCHMARK_ENABLE_INSTALL OFF)
set(BENCHMARK_ENABLE_DOXYGEN OFF)
Expand All @@ -79,15 +79,16 @@ if (SIMSIMD_BUILD_BENCHMARKS)
)
FetchContent_MakeAvailable(benchmark)

# Remove the Google Benchmark's "built in debug warning"
if (CMAKE_BUILD_TYPE STREQUAL "Release")
# Remove the "google benchmark built in debug" warning
if(CMAKE_BUILD_TYPE STREQUAL "Release")
target_compile_definitions(benchmark PRIVATE NDEBUG)
endif ()
endif()

find_package(Threads REQUIRED)
add_executable(simsimd_bench scripts/bench.cxx)
target_link_libraries(simsimd_bench simsimd Threads::Threads benchmark)

# BLAS support
if (SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS)
find_package(BLAS REQUIRED)
if (BLAS_FOUND)
Expand All @@ -101,7 +102,17 @@ if (SIMSIMD_BUILD_BENCHMARKS)
endif ()
endif ()

endif ()
# OpenMP support
if(SIMSIMD_BUILD_WITH_OPENMP)
find_package(OpenMP REQUIRED)

if(OpenMP_CXX_FOUND)
target_compile_definitions(simsimd INTERFACE SIMSIMD_BUILD_WITH_OPENMP)
target_compile_options(simsimd INTERFACE ${OpenMP_CXX_FLAGS})
target_link_libraries(simsimd INTERFACE OpenMP::OpenMP_CXX)
endif()
endif()
endif()

if (SIMSIMD_BUILD_TESTS)
add_executable(simsimd_test_compile_time scripts/test.c)
Expand All @@ -110,11 +121,11 @@ if (SIMSIMD_BUILD_TESTS)
add_executable(simsimd_test_run_time scripts/test.c c/lib.c)
target_compile_definitions(simsimd_test_run_time PRIVATE SIMSIMD_DYNAMIC_DISPATCH=1)
target_link_libraries(simsimd_test_run_time simsimd m)
endif ()
endif()

if (SIMSIMD_BUILD_SHARED)
if(SIMSIMD_BUILD_SHARED)
set(SIMSIMD_SOURCES ${SIMSIMD_SOURCES} c/lib.c)
add_library(simsimd_shared SHARED ${SIMSIMD_SOURCES})
target_include_directories(simsimd_shared PUBLIC "${PROJECT_SOURCE_DIR}/include")
set_target_properties(simsimd_shared PROPERTIES OUTPUT_NAME simsimd)
endif ()
endif()
23 changes: 22 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ To rerun experiments utilize the following command:

```sh
sudo apt install libopenblas-dev # BLAS installation is optional, but recommended for benchmarks
cmake -D CMAKE_BUILD_TYPE=Release -D SIMSIMD_BUILD_TESTS=1 -D SIMSIMD_BUILD_BENCHMARKS=1 -D SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS=1 -B build_release
cmake -D CMAKE_BUILD_TYPE=Release \
-D SIMSIMD_BUILD_TESTS=1 -D SIMSIMD_BUILD_BENCHMARKS=1 \
-D SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS=1 -D SIMSIMD_BUILD_WITH_OPENMP=1 \
-B build_release
cmake --build build_release --config Release
build_release/simsimd_bench
build_release/simsimd_bench --benchmark_filter=js
Expand Down Expand Up @@ -63,6 +66,24 @@ cmake -D CMAKE_BUILD_TYPE=Release \
cmake --build build_release --config Release
```

Similarly, using Clang on Linux:

```sh
sudo apt install clang
cmake -D CMAKE_BUILD_TYPE=Release \
-D SIMSIMD_BUILD_TESTS=1 -D SIMSIMD_BUILD_BENCHMARKS=1 \
-D SIMSIMD_BUILD_BENCHMARKS_WITH_CBLAS=1 -D SIMSIMD_BUILD_WITH_OPENMP=0 \
-D CMAKE_C_COMPILER=clang -D CMAKE_CXX_COMPILER=clang++ \
-B build_release
cmake --build build_release --config Release
```

I'd recommend putting the following breakpoints:

- `__asan::ReportGenericError` - to detect illegal memory accesses.
- `__GI_exit` - to stop at exit points - the end of running any executable.
- `__builtin_unreachable` - to catch all the places where the code is expected to be unreachable.

## Python

Testing:
Expand Down
79 changes: 73 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -821,7 +821,9 @@ Possibly, in the future:
Last, but not the least - don't build unless there is a demand for it.
So if you have a specific use-case, please open an issue or a pull request, and ideally, bring in more users with similar needs.

### Cosine Similarity, Reciprocal Square Root, and Newton-Raphson Iteration
### Spatial Affinity

> On cosine similarity, reciprocal square root approximations, and the Newton-Raphson iteration.

The cosine similarity is the most common and straightforward metric used in machine learning and information retrieval.
Interestingly, there are multiple ways to shoot yourself in the foot when computing it.
Expand Down Expand Up @@ -876,7 +878,10 @@ On 1536-dimensional inputs on Intel Sapphire Rapids CPU a single such iteration
| `float32` | 2.21e-08 ± 1.65e-08 | 3.47e-07 ± 3.49e-07 | 3.77e-09 ± 2.84e-09 |
| `float64` | 0.00e+00 ± 0.00e+00 | 3.80e-07 ± 4.50e-07 | 1.35e-11 ± 1.85e-11 |

### Curved Spaces, Mahalanobis Distance, and Bilinear Quadratic Forms

### Curved Spaces

> On non-Euclidean geometry, Mahalanobis distance, and Bilinear Quadratic Forms.

The Mahalanobis distance is a generalization of the Euclidean distance, which takes into account the covariance of the data.
It's very similar in its form to the bilinear form, which is a generalization of the dot product.
Expand Down Expand Up @@ -906,7 +911,9 @@ A $vector * matrix * vector$ product is a scalar, whereas its constituent parts

SimSIMD doesn't produce intermediate vector results, like `a @ M @ b`, but computes the bilinear form directly.

### Set Intersection, Galloping, and Binary Search
### Sparse Vectors

> On sorted set intersections, "Galloping", and Binary Search.

The set intersection operation is generally defined as the number of elements that are common between two sets, represented as sorted arrays of integers.
The most common way to compute it is a linear scan:
Expand Down Expand Up @@ -934,7 +941,9 @@ Third approach is to use the SIMD instructions to compare multiple elements at o

After benchmarking, the last approach was chosen, as it's the most flexible and often the fastest.

### Complex Dot Products, Conjugate Dot Products, and Complex Numbers
### Dot Products

> On complex numbers, complex conjugates.

Complex dot products are a generalization of the dot product to complex numbers.
They are supported by most BLAS packages, but almost never in mixed precision.
Expand Down Expand Up @@ -971,7 +980,12 @@ def vdot(a: List[number], b: List[number]) -> number:
return ab_real, ab_imaginary
```

### Logarithms in Kullback-Leibler & Jensen–Shannon Divergences
### Binary Represntations


### Probability Distributions

> On fast logarithm approximations in Kullback-Leibler & Jensen–Shannon divergences.

The Kullback-Leibler divergence is a measure of how one probability distribution diverges from a second, expected probability distribution.
Jensen-Shannon divergence is a symmetrized and smoothed version of the Kullback-Leibler divergence, which can be used as a distance metric between probability distributions.
Expand All @@ -986,7 +1000,60 @@ Jensen-Shannon divergence is a symmetrized and smoothed version of the Kullback-

Both functions are defined for non-negative numbers, and the logarithm is a key part of their computation.

### Mixed Precision in Fused-Multiply-Add and Weighted Sums
### Dense Matrix Multiplications

> On dense matrix multiplication, and Normalized Cross-Correlation.

Unlike most dense matrix multiplication libraries, SimSIMD:

- Focuses on mixed-precision computation.
- Focuses on row-by-row multiplication, where the second matrix is transposed.
- Outputs into the same precision as the input matrices, normalizing the output, like in cosine similarity calculations.
- Doesn't implement parallelism, making it compatible with arbitrary concurrency models, third-party thread pools and task schedulers, like OpenMP, TBB, or Rayon.

These decisions makes the algorithm much easier to tile, and makes it more robust to noise and precision loss.
It also makes the multiplication more suitable to integration into multi-step Machine Learning and AI inference pipelines, where the output of one step is the input of the next, and uniform representation is required.

```math
C_{ij} = \frac{A_i \cdot B_j}{\|A_i\| \|B_j\|} = \frac{\sum_k A_{ik} B_{jk}}{\sqrt{\sum_k A_{ik}^2} \sqrt{\sum_k B_{jk}^2}}
```

The conventional matrmix multiplication lacks the denominator and indexes the second matrix differently:

```math
C_{ij} = A_i \cdot B_j = \frac{\sum_k A_{ik} B_{kj}}{\sqrt{\sum_k A_{ik}^2} \sqrt{\sum_k B_{kj}^2}}
```

Important to note, if you are going to use the Normalized Cross Correlation in training pipelines, the difference in the product definition will affect the gradient flow, and is defined differently:

Gradient with Respect to $A_i$:

```math
\frac{\partial C_{ij}}{\partial A_i} = \frac{1}{\|A_i\| \|B_j\|} \left( B_j - \frac{A_i \cdot B_j}{\|A_i\|^2} A_i \right)
```

Similarly, the gradient with respect to $B_j$ is:

```math
\frac{\partial C_{ij}}{\partial B_j} = \frac{1}{\|A_i\| \|B_j\|} \left( A_i - \frac{A_i \cdot B_j}{\|B_j\|^2} B_j \right)
```

#### Optimizing for Intel AMX

Intel Advanced Matrix Extensions (AMX) is a new instruction available in Sapphire Rapids CPUs and newer.
It provides 8x special registers, each __1 kilobyte__ in size, enough to store 16 by 16 matrix tile of 4-byte words.

### Sparse-Sparse Matrix Multiplications

> On sparse matrix multiplication, and the Compressed Sparse Row format, and the Compressed Sparse Column format.

Sparse-Sparse matrix multiplication is just "matrix multiplication" when both matrices have sparse.
Hence their values are stored in non-regularly addressable arrays, and their indices are often stored in separate arrays.
Those representations are very effective when $>99%$ of the values are close to zero, and the matrices are large.
With a good algorithm, a 100x improvement in performance can be achieved.
### Elementwise Operations

> On mixed precision in Fused-Multiply-Add and Weighted Sums.

The Fused-Multiply-Add (FMA) operation is a single operation that combines element-wise multiplication and addition with different scaling factors.
The Weighted Sum is it's simplified variant without element-wise multiplication.
Expand Down
17 changes: 10 additions & 7 deletions include/simsimd/binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* @brief SIMD-accelerated Binary Similarity Measures.
* @author Ash Vardanian
* @date July 1, 2023
* @see https://github.com/ashvardanian/simsimd?tab=readme-ov-file#binary-representations
*
* Contains:
* - Hamming distance
Expand Down Expand Up @@ -49,14 +50,16 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c

SIMSIMD_PUBLIC unsigned char simsimd_popcount_b8(simsimd_b8_t x) {
static unsigned char lookup_table[] = {
//
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, //
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, //
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, //
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, //
};
return lookup_table[x];
}

Expand Down
Loading
Loading