Skip to content

Commit

Permalink
separate makefiles for gtests and add makefile for the examples.
Browse files Browse the repository at this point in the history
  • Loading branch information
bassoy committed Mar 5, 2024
1 parent cc5ab6a commit 6f3d42d
Show file tree
Hide file tree
Showing 9 changed files with 101 additions and 43 deletions.
19 changes: 16 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,22 @@ jobs:
cd ${GITHUB_WORKSPACE}
git clone https://github.com/bassoy/ttm.git
- name: Build and run tests
- name: Build and run tests (Intel MKL)
run: |
cd ttm/test
make -j
./bin/main
make clean && make -j BLAS_FLAG=MKL
./bin/main
- name: Build and run tests (OpenBLAS)
run: |
cd ttm/test
make clean && make -j BLAS_FLAG=OPENBLAS
./bin/main
- name: Build and run examples (Intel MKL)
run: |
cd ttm/example
make clean && make -j BLAS_FLAG=MKL
./interface1
./interface2
./interface3
14 changes: 6 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ The order-$2$ tensor $\mathbf{B}$ is a matrix with shape $\mathbf{n}\_b = (m,n\_
A simple example of the tensor-matrix multiplication is the matrix-matrix multiplication with $\mathbf{C} = \mathbf{B} \cdot \mathbf{A}$ with $q=1$.
The number of dimensions (order) $p$ and the dimensions $n_r$ as well as the linear tensor layout $\mathbf{\pi}$ of the tensors $\underline{\mathbf{A}}$ and $\underline{\mathbf{C}}$ can be chosen at runtime.

All function implementations are based on the Loops-Over-GEMM (LOG) approach and utilize high-performance `GEMM` or `GEMV` routines of `BLAS` such as OpenBLAS or Intel MKL without transposing tensors.
All function implementations are based on the Loops-Over-GEMM (LOG) approach and utilize high-performance `gemm` or `gemm` routines of `BLAS` such as OpenBLAS or Intel MKL without transposing tensors.
The library is an extension of the [boost/ublas](https://github.com/boostorg/ublas) tensor library containing the sequential version.

Please have a look at the [wiki](https://github.com/bassoy/ttm/wiki) page for more informations about the **usage**, **function interfaces** and the **setting parameters**.
Expand All @@ -38,7 +38,7 @@ Please have a look at the [wiki](https://github.com/bassoy/ttm/wiki) page for mo

### Performance
* Multi-threading support with OpenMP v4.5 or higher
* Can be used with and without a BLAS implementation
* Currently mustbe used with a BLAS implementation
* Performs in-place operations without transposing the tensor - no extra memory needed
* For large tensors reaches peak matrix-times-matrix performance

Expand All @@ -48,10 +48,10 @@ Please have a look at the [wiki](https://github.com/bassoy/ttm/wiki) page for mo

## Experiments

The experiments were carried out on a Core i9-7900X Intel Xeon processor with 10 cores and 20 hardware threads running at 3.3 GHz.
The source code has been compiled with GCC v7.3 using the highest optimization level `-Ofast` and `-march=native`, `-pthread` and `-fopenmp`.
Parallel execution has been accomplished using GCC ’s implementation of the OpenMP v4.5 specification.
We have used the `gemv` and `gemm` implementation of the MLK library v2024.
The experiments were carried out on a Intel Xeon Gold 6248R Xeon processor with 24 cores running at a base frequency of 3 GHz.
The source code has been compiled with GCC v10.2 using the highest optimization level `-O3` and `-march=native`, `-pthread` and `-fopenmp`.
Parallel execution has been accomplished using GCC’s implementation of the OpenMP v4.5 specification.
We have used the `gemv` and `gemm` implementation of the Intel MLK library v2024.
The benchmark results of each of the following functions are the average of 10 runs.

The comparison includes three state-of-the-art libraries that implement three different approaches.
Expand Down Expand Up @@ -147,5 +147,3 @@ int main()
*/
}
```
Compile with `g++ -I${TLIB_INC} ${BLAS_INC} -std=c++17 -O3 -fopenmp main.cpp ${BLAS_LIB} ${BLAS_FLAGS} -DUSE_MKLBLAS -o main`
where `${TLIB_INC}` is the header location of `TLIB` and `${BLAS_INC}` is the header location of the desired `BLAS` library.
47 changes: 47 additions & 0 deletions example/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
CXX :=g++
#CXX :=clang++

CXX_FLAGS:=-Wextra -Wall -Wpedantic -O3 -std=c++20 -pthread -fopenmp

ifeq ($(BLAS_FLAG), OPENBLAS)
include ../openblas.mk
else ifeq ($(BLAS_FLAG), MKL)
include ../mkl.mk
endif

TLIB_INC :=-I../include

CXX_FLAGS += $(BLAS_FLAGS)
INCS += $(TLIB_INC) $(BLAS_INC)
LIBS += $(BLAS_LIB) -lgomp -lpthread

# Source files
SRC1 := interface1.cpp
SRC2 := interface2.cpp
SRC3 := interface3.cpp

# Object files
OBJ1 := $(SRC1:.cpp=.o)
OBJ2 := $(SRC2:.cpp=.o)
OBJ3 := $(SRC3:.cpp=.o)

# Targets
all: interface1 interface2 interface3

interface1: $(OBJ1)
$(CXX) $(CXX_FLAGS) -o $@ $^ $(LIBS)

interface2: $(OBJ2)
$(CXX) $(CXX_FLAGS) -o $@ $^ $(LIBS)

interface3: $(OBJ3)
$(CXX) $(CXX_FLAGS) -o $@ $^ $(LIBS)

# Generic rule to build object files
%.o: %.cpp
$(CXX) $(CXX_FLAGS) $(INCS) -c $< -o $@

# Clean rule
clean:
rm -f interface1 interface2 interface3 $(OBJ1) $(OBJ2) $(OBJ3)

2 changes: 1 addition & 1 deletion include/tlib/detail/ttm.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ inline void set_blas_threads(size_t num)
static inline unsigned get_blas_threads()
{
#ifdef USE_OPENBLAS
return openblas_get_max_threads();
return openblas_get_num_threads();
#elif defined USE_MKLBLAS
return mkl_get_max_threads();
#endif
Expand Down
14 changes: 14 additions & 0 deletions mkl.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#MKL_ROOT_DIR=/opt/intel/oneapi
#MKL_BLAS_DIR=${MKL_ROOT_DIR}/mkl/latest
#MKL_COMP_DIR="${MKL_ROOT_DIR}/compiler/2023.2.0/linux/compiler"
##MKL_COMP_DIR=${MKL_ROOT_DIR}/compiler/2024.0
#MKL_BLAS_INC=-I${MKL_BLAS_DIR}/include
#MKL_BLAS_LIB=-Wl,--start-group ${MKL_BLAS_DIR}/lib/libmkl_intel_ilp64.a ${MKL_BLAS_DIR}/lib/libmkl_intel_thread.a ${MKL_BLAS_DIR}/lib/libmkl_core.a ${MKL_COMP_DIR}/lib/intel64_lin/libiomp5.a -Wl,--end-group
#MKL_BLAS_LIB+=-lpthread -lm -ldl -m64 #-L${MKL_COMP_DIR}/lib -liomp5
#MKL_BLAS_FLAGS=-DMKL_ILP64 -m64


MKL_BLAS_DIR=/usr/lib/x86_64-linux-gnu
BLAS_INC=-I/usr/include/mkl
BLAS_LIB=-Wl,--start-group ${MKL_BLAS_DIR}/libmkl_intel_ilp64.a ${MKL_BLAS_DIR}/libmkl_intel_thread.a ${MKL_BLAS_DIR}/libmkl_core.a -Wl,--end-group -liomp5 -lm -ldl -m64
BLAS_FLAGS=-DMKL_ILP64 -m64 -lpthread -DUSE_MKLBLAS
3 changes: 3 additions & 0 deletions openblas.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
BLAS_INC:=-I/usr/include/x86_64-linux-gnu/openblas64-pthread
BLAS_LIB:=-lopenblas
BLAS_FLAGS:=-DUSE_OPENBLAS
42 changes: 11 additions & 31 deletions test/Makefile
Original file line number Diff line number Diff line change
@@ -1,48 +1,28 @@
CXX :=g++
#CXX :=clang++

# -DUSE_OPENBLAS
# -DUSE_MLKBLAS
CXX_FLAGS +=-Wextra -Wall -Wpedantic -O3 -std=c++20 -pthread -fopenmp -DUSE_MKLBLAS
# -g -O0 -fprofile-arcs -ftest-coverage -fno-inline
CXX_FLAGS +=-Wextra -Wall -Wpedantic -O3 -std=c++20 -pthread -fopenmp

ifeq ($(BLAS_FLAG), OPENBLAS)
include ../openblas.mk
else ifeq ($(BLAS_FLAG), MKL)
include ../mkl.mk
endif

#MKL_ROOT_DIR=/opt/intel/oneapi
#MKL_BLAS_DIR=${MKL_ROOT_DIR}/mkl/latest
#MKL_COMP_DIR="${MKL_ROOT_DIR}/compiler/2023.2.0/linux/compiler"
##MKL_COMP_DIR=${MKL_ROOT_DIR}/compiler/2024.0
#MKL_BLAS_INC=-I${MKL_BLAS_DIR}/include
#MKL_BLAS_LIB=-Wl,--start-group ${MKL_BLAS_DIR}/lib/libmkl_intel_ilp64.a ${MKL_BLAS_DIR}/lib/libmkl_intel_thread.a ${MKL_BLAS_DIR}/lib/libmkl_core.a ${MKL_COMP_DIR}/lib/intel64_lin/libiomp5.a -Wl,--end-group
#MKL_BLAS_LIB+=-lpthread -lm -ldl -m64 #-L${MKL_COMP_DIR}/lib -liomp5
#MKL_BLAS_FLAGS=-DMKL_ILP64 -m64

include gtest.mk

MKL_BLAS_DIR=/usr/lib/x86_64-linux-gnu
MKL_BLAS_INC=-I/usr/include/mkl
MKL_BLAS_LIB=-Wl,--start-group ${MKL_BLAS_DIR}/libmkl_intel_ilp64.a ${MKL_BLAS_DIR}/libmkl_intel_thread.a ${MKL_BLAS_DIR}/libmkl_core.a -Wl,--end-group -liomp5
MKL_BLAS_FLAGS=-DMKL_ILP64 -m64
TLIB_INC :=-I../include -Iinclude


OPENBLAS_INC :=-I/usr/include/x86_64-linux-gnu/openblas64-pthread
OPENBLAS_LIB :=-lopenblas

GTEST_LIB :=-lgtest -lpthread

TLIB_INC :=-I../include

#$(OPENBLAS_INC)
#$(OPENBLAS_LIB)

INCS +=$(TLIB_INC) -Iinclude $(MKL_BLAS_INC)
LIBS +=$(GTEST_LIB) $(MKL_BLAS_LIB) -lgomp -lpthread -lm -ldl -m64
CXX_FLAGS += $(BLAS_FLAGS)
INCS += $(TLIB_INC) $(BLAS_INC)
LIBS += $(GTEST_LIB) $(BLAS_LIB) -lgomp -lpthread

CFILES2 = $(wildcard src/*.cpp)
OBJS5 = $(notdir $(CFILES2))
OBJS6 = $(patsubst %.cpp,%.o,$(OBJS5))
OBJSC = $(addprefix build/,$(OBJS6))

CXX_FLAGS += $(MKL_BLAS_FLAGS)

TARGET := bin/main
$(TARGET): $(OBJSC)
$(CXX) $(CXX_FLAGS) $(OBJSC) $(LIBS) -o $(TARGET)
Expand Down
1 change: 1 addition & 0 deletions test/gtest.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
GTEST_LIB :=-lgtest -lpthread
2 changes: 2 additions & 0 deletions test/src/gtest_tlib_ttm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,8 +371,10 @@ TEST(TensorTimesMatrix, BatchedGemmSubtensorOuterFusion)
using slicing_policy = tlib::slicing_policy::subtensor_t;
using fusion_policy = tlib::fusion_policy::outer_t;

#ifdef USE_MKLBLAS
check_tensor_times_matrix<value_type,size_type,execution_policy,slicing_policy,fusion_policy,2u>(2u,3);
check_tensor_times_matrix<value_type,size_type,execution_policy,slicing_policy,fusion_policy,3u>(2u,3);
check_tensor_times_matrix<value_type,size_type,execution_policy,slicing_policy,fusion_policy,4u>(2u,3);
// check_tensor_times_matrix<value_type,size_type,execution_policy,slicing_policy,fusion_policy,5u>(2u,3);
#endif
}

0 comments on commit 6f3d42d

Please sign in to comment.