separate makefiles for gtests and add makefile for the examples.

bassoy · Mar 5, 2024 · 6f3d42d · 6f3d42d
1 parent cc5ab6a
commit 6f3d42d
Show file tree

Hide file tree

Showing 9 changed files with 101 additions and 43 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -33,9 +33,22 @@ jobs:
         cd ${GITHUB_WORKSPACE}
         git clone https://github.com/bassoy/ttm.git
 
-    - name: Build and run tests
+    - name: Build and run tests (Intel MKL)
       run: |
         cd ttm/test
-        make -j
-        ./bin/main      
+        make clean && make -j BLAS_FLAG=MKL
+        ./bin/main
+
+    - name: Build and run tests (OpenBLAS)
+      run: |
+        cd ttm/test
+        make clean && make -j BLAS_FLAG=OPENBLAS
+        ./bin/main
         
+    - name: Build and run examples (Intel MKL)
+      run: |
+        cd ttm/example
+        make clean && make -j BLAS_FLAG=MKL
+        ./interface1
+        ./interface2
+        ./interface3
diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ The order-$2$ tensor $\mathbf{B}$ is a matrix with shape $\mathbf{n}\_b = (m,n\_
 A simple example of the tensor-matrix multiplication is the matrix-matrix multiplication with $\mathbf{C} = \mathbf{B} \cdot \mathbf{A}$ with $q=1$.
 The number of dimensions (order) $p$ and the dimensions $n_r$ as well as the linear tensor layout $\mathbf{\pi}$ of the tensors $\underline{\mathbf{A}}$ and $\underline{\mathbf{C}}$ can be chosen at runtime.
 
-All function implementations are based on the Loops-Over-GEMM (LOG) approach and utilize high-performance `GEMM` or `GEMV` routines of `BLAS` such as OpenBLAS or Intel MKL without transposing tensors.
+All function implementations are based on the Loops-Over-GEMM (LOG) approach and utilize high-performance `gemm` or `gemm` routines of `BLAS` such as OpenBLAS or Intel MKL without transposing tensors.
 The library is an extension of the [boost/ublas](https://github.com/boostorg/ublas) tensor library containing the sequential version. 
 
 Please have a look at the [wiki](https://github.com/bassoy/ttm/wiki) page for more informations about the **usage**, **function interfaces** and the **setting parameters**.
@@ -38,7 +38,7 @@ Please have a look at the [wiki](https://github.com/bassoy/ttm/wiki) page for mo
 
 ### Performance
 * Multi-threading support with OpenMP v4.5 or higher
-* Can be used with and without a BLAS implementation
+* Currently mustbe used with a BLAS implementation
 * Performs in-place operations without transposing the tensor - no extra memory needed
 * For large tensors reaches peak matrix-times-matrix performance
 
@@ -48,10 +48,10 @@ Please have a look at the [wiki](https://github.com/bassoy/ttm/wiki) page for mo
 
 ## Experiments
 
-The experiments were carried out on a Core i9-7900X Intel Xeon processor with 10 cores and 20 hardware threads running at 3.3 GHz.
-The source code has been compiled with GCC v7.3 using the highest optimization level `-Ofast` and `-march=native`, `-pthread` and `-fopenmp`. 
-Parallel execution has been accomplished using GCC ’s implementation of the OpenMP v4.5 specification. 
-We have used the `gemv` and `gemm` implementation of the MLK library v2024. 
+The experiments were carried out on a Intel Xeon Gold 6248R Xeon processor with 24 cores running at  a base frequency of 3 GHz.
+The source code has been compiled with GCC v10.2 using the highest optimization level `-O3` and `-march=native`, `-pthread` and `-fopenmp`. 
+Parallel execution has been accomplished using GCC’s implementation of the OpenMP v4.5 specification. 
+We have used the `gemv` and `gemm` implementation of the Intel MLK library v2024. 
 The benchmark results of each of the following functions are the average of 10 runs.
 
 The comparison includes three state-of-the-art libraries that implement three different approaches. 
@@ -147,5 +147,3 @@ int main()
 */
 }
 ```
-Compile with `g++ -I${TLIB_INC} ${BLAS_INC} -std=c++17 -O3 -fopenmp main.cpp ${BLAS_LIB} ${BLAS_FLAGS} -DUSE_MKLBLAS -o main`
-where `${TLIB_INC}` is the header location of `TLIB` and `${BLAS_INC}` is the header location of the desired `BLAS` library.
diff --git a/example/Makefile b/example/Makefile
@@ -0,0 +1,47 @@
+CXX :=g++
+#CXX :=clang++
+
+CXX_FLAGS:=-Wextra -Wall -Wpedantic -O3 -std=c++20 -pthread -fopenmp
+
+ifeq ($(BLAS_FLAG), OPENBLAS)
+  include ../openblas.mk
+else ifeq ($(BLAS_FLAG), MKL)
+  include ../mkl.mk
+endif
+
+TLIB_INC :=-I../include
+
+CXX_FLAGS += $(BLAS_FLAGS)
+INCS += $(TLIB_INC) $(BLAS_INC)
+LIBS += $(BLAS_LIB) -lgomp -lpthread
+
+# Source files
+SRC1 := interface1.cpp
+SRC2 := interface2.cpp
+SRC3 := interface3.cpp
+
+# Object files
+OBJ1 := $(SRC1:.cpp=.o)
+OBJ2 := $(SRC2:.cpp=.o)
+OBJ3 := $(SRC3:.cpp=.o)
+
+# Targets
+all: interface1 interface2 interface3
+
+interface1: $(OBJ1)
+	$(CXX) $(CXX_FLAGS) -o $@ $^ $(LIBS)
+
+interface2: $(OBJ2)
+	$(CXX) $(CXX_FLAGS) -o $@ $^ $(LIBS)
+
+interface3: $(OBJ3)
+	$(CXX) $(CXX_FLAGS) -o $@ $^ $(LIBS) 
+
+# Generic rule to build object files
+%.o: %.cpp
+	$(CXX) $(CXX_FLAGS) $(INCS) -c $< -o $@
+
+# Clean rule
+clean:
+	rm -f interface1 interface2 interface3 $(OBJ1) $(OBJ2) $(OBJ3)
+
diff --git a/include/tlib/detail/ttm.h b/include/tlib/detail/ttm.h
@@ -64,7 +64,7 @@ inline void set_blas_threads(size_t num)
 static inline unsigned get_blas_threads()
 {
 #ifdef USE_OPENBLAS
-    return openblas_get_max_threads();
+    return openblas_get_num_threads();
 #elif defined USE_MKLBLAS
     return mkl_get_max_threads();
 #endif

diff --git a/mkl.mk b/mkl.mk
@@ -0,0 +1,14 @@
+#MKL_ROOT_DIR=/opt/intel/oneapi
+#MKL_BLAS_DIR=${MKL_ROOT_DIR}/mkl/latest
+#MKL_COMP_DIR="${MKL_ROOT_DIR}/compiler/2023.2.0/linux/compiler"
+##MKL_COMP_DIR=${MKL_ROOT_DIR}/compiler/2024.0
+#MKL_BLAS_INC=-I${MKL_BLAS_DIR}/include
+#MKL_BLAS_LIB=-Wl,--start-group ${MKL_BLAS_DIR}/lib/libmkl_intel_ilp64.a ${MKL_BLAS_DIR}/lib/libmkl_intel_thread.a ${MKL_BLAS_DIR}/lib/libmkl_core.a ${MKL_COMP_DIR}/lib/intel64_lin/libiomp5.a -Wl,--end-group
+#MKL_BLAS_LIB+=-lpthread -lm -ldl -m64 #-L${MKL_COMP_DIR}/lib -liomp5
+#MKL_BLAS_FLAGS=-DMKL_ILP64 -m64
+
+
+MKL_BLAS_DIR=/usr/lib/x86_64-linux-gnu
+BLAS_INC=-I/usr/include/mkl
+BLAS_LIB=-Wl,--start-group ${MKL_BLAS_DIR}/libmkl_intel_ilp64.a ${MKL_BLAS_DIR}/libmkl_intel_thread.a ${MKL_BLAS_DIR}/libmkl_core.a -Wl,--end-group -liomp5 -lm -ldl -m64
+BLAS_FLAGS=-DMKL_ILP64 -m64 -lpthread -DUSE_MKLBLAS
diff --git a/openblas.mk b/openblas.mk
@@ -0,0 +1,3 @@
+BLAS_INC:=-I/usr/include/x86_64-linux-gnu/openblas64-pthread
+BLAS_LIB:=-lopenblas
+BLAS_FLAGS:=-DUSE_OPENBLAS
diff --git a/test/Makefile b/test/Makefile
@@ -1,48 +1,28 @@
 CXX :=g++
 #CXX :=clang++
 
-# -DUSE_OPENBLAS 
-# -DUSE_MLKBLAS
-CXX_FLAGS +=-Wextra -Wall -Wpedantic -O3 -std=c++20 -pthread -fopenmp -DUSE_MKLBLAS
-# -g -O0 -fprofile-arcs -ftest-coverage -fno-inline 
+CXX_FLAGS +=-Wextra -Wall -Wpedantic -O3 -std=c++20 -pthread -fopenmp
 
+ifeq ($(BLAS_FLAG), OPENBLAS)
+  include ../openblas.mk
+else ifeq ($(BLAS_FLAG), MKL)
+  include ../mkl.mk
+endif
 
-#MKL_ROOT_DIR=/opt/intel/oneapi
-#MKL_BLAS_DIR=${MKL_ROOT_DIR}/mkl/latest
-#MKL_COMP_DIR="${MKL_ROOT_DIR}/compiler/2023.2.0/linux/compiler"
-##MKL_COMP_DIR=${MKL_ROOT_DIR}/compiler/2024.0
-#MKL_BLAS_INC=-I${MKL_BLAS_DIR}/include
-#MKL_BLAS_LIB=-Wl,--start-group ${MKL_BLAS_DIR}/lib/libmkl_intel_ilp64.a ${MKL_BLAS_DIR}/lib/libmkl_intel_thread.a ${MKL_BLAS_DIR}/lib/libmkl_core.a ${MKL_COMP_DIR}/lib/intel64_lin/libiomp5.a -Wl,--end-group
-#MKL_BLAS_LIB+=-lpthread -lm -ldl -m64 #-L${MKL_COMP_DIR}/lib -liomp5
-#MKL_BLAS_FLAGS=-DMKL_ILP64 -m64
 
+include gtest.mk
 
-MKL_BLAS_DIR=/usr/lib/x86_64-linux-gnu
-MKL_BLAS_INC=-I/usr/include/mkl
-MKL_BLAS_LIB=-Wl,--start-group ${MKL_BLAS_DIR}/libmkl_intel_ilp64.a ${MKL_BLAS_DIR}/libmkl_intel_thread.a ${MKL_BLAS_DIR}/libmkl_core.a -Wl,--end-group -liomp5
-MKL_BLAS_FLAGS=-DMKL_ILP64 -m64
+TLIB_INC :=-I../include -Iinclude
 
-
-OPENBLAS_INC :=-I/usr/include/x86_64-linux-gnu/openblas64-pthread
-OPENBLAS_LIB :=-lopenblas
-
-GTEST_LIB :=-lgtest -lpthread
-
-TLIB_INC :=-I../include
-
-#$(OPENBLAS_INC) 
-#$(OPENBLAS_LIB) 
-
-INCS +=$(TLIB_INC) -Iinclude $(MKL_BLAS_INC)
-LIBS +=$(GTEST_LIB) $(MKL_BLAS_LIB) -lgomp -lpthread -lm -ldl -m64
+CXX_FLAGS += $(BLAS_FLAGS)
+INCS      += $(TLIB_INC)  $(BLAS_INC)
+LIBS      += $(GTEST_LIB) $(BLAS_LIB) -lgomp -lpthread
 
 CFILES2 = $(wildcard src/*.cpp)
 OBJS5   = $(notdir $(CFILES2))
 OBJS6   = $(patsubst %.cpp,%.o,$(OBJS5))
 OBJSC   = $(addprefix build/,$(OBJS6))
 
-CXX_FLAGS += $(MKL_BLAS_FLAGS)
-
 TARGET := bin/main
 $(TARGET): $(OBJSC)
 	$(CXX) $(CXX_FLAGS) $(OBJSC) $(LIBS) -o $(TARGET)

diff --git a/test/gtest.mk b/test/gtest.mk
@@ -0,0 +1 @@
+GTEST_LIB :=-lgtest -lpthread
diff --git a/test/src/gtest_tlib_ttm.cpp b/test/src/gtest_tlib_ttm.cpp
@@ -371,8 +371,10 @@ TEST(TensorTimesMatrix, BatchedGemmSubtensorOuterFusion)
     using slicing_policy   = tlib::slicing_policy::subtensor_t;
     using fusion_policy    = tlib::fusion_policy::outer_t;
 
+#ifdef USE_MKLBLAS
     check_tensor_times_matrix<value_type,size_type,execution_policy,slicing_policy,fusion_policy,2u>(2u,3);
     check_tensor_times_matrix<value_type,size_type,execution_policy,slicing_policy,fusion_policy,3u>(2u,3);
     check_tensor_times_matrix<value_type,size_type,execution_policy,slicing_policy,fusion_policy,4u>(2u,3);
 //    check_tensor_times_matrix<value_type,size_type,execution_policy,slicing_policy,fusion_policy,5u>(2u,3);
+#endif
 }