From a030a12798476af0fcb57427f01b14bf8f78cd4a Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 7 Apr 2022 10:51:47 +0100
Subject: [PATCH 001/318] Add ACCL as external dependency

---
 extern/CMakeLists.txt | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 75025b7c..77d5e3ac 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -28,7 +28,7 @@ FetchContent_Declare(
 
   # unfortunately they do not use releases, so the latest commit was used
   GIT_REPOSITORY      https://github.com/definelicht/hlslib.git
-  GIT_TAG             v1.2.1)
+  GIT_TAG             v1.4.3)
 
 FetchContent_GetProperties(extern_hlslib)
 if(NOT extern_hlslib_POPULATED)
@@ -54,3 +54,18 @@ if(NOT extern_cxxopts_POPULATED)
     ${extern_cxxopts_BINARY_DIR}
     EXCLUDE_FROM_ALL)
 endif()
+
+# -------------------------------------------------------------------------------
+# ACCL Library
+FetchContent_Declare(
+	extern_accl
+
+	GIT_REPOSITORY	https://github.com/Mellich/ACCL.git
+	GIT_TAG		dev)
+
+FetchContent_GetProperties(extern_accl)
+if(NOT extern_accl_POPULATED)
+	message(STATUS "Fetching mandatory build dependency ACCL")
+	FetchContent_Populate(extern_accl)
+	set(extern_accl_SOURCE_DIR ${extern_accl_SOURCE_DIR} PARENT_SCOPE)
+endif()

From 37eca9d1ae611038a2f71ba116e4c7c601049af8 Mon Sep 17 00:00:00 2001
From: Marius Meyer <marius.meyer@uni-paderborn.de>
Date: Fri, 8 Apr 2022 10:28:10 +0100
Subject: [PATCH 002/318] Make base class more generic

---
 PTRANS/src/host/transpose_benchmark.hpp |  8 ++-
 b_eff/src/host/network_benchmark.hpp    |  6 ++-
 b_eff/tests/CMakeLists.txt              |  4 +-
 cmake/unitTestTargets.cmake             |  4 +-
 shared/include/hpcc_benchmark.hpp       | 27 ++++++----
 shared/include/setup/fpga_setup_xrt.hpp | 66 +++++++++++++++++++++++++
 6 files changed, 100 insertions(+), 15 deletions(-)
 create mode 100644 shared/include/setup/fpga_setup_xrt.hpp

diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index 5de333ca..0136c22c 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -46,8 +46,12 @@ namespace transpose {
  * @brief Implementation of the transpose benchmark
  * 
  */
-class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings, TransposeData, TransposeExecutionTimings> {
-
+class TransposeBenchmark : 
+#ifndef USE_XRT_BINDINGS
+public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,cl::Device, cl::Context, cl::Program, TransposeData, TransposeExecutionTimings> {
+#else
+// TODO initialize benchmark wth XRT bindings
+#endif
 protected:
 
     /**
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index 0fdf8064..df445649 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -213,8 +213,12 @@ class NetworkExecutionTimings {
  * @brief Implementation of the Network benchmark
  * 
  */
-class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, NetworkData, NetworkExecutionTimings> {
+class NetworkBenchmark : 
+#ifndef USE_XRT_BINDINGS
+public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, cl::Device, cl::Context, cl::Program, NetworkData, NetworkExecutionTimings> {
+#else
 
+#endif
 protected:
 
     /**
diff --git a/b_eff/tests/CMakeLists.txt b/b_eff/tests/CMakeLists.txt
index 2a00ea83..be73f519 100755
--- a/b_eff/tests/CMakeLists.txt
+++ b/b_eff/tests/CMakeLists.txt
@@ -6,4 +6,6 @@ set(TEST_SOURCES test_kernel_functionality_and_host_integration.cpp)
 
 include(${CMAKE_SOURCE_DIR}/../cmake/unitTestTargets.cmake)
 
-target_link_libraries(${LIB_NAME}_intel ${MPI_LIBRARIES})
+if (INTELFPGAOPENCL_FOUND)
+    target_link_libraries(${LIB_NAME}_intel ${MPI_LIBRARIES})
+endif()
diff --git a/cmake/unitTestTargets.cmake b/cmake/unitTestTargets.cmake
index 2597017b..0f36d3da 100644
--- a/cmake/unitTestTargets.cmake
+++ b/cmake/unitTestTargets.cmake
@@ -24,7 +24,9 @@ if (Vitis_FOUND)
     add_executable(${HOST_EXE_NAME}_test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES})
     target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
     target_link_libraries(${HOST_EXE_NAME}_test_xilinx hpcc_fpga_base_test)
-    add_dependencies(${HOST_EXE_NAME}_test_xilinx ${kernel_emulation_targets_xilinx})
+    if (NOT "${kernel_emulation_targets_xilinx}" STREQUAL "")
+        add_dependencies(${HOST_EXE_NAME}_test_xilinx "${kernel_emulation_targets_xilinx}")
+    endif()
     target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
     foreach (kernel_target ${kernel_emulation_targets_xilinx})
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 17e17bb9..e85579d2 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -35,6 +35,9 @@ SOFTWARE.
 #endif
 
 /* Project's headers */
+#ifdef USE_XRT_BINDINGS
+#include "setup/fpga_setup_xrt.hpp"
+#endif
 #include "setup/fpga_setup.hpp"
 #include "cxxopts.hpp"
 #include "parameters.h"
@@ -176,7 +179,7 @@ class BaseSettings {
  * 
  * @tparam TSettings The program settings class that should be used (Must derive from BaseSettings)
  */
-template <class TSettings>
+template <class TSettings, class TDevice, class TContext, class TProgram>
 class ExecutionSettings {
 public:
 
@@ -190,19 +193,19 @@ class ExecutionSettings {
      * @brief The OpenCL device that should be used for execution
      * 
      */
-    std::unique_ptr<cl::Device> device;
+    std::unique_ptr<TDevice> device;
 
     /**
      * @brief The OpenCL context that should be used for execution
      * 
      */
-    std::unique_ptr<cl::Context> context;
+    std::unique_ptr<TContext> context;
 
     /**
      * @brief The OpenCL program that contains the benchmark kernel
      * 
      */
-    std::unique_ptr<cl::Program> program;
+    std::unique_ptr<TProgram> program;
 
     /**
      * @brief Construct a new Execution Settings object
@@ -238,7 +241,7 @@ class ExecutionSettings {
  * @tparam TData Class used to represent the benchmark input and output data
  * @tparam TOutput Class representing the measurements like timings etc
  */
-template <class TSettings, class TData, class TOutput>
+template <class TSettings, class TDevice, class TContext, class TProgram, class TData, class TOutput>
 class HpccFpgaBenchmark {
 
 private:
@@ -258,7 +261,7 @@ class HpccFpgaBenchmark {
      *        It should be laos used by all other methods to read the current benchmark settings.
      * 
      */
-    std::unique_ptr<ExecutionSettings<TSettings>> executionSettings;
+    std::unique_ptr<ExecutionSettings<TSettings, TDevice, TContext, TProgram>> executionSettings;
 
     /**
      * @brief Add additional options to the program parameter parser
@@ -472,20 +475,24 @@ class HpccFpgaBenchmark {
 
             std::unique_ptr<TSettings> programSettings = parseProgramParameters(tmp_argc, tmp_argv);
 
-            std::unique_ptr<cl::Context> context;
-            std::unique_ptr<cl::Program> program;
-            std::unique_ptr<cl::Device> usedDevice;
+            std::unique_ptr<TContext> context;
+            std::unique_ptr<TProgram> program;
+            std::unique_ptr<TDevice> usedDevice;
 
             if (!programSettings->testOnly) {
+#ifndef USE_XRT_BINDINGS
                 usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
                                                                     programSettings->defaultDevice);
 
                 context = std::unique_ptr<cl::Context>(new cl::Context(*usedDevice));
                 program = fpga_setup::fpgaSetup(context.get(), {*usedDevice},
                                                                     &programSettings->kernelFileName);
+ #else
+                // TODO: Select XRT device and program here!
+ #endif
             }
 
-            executionSettings = std::unique_ptr<ExecutionSettings<TSettings>>(new ExecutionSettings<TSettings>(std::move(programSettings), std::move(usedDevice), 
+            executionSettings = std::unique_ptr<ExecutionSettings<TSettings, TDevice, TContext, TProgram>>(new ExecutionSettings<TSettings, TDevice, TContext, TProgram>(std::move(programSettings), std::move(usedDevice), 
                                                                     std::move(context), std::move(program)));
             if (mpi_comm_rank == 0) {
                 if (!checkInputParameters()) {
diff --git a/shared/include/setup/fpga_setup_xrt.hpp b/shared/include/setup/fpga_setup_xrt.hpp
new file mode 100644
index 00000000..73f5c56f
--- /dev/null
+++ b/shared/include/setup/fpga_setup_xrt.hpp
@@ -0,0 +1,66 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_FPGA_SETUP_XRT_H_
+#define SRC_HOST_FPGA_SETUP_XRT_H_
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <fstream>
+#include <memory>
+
+/* External libraries */
+#include "xrt/xrt_device.h"
+
+
+namespace fpga_setup {
+
+/**
+Sets up the given FPGA with the kernel in the provided file.
+
+@param device The device used for the program
+@param usedKernelFile The path to the kernel file
+@return The program that is used to create the benchmark kernels
+*/
+    std::unique_ptr<uuid>
+    fpgaSetupXRT(xrt::device &device,
+              const std::string *usedKernelFile);
+
+
+/**
+Searches an selects an FPGA device using the CL library functions.
+If multiple platforms or devices are given, the user will be prompted to
+choose a device.
+
+@param defaultDevice The index of the device that has to be used. If a
+                        value < 0 is given, the device can be chosen
+                        interactively
+
+@return the selected device
+*/
+    std::unique_ptr<xrt::device>
+    selectFPGADeviceXRT(int defaultDevice);
+
+}  // namespace fpga_setup
+#endif  // SRC_HOST_FPGA_SETUP_H_

From d8774266f8ef2a73564046bc18153c482f1c31f9 Mon Sep 17 00:00:00 2001
From: Marius Meyer <marius.meyer@uni-paderborn.de>
Date: Fri, 8 Apr 2022 11:26:05 +0100
Subject: [PATCH 003/318] Patch b_eff compatability with Xilinx

---
 b_eff/CMakeLists.txt                          |  3 ---
 b_eff/src/host/CMakeLists.txt                 | 12 +++++++++
 b_eff/src/host/execution_types/execution.hpp  |  4 ++-
 .../host/execution_types/execution_pcie.hpp   | 23 ++++++++++++++++
 b_eff/src/host/network_benchmark.cpp          |  6 +++--
 b_eff/src/host/network_benchmark.hpp          | 27 +++++++++++++++++++
 6 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt
index b894bb48..13d93b1b 100755
--- a/b_eff/CMakeLists.txt
+++ b/b_eff/CMakeLists.txt
@@ -24,6 +24,3 @@ include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake)
 unset(DATA_TYPE CACHE)
 find_package(MPI REQUIRED)
 
-if (NOT INTELFPGAOPENCL_FOUND)
-    message(ERROR "Benchmark does only support the Intel OpenCL SDK")
-endif()
diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt
index fb08281f..9809208c 100755
--- a/b_eff/src/host/CMakeLists.txt
+++ b/b_eff/src/host/CMakeLists.txt
@@ -17,3 +17,15 @@ if (INTELFPGAOPENCL_FOUND)
     target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_intel_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_intel> -h)
 endif()
+if (Vitis_FOUND)
+    add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE})
+    target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS})
+    target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host)
+    add_executable(${HOST_EXE_NAME}_xilinx main.cpp)
+    target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
+    target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
+    target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx)
+    target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
+    target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
+    add_test(NAME test_xilinx_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_xilinx> -h)
+endif()
diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp
index df630838..22a8a12e 100644
--- a/b_eff/src/host/execution_types/execution.hpp
+++ b/b_eff/src/host/execution_types/execution.hpp
@@ -22,4 +22,6 @@ SOFTWARE.
 
 #include "execution_types/execution_cpu.hpp"
 #include "execution_types/execution_pcie.hpp"
-#include "execution_types/execution_iec.hpp"
\ No newline at end of file
+#ifdef INTEL_FPGA
+#include "execution_types/execution_iec.hpp"
+#endif
diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp
index 73156b7e..9e266cf8 100644
--- a/b_eff/src/host/execution_types/execution_pcie.hpp
+++ b/b_eff/src/host/execution_types/execution_pcie.hpp
@@ -45,6 +45,9 @@ namespace network::execution_types::pcie {
         int err;
         std::vector<cl::CommandQueue> sendQueues;
         std::vector<cl::Buffer> dummyBuffers;
+#ifdef XILINX_FPGA
+        std::vector<cl::Kernel> accesskernel;
+#endif
         std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
 
         cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
@@ -59,6 +62,9 @@ namespace network::execution_types::pcie {
         for (uint r =0; r < config.programSettings->numRepetitions; r++) {
             sendQueues.clear();
             dummyBuffers.clear();
+#ifdef XILINX_FPGA
+            accesskernel.clear();
+#endif
             dummyBufferContents.clear();
             // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
@@ -66,6 +72,23 @@ namespace network::execution_types::pcie {
                 dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err));
                 ASSERT_CL(err)
 
+#ifdef XILINX_FPGA
+                accesskernel.push_back(cl::Kernel(*config.program,
+                    ("accessMemory_0:{accessMemory_0_" + std::to_string(r + 1) + "}").c_str(), &err));
+
+                err = accesskernel[r].setArg(0, dummyBuffers[r]);
+                        ASSERT_CL(err);
+                err = accesskernel[r].setArg(1, dummyBuffers[r]);
+                ASSERT_CL(err);
+                err = accesskernel[r].setArg(2, static_cast<cl_long>(0));
+                ASSERT_CL(err);
+                err = accesskernel[r].setArg(3, static_cast<cl_long>(0));
+                ASSERT_CL(err);
+                err = accesskernel[r].setArg(4,(1));
+                ASSERT_CL(err);
+                err = accesskernel[r].setArg(5, cl_uint(0));
+                ASSERT_CL(err);
+#endif
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
 
                 cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err);
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index 7bf728a2..e2b8b830 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -109,8 +109,10 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
         switch (executionSettings->programSettings->communicationType) {
             case hpcc_base::CommunicationType::cpu_only: timing = execution_types::cpu::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
             case hpcc_base::CommunicationType::pcie_mpi: timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
-            case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
-            default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType));
+#ifdef INTEL_FPGA
+	    case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
+#endif
+	    default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType));
         }
         timing_results.push_back(timing);
     }
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index df445649..b6d348d0 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -31,6 +31,33 @@ SOFTWARE.
 #include "hpcc_benchmark.hpp"
 #include "parameters.h"
 
+#ifdef XILINX_FPGA
+template <typename T>
+struct aligned_allocator {
+
+   //    typedefs
+          typedef T value_type;
+          typedef value_type* pointer;
+          typedef const value_type* const_pointer;
+
+	   pointer allocate(size_t pCount, const_pointer = 0){ 
+	    	T* mem = 0;
+	    	if (posix_memalign(reinterpret_cast<void**>(&mem), 1024 , sizeof(T) * pCount) != 0) {
+	    		throw std::bad_alloc();
+	        }
+		return mem; 
+	   }
+
+	   void deallocate(pointer pPtr, size_t pCount) { 
+	       free(pPtr);
+	   }
+};
+	   
+namespace cl {
+    template <class T> using vector = std::vector<T,aligned_allocator<T>>; 
+}
+#endif
+
 /**
  * @brief Contains all classes and methods needed by the Network benchmark
  * 

From b96935a9a41a3f75e2ca190e44e7d819b166aa3d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 8 Apr 2022 13:46:26 +0100
Subject: [PATCH 004/318] Fix calculate functions for new templating

---
 .../host/execution_types/execution_cpu.hpp    | 22 +++----------------
 .../host/execution_types/execution_pcie.hpp   |  3 ++-
 2 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_cpu.hpp b/b_eff/src/host/execution_types/execution_cpu.hpp
index 778dc2f1..f70cedf1 100644
--- a/b_eff/src/host/execution_types/execution_cpu.hpp
+++ b/b_eff/src/host/execution_types/execution_cpu.hpp
@@ -38,13 +38,12 @@ namespace network::execution_types::cpu {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
+	template<class TDevice, class TContext, class TProgram>
     std::shared_ptr<network::ExecutionTimings>
-    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength,
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
         int err;
-        std::vector<cl::CommandQueue> sendQueues;
-        std::vector<cl::Buffer> dummyBuffers;
         std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
 
         cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
@@ -57,24 +56,10 @@ namespace network::execution_types::cpu {
 
         std::vector<double> calculationTimings;
         for (uint r =0; r < config.programSettings->numRepetitions; r++) {
-            sendQueues.clear();
-            dummyBuffers.clear();
             dummyBufferContents.clear();
             // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-
-                dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err));
-                ASSERT_CL(err)
-
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
-
-                cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err);
-                ASSERT_CL(err)
-
-                sendQueue.enqueueWriteBuffer(dummyBuffers.back(), CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents.back().data());
-
-                sendQueues.push_back(sendQueue);
-
             }
             double calculationTime = 0.0;
             for (int i = 0; i < config.programSettings->kernelReplications; i++) {
@@ -102,8 +87,7 @@ namespace network::execution_types::cpu {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-            err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
-            ASSERT_CL(err);
+		std::copy(dummyBufferContents[r].begin(), dummyBufferContents[r].end(),validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
         }
         std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
                 looplength,
diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp
index 9e266cf8..274e1c1d 100644
--- a/b_eff/src/host/execution_types/execution_pcie.hpp
+++ b/b_eff/src/host/execution_types/execution_pcie.hpp
@@ -38,8 +38,9 @@ namespace network::execution_types::pcie {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
+	template<class TDevice, class TContext, class TProgram>
     std::shared_ptr<network::ExecutionTimings>
-    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength,
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
         int err;

From 7bd92e5d856b5c0488eff9e3bf9ca98111516af4 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 8 Apr 2022 14:05:45 +0100
Subject: [PATCH 005/318] Add ACCL to b_eff host build

---
 b_eff/CMakeLists.txt          | 2 ++
 b_eff/src/host/CMakeLists.txt | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt
index 13d93b1b..4b4fbb41 100755
--- a/b_eff/CMakeLists.txt
+++ b/b_eff/CMakeLists.txt
@@ -24,3 +24,5 @@ include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake)
 unset(DATA_TYPE CACHE)
 find_package(MPI REQUIRED)
 
+include(${extern_accl_SOURCE_DIR}/driver/xrt/CMakeLists.txt)
+
diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt
index 9809208c..b8c44859 100755
--- a/b_eff/src/host/CMakeLists.txt
+++ b/b_eff/src/host/CMakeLists.txt
@@ -24,6 +24,8 @@ if (Vitis_FOUND)
     add_executable(${HOST_EXE_NAME}_xilinx main.cpp)
     target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
     target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
+    target_link_libraries(${LIB_NAME}_xilinx accl)
+    target_include_directories(${LIB_NAME}_xilinx PRIVATE ${ACCL_INCLUDE_PATH})
     target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx)
     target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")

From bccab2d9a01f9afad023ef0ec7cb8175e7d72086 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 8 Apr 2022 14:16:54 +0100
Subject: [PATCH 006/318] Temporarily remove OCL device setup to work with ACCL
 emulator

---
 b_eff/src/host/network_benchmark.hpp |  3 +++
 shared/include/hpcc_benchmark.hpp    | 11 +++++++----
 shared/setup/fpga_setup.cpp          |  8 ++++++--
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index b6d348d0..232bfd56 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -31,6 +31,8 @@ SOFTWARE.
 #include "hpcc_benchmark.hpp"
 #include "parameters.h"
 
+//TODO: remove this custom allocator since cl2.hpp is available here?
+#if 0
 #ifdef XILINX_FPGA
 template <typename T>
 struct aligned_allocator {
@@ -57,6 +59,7 @@ namespace cl {
     template <class T> using vector = std::vector<T,aligned_allocator<T>>; 
 }
 #endif
+#endif
 
 /**
  * @brief Contains all classes and methods needed by the Network benchmark
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index e85579d2..f4f7c080 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -480,6 +480,8 @@ class HpccFpgaBenchmark {
             std::unique_ptr<TDevice> usedDevice;
 
             if (!programSettings->testOnly) {
+// TODO: This is temporarily excluded to only usethe ACCL emulator!
+#if 0
 #ifndef USE_XRT_BINDINGS
                 usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
                                                                     programSettings->defaultDevice);
@@ -490,6 +492,7 @@ class HpccFpgaBenchmark {
  #else
                 // TODO: Select XRT device and program here!
  #endif
+#endif
             }
 
             executionSettings = std::unique_ptr<ExecutionSettings<TSettings, TDevice, TContext, TProgram>>(new ExecutionSettings<TSettings, TDevice, TContext, TProgram>(std::move(programSettings), std::move(usedDevice), 
@@ -609,7 +612,7 @@ class HpccFpgaBenchmark {
      * 
      * @return ExecutionSettings& The execution settings object
      */
-    ExecutionSettings<TSettings>& getExecutionSettings() {
+    ExecutionSettings<TSettings, TDevice, TContext, TProgram>& getExecutionSettings() {
         return *executionSettings;
     }
 
@@ -664,12 +667,12 @@ class HpccFpgaBenchmark {
  * @param printedExecutionSettings The execution settings that have to be printed to the stream
  * @return std::ostream& The output stream after the execution settings are piped in
  */
-template <class TSettings>
-std::ostream& operator<<(std::ostream& os, ExecutionSettings<TSettings> const& printedExecutionSettings){
+template <class TSettings, class TDevice, class TContext, class TProgram>
+std::ostream& operator<<(std::ostream& os, ExecutionSettings<TSettings, TDevice, TContext, TProgram> const& printedExecutionSettings){
         std::string device_name;
         os << std::left;
         if (!printedExecutionSettings.programSettings->testOnly) {
-        printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name);
+//        printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name);
         }
         else {
             device_name = "TEST RUN: Not selected!";
diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp
index dd1ddd28..e923545a 100644
--- a/shared/setup/fpga_setup.cpp
+++ b/shared/setup/fpga_setup.cpp
@@ -135,7 +135,8 @@ Sets up the given FPGA with the kernel in the provided file.
 #ifdef _USE_MPI_
         MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 #endif
-
+// TODO: Thisis temporarily excluded to work with ACCL emulator without emulation bitstream!
+#if 0
         if (world_rank == 0) {
             std::cout << HLINE;
             std::cout << "FPGA Setup:" << usedKernelFile->c_str() << std::endl;
@@ -176,6 +177,9 @@ Sets up the given FPGA with the kernel in the provided file.
             std::cout << HLINE;
         }
         return std::unique_ptr<cl::Program>(new cl::Program(program));
+#else
+	return std::unique_ptr<cl::Program>(nullptr);
+#endif
     }
 
 /**
@@ -322,4 +326,4 @@ choose a device.
         return std::unique_ptr<cl::Device>(new cl::Device(deviceList[chosenDeviceId]));
     }
 
-}  // namespace fpga_setup
\ No newline at end of file
+}  // namespace fpga_setup

From 16c5b8af003eda0765b238225eded858d50eebb0 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 8 Apr 2022 17:17:50 +0100
Subject: [PATCH 007/318] First minimal version working with Simulator

---
 b_eff/src/host/execution_types/execution.hpp  |   1 +
 .../host/execution_types/execution_accl.hpp   | 130 ++++++++++++++++++
 b_eff/src/host/network_benchmark.cpp          |   1 +
 shared/include/communication_types.hpp        |  10 +-
 4 files changed, 140 insertions(+), 2 deletions(-)
 create mode 100644 b_eff/src/host/execution_types/execution_accl.hpp

diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp
index 22a8a12e..c36459a4 100644
--- a/b_eff/src/host/execution_types/execution.hpp
+++ b/b_eff/src/host/execution_types/execution.hpp
@@ -25,3 +25,4 @@ SOFTWARE.
 #ifdef INTEL_FPGA
 #include "execution_types/execution_iec.hpp"
 #endif
+#include "execution_types/execution_accl.hpp"
diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
new file mode 100644
index 00000000..67d8d03a
--- /dev/null
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -0,0 +1,130 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_HPP
+#define SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_HPP
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* External library headers */
+#include "mpi.h"
+#include "accl.hpp"
+
+/* Project's headers */
+
+namespace network::execution_types::accl {
+
+    /*
+    Implementation for the single kernel.
+     @copydoc bm_execution::calculate()
+    */
+	template<class TDevice, class TContext, class TProgram>
+    std::shared_ptr<network::ExecutionTimings>
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
+                cl::vector<HOST_DATA_TYPE> &validationData) {
+
+        int err;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+        std::vector<cl::vector<HOST_DATA_TYPE>> recvBufferContents;
+	std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclSendBuffers;
+	std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclRecvBuffers;
+        cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
+
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+	std::cout << "Setup ACCL..." << std::endl;
+
+	std::vector<ACCL::rank_t> ranks = {};
+        for (int i = 0; i < current_size; ++i) {
+		ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i,
+                       1024};
+            ranks.emplace_back(new_rank);
+        }
+	// TODO: Add start port here. Currenty hardcoded!
+	ACCL::ACCL accl(ranks, current_rank,
+                          "tcp://localhost:" +
+                              std::to_string(5500 + current_rank));
+	std::cout << "Start seidnign..." << std::endl; 
+        std::vector<double> calculationTimings;
+        for (uint r =0; r < config.programSettings->numRepetitions; r++) {
+            dummyBufferContents.clear();
+	    recvBufferContents.clear();
+	    acclSendBuffers.clear();
+	    acclRecvBuffers.clear();
+            // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
+            for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+                recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
+		acclSendBuffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(dummyBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16));
+		acclRecvBuffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(recvBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16));
+		acclSendBuffers.back()->sync_to_device();
+		acclRecvBuffers.back()->sync_to_device();
+            }
+	    std::cout << "Buffers prepared" << std::endl;
+            double calculationTime = 0.0;
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto startCalculation = std::chrono::high_resolution_clock::now();
+                for (int l = 0; l < looplength; l++) {
+			std::cout << "Send from " << current_rank << " to " << (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size << std::endl;
+			accl.send(0, *acclSendBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+			accl.recv(0, *acclRecvBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+//                        MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, 
+//                                        dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2)  + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                }
+                auto endCalculation = std::chrono::high_resolution_clock::now();
+                calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
+                #endif
+            }
+            calculationTimings.push_back(calculationTime);
+#ifndef NDEBUG
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+        std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
+#endif
+        }
+        // Read validation data from FPGA will be placed sequentially in buffer for all replications
+        // The data order should not matter, because every byte should have the same value!
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+		std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(),validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
+        }
+        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
+                looplength,
+                messageSize,
+                calculationTimings
+        });
+        return result;
+    }
+
+}  // namespace bm_execution
+
+#endif
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index e2b8b830..40332b85 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -112,6 +112,7 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
 #ifdef INTEL_FPGA
 	    case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
 #endif
+	    case hpcc_base::CommunicationType::accl: timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
 	    default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType));
         }
         timing_results.push_back(timing);
diff --git a/shared/include/communication_types.hpp b/shared/include/communication_types.hpp
index bb46bb8d..1d74aa6f 100644
--- a/shared/include/communication_types.hpp
+++ b/shared/include/communication_types.hpp
@@ -52,6 +52,11 @@ typedef enum _CommunicationType {
      */
     smi,
 
+    /**
+     * @brief Communication using ACCL 
+     */ 
+    accl,
+
     /**
      * @brief Calculate the benchmark on CPU instead of FPGA
      * 
@@ -75,7 +80,8 @@ typedef enum _CommunicationType {
 static const std::map<const std::string, CommunicationType> comm_to_str_map{ 
     {"IEC", CommunicationType::intel_external_channels}, 
     {"PCIE", CommunicationType::pcie_mpi},
-	{"SMI", CommunicationType::smi},
+    {"SMI", CommunicationType::smi},
+    {"ACCL", CommunicationType::accl},
     {"CPU", CommunicationType::cpu_only},
     {"UNSUPPORTED", CommunicationType::unsupported},
     {"AUTO", CommunicationType::automatic}
@@ -121,4 +127,4 @@ static CommunicationType retrieveCommunicationType(std::string comm_name, std::s
 }
 }
 
-#endif
\ No newline at end of file
+#endif

From 137e27b84fd2ee7d35e077305280d02956298302 Mon Sep 17 00:00:00 2001
From: Marius Meyer <marius.meyer@uni-paderborn.de>
Date: Fri, 8 Apr 2022 17:41:28 +0100
Subject: [PATCH 008/318] Move ACCL setup to fpga setup

---
 b_eff/src/host/network_benchmark.hpp          |  4 +-
 cmake/general_benchmark_build_setup.cmake     |  4 ++
 shared/CMakeLists.txt                         |  4 ++
 shared/include/hpcc_benchmark.hpp             | 18 +++----
 ...fpga_setup_xrt.hpp => fpga_setup_accl.hpp} |  7 +--
 shared/setup/fpga_setup.cpp                   |  5 --
 shared/setup/fpga_setup_accl.cpp              | 50 +++++++++++++++++++
 7 files changed, 72 insertions(+), 20 deletions(-)
 rename shared/include/setup/{fpga_setup_xrt.hpp => fpga_setup_accl.hpp} (93%)
 create mode 100644 shared/setup/fpga_setup_accl.cpp

diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index 232bfd56..89ff9fe0 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -244,10 +244,10 @@ class NetworkExecutionTimings {
  * 
  */
 class NetworkBenchmark : 
-#ifndef USE_XRT_BINDINGS
+#ifndef USE_ACCL
 public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, cl::Device, cl::Context, cl::Program, NetworkData, NetworkExecutionTimings> {
 #else
-
+public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, xrt::device, nullptr, ACCL::ACCL, NetworkData, NetworkExecutionTimings> {
 #endif
 protected:
 
diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake
index 64aa8d0a..66153e5f 100644
--- a/cmake/general_benchmark_build_setup.cmake
+++ b/cmake/general_benchmark_build_setup.cmake
@@ -30,6 +30,7 @@ set(USE_OPENMP ${USE_OPENMP} CACHE BOOL "Use OpenMP in the host code")
 set(USE_MPI ${USE_MPI} CACHE BOOL "Compile the host code with MPI support. This has to be supported by the host code.")
 set(USE_SVM No CACHE BOOL "Use SVM pointers instead of creating buffers on the board and transferring the data there before execution.")
 set(USE_HBM No CACHE BOOL "Use host code specific to HBM FPGAs")
+set(USE_ACCL No CACHE BOOL "Use ACCL for communication")
 set(USE_CUSTOM_KERNEL_TARGETS No CACHE BOOL "Enable build targets for custom kernels")
 set(USE_DEPRECATED_HPP_HEADER ${header_default} CACHE BOOL "Flag that indicates if the old C++ wrapper header should be used (cl.hpp) or the newer version (cl2.hpp or opencl.hpp)")
 set(HPCC_FPGA_CONFIG ${HPCC_FPGA_CONFIG} CACHE FILEPATH "Configuration file that is used to overwrite the default configuration")
@@ -86,6 +87,9 @@ if (USE_MPI)
     include_directories(${MPI_CXX_INCLUDE_PATH})
     link_libraries(${MPI_LIBRARIES})
 endif()
+if (USE_ACCL)
+    add_definitions(-DUSE_ACCL)
+endif()
 
 # Add configuration time to build
 string(TIMESTAMP CONFIG_TIME "%a %b %d %H:%M:%S UTC %Y" UTC)
diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt
index 89a18117..a7e8390b 100644
--- a/shared/CMakeLists.txt
+++ b/shared/CMakeLists.txt
@@ -2,6 +2,10 @@ project(HPCCBaseLibrary VERSION 1.0.1)
 
 add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp)
 
+if (defined USE_ACCL)
+add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp)
+endif()
+
 find_package(OpenCL QUIET)
 
 if (INTELFPGAOPENCL_FOUND)
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index f4f7c080..dd9b022a 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -35,8 +35,8 @@ SOFTWARE.
 #endif
 
 /* Project's headers */
-#ifdef USE_XRT_BINDINGS
-#include "setup/fpga_setup_xrt.hpp"
+#ifdef USE_ACCL
+#include "setup/fpga_setup_accl.hpp"
 #endif
 #include "setup/fpga_setup.hpp"
 #include "cxxopts.hpp"
@@ -480,9 +480,7 @@ class HpccFpgaBenchmark {
             std::unique_ptr<TDevice> usedDevice;
 
             if (!programSettings->testOnly) {
-// TODO: This is temporarily excluded to only usethe ACCL emulator!
-#if 0
-#ifndef USE_XRT_BINDINGS
+#ifndef USE_ACCL
                 usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
                                                                     programSettings->defaultDevice);
 
@@ -490,9 +488,9 @@ class HpccFpgaBenchmark {
                 program = fpga_setup::fpgaSetup(context.get(), {*usedDevice},
                                                                     &programSettings->kernelFileName);
  #else
-                // TODO: Select XRT device and program here!
+                program = fpga_setup::fpgaSetupACCL(usedDevice,
+                                                    &programSettings->kernelFileName);
  #endif
-#endif
             }
 
             executionSettings = std::unique_ptr<ExecutionSettings<TSettings, TDevice, TContext, TProgram>>(new ExecutionSettings<TSettings, TDevice, TContext, TProgram>(std::move(programSettings), std::move(usedDevice), 
@@ -667,12 +665,12 @@ class HpccFpgaBenchmark {
  * @param printedExecutionSettings The execution settings that have to be printed to the stream
  * @return std::ostream& The output stream after the execution settings are piped in
  */
-template <class TSettings, class TDevice, class TContext, class TProgram>
-std::ostream& operator<<(std::ostream& os, ExecutionSettings<TSettings, TDevice, TContext, TProgram> const& printedExecutionSettings){
+template <class TSettings>
+std::ostream& operator<<(std::ostream& os, ExecutionSettings<TSettings> const& printedExecutionSettings){
         std::string device_name;
         os << std::left;
         if (!printedExecutionSettings.programSettings->testOnly) {
-//        printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name);
+        printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name);
         }
         else {
             device_name = "TEST RUN: Not selected!";
diff --git a/shared/include/setup/fpga_setup_xrt.hpp b/shared/include/setup/fpga_setup_accl.hpp
similarity index 93%
rename from shared/include/setup/fpga_setup_xrt.hpp
rename to shared/include/setup/fpga_setup_accl.hpp
index 73f5c56f..cfc1abe4 100644
--- a/shared/include/setup/fpga_setup_xrt.hpp
+++ b/shared/include/setup/fpga_setup_accl.hpp
@@ -32,6 +32,7 @@ SOFTWARE.
 
 /* External libraries */
 #include "xrt/xrt_device.h"
+#include "accl.hpp"
 
 
 namespace fpga_setup {
@@ -41,10 +42,10 @@ Sets up the given FPGA with the kernel in the provided file.
 
 @param device The device used for the program
 @param usedKernelFile The path to the kernel file
-@return The program that is used to create the benchmark kernels
+@return The ACCL instance used for communication
 */
-    std::unique_ptr<uuid>
-    fpgaSetupXRT(xrt::device &device,
+    std::unique_ptr<ACCL::ACCL>
+    fpgaSetupACCL(xrt::device &device,
               const std::string *usedKernelFile);
 
 
diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp
index e923545a..aba9b8b2 100644
--- a/shared/setup/fpga_setup.cpp
+++ b/shared/setup/fpga_setup.cpp
@@ -135,8 +135,6 @@ Sets up the given FPGA with the kernel in the provided file.
 #ifdef _USE_MPI_
         MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 #endif
-// TODO: Thisis temporarily excluded to work with ACCL emulator without emulation bitstream!
-#if 0
         if (world_rank == 0) {
             std::cout << HLINE;
             std::cout << "FPGA Setup:" << usedKernelFile->c_str() << std::endl;
@@ -177,9 +175,6 @@ Sets up the given FPGA with the kernel in the provided file.
             std::cout << HLINE;
         }
         return std::unique_ptr<cl::Program>(new cl::Program(program));
-#else
-	return std::unique_ptr<cl::Program>(nullptr);
-#endif
     }
 
 /**
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
new file mode 100644
index 00000000..e0cf3723
--- /dev/null
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -0,0 +1,50 @@
+//
+// Created by Marius Meyer on 04.12.19.
+//
+
+#include "setup/fpga_setup_accl.hpp"
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <fstream>
+
+/* External libraries */
+#include "parameters.h"
+
+#ifdef _USE_MPI_
+#include "mpi.h"
+#endif
+
+namespace fpga_setup {
+
+    std::unique_ptr<ACCL::ACCL>
+    fpgaSetup(xrt::device &context,
+              const std::string *usedKernelFile) {
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+	std::vector<ACCL::rank_t> ranks = {};
+        for (int i = 0; i < current_size; ++i) {
+		ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i,
+                       1024};
+            ranks.emplace_back(new_rank);
+        }
+	// TODO: Add start port here. Currenty hardcoded!
+        return std::unique_ptr<ACCL::ACCL>(new ACCL::ACCL(ranks, current_rank,
+                          "tcp://localhost:" +
+                            std::to_string(5500 + current_rank)));
+    }
+
+
+    std::unique_ptr<xrt::device>
+    selectFPGADevice(int defaultDevice) {
+        return std::unique_ptr<xrt::device>(nullptr);
+    }
+
+}  // namespace fpga_setup
\ No newline at end of file

From 364db3c1bfde3ffabc0bd78fc521d4b1207b247d Mon Sep 17 00:00:00 2001
From: Marius Meyer <marius.meyer@uni-paderborn.de>
Date: Fri, 8 Apr 2022 17:44:08 +0100
Subject: [PATCH 009/318] Use global ACCL instance

---
 .../host/execution_types/execution_accl.hpp   | 26 ++++---------------
 1 file changed, 5 insertions(+), 21 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
index 67d8d03a..d4822e3d 100644
--- a/b_eff/src/host/execution_types/execution_accl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -57,19 +57,6 @@ namespace network::execution_types::accl {
         int current_size;
         MPI_Comm_size(MPI_COMM_WORLD, & current_size);
 
-	std::cout << "Setup ACCL..." << std::endl;
-
-	std::vector<ACCL::rank_t> ranks = {};
-        for (int i = 0; i < current_size; ++i) {
-		ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i,
-                       1024};
-            ranks.emplace_back(new_rank);
-        }
-	// TODO: Add start port here. Currenty hardcoded!
-	ACCL::ACCL accl(ranks, current_rank,
-                          "tcp://localhost:" +
-                              std::to_string(5500 + current_rank));
-	std::cout << "Start seidnign..." << std::endl; 
         std::vector<double> calculationTimings;
         for (uint r =0; r < config.programSettings->numRepetitions; r++) {
             dummyBufferContents.clear();
@@ -80,22 +67,19 @@ namespace network::execution_types::accl {
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
                 recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
-		acclSendBuffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(dummyBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16));
-		acclRecvBuffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(recvBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16));
+		acclSendBuffers.push_back(config.program->create_buffer<HOST_DATA_TYPE>(dummyBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16));
+		acclRecvBuffers.push_back(config.program->create_buffer<HOST_DATA_TYPE>(recvBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16));
 		acclSendBuffers.back()->sync_to_device();
 		acclRecvBuffers.back()->sync_to_device();
             }
-	    std::cout << "Buffers prepared" << std::endl;
+
             double calculationTime = 0.0;
             for (int i = 0; i < config.programSettings->kernelReplications; i++) {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 for (int l = 0; l < looplength; l++) {
-			std::cout << "Send from " << current_rank << " to " << (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size << std::endl;
-			accl.send(0, *acclSendBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
-			accl.recv(0, *acclRecvBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
-//                        MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, 
-//                                        dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2)  + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+			config.program->send(0, *acclSendBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+			config.program->recv(0, *acclRecvBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();

From 62afe38615dfaa7c9badf4fcd4de3706b6350a04 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 8 Apr 2022 18:59:23 +0100
Subject: [PATCH 010/318] Complete move ACCL setup to FPGA setup

---
 b_eff/src/host/execution_types/execution.hpp     |  2 ++
 .../src/host/execution_types/execution_accl.hpp  |  4 ++--
 b_eff/src/host/network_benchmark.cpp             |  4 +++-
 b_eff/src/host/network_benchmark.hpp             |  2 +-
 shared/CMakeLists.txt                            |  9 ++++++---
 shared/include/hpcc_benchmark.hpp                | 16 +++++++++-------
 shared/setup/fpga_setup_accl.cpp                 |  4 ++--
 7 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp
index c36459a4..33d3b0a6 100644
--- a/b_eff/src/host/execution_types/execution.hpp
+++ b/b_eff/src/host/execution_types/execution.hpp
@@ -21,8 +21,10 @@ SOFTWARE.
 */
 
 #include "execution_types/execution_cpu.hpp"
+#ifndef USE_ACCL
 #include "execution_types/execution_pcie.hpp"
 #ifdef INTEL_FPGA
 #include "execution_types/execution_iec.hpp"
 #endif
+#endif
 #include "execution_types/execution_accl.hpp"
diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
index d4822e3d..f1f5736d 100644
--- a/b_eff/src/host/execution_types/execution_accl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -67,8 +67,8 @@ namespace network::execution_types::accl {
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
                 recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
-		acclSendBuffers.push_back(config.program->create_buffer<HOST_DATA_TYPE>(dummyBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16));
-		acclRecvBuffers.push_back(config.program->create_buffer<HOST_DATA_TYPE>(recvBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16));
+		acclSendBuffers.push_back(config.program->create_buffer(dummyBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16));
+		acclRecvBuffers.push_back(config.program->create_buffer(recvBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16));
 		acclSendBuffers.back()->sync_to_device();
 		acclRecvBuffers.back()->sync_to_device();
             }
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index 40332b85..09872106 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -108,9 +108,11 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
         std::shared_ptr<network::ExecutionTimings> timing;
         switch (executionSettings->programSettings->communicationType) {
             case hpcc_base::CommunicationType::cpu_only: timing = execution_types::cpu::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
-            case hpcc_base::CommunicationType::pcie_mpi: timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
+#ifndef USE_ACCL
+	    case hpcc_base::CommunicationType::pcie_mpi: timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
 #ifdef INTEL_FPGA
 	    case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
+#endif
 #endif
 	    case hpcc_base::CommunicationType::accl: timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
 	    default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType));
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index 89ff9fe0..cfe9a25e 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -247,7 +247,7 @@ class NetworkBenchmark :
 #ifndef USE_ACCL
 public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, cl::Device, cl::Context, cl::Program, NetworkData, NetworkExecutionTimings> {
 #else
-public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, xrt::device, nullptr, ACCL::ACCL, NetworkData, NetworkExecutionTimings> {
+public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, xrt::device, bool, ACCL::ACCL, NetworkData, NetworkExecutionTimings> {
 #endif
 protected:
 
diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt
index a7e8390b..d2fba9a2 100644
--- a/shared/CMakeLists.txt
+++ b/shared/CMakeLists.txt
@@ -1,9 +1,12 @@
 project(HPCCBaseLibrary VERSION 1.0.1)
 
-add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp)
 
-if (defined USE_ACCL)
-add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp)
+if (USE_ACCL)
+add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp)
+target_include_directories(hpcc_fpga_base PRIVATE ${ACCL_INCLUDE_PATH})
+target_link_libraries(hpcc_fpga_base accl)
+else()
+add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp)
 endif()
 
 find_package(OpenCL QUIET)
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index dd9b022a..5d451ff4 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -215,8 +215,8 @@ class ExecutionSettings {
      * @param context_ Used OpenCL context
      * @param program_ Used OpenCL program
      */
-    ExecutionSettings(std::unique_ptr<TSettings> programSettings_, std::unique_ptr<cl::Device> device_, 
-                        std::unique_ptr<cl::Context> context_, std::unique_ptr<cl::Program> program_): 
+    ExecutionSettings(std::unique_ptr<TSettings> programSettings_, std::unique_ptr<TDevice> device_, 
+                        std::unique_ptr<TContext> context_, std::unique_ptr<TProgram> program_): 
                                     programSettings(std::move(programSettings_)), device(std::move(device_)), 
                                     context(std::move(context_)), program(std::move(program_)) {}
 
@@ -488,7 +488,7 @@ class HpccFpgaBenchmark {
                 program = fpga_setup::fpgaSetup(context.get(), {*usedDevice},
                                                                     &programSettings->kernelFileName);
  #else
-                program = fpga_setup::fpgaSetupACCL(usedDevice,
+                program = fpga_setup::fpgaSetupACCL(*usedDevice,
                                                     &programSettings->kernelFileName);
  #endif
             }
@@ -665,13 +665,15 @@ class HpccFpgaBenchmark {
  * @param printedExecutionSettings The execution settings that have to be printed to the stream
  * @return std::ostream& The output stream after the execution settings are piped in
  */
-template <class TSettings>
-std::ostream& operator<<(std::ostream& os, ExecutionSettings<TSettings> const& printedExecutionSettings){
+template <class TSettings, class TDevice, class TContext, class TProgram>
+std::ostream& operator<<(std::ostream& os, ExecutionSettings<TSettings, TDevice, TContext, TProgram> const& printedExecutionSettings){
         std::string device_name;
         os << std::left;
         if (!printedExecutionSettings.programSettings->testOnly) {
-        printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name);
-        }
+#ifndef USE_ACCL
+		printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name);
+#endif
+	}
         else {
             device_name = "TEST RUN: Not selected!";
         }
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index e0cf3723..14eddc18 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -21,7 +21,7 @@
 namespace fpga_setup {
 
     std::unique_ptr<ACCL::ACCL>
-    fpgaSetup(xrt::device &context,
+    fpgaSetupACCL(xrt::device &context,
               const std::string *usedKernelFile) {
         int current_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
@@ -47,4 +47,4 @@ namespace fpga_setup {
         return std::unique_ptr<xrt::device>(nullptr);
     }
 
-}  // namespace fpga_setup
\ No newline at end of file
+}  // namespace fpga_setup

From 82dd098ce7dc8ddac8c91f02a08458fc918aa1f4 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 11 Apr 2022 14:33:01 +0100
Subject: [PATCH 011/318] Fix ACCL bugs in b_eff

---
 b_eff/src/host/execution_types/execution_accl.hpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
index f1f5736d..5db9093c 100644
--- a/b_eff/src/host/execution_types/execution_accl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -63,12 +63,13 @@ namespace network::execution_types::accl {
 	    recvBufferContents.clear();
 	    acclSendBuffers.clear();
 	    acclRecvBuffers.clear();
+	    int size_in_values = (size_in_bytes + 3) / 4;
             // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
                 recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
-		acclSendBuffers.push_back(config.program->create_buffer(dummyBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16));
-		acclRecvBuffers.push_back(config.program->create_buffer(recvBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16));
+		acclSendBuffers.push_back(config.program->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
+		acclRecvBuffers.push_back(config.program->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
 		acclSendBuffers.back()->sync_to_device();
 		acclRecvBuffers.back()->sync_to_device();
             }
@@ -78,8 +79,8 @@ namespace network::execution_types::accl {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 for (int l = 0; l < looplength; l++) {
-			config.program->send(0, *acclSendBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
-			config.program->recv(0, *acclRecvBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+			config.program->send(0, *acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+			config.program->recv(0, *acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
@@ -99,7 +100,7 @@ namespace network::execution_types::accl {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-		std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(),validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
+		std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
         }
         std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
                 looplength,

From 2e5b0e326ac95cd74f30e6f9a24636365bccdb59 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 12 Apr 2022 15:00:37 +0100
Subject: [PATCH 012/318] Partial restructuring code for XRT

---
 PTRANS/src/host/data_handlers/diagonal.hpp    |  5 +-
 PTRANS/src/host/data_handlers/handler.hpp     |  3 +-
 PTRANS/src/host/data_handlers/pq.hpp          |  7 +-
 .../host/execution_types/execution_cpu.hpp    |  3 +-
 .../host/execution_types/execution_intel.hpp  |  4 +-
 .../execution_types/execution_intel_pq.hpp    |  4 +-
 .../host/execution_types/execution_pcie.hpp   |  2 +-
 .../execution_types/execution_pcie_pq.hpp     |  4 +-
 PTRANS/src/host/main.cpp                      |  2 +-
 PTRANS/src/host/transpose_benchmark.cpp       |  9 ---
 PTRANS/src/host/transpose_benchmark.hpp       |  9 +--
 b_eff/CMakeLists.txt                          |  2 -
 b_eff/src/host/execution_types/execution.hpp  |  3 +-
 .../host/execution_types/execution_accl.hpp   |  8 +--
 b_eff/src/host/network_benchmark.hpp          | 13 ++--
 cmake/general_benchmark_build_setup.cmake     |  7 +-
 cmake/unitTestTargets.cmake                   |  2 +-
 shared/CMakeLists.txt                         | 19 ++++--
 shared/include/hpcc_benchmark.hpp             | 51 +++++++++-----
 shared/include/setup/fpga_setup_accl.hpp      | 21 +-----
 shared/include/setup/fpga_setup_xrt.hpp       | 66 +++++++++++++++++++
 shared/setup/fpga_setup_accl.cpp              |  8 +--
 shared/setup/fpga_setup_xrt.cpp               | 39 +++++++++++
 23 files changed, 201 insertions(+), 90 deletions(-)
 create mode 100644 shared/include/setup/fpga_setup_xrt.hpp
 create mode 100644 shared/setup/fpga_setup_xrt.cpp

diff --git a/PTRANS/src/host/data_handlers/diagonal.hpp b/PTRANS/src/host/data_handlers/diagonal.hpp
index e1d72f3b..2edae91a 100644
--- a/PTRANS/src/host/data_handlers/diagonal.hpp
+++ b/PTRANS/src/host/data_handlers/diagonal.hpp
@@ -44,7 +44,8 @@ namespace transpose {
  *         the missing data. e.g. for N ranks, the pairs will be (0, N/2), (1, N/2 + 1), ...
  * 
  */
-class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler {
+template<class TDevice, class TContext, class TProgram>
+class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler<TDevice, TContext, TProgram> {
 
 private:
 
@@ -69,7 +70,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler {
      * @return std::unique_ptr<TransposeData> The generated data
      */
     std::unique_ptr<TransposeData>
-    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& settings) override {
+    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, TDevice, TContext, TProgram>& settings) override {
         MPI_Type_contiguous(settings.programSettings->blockSize * settings.programSettings->blockSize, MPI_FLOAT, &data_block);
         MPI_Type_commit(&data_block);
         
diff --git a/PTRANS/src/host/data_handlers/handler.hpp b/PTRANS/src/host/data_handlers/handler.hpp
index fe1293fe..b71597bd 100644
--- a/PTRANS/src/host/data_handlers/handler.hpp
+++ b/PTRANS/src/host/data_handlers/handler.hpp
@@ -43,6 +43,7 @@ namespace data_handler {
  *          calculate the overall validation error.
  * 
  */
+template<class TDevice, class TContext, class TProgram>
 class TransposeDataHandler {
 
 protected:
@@ -68,7 +69,7 @@ class TransposeDataHandler {
      * @return std::unique_ptr<TransposeData> The generated data
      */
     virtual std::unique_ptr<TransposeData>
-    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& settings) = 0;
+    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, TDevice, TContext, TProgram>& settings) = 0;
 
     /**
      * @brief Exchange the data blocks for verification
diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp
index 01f2261d..388c83d9 100644
--- a/PTRANS/src/host/data_handlers/pq.hpp
+++ b/PTRANS/src/host/data_handlers/pq.hpp
@@ -52,7 +52,8 @@ static T mod(T number, T op) {
     return (result < 0 || result >= op) ? op + result : result;
 }
 
-class DistributedPQTransposeDataHandler : public TransposeDataHandler {
+template<class TDevice, class TContext, class TProgram>
+class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, TContext, TProgram> {
 
 private:
 
@@ -135,7 +136,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler {
      * @return std::unique_ptr<TransposeData> The generated data
      */
     std::unique_ptr<TransposeData>
-    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& settings) override {
+    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, TDevice, TContext, TProgram>& settings) override {
         int width_in_blocks = settings.programSettings->matrixSize / settings.programSettings->blockSize;
         global_width = width_in_blocks;
 
@@ -384,7 +385,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler {
  * @param mpi_size Size of the communication world
  * @param p Width of the PQ grid the FPGAs are arranged in
  */
-    DistributedPQTransposeDataHandler(int mpi_rank, int mpi_size, int p) : TransposeDataHandler(mpi_rank, mpi_size) {
+    DistributedPQTransposeDataHandler(int mpi_rank, int mpi_size, int p) : TransposeDataHandler<TDevice, TContext, TProgram>(mpi_rank, mpi_size) {
         if (mpi_size % p != 0) {
             throw std::runtime_error("Number of MPI ranks must be multiple of P! P=" + std::to_string(p));
         }
diff --git a/PTRANS/src/host/execution_types/execution_cpu.hpp b/PTRANS/src/host/execution_types/execution_cpu.hpp
index ab74fdc9..130b016e 100644
--- a/PTRANS/src/host/execution_types/execution_cpu.hpp
+++ b/PTRANS/src/host/execution_types/execution_cpu.hpp
@@ -50,8 +50,9 @@ namespace transpose
  * @param data data object that contains all required data for the execution
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
+            template<class TDevice, class TContext, class TProgram>
             static std::unique_ptr<transpose::TransposeExecutionTimings>
-            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler)
+            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, TDevice, TContext, TProgram> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler)
             {
                 int err;
 
diff --git a/PTRANS/src/host/execution_types/execution_intel.hpp b/PTRANS/src/host/execution_types/execution_intel.hpp
index d95bf578..58f5a73f 100644
--- a/PTRANS/src/host/execution_types/execution_intel.hpp
+++ b/PTRANS/src/host/execution_types/execution_intel.hpp
@@ -43,7 +43,7 @@ namespace intel {
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
 static  std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& config, transpose::TransposeData& data) {
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData& data) {
         int err;
 
         if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::diagonal) {
@@ -275,4 +275,4 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
 }  // namespace fpga_execution
 }  // namespace intel
 
-#endif
\ No newline at end of file
+#endif
diff --git a/PTRANS/src/host/execution_types/execution_intel_pq.hpp b/PTRANS/src/host/execution_types/execution_intel_pq.hpp
index 431ff40d..85e596a7 100644
--- a/PTRANS/src/host/execution_types/execution_intel_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_intel_pq.hpp
@@ -44,7 +44,7 @@ namespace intel_pq {
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
 static  std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) {
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) {
         int err;
 
         if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
@@ -354,4 +354,4 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
 }  // namespace fpga_execution
 }  // namespace intel
 
-#endif
\ No newline at end of file
+#endif
diff --git a/PTRANS/src/host/execution_types/execution_pcie.hpp b/PTRANS/src/host/execution_types/execution_pcie.hpp
index 5e29ad2e..a08888de 100644
--- a/PTRANS/src/host/execution_types/execution_pcie.hpp
+++ b/PTRANS/src/host/execution_types/execution_pcie.hpp
@@ -49,7 +49,7 @@ namespace transpose
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
             static std::unique_ptr<transpose::TransposeExecutionTimings>
-            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler)
+            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler)
             {
                 int err;
 
diff --git a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
index d2cfae7e..db1d9bee 100644
--- a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
@@ -45,7 +45,7 @@ namespace pcie_pq {
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
 static  std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) {
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) {
         int err;
 
         if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
@@ -378,4 +378,4 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
 }  // namespace fpga_execution
 }  // namespace intel
 
-#endif
\ No newline at end of file
+#endif
diff --git a/PTRANS/src/host/main.cpp b/PTRANS/src/host/main.cpp
index a054f7dd..f65d06ce 100644
--- a/PTRANS/src/host/main.cpp
+++ b/PTRANS/src/host/main.cpp
@@ -8,7 +8,7 @@ The program entry point
 int
 main(int argc, char *argv[]) {
     // Setup benchmark
-    TransposeBenchmark bm(argc, argv);
+    TransposeBenchmark<cl::Device, cl::Context, cl::Program> bm(argc, argv);
     bool success = bm.executeBenchmark();
     if (success) {
         return 0;
diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp
index 755b11a0..e66b3a36 100644
--- a/PTRANS/src/host/transpose_benchmark.cpp
+++ b/PTRANS/src/host/transpose_benchmark.cpp
@@ -173,13 +173,4 @@ transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeD
     return static_cast<double>(global_max_error) < 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon();
 }
 
-void
-transpose::TransposeBenchmark::setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier) {
-    switch (dataHandlerIdentifier) {
-        case transpose::data_handler::DataHandlerType::diagonal: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler>(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break;
-        case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler>(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break;
-        default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier));
-    }
-        
-
 }
diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index 0136c22c..74ada897 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -46,12 +46,9 @@ namespace transpose {
  * @brief Implementation of the transpose benchmark
  * 
  */
+template<class TDevice, class TContext, class TProgram> 
 class TransposeBenchmark : 
-#ifndef USE_XRT_BINDINGS
-public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,cl::Device, cl::Context, cl::Program, TransposeData, TransposeExecutionTimings> {
-#else
-// TODO initialize benchmark wth XRT bindings
-#endif
+public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext, TProgram, TransposeData, TransposeExecutionTimings> {
 protected:
 
     /**
@@ -62,7 +59,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,cl::Device, cl::Con
     void
     addAdditionalParseOptions(cxxopts::Options &options) override;
 
-    std::unique_ptr<transpose::data_handler::TransposeDataHandler> dataHandler;
+    std::unique_ptr<transpose::data_handler::TransposeDataHandler<TDevice, TContext, TProgram>> dataHandler;
 
 public:
 
diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt
index 4b4fbb41..13d93b1b 100755
--- a/b_eff/CMakeLists.txt
+++ b/b_eff/CMakeLists.txt
@@ -24,5 +24,3 @@ include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake)
 unset(DATA_TYPE CACHE)
 find_package(MPI REQUIRED)
 
-include(${extern_accl_SOURCE_DIR}/driver/xrt/CMakeLists.txt)
-
diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp
index 33d3b0a6..118f0ebc 100644
--- a/b_eff/src/host/execution_types/execution.hpp
+++ b/b_eff/src/host/execution_types/execution.hpp
@@ -26,5 +26,6 @@ SOFTWARE.
 #ifdef INTEL_FPGA
 #include "execution_types/execution_iec.hpp"
 #endif
-#endif
+#else
 #include "execution_types/execution_accl.hpp"
+#endif
diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
index 5db9093c..81673835 100644
--- a/b_eff/src/host/execution_types/execution_accl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -68,8 +68,8 @@ namespace network::execution_types::accl {
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
                 recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
-		acclSendBuffers.push_back(config.program->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
-		acclRecvBuffers.push_back(config.program->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
+		acclSendBuffers.push_back(config.accl->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
+		acclRecvBuffers.push_back(config.accl->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
 		acclSendBuffers.back()->sync_to_device();
 		acclRecvBuffers.back()->sync_to_device();
             }
@@ -79,8 +79,8 @@ namespace network::execution_types::accl {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 for (int l = 0; l < looplength; l++) {
-			config.program->send(0, *acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
-			config.program->recv(0, *acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+			config.accl->send(0, *acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+			config.accl->recv(0, *acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index cfe9a25e..efffe1bf 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -244,12 +244,15 @@ class NetworkExecutionTimings {
  * 
  */
 class NetworkBenchmark : 
-#ifndef USE_ACCL
-public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, cl::Device, cl::Context, cl::Program, NetworkData, NetworkExecutionTimings> {
-#else
-public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, xrt::device, bool, ACCL::ACCL, NetworkData, NetworkExecutionTimings> {
+#ifdef USE_OCL_HOST
+    public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, cl::Device, cl::Context, cl::Program, NetworkData, NetworkExecutionTimings> 
 #endif
-protected:
+#ifdef USE_XRT_HOST
+    public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, xrt::device, bool, xrt::uuid, NetworkData, NetworkExecutionTimings> 
+
+#endif
+   {
+    protected:
 
     /**
      * @brief Additional input parameters of the Network benchmark
diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake
index 66153e5f..441b6f41 100644
--- a/cmake/general_benchmark_build_setup.cmake
+++ b/cmake/general_benchmark_build_setup.cmake
@@ -1,7 +1,7 @@
 cmake_policy(VERSION 3.13)
 INCLUDE (CheckTypeSize)
 
-set (CMAKE_CXX_STANDARD 11)
+set (CMAKE_CXX_STANDARD 14)
 
 # Download build dependencies
 add_subdirectory(${CMAKE_SOURCE_DIR}/../extern ${CMAKE_BINARY_DIR}/extern)
@@ -31,6 +31,7 @@ set(USE_MPI ${USE_MPI} CACHE BOOL "Compile the host code with MPI support. This
 set(USE_SVM No CACHE BOOL "Use SVM pointers instead of creating buffers on the board and transferring the data there before execution.")
 set(USE_HBM No CACHE BOOL "Use host code specific to HBM FPGAs")
 set(USE_ACCL No CACHE BOOL "Use ACCL for communication")
+set(USE_OCL_HOST Yes CACHE BOOL "Use OpenCL host code implementation")
 set(USE_CUSTOM_KERNEL_TARGETS No CACHE BOOL "Enable build targets for custom kernels")
 set(USE_DEPRECATED_HPP_HEADER ${header_default} CACHE BOOL "Flag that indicates if the old C++ wrapper header should be used (cl.hpp) or the newer version (cl2.hpp or opencl.hpp)")
 set(HPCC_FPGA_CONFIG ${HPCC_FPGA_CONFIG} CACHE FILEPATH "Configuration file that is used to overwrite the default configuration")
@@ -91,6 +92,10 @@ if (USE_ACCL)
     add_definitions(-DUSE_ACCL)
 endif()
 
+if (USE_OCL_HOST)
+    add_definitions(-DUSE_OCL_HOST)
+endif()
+
 # Add configuration time to build
 string(TIMESTAMP CONFIG_TIME "%a %b %d %H:%M:%S UTC %Y" UTC)
 add_definitions(-DCONFIG_TIME="${CONFIG_TIME}")
diff --git a/cmake/unitTestTargets.cmake b/cmake/unitTestTargets.cmake
index 0f36d3da..776269e7 100644
--- a/cmake/unitTestTargets.cmake
+++ b/cmake/unitTestTargets.cmake
@@ -25,7 +25,7 @@ if (Vitis_FOUND)
     target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
     target_link_libraries(${HOST_EXE_NAME}_test_xilinx hpcc_fpga_base_test)
     if (NOT "${kernel_emulation_targets_xilinx}" STREQUAL "")
-        add_dependencies(${HOST_EXE_NAME}_test_xilinx "${kernel_emulation_targets_xilinx}")
+        add_dependencies(${HOST_EXE_NAME}_test_xilinx ${kernel_emulation_targets_xilinx})
     endif()
     target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt
index d2fba9a2..3f3ada79 100644
--- a/shared/CMakeLists.txt
+++ b/shared/CMakeLists.txt
@@ -1,12 +1,21 @@
 project(HPCCBaseLibrary VERSION 1.0.1)
 
+set(HPCC_BASE_SOURCES "")
 
 if (USE_ACCL)
-add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp)
-target_include_directories(hpcc_fpga_base PRIVATE ${ACCL_INCLUDE_PATH})
-target_link_libraries(hpcc_fpga_base accl)
-else()
-add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp)
+    include(${extern_accl_SOURCE_DIR}/driver/xrt/CMakeLists.txt)
+    list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp)
+endif()
+if (USE_XRT_HOST)
+    list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp)
+endif()
+if (USE_OCL_HOST)
+    list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp)
+endif()
+add_library(hpcc_fpga_base STATIC ${HPCC_BASE_SOURCES})
+if (USE_ACCL)
+    target_include_directories(hpcc_fpga_base PRIVATE ${ACCL_INCLUDE_PATH})
+    target_link_libraries(hpcc_fpga_base accl)
 endif()
 
 find_package(OpenCL QUIET)
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 5d451ff4..f135fc30 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -172,7 +172,6 @@ class BaseSettings {
 
 };
 
-
 /**
  * @brief Settings class that is containing the program settings together with
  *          additional information about the OpenCL runtime
@@ -207,6 +206,14 @@ class ExecutionSettings {
      */
     std::unique_ptr<TProgram> program;
 
+#ifdef USE_ACCL
+    /**
+     * @brief Pointer to ACCL instance
+     *
+     */
+    std::unique_ptr<ACCL::ACCL> accl;
+#endif
+
     /**
      * @brief Construct a new Execution Settings object
      * 
@@ -216,9 +223,18 @@ class ExecutionSettings {
      * @param program_ Used OpenCL program
      */
     ExecutionSettings(std::unique_ptr<TSettings> programSettings_, std::unique_ptr<TDevice> device_, 
-                        std::unique_ptr<TContext> context_, std::unique_ptr<TProgram> program_): 
+                        std::unique_ptr<TContext> context_, std::unique_ptr<TProgram> program_
+#ifdef USE_ACCL
+                        , std::unique_ptr<ACCL::ACCL> accl_
+#endif
+                        
+                        ): 
                                     programSettings(std::move(programSettings_)), device(std::move(device_)), 
-                                    context(std::move(context_)), program(std::move(program_)) {}
+                                    context(std::move(context_)), program(std::move(program_))
+#ifdef USE_ACCL
+                                            , accl(std::move(accl_))
+#endif                                      
+                                             {}
 
     /**
      * @brief Destroy the Execution Settings object. Used to specify the order the contained objects are destroyed 
@@ -478,23 +494,26 @@ class HpccFpgaBenchmark {
             std::unique_ptr<TContext> context;
             std::unique_ptr<TProgram> program;
             std::unique_ptr<TDevice> usedDevice;
-
+#ifdef USE_ACCL
+            std::unique_ptr<ACCL::ACCL> accl;
+#endif
             if (!programSettings->testOnly) {
-#ifndef USE_ACCL
-                usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
-                                                                    programSettings->defaultDevice);
-
-                context = std::unique_ptr<cl::Context>(new cl::Context(*usedDevice));
-                program = fpga_setup::fpgaSetup(context.get(), {*usedDevice},
-                                                                    &programSettings->kernelFileName);
- #else
-                program = fpga_setup::fpgaSetupACCL(*usedDevice,
-                                                    &programSettings->kernelFileName);
- #endif
+//                usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
+//                                                                    programSettings->defaultDevice);
+#ifdef USE_OCL_HOST
+//                context = std::unique_ptr<cl::Context>(new cl::Context(*usedDevice));
+//                program = fpga_setup::fpgaSetup(context.get(), {*usedDevice},
+//                                                                    &programSettings->kernelFileName);
+#endif
+#ifdef USE_ACCL
+                xrt::device dev;
+                xrt::uuid *program;
+                accl = fpga_setup::fpgaSetupACCL(dev, *program);
+#endif
             }
 
             executionSettings = std::unique_ptr<ExecutionSettings<TSettings, TDevice, TContext, TProgram>>(new ExecutionSettings<TSettings, TDevice, TContext, TProgram>(std::move(programSettings), std::move(usedDevice), 
-                                                                    std::move(context), std::move(program)));
+                                                                    std::move(context), std::move(program), std::move(accl)));
             if (mpi_comm_rank == 0) {
                 if (!checkInputParameters()) {
                     std::cerr << "ERROR: Input parameter check failed!" << std::endl;
diff --git a/shared/include/setup/fpga_setup_accl.hpp b/shared/include/setup/fpga_setup_accl.hpp
index cfc1abe4..7158a81b 100644
--- a/shared/include/setup/fpga_setup_accl.hpp
+++ b/shared/include/setup/fpga_setup_accl.hpp
@@ -19,8 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
-#ifndef SRC_HOST_FPGA_SETUP_XRT_H_
-#define SRC_HOST_FPGA_SETUP_XRT_H_
+#ifndef SRC_HOST_FPGA_SETUP_ACCL_H_
+#define SRC_HOST_FPGA_SETUP_ACCL_H_
 
 #include <string>
 #include <vector>
@@ -46,22 +46,7 @@ Sets up the given FPGA with the kernel in the provided file.
 */
     std::unique_ptr<ACCL::ACCL>
     fpgaSetupACCL(xrt::device &device,
-              const std::string *usedKernelFile);
-
-
-/**
-Searches an selects an FPGA device using the CL library functions.
-If multiple platforms or devices are given, the user will be prompted to
-choose a device.
-
-@param defaultDevice The index of the device that has to be used. If a
-                        value < 0 is given, the device can be chosen
-                        interactively
-
-@return the selected device
-*/
-    std::unique_ptr<xrt::device>
-    selectFPGADeviceXRT(int defaultDevice);
+              xrt::uuid &program);
 
 }  // namespace fpga_setup
 #endif  // SRC_HOST_FPGA_SETUP_H_
diff --git a/shared/include/setup/fpga_setup_xrt.hpp b/shared/include/setup/fpga_setup_xrt.hpp
new file mode 100644
index 00000000..61c74f72
--- /dev/null
+++ b/shared/include/setup/fpga_setup_xrt.hpp
@@ -0,0 +1,66 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_FPGA_SETUP_XRT_H_
+#define SRC_HOST_FPGA_SETUP_XRT_H_
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <fstream>
+#include <memory>
+
+/* External libraries */
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+namespace fpga_setup {
+
+/**
+Sets up the given FPGA with the kernel in the provided file.
+
+@param device The device used for the program
+@param usedKernelFile The path to the kernel file
+@return The ACCL instance used for communication
+*/
+    std::unique_ptr<xrt::uuid>
+    fpgaSetup(xrt::device &device,
+              const std::string &usedKernelFile);
+
+
+/**
+Searches an selects an FPGA device using the CL library functions.
+If multiple platforms or devices are given, the user will be prompted to
+choose a device.
+
+@param defaultDevice The index of the device that has to be used. If a
+                        value < 0 is given, the device can be chosen
+                        interactively
+
+@return the selected device
+*/
+    std::unique_ptr<xrt::device>
+    selectFPGADevice(int defaultDevice);
+
+}  // namespace fpga_setup
+#endif  // SRC_HOST_FPGA_SETUP_H_
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 14eddc18..01d012e3 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -22,7 +22,7 @@ namespace fpga_setup {
 
     std::unique_ptr<ACCL::ACCL>
     fpgaSetupACCL(xrt::device &context,
-              const std::string *usedKernelFile) {
+              xrt::uuid &program) {
         int current_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
 
@@ -41,10 +41,4 @@ namespace fpga_setup {
                             std::to_string(5500 + current_rank)));
     }
 
-
-    std::unique_ptr<xrt::device>
-    selectFPGADevice(int defaultDevice) {
-        return std::unique_ptr<xrt::device>(nullptr);
-    }
-
 }  // namespace fpga_setup
diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp
new file mode 100644
index 00000000..1a9135bb
--- /dev/null
+++ b/shared/setup/fpga_setup_xrt.cpp
@@ -0,0 +1,39 @@
+//
+// Created by Marius Meyer on 04.12.19.
+//
+
+#include "setup/fpga_setup_xrt.hpp"
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <fstream>
+
+/* External libraries */
+#include "parameters.h"
+
+#ifdef _USE_MPI_
+#include "mpi.h"
+#endif
+
+namespace fpga_setup {
+
+    std::unique_ptr<xrt::uuid>
+    fpgaSetup(xrt::device &device,
+              std::string &kernelFileName) {
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+        return std::unique_ptr<xrt::uuid>(new device.load_xclbin(kernelFileName));
+    }
+
+    std::unique_ptr<xrt::device>
+    selectFPGADevice(int defaultDevice) {
+        return std::unique_ptr<xrt::device>(new xrt::device(defaultDevice));
+    } 
+}  // namespace fpga_setup

From e052b1ef295da976d1839b8187daa7ca46074088 Mon Sep 17 00:00:00 2001
From: Marius Meyer <marius.meyer@uni-paderborn.de>
Date: Tue, 12 Apr 2022 15:15:48 +0100
Subject: [PATCH 013/318] Attempt fix generic PTRANS impl

---
 PTRANS/src/host/CMakeLists.txt          |   2 +-
 PTRANS/src/host/transpose_benchmark.hpp | 136 ++++++++++++++++++++++--
 2 files changed, 126 insertions(+), 12 deletions(-)

diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt
index 89b45ff8..647ac6ee 100755
--- a/PTRANS/src/host/CMakeLists.txt
+++ b/PTRANS/src/host/CMakeLists.txt
@@ -1,6 +1,6 @@
 
 add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase)
-set(HOST_SOURCE transpose_benchmark.cpp transpose_data.cpp)
+set(HOST_SOURCE transpose_data.cpp)
 
 set(HOST_EXE_NAME Transpose)
 set(LIB_NAME trans)
diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index 74ada897..4b27ecc4 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -57,7 +57,18 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      * @param options 
      */
     void
-    addAdditionalParseOptions(cxxopts::Options &options) override;
+    addAdditionalParseOptions(cxxopts::Options &options) override {
+        options.add_options()
+            ("m", "Matrix size in number of blocks in one dimension",
+                cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_MATRIX_SIZE)))
+            ("b", "Block size in number of values in one dimension",
+                cxxopts::value<uint>()->default_value(std::to_string(BLOCK_SIZE)))
+            ("p", "Value of P that equals the width of the PQ grid of FPGAs. Q is determined by the world size.",
+                cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_P_VALUE)))
+            ("distribute-buffers", "Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs.")
+            ("handler", "Specify the used data handler that distributes the data over devices and memory banks",
+                cxxopts::value<std::string>()->default_value(DEFAULT_DIST_TYPE));
+    }
 
     std::unique_ptr<transpose::data_handler::TransposeDataHandler<TDevice, TContext, TProgram>> dataHandler;
 
@@ -69,14 +80,22 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      * @return std::unique_ptr<TransposeData> The input and output data of the benchmark
      */
     std::unique_ptr<TransposeData>
-    generateInputData() override;
+    generateInputData() override {
+        return dataHandler->generateData(*executionSettings);
+    }
 
     /**
      * @brief Set the data handler object by calling the function with the matching template argument
      * 
      */
     void
-    setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier);
+    setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier) {
+        switch (dataHandlerIdentifier) {
+            case transpose::data_handler::DataHandlerType::diagonal: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler>(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break;
+            case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler>(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break;
+            default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier));
+        }
+    }
 
     /**
      * @brief Transpose specific implementation of the kernel execution
@@ -85,7 +104,28 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      * @return std::unique_ptr<TransposeExecutionTimings> Measured runtimes of the kernel execution
      */
     std::unique_ptr<TransposeExecutionTimings>
-    executeKernel(TransposeData &data) override;
+    executeKernel(TransposeData &data) override {
+        switch (executionSettings->programSettings->communicationType) {
+            case hpcc_base::CommunicationType::intel_external_channels: 
+                                    if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
+                                        return transpose::fpga_execution::intel::calculate(*executionSettings, data);
+                                    }
+                                    else {
+                                        return transpose::fpga_execution::intel_pq::calculate(*executionSettings, data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*dataHandler));
+                                    } break;
+            case hpcc_base::CommunicationType::pcie_mpi :                                 
+                                    if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
+                                        return transpose::fpga_execution::pcie::calculate(*executionSettings, data, *dataHandler);
+                                    }
+                                    else {
+                                        return transpose::fpga_execution::pcie_pq::calculate(*executionSettings, data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*dataHandler));
+                                    } break;
+#ifdef MKL_FOUND
+            case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*executionSettings, data, *dataHandler); break;
+#endif
+            default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType));
+        }
+    }
 
     /**
      * @brief Transpose specific implementation of the execution validation
@@ -95,7 +135,28 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(TransposeData &data) override;
+    validateOutputAndPrintError(TransposeData &data) override {
+
+        // exchange the data using MPI depending on the chosen distribution scheme
+        dataHandler->exchangeData(data);
+
+        dataHandler->reference_transpose(data);
+
+        double max_error = 0.0;
+        for (size_t i = 0; i < executionSettings->programSettings->blockSize * executionSettings->programSettings->blockSize * data.numBlocks; i++) {
+            max_error = std::max(fabs(data.A[i]), max_error);
+        }
+
+        double global_max_error = 0;
+        MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+
+        if (mpi_comm_rank == 0) {
+            std::cout << "Maximum error: " << global_max_error << " < " << 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon() <<  std::endl;
+            std::cout << "Mach. Epsilon: " << std::numeric_limits<HOST_DATA_TYPE>::epsilon() << std::endl;
+        }
+
+        return static_cast<double>(global_max_error) < 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon();
+    }
 
     /**
      * @brief Transpose specific implementation of printing the execution results
@@ -103,7 +164,56 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      * @param output Measured runtimes of the kernel execution
      */
     void
-    collectAndPrintResults(const TransposeExecutionTimings &output) override;
+    collectAndPrintResults(const TransposeExecutionTimings &output) override {
+        double flops = static_cast<double>(executionSettings->programSettings->matrixSize) * executionSettings->programSettings->matrixSize;
+
+        // Number of experiment repetitions
+        uint number_measurements = output.calculationTimings.size();
+        std::vector<double> max_measures(number_measurements);
+        std::vector<double> max_transfers(number_measurements);
+#ifdef _USE_MPI_
+            // Copy the object variable to a local variable to make it accessible to the lambda function
+            int mpi_size = mpi_comm_size;
+            MPI_Reduce(output.calculationTimings.data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+            MPI_Reduce(output.transferTimings.data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+#else
+            std::copy(output.calculationTimings.begin(), output.calculationTimings.end(), max_measures.begin());
+            std::copy(output.transferTimings.begin(), output.transferTimings.end(), max_transfers.begin());
+#endif
+
+        double avgCalculationTime = accumulate(max_measures.begin(), max_measures.end(), 0.0)
+                                    / max_measures.size();
+        double minCalculationTime = *min_element(max_measures.begin(), max_measures.end());
+
+        double avgTransferTime = accumulate(max_transfers.begin(), max_transfers.end(), 0.0)
+                                    / max_transfers.size();
+        double minTransferTime = *min_element(max_transfers.begin(), max_transfers.end());
+
+        double avgCalcFLOPS = flops / avgCalculationTime;
+        double maxCalcFLOPS = flops / minCalculationTime;
+        double avgMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime;
+        double maxMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime;
+        double avgTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime;
+        double maxTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime;
+
+        if (mpi_comm_rank == 0) {
+            std::cout << "       total [s]     transfer [s]  calc [s]      calc FLOPS    Mem [B/s]     PCIe [B/s]" << std::endl;
+            std::cout << "avg:   " << (avgTransferTime + avgCalculationTime)
+                    << "   " << avgTransferTime
+                    << "   " << avgCalculationTime
+                    << "   " << avgCalcFLOPS
+                    << "   " << avgMemBandwidth
+                    << "   " << avgTransferBandwidth
+                    << std::endl;
+            std::cout << "best:  " << (minTransferTime + minCalculationTime)
+                    << "   " << minTransferTime
+                    << "   " << minCalculationTime
+                    << "   " << maxCalcFLOPS
+                    << "   " << maxMemBandwidth
+                    << "   " << maxTransferBandwidth
+                    << std::endl;
+        }
+    }
 
     /**
      * @brief Construct a new Transpose Benchmark object
@@ -111,16 +221,20 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      * @param argc the number of program input parameters
      * @param argv the program input parameters as array of strings
      */
-    TransposeBenchmark(int argc, char* argv[]);
+    TransposeBenchmark(int argc, char* argv[]) : HpccFpgaBenchmark(argc, argv) {
+        if (setupBenchmark(argc, argv)) {
+            setTransposeDataHandler(executionSettings->programSettings->dataHandlerIdentifier);
+        }
+    }
 
-        /**
+    /**
      * @brief Construct a new Transpose Benchmark object
      */
-    TransposeBenchmark();
+    TransposeBenchmark() : HpccFpgaBenchmark(argc, argv) {}
 
 };
 
-} // namespace stream
+} // namespace transpose
 
 
-#endif // SRC_HOST_STREAM_BENCHMARK_H_
+#endif // SRC_HOST_TRANSPOSE_BENCHMARK_H_

From aa26b97f94b5f7ba2a7f39489e97006c6332c414 Mon Sep 17 00:00:00 2001
From: Marius Meyer <marius.meyer@uni-paderborn.de>
Date: Tue, 12 Apr 2022 15:29:43 +0100
Subject: [PATCH 014/318] Add template to data handler calls

---
 PTRANS/src/host/transpose_benchmark.hpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index 4b27ecc4..9b515b73 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -31,8 +31,16 @@ SOFTWARE.
 #include "hpcc_benchmark.hpp"
 #include "transpose_data.hpp"
 
+#include "execution_types/execution_intel.hpp"
+#include "execution_types/execution_intel_pq.hpp"
+#include "execution_types/execution_pcie.hpp"
+#include "execution_types/execution_pcie_pq.hpp"
+#include "execution_types/execution_cpu.hpp"
+#include "communication_types.hpp"
+
 #include "data_handlers/data_handler_types.h"
-#include "data_handlers/handler.hpp"
+#include "data_handlers/diagonal.hpp"
+#include "data_handlers/pq.hpp"
 
 #include "parameters.h"
 
@@ -91,8 +99,8 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
     void
     setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier) {
         switch (dataHandlerIdentifier) {
-            case transpose::data_handler::DataHandlerType::diagonal: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler>(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break;
-            case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler>(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break;
+            case transpose::data_handler::DataHandlerType::diagonal: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler, TDevice, TContext, TProgram>(new transpose::data_handler::DistributedDiagonalTransposeDataHandler<TDevice, TContext, TProgram>(mpi_comm_rank, mpi_comm_size)); break;
+            case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler, TDevice, TContext, TProgram>(new transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break;
             default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier));
         }
     }

From 35b218bcda49771e1b5fe025a5b1dc182cddba65 Mon Sep 17 00:00:00 2001
From: Marius Meyer <marius.meyer@uni-paderborn.de>
Date: Tue, 12 Apr 2022 15:36:45 +0100
Subject: [PATCH 015/318] Add explicit this to transpose benchmark

---
 PTRANS/src/host/transpose_benchmark.hpp | 32 ++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index 9b515b73..2138de6c 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -89,7 +89,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      */
     std::unique_ptr<TransposeData>
     generateInputData() override {
-        return dataHandler->generateData(*executionSettings);
+        return this->dataHandler->generateData(*(this->executionSettings));
     }
 
     /**
@@ -99,8 +99,8 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
     void
     setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier) {
         switch (dataHandlerIdentifier) {
-            case transpose::data_handler::DataHandlerType::diagonal: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler, TDevice, TContext, TProgram>(new transpose::data_handler::DistributedDiagonalTransposeDataHandler<TDevice, TContext, TProgram>(mpi_comm_rank, mpi_comm_size)); break;
-            case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler, TDevice, TContext, TProgram>(new transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break;
+            case transpose::data_handler::DataHandlerType::diagonal: this->dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler, TDevice, TContext, TProgram>(new transpose::data_handler::DistributedDiagonalTransposeDataHandler<TDevice, TContext, TProgram>(mpi_comm_rank, mpi_comm_size)); break;
+            case transpose::data_handler::DataHandlerType::pq: this->dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler, TDevice, TContext, TProgram>(new transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break;
             default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier));
         }
     }
@@ -113,25 +113,25 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      */
     std::unique_ptr<TransposeExecutionTimings>
     executeKernel(TransposeData &data) override {
-        switch (executionSettings->programSettings->communicationType) {
+        switch (this->executionSettings->programSettings->communicationType) {
             case hpcc_base::CommunicationType::intel_external_channels: 
-                                    if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
-                                        return transpose::fpga_execution::intel::calculate(*executionSettings, data);
+                                    if (this->executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
+                                        return transpose::fpga_execution::intel::calculate(*(this->executionSettings), data);
                                     }
                                     else {
-                                        return transpose::fpga_execution::intel_pq::calculate(*executionSettings, data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*dataHandler));
+                                        return transpose::fpga_execution::intel_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*this->dataHandler));
                                     } break;
             case hpcc_base::CommunicationType::pcie_mpi :                                 
                                     if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
-                                        return transpose::fpga_execution::pcie::calculate(*executionSettings, data, *dataHandler);
+                                        return transpose::fpga_execution::pcie::calculate(*(this->executionSettings), data, *dataHandler);
                                     }
                                     else {
-                                        return transpose::fpga_execution::pcie_pq::calculate(*executionSettings, data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*dataHandler));
+                                        return transpose::fpga_execution::pcie_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*this->dataHandler));
                                     } break;
 #ifdef MKL_FOUND
-            case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*executionSettings, data, *dataHandler); break;
+            case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*(this->executionSettings), data, *dataHandler); break;
 #endif
-            default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType));
+            default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(this->executionSettings->programSettings->communicationType));
         }
     }
 
@@ -146,9 +146,9 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
     validateOutputAndPrintError(TransposeData &data) override {
 
         // exchange the data using MPI depending on the chosen distribution scheme
-        dataHandler->exchangeData(data);
+        this->dataHandler->exchangeData(data);
 
-        dataHandler->reference_transpose(data);
+        this->dataHandler->reference_transpose(data);
 
         double max_error = 0.0;
         for (size_t i = 0; i < executionSettings->programSettings->blockSize * executionSettings->programSettings->blockSize * data.numBlocks; i++) {
@@ -230,15 +230,15 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      * @param argv the program input parameters as array of strings
      */
     TransposeBenchmark(int argc, char* argv[]) : HpccFpgaBenchmark(argc, argv) {
-        if (setupBenchmark(argc, argv)) {
-            setTransposeDataHandler(executionSettings->programSettings->dataHandlerIdentifier);
+        if (this->setupBenchmark(argc, argv)) {
+            this->setTransposeDataHandler(this->executionSettings->programSettings->dataHandlerIdentifier);
         }
     }
 
     /**
      * @brief Construct a new Transpose Benchmark object
      */
-    TransposeBenchmark() : HpccFpgaBenchmark(argc, argv) {}
+    TransposeBenchmark() : HpccFpgaBenchmark() {}
 
 };
 

From c78306a996fcfa34a48b832e7b21296fcad7e1c1 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 12 Apr 2022 17:01:38 +0100
Subject: [PATCH 016/318] Compilable generic PTRANS

---
 PTRANS/src/host/CMakeLists.txt                |  2 ++
 PTRANS/src/host/data_handlers/diagonal.hpp    | 32 +++++++++----------
 PTRANS/src/host/data_handlers/pq.hpp          |  9 +++---
 .../execution_types/execution_intel_pq.hpp    |  2 +-
 .../host/execution_types/execution_pcie.hpp   |  2 +-
 .../execution_types/execution_pcie_pq.hpp     |  2 +-
 PTRANS/src/host/transpose_benchmark.hpp       | 27 ++++++++--------
 7 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt
index 647ac6ee..2404394f 100755
--- a/PTRANS/src/host/CMakeLists.txt
+++ b/PTRANS/src/host/CMakeLists.txt
@@ -27,6 +27,7 @@ if (INTELFPGAOPENCL_FOUND)
         target_include_directories(${LIB_NAME}_intel PRIVATE "$ENV{MKL_ROOT}/include")
     endif()
     target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA)
+    target_compile_definitions(${HOST_EXE_NAME}_intel PRIVATE -DINTEL_FPGA)
     target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_intel_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_intel> -h)
 endif()
@@ -40,6 +41,7 @@ if (Vitis_FOUND)
     target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
     target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx)
     target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
+    target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_xilinx_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_xilinx> -h)
 endif()
diff --git a/PTRANS/src/host/data_handlers/diagonal.hpp b/PTRANS/src/host/data_handlers/diagonal.hpp
index 2edae91a..a2c702c0 100644
--- a/PTRANS/src/host/data_handlers/diagonal.hpp
+++ b/PTRANS/src/host/data_handlers/diagonal.hpp
@@ -76,37 +76,37 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler<TDev
         
         int width_in_blocks = settings.programSettings->matrixSize / settings.programSettings->blockSize;
 
-        int avg_blocks_per_rank = (width_in_blocks * width_in_blocks) / mpi_comm_size;
+        int avg_blocks_per_rank = (width_in_blocks * width_in_blocks) / this->mpi_comm_size;
         int avg_diagonal_blocks = width_in_blocks;
         if (avg_blocks_per_rank > 0) {
             avg_diagonal_blocks = (width_in_blocks / avg_blocks_per_rank);
         }
         num_diagonal_ranks = std::max(avg_diagonal_blocks, 1);
 
-        if (num_diagonal_ranks % 2 != mpi_comm_size % 2) {
+        if (num_diagonal_ranks % 2 != this->mpi_comm_size % 2) {
         #ifndef NDEBUG
-            std::cout << "Rank " << mpi_comm_rank << ": Fail 1!" << std::endl;
+            std::cout << "Rank " << this->mpi_comm_rank << ": Fail 1!" << std::endl;
         #endif
             // Abort if there is a too high difference in the number of matrix blocks between the MPI ranks
             throw std::runtime_error("Matrix size and MPI ranks to not allow fair distribution of blocks! Increase or reduce the number of MPI ranks by 1.");
         }
-        if ((mpi_comm_size - num_diagonal_ranks) % 2 != 0 || (mpi_comm_size - num_diagonal_ranks) == 0 && width_in_blocks > 1) {
+        if ((this->mpi_comm_size - num_diagonal_ranks) % 2 != 0 || (this->mpi_comm_size - num_diagonal_ranks) == 0 && width_in_blocks > 1) {
         #ifndef NDEBUG
-            std::cout << "Rank " << mpi_comm_rank << ": Fail 2!" << std::endl;
+            std::cout << "Rank " << this->mpi_comm_rank << ": Fail 2!" << std::endl;
         #endif
             throw std::runtime_error("Not possible to create pairs of MPI ranks for lower and upper half of matrix. Increase number of MPI ranks!.");
         }
-        bool this_rank_is_diagonal = mpi_comm_rank >= (mpi_comm_size - num_diagonal_ranks);
-        int blocks_if_diagonal = width_in_blocks / num_diagonal_ranks + ( (mpi_comm_rank - (mpi_comm_size - num_diagonal_ranks)) < (width_in_blocks % num_diagonal_ranks) ? 1 : 0);
+        bool this_rank_is_diagonal = this->mpi_comm_rank >= (this->mpi_comm_size - num_diagonal_ranks);
+        int blocks_if_diagonal = width_in_blocks / num_diagonal_ranks + ( (this->mpi_comm_rank - (this->mpi_comm_size - num_diagonal_ranks)) < (width_in_blocks % num_diagonal_ranks) ? 1 : 0);
         int blocks_if_not_diagonal = 0;
-        if ((mpi_comm_size - num_diagonal_ranks) > 0 ) {
-            blocks_if_not_diagonal = (width_in_blocks * (width_in_blocks - 1)) / (mpi_comm_size - num_diagonal_ranks) + (mpi_comm_rank  < ((width_in_blocks * (width_in_blocks - 1)) % (mpi_comm_size - num_diagonal_ranks)) ? 1 : 0);
+        if ((this->mpi_comm_size - num_diagonal_ranks) > 0 ) {
+            blocks_if_not_diagonal = (width_in_blocks * (width_in_blocks - 1)) / (this->mpi_comm_size - num_diagonal_ranks) + (this->mpi_comm_rank  < ((width_in_blocks * (width_in_blocks - 1)) % (this->mpi_comm_size - num_diagonal_ranks)) ? 1 : 0);
         }
 
 
         int blocks_per_rank = (this_rank_is_diagonal) ? blocks_if_diagonal : blocks_if_not_diagonal;
 
-        if (mpi_comm_rank == 0) {
+        if (this->mpi_comm_rank == 0) {
             std::cout << "Diag. blocks per rank:              " << blocks_if_diagonal << std::endl;
             std::cout << "Blocks per rank:                    " << blocks_if_not_diagonal << std::endl;
             std::cout << "Loopback ranks for diagonal blocks: " << num_diagonal_ranks << std::endl;
@@ -115,14 +115,14 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler<TDev
         int data_height_per_rank = blocks_per_rank * settings.programSettings->blockSize;
 
     #ifndef NDEBUG
-        std::cout << "Rank " << mpi_comm_rank << ": NumBlocks = " << blocks_per_rank << std::endl;
+        std::cout << "Rank " << this->mpi_comm_rank << ": NumBlocks = " << blocks_per_rank << std::endl;
     #endif
         
         // Allocate memory for a single device and all its memory banks
         auto d = std::unique_ptr<transpose::TransposeData>(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank));
 
         // Fill the allocated memory with pseudo random values
-        std::mt19937 gen(mpi_comm_rank);
+        std::mt19937 gen(this->mpi_comm_rank);
         std::uniform_real_distribution<> dis(-100.0, 100.0);
         for (size_t i = 0; i < data_height_per_rank; i++) {
             for (size_t j = 0; j < settings.programSettings->blockSize; j++) {
@@ -148,10 +148,10 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler<TDev
         // std::cout << "Start data exchange " << mpi_comm_rank << std::endl;
     #endif
         // Only need to exchange data, if rank has a partner
-        if (mpi_comm_rank < mpi_comm_size - num_diagonal_ranks) {
+        if (this->mpi_comm_rank < this->mpi_comm_size - num_diagonal_ranks) {
 
-            int first_upper_half_rank = (mpi_comm_size - num_diagonal_ranks)/2;
-            int pair_rank = (mpi_comm_rank >= first_upper_half_rank) ? mpi_comm_rank - first_upper_half_rank : mpi_comm_rank + first_upper_half_rank;
+            int first_upper_half_rank = (this->mpi_comm_size - num_diagonal_ranks)/2;
+            int pair_rank = (this->mpi_comm_rank >= first_upper_half_rank) ? this->mpi_comm_rank - first_upper_half_rank : this->mpi_comm_rank + first_upper_half_rank;
 
             // To re-calculate the matrix transposition locally on this host, we need to 
             // exchange matrix A for every kernel replication
@@ -197,7 +197,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler<TDev
         }
     }
 
-    DistributedDiagonalTransposeDataHandler(int mpi_rank, int mpi_size): TransposeDataHandler(mpi_rank, mpi_size) {
+    DistributedDiagonalTransposeDataHandler(int mpi_rank, int mpi_size): TransposeDataHandler<TDevice, TContext, TProgram>(mpi_rank, mpi_size) {
         if (mpi_rank >= mpi_size) {
             throw std::runtime_error("MPI rank must be smaller the MPI world size!");
         }
diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp
index 388c83d9..d065150b 100644
--- a/PTRANS/src/host/data_handlers/pq.hpp
+++ b/PTRANS/src/host/data_handlers/pq.hpp
@@ -26,6 +26,7 @@ SOFTWARE.
 /* C++ standard library headers */
 #include <memory>
 #include <algorithm>
+#include <random>
 
 /* Project's headers */
 #include "handler.hpp"
@@ -142,8 +143,8 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, T
 
         width_per_rank = width_in_blocks / pq_width;
         height_per_rank = width_in_blocks / pq_height;
-        pq_row = mpi_comm_rank / pq_width;
-        pq_col = mpi_comm_rank % pq_width;
+        pq_row = this->mpi_comm_rank / pq_width;
+        pq_col = this->mpi_comm_rank % pq_width;
 
         // If the torus width is not a divisor of the matrix size,
         // distribute remaining blocks to the ranks
@@ -167,7 +168,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, T
         auto d = std::unique_ptr<transpose::TransposeData>(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank));
 
         // Fill the allocated memory with pseudo random values
-        std::mt19937 gen(mpi_comm_rank);
+        std::mt19937 gen(this->mpi_comm_rank);
         std::uniform_real_distribution<> dis(-100.0, 100.0);
         for (size_t i = 0; i < blocks_per_rank * settings.programSettings->blockSize; i++) {
             for (size_t j = 0; j < settings.programSettings->blockSize; j++) {
@@ -308,7 +309,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, T
 
                     // Do actual MPI communication
 #ifndef NDEBUG
-                    std::cout << "Rank " << mpi_comm_rank << ": blocks (" << sending_size / (data.blockSize * data.blockSize) << "," << receiving_size / (data.blockSize * data.blockSize) << ") send " << send_rank << ", recv " << recv_rank << std::endl << std::flush;
+                    std::cout << "Rank " << this->mpi_comm_rank << ": blocks (" << sending_size / (data.blockSize * data.blockSize) << "," << receiving_size / (data.blockSize * data.blockSize) << ") send " << send_rank << ", recv " << recv_rank << std::endl << std::flush;
 #endif
                     MPI_Isend(send_buffers[current_parallel_execution].data(), sending_size, MPI_FLOAT, send_rank, 0, MPI_COMM_WORLD, &mpi_requests[current_parallel_execution]);
                     MPI_Irecv(recv_buffers[current_parallel_execution].data(), receiving_size, MPI_FLOAT, recv_rank, 0, MPI_COMM_WORLD, &mpi_requests[gcd + current_parallel_execution]);
diff --git a/PTRANS/src/host/execution_types/execution_intel_pq.hpp b/PTRANS/src/host/execution_types/execution_intel_pq.hpp
index 85e596a7..f1f4add4 100644
--- a/PTRANS/src/host/execution_types/execution_intel_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_intel_pq.hpp
@@ -44,7 +44,7 @@ namespace intel_pq {
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
 static  std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) {
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler<cl::Device, cl::Context, cl::Program> &handler) {
         int err;
 
         if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
diff --git a/PTRANS/src/host/execution_types/execution_pcie.hpp b/PTRANS/src/host/execution_types/execution_pcie.hpp
index a08888de..97bd910f 100644
--- a/PTRANS/src/host/execution_types/execution_pcie.hpp
+++ b/PTRANS/src/host/execution_types/execution_pcie.hpp
@@ -49,7 +49,7 @@ namespace transpose
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
             static std::unique_ptr<transpose::TransposeExecutionTimings>
-            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler)
+            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler<cl::Device, cl::Context, cl::Program> &handler)
             {
                 int err;
 
diff --git a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
index db1d9bee..c369e9cb 100644
--- a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
@@ -45,7 +45,7 @@ namespace pcie_pq {
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
 static  std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) {
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler<cl::Device, cl::Context, cl::Program> &handler) {
         int err;
 
         if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index 2138de6c..a31bf9d5 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -28,6 +28,7 @@ SOFTWARE.
 #include <memory>
 
 /* Project's headers */
+#include "parameters.h"
 #include "hpcc_benchmark.hpp"
 #include "transpose_data.hpp"
 
@@ -42,8 +43,6 @@ SOFTWARE.
 #include "data_handlers/diagonal.hpp"
 #include "data_handlers/pq.hpp"
 
-#include "parameters.h"
-
 /**
  * @brief Contains all classes and methods needed by the Transpose benchmark
  * 
@@ -99,8 +98,8 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
     void
     setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier) {
         switch (dataHandlerIdentifier) {
-            case transpose::data_handler::DataHandlerType::diagonal: this->dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler, TDevice, TContext, TProgram>(new transpose::data_handler::DistributedDiagonalTransposeDataHandler<TDevice, TContext, TProgram>(mpi_comm_rank, mpi_comm_size)); break;
-            case transpose::data_handler::DataHandlerType::pq: this->dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler, TDevice, TContext, TProgram>(new transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break;
+            case transpose::data_handler::DataHandlerType::diagonal: this->dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler<TDevice, TContext, TProgram>>(new transpose::data_handler::DistributedDiagonalTransposeDataHandler<TDevice, TContext, TProgram>(this->mpi_comm_rank, this->mpi_comm_size)); break;
+            case transpose::data_handler::DataHandlerType::pq: this->dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler<TDevice, TContext, TProgram>>(new transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>(this->mpi_comm_rank, this->mpi_comm_size, this->executionSettings->programSettings->p)); break;
             default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier));
         }
     }
@@ -119,14 +118,14 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
                                         return transpose::fpga_execution::intel::calculate(*(this->executionSettings), data);
                                     }
                                     else {
-                                        return transpose::fpga_execution::intel_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*this->dataHandler));
+                                        return transpose::fpga_execution::intel_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler));
                                     } break;
             case hpcc_base::CommunicationType::pcie_mpi :                                 
-                                    if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
+                                    if (this->executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
                                         return transpose::fpga_execution::pcie::calculate(*(this->executionSettings), data, *dataHandler);
                                     }
                                     else {
-                                        return transpose::fpga_execution::pcie_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*this->dataHandler));
+                                        return transpose::fpga_execution::pcie_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler));
                                     } break;
 #ifdef MKL_FOUND
             case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*(this->executionSettings), data, *dataHandler); break;
@@ -151,14 +150,14 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
         this->dataHandler->reference_transpose(data);
 
         double max_error = 0.0;
-        for (size_t i = 0; i < executionSettings->programSettings->blockSize * executionSettings->programSettings->blockSize * data.numBlocks; i++) {
+        for (size_t i = 0; i < this->executionSettings->programSettings->blockSize * this->executionSettings->programSettings->blockSize * data.numBlocks; i++) {
             max_error = std::max(fabs(data.A[i]), max_error);
         }
 
         double global_max_error = 0;
         MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
 
-        if (mpi_comm_rank == 0) {
+        if (this->mpi_comm_rank == 0) {
             std::cout << "Maximum error: " << global_max_error << " < " << 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon() <<  std::endl;
             std::cout << "Mach. Epsilon: " << std::numeric_limits<HOST_DATA_TYPE>::epsilon() << std::endl;
         }
@@ -173,7 +172,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      */
     void
     collectAndPrintResults(const TransposeExecutionTimings &output) override {
-        double flops = static_cast<double>(executionSettings->programSettings->matrixSize) * executionSettings->programSettings->matrixSize;
+        double flops = static_cast<double>(this->executionSettings->programSettings->matrixSize) * this->executionSettings->programSettings->matrixSize;
 
         // Number of experiment repetitions
         uint number_measurements = output.calculationTimings.size();
@@ -181,7 +180,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
         std::vector<double> max_transfers(number_measurements);
 #ifdef _USE_MPI_
             // Copy the object variable to a local variable to make it accessible to the lambda function
-            int mpi_size = mpi_comm_size;
+            int mpi_size = this->mpi_comm_size;
             MPI_Reduce(output.calculationTimings.data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
             MPI_Reduce(output.transferTimings.data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
 #else
@@ -204,7 +203,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
         double avgTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime;
         double maxTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime;
 
-        if (mpi_comm_rank == 0) {
+        if (this->mpi_comm_rank == 0) {
             std::cout << "       total [s]     transfer [s]  calc [s]      calc FLOPS    Mem [B/s]     PCIe [B/s]" << std::endl;
             std::cout << "avg:   " << (avgTransferTime + avgCalculationTime)
                     << "   " << avgTransferTime
@@ -229,7 +228,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      * @param argc the number of program input parameters
      * @param argv the program input parameters as array of strings
      */
-    TransposeBenchmark(int argc, char* argv[]) : HpccFpgaBenchmark(argc, argv) {
+    TransposeBenchmark(int argc, char* argv[]) : hpcc_base::HpccFpgaBenchmark<transpose::TransposeProgramSettings,TDevice, TContext, TProgram, transpose::TransposeData, transpose::TransposeExecutionTimings>(argc, argv) {
         if (this->setupBenchmark(argc, argv)) {
             this->setTransposeDataHandler(this->executionSettings->programSettings->dataHandlerIdentifier);
         }
@@ -238,7 +237,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
     /**
      * @brief Construct a new Transpose Benchmark object
      */
-    TransposeBenchmark() : HpccFpgaBenchmark() {}
+    TransposeBenchmark() : hpcc_base::HpccFpgaBenchmark<transpose::TransposeProgramSettings,TDevice, TContext, TProgram, transpose::TransposeData, transpose::TransposeExecutionTimings>() {}
 
 };
 

From fa61522c57e78d94b5c9806ce652a9f0a492733f Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 13 Apr 2022 18:19:21 +0100
Subject: [PATCH 017/318] Fix working with OCL bindings

---
 PTRANS/src/host/main.cpp                |  4 ++++
 PTRANS/src/host/transpose_benchmark.hpp |  3 ++-
 shared/CMakeLists.txt                   |  4 +---
 shared/include/hpcc_benchmark.hpp       | 25 ++++++++++++++++---------
 shared/setup/fpga_setup.cpp             |  2 +-
 shared/setup/fpga_setup_accl.cpp        | 22 +++++++++++++++-------
 shared/setup/fpga_setup_xrt.cpp         |  2 +-
 7 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/PTRANS/src/host/main.cpp b/PTRANS/src/host/main.cpp
index f65d06ce..d4db9803 100644
--- a/PTRANS/src/host/main.cpp
+++ b/PTRANS/src/host/main.cpp
@@ -8,7 +8,11 @@ The program entry point
 int
 main(int argc, char *argv[]) {
     // Setup benchmark
+#ifdef USE_OCL_HOST
     TransposeBenchmark<cl::Device, cl::Context, cl::Program> bm(argc, argv);
+#else
+    TransposeBenchmark<xrt::device, bool, xrt::uuid> bm(argc, argv);
+#endif
     bool success = bm.executeBenchmark();
     if (success) {
         return 0;
diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index a31bf9d5..148adc7f 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -31,7 +31,6 @@ SOFTWARE.
 #include "parameters.h"
 #include "hpcc_benchmark.hpp"
 #include "transpose_data.hpp"
-
 #include "execution_types/execution_intel.hpp"
 #include "execution_types/execution_intel_pq.hpp"
 #include "execution_types/execution_pcie.hpp"
@@ -113,6 +112,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
     std::unique_ptr<TransposeExecutionTimings>
     executeKernel(TransposeData &data) override {
         switch (this->executionSettings->programSettings->communicationType) {
+#ifdef USE_OCL_HOST
             case hpcc_base::CommunicationType::intel_external_channels: 
                                     if (this->executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
                                         return transpose::fpga_execution::intel::calculate(*(this->executionSettings), data);
@@ -127,6 +127,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
                                     else {
                                         return transpose::fpga_execution::pcie_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler));
                                     } break;
+#endif
 #ifdef MKL_FOUND
             case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*(this->executionSettings), data, *dataHandler); break;
 #endif
diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt
index 3f3ada79..fdb8ca2f 100644
--- a/shared/CMakeLists.txt
+++ b/shared/CMakeLists.txt
@@ -9,9 +9,7 @@ endif()
 if (USE_XRT_HOST)
     list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp)
 endif()
-if (USE_OCL_HOST)
-    list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp)
-endif()
+list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp)
 add_library(hpcc_fpga_base STATIC ${HPCC_BASE_SOURCES})
 if (USE_ACCL)
     target_include_directories(hpcc_fpga_base PRIVATE ${ACCL_INCLUDE_PATH})
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index f135fc30..b16994f2 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -498,22 +498,29 @@ class HpccFpgaBenchmark {
             std::unique_ptr<ACCL::ACCL> accl;
 #endif
             if (!programSettings->testOnly) {
-//                usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
-//                                                                    programSettings->defaultDevice);
+#ifdef USE_XRT_HOST
+                usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultDevice);
+                context = false;
+                program = fpga_setup::fpgaSetup(usedDevice);
+#endif                                                             
 #ifdef USE_OCL_HOST
-//                context = std::unique_ptr<cl::Context>(new cl::Context(*usedDevice));
-//                program = fpga_setup::fpgaSetup(context.get(), {*usedDevice},
-//                                                                    &programSettings->kernelFileName);
+                usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
+                                                                    programSettings->defaultDevice);
+                context = std::unique_ptr<cl::Context>(new cl::Context(*usedDevice));
+                program = fpga_setup::fpgaSetup(context.get(), {*usedDevice},
+                                                                    &programSettings->kernelFileName);
 #endif
 #ifdef USE_ACCL
-                xrt::device dev;
-                xrt::uuid *program;
-                accl = fpga_setup::fpgaSetupACCL(dev, *program);
+                accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program);
 #endif
             }
 
             executionSettings = std::unique_ptr<ExecutionSettings<TSettings, TDevice, TContext, TProgram>>(new ExecutionSettings<TSettings, TDevice, TContext, TProgram>(std::move(programSettings), std::move(usedDevice), 
-                                                                    std::move(context), std::move(program), std::move(accl)));
+                                                                    std::move(context), std::move(program) 
+#ifdef USE_ACCL
+                                                                    , std::move(accl)
+#endif
+                                                                    ));
             if (mpi_comm_rank == 0) {
                 if (!checkInputParameters()) {
                     std::cerr << "ERROR: Input parameter check failed!" << std::endl;
diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp
index aba9b8b2..6d08a26f 100644
--- a/shared/setup/fpga_setup.cpp
+++ b/shared/setup/fpga_setup.cpp
@@ -101,7 +101,7 @@ Converts the reveived OpenCL error to a string
             CL_ERR_TO_STR(CL_INVALID_DEVICE_PARTITION_COUNT);
 
             default:
-                return "UNKNOWN ERROR CODE";
+                return "UNKNOWN ERROR CODE: " + std::to_string(err);
         }
     }
 
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 01d012e3..4abb8533 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -13,7 +13,7 @@
 
 /* External libraries */
 #include "parameters.h"
-
+#include "xrt.h"
 #ifdef _USE_MPI_
 #include "mpi.h"
 #endif
@@ -21,24 +21,32 @@
 namespace fpga_setup {
 
     std::unique_ptr<ACCL::ACCL>
-    fpgaSetupACCL(xrt::device &context,
+    fpgaSetupACCL(xrt::device &device,
               xrt::uuid &program) {
         int current_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
 
         int current_size;
         MPI_Comm_size(MPI_COMM_WORLD, & current_size);
-
-	std::vector<ACCL::rank_t> ranks = {};
+ 
+        std::vector<ACCL::rank_t> ranks = {};
         for (int i = 0; i < current_size; ++i) {
-		ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i,
+		    // TODO: Replace the ip addresses and ports here for execution of real hardware?
+            ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i,
                        1024};
             ranks.emplace_back(new_rank);
         }
-	// TODO: Add start port here. Currenty hardcoded!
+#ifdef ACCL_HARDWARE_SUPPORT
+        auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}");
+        auto hostctl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}",
+                xrt::kernel::cu_access_mode::exclusive);
+        return std::unique_ptr<ACCL::ACCL>(new ACCL::ACCL(ranks, rank, device, cclo_ip, hostctrl_ip, 0, {0}, 0);
+#else
+                // TODO: Add start port here. Currenty hardcoded!
         return std::unique_ptr<ACCL::ACCL>(new ACCL::ACCL(ranks, current_rank,
                           "tcp://localhost:" +
                             std::to_string(5500 + current_rank)));
-    }
+#endif
+        }
 
 }  // namespace fpga_setup
diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp
index 1a9135bb..f04e90aa 100644
--- a/shared/setup/fpga_setup_xrt.cpp
+++ b/shared/setup/fpga_setup_xrt.cpp
@@ -29,7 +29,7 @@ namespace fpga_setup {
         int current_size;
         MPI_Comm_size(MPI_COMM_WORLD, & current_size);
 
-        return std::unique_ptr<xrt::uuid>(new device.load_xclbin(kernelFileName));
+        return std::make_unique<xrt::uuid>(std::move(device.load_xclbin(kernelFileName)));
     }
 
     std::unique_ptr<xrt::device>

From 5525b086c0810d85df743a05fe095d3ec1e3f0bc Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 14 Apr 2022 16:57:30 +0100
Subject: [PATCH 018/318] Update PTRANS for XRT only execution

---
 PTRANS/src/host/data_handlers/diagonal.hpp    |   8 +-
 PTRANS/src/host/data_handlers/handler.hpp     |   6 +-
 PTRANS/src/host/data_handlers/pq.hpp          |  12 +-
 .../host/execution_types/execution_cpu.hpp    |   2 +-
 .../host/execution_types/execution_intel.hpp  |   2 +-
 .../execution_types/execution_intel_pq.hpp    |   2 +-
 .../host/execution_types/execution_pcie.hpp   |   2 +-
 .../execution_types/execution_pcie_pq.hpp     |   2 +-
 .../execution_types/execution_xrt_accl_pq.hpp | 230 +++++++++++++++++
 .../execution_types/execution_xrt_pcie_pq.hpp | 238 ++++++++++++++++++
 PTRANS/src/host/transpose_benchmark.cpp       | 176 -------------
 PTRANS/src/host/transpose_benchmark.hpp       |  30 ++-
 PTRANS/src/host/transpose_data.cpp            |  44 ----
 PTRANS/src/host/transpose_data.hpp            |  48 +++-
 14 files changed, 554 insertions(+), 248 deletions(-)
 create mode 100644 PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
 create mode 100644 PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
 delete mode 100644 PTRANS/src/host/transpose_benchmark.cpp

diff --git a/PTRANS/src/host/data_handlers/diagonal.hpp b/PTRANS/src/host/data_handlers/diagonal.hpp
index a2c702c0..9f601105 100644
--- a/PTRANS/src/host/data_handlers/diagonal.hpp
+++ b/PTRANS/src/host/data_handlers/diagonal.hpp
@@ -69,7 +69,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler<TDev
      * @param settings The execution settings that contain information about the data size
      * @return std::unique_ptr<TransposeData> The generated data
      */
-    std::unique_ptr<TransposeData>
+    std::unique_ptr<TransposeData<TContext>>
     generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, TDevice, TContext, TProgram>& settings) override {
         MPI_Type_contiguous(settings.programSettings->blockSize * settings.programSettings->blockSize, MPI_FLOAT, &data_block);
         MPI_Type_commit(&data_block);
@@ -119,7 +119,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler<TDev
     #endif
         
         // Allocate memory for a single device and all its memory banks
-        auto d = std::unique_ptr<transpose::TransposeData>(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank));
+        auto d = std::unique_ptr<transpose::TransposeData<TContext>>(new transpose::TransposeData<TContext>(*settings.context, settings.programSettings->blockSize, blocks_per_rank));
 
         // Fill the allocated memory with pseudo random values
         std::mt19937 gen(this->mpi_comm_rank);
@@ -142,7 +142,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler<TDev
      *              Exchanged data will be stored in the same object.
      */
     void
-    exchangeData(TransposeData& data) override {
+    exchangeData(TransposeData<TContext>& data) override {
 
     #ifndef NDEBUG
         // std::cout << "Start data exchange " << mpi_comm_rank << std::endl;
@@ -185,7 +185,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler<TDev
     }
 
     void 
-    reference_transpose(TransposeData& data) {
+    reference_transpose(TransposeData<TContext>& data) {
         size_t block_offset = data.blockSize * data.blockSize;
         for (size_t b = 0; b < data.numBlocks; b++) {
             for (size_t i = 0; i < data.blockSize; i++) {
diff --git a/PTRANS/src/host/data_handlers/handler.hpp b/PTRANS/src/host/data_handlers/handler.hpp
index b71597bd..646fcdbf 100644
--- a/PTRANS/src/host/data_handlers/handler.hpp
+++ b/PTRANS/src/host/data_handlers/handler.hpp
@@ -68,7 +68,7 @@ class TransposeDataHandler {
      * @param settings The execution settings that contain information about the data size
      * @return std::unique_ptr<TransposeData> The generated data
      */
-    virtual std::unique_ptr<TransposeData>
+    virtual std::unique_ptr<TransposeData<TContext>>
     generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, TDevice, TContext, TProgram>& settings) = 0;
 
     /**
@@ -78,10 +78,10 @@ class TransposeDataHandler {
      *              Exchanged data will be stored in the same object.
      */
     virtual void
-    exchangeData(TransposeData& data) = 0;
+    exchangeData(TransposeData<TContext>& data) = 0;
 
     virtual void
-    reference_transpose(TransposeData& data) = 0;
+    reference_transpose(TransposeData<TContext>& data) = 0;
 
     /**
      * @brief Construct a new Transpose Data Handler object and initialize the MPI rank and MPI size variables if MPI is used
diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp
index d065150b..0e28c109 100644
--- a/PTRANS/src/host/data_handlers/pq.hpp
+++ b/PTRANS/src/host/data_handlers/pq.hpp
@@ -136,7 +136,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, T
      * @param settings The execution settings that contain information about the data size
      * @return std::unique_ptr<TransposeData> The generated data
      */
-    std::unique_ptr<TransposeData>
+    std::unique_ptr<TransposeData<TContext>>
     generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, TDevice, TContext, TProgram>& settings) override {
         int width_in_blocks = settings.programSettings->matrixSize / settings.programSettings->blockSize;
         global_width = width_in_blocks;
@@ -165,15 +165,15 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, T
         }
         
         // Allocate memory for a single device and all its memory banks
-        auto d = std::unique_ptr<transpose::TransposeData>(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank));
+        auto d = std::unique_ptr<transpose::TransposeData<TContext>>(new transpose::TransposeData<TContext>(*settings.context, settings.programSettings->blockSize, blocks_per_rank));
 
         // Fill the allocated memory with pseudo random values
         std::mt19937 gen(this->mpi_comm_rank);
         std::uniform_real_distribution<> dis(-100.0, 100.0);
         for (size_t i = 0; i < blocks_per_rank * settings.programSettings->blockSize; i++) {
             for (size_t j = 0; j < settings.programSettings->blockSize; j++) {
-                d->A[i * settings.programSettings->blockSize + j] = dis(gen);
-                d->B[i * settings.programSettings->blockSize + j] = dis(gen);
+                d->A[i * settings.programSettings->blockSize + j] = i * settings.programSettings->blockSize + j;//dis(gen);
+                d->B[i * settings.programSettings->blockSize + j] = 0.0; //dis(gen);
                 d->result[i * settings.programSettings->blockSize + j] = 0.0;
             }
         }
@@ -188,7 +188,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, T
      *              Exchanged data will be stored in the same object.
      */
     void
-    exchangeData(TransposeData& data) override {
+    exchangeData(TransposeData<TContext>& data) override {
 
         MPI_Status status;     
 
@@ -371,7 +371,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, T
     }
 
     void 
-    reference_transpose(TransposeData& data) {
+    reference_transpose(TransposeData<TContext>& data) {
         for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
             for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
                 data.A[i * height_per_rank * data.blockSize + j] -= (data.result[j * width_per_rank * data.blockSize + i] - data.B[j * width_per_rank * data.blockSize + i]);
diff --git a/PTRANS/src/host/execution_types/execution_cpu.hpp b/PTRANS/src/host/execution_types/execution_cpu.hpp
index 130b016e..bc148d98 100644
--- a/PTRANS/src/host/execution_types/execution_cpu.hpp
+++ b/PTRANS/src/host/execution_types/execution_cpu.hpp
@@ -52,7 +52,7 @@ namespace transpose
  */
             template<class TDevice, class TContext, class TProgram>
             static std::unique_ptr<transpose::TransposeExecutionTimings>
-            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, TDevice, TContext, TProgram> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler)
+            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, TDevice, TContext, TProgram> &config, transpose::TransposeData<TContext> &data, transpose::data_handler::TransposeDataHandler &handler)
             {
                 int err;
 
diff --git a/PTRANS/src/host/execution_types/execution_intel.hpp b/PTRANS/src/host/execution_types/execution_intel.hpp
index 58f5a73f..64c996e0 100644
--- a/PTRANS/src/host/execution_types/execution_intel.hpp
+++ b/PTRANS/src/host/execution_types/execution_intel.hpp
@@ -43,7 +43,7 @@ namespace intel {
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
 static  std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData& data) {
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData<cl::Context>& data) {
         int err;
 
         if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::diagonal) {
diff --git a/PTRANS/src/host/execution_types/execution_intel_pq.hpp b/PTRANS/src/host/execution_types/execution_intel_pq.hpp
index f1f4add4..9c8bf557 100644
--- a/PTRANS/src/host/execution_types/execution_intel_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_intel_pq.hpp
@@ -44,7 +44,7 @@ namespace intel_pq {
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
 static  std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler<cl::Device, cl::Context, cl::Program> &handler) {
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData<cl::Context>& data, transpose::data_handler::DistributedPQTransposeDataHandler<cl::Device, cl::Context, cl::Program> &handler) {
         int err;
 
         if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
diff --git a/PTRANS/src/host/execution_types/execution_pcie.hpp b/PTRANS/src/host/execution_types/execution_pcie.hpp
index 97bd910f..2e607a97 100644
--- a/PTRANS/src/host/execution_types/execution_pcie.hpp
+++ b/PTRANS/src/host/execution_types/execution_pcie.hpp
@@ -49,7 +49,7 @@ namespace transpose
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
             static std::unique_ptr<transpose::TransposeExecutionTimings>
-            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler<cl::Device, cl::Context, cl::Program> &handler)
+            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program> &config, transpose::TransposeData<cl::Context> &data, transpose::data_handler::TransposeDataHandler<cl::Device, cl::Context, cl::Program> &handler)
             {
                 int err;
 
diff --git a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
index c369e9cb..9d7d0b45 100644
--- a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
@@ -45,7 +45,7 @@ namespace pcie_pq {
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
 static  std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler<cl::Device, cl::Context, cl::Program> &handler) {
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData<cl::Context>& data, transpose::data_handler::DistributedPQTransposeDataHandler<cl::Device, cl::Context, cl::Program> &handler) {
         int err;
 
         if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
new file mode 100644
index 00000000..ce42dd6f
--- /dev/null
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
@@ -0,0 +1,230 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_ACCL_PQ_EXECUTION_H_
+#define SRC_HOST_ACCL_PQ_EXECUTION_H_
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* Project's headers */
+#include "transpose_benchmark.hpp"
+#include "data_handlers/data_handler_types.h"
+#include "data_handlers/pq.hpp"
+
+namespace transpose {
+namespace fpga_execution {
+namespace accl_pq {
+
+    /**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and PCIe+MPI over the host for communication
+ * 
+ * @param config The progrma configuration
+ * @param data data object that contains all required data for the execution on the FPGA
+ * @param handler data handler instance that should be used to exchange data between hosts
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
+ */
+static  std::unique_ptr<transpose::TransposeExecutionTimings>
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, xrt::device, bool, xrt::uuid>& config, transpose::TransposeData<bool>& data, transpose::data_handler::DistributedPQTransposeDataHandler<xrt::device, bool, xrt::uuid> &handler) {
+        int err;
+
+        if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
+                throw std::runtime_error("Used data handler not supported by execution handler!");
+        }
+#ifdef USE_SVM
+        throw new std::runtime_error("SVM not supported in the host implementation of this communication method");
+#endif
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+        throw new std::runtime_error("Using the Write Rect method is not supported in this host implementation of this communication method");
+#endif
+
+
+        std::vector<size_t> bufferSizeList;
+        std::vector<size_t> bufferStartList;
+        std::vector<size_t> bufferOffsetList;
+        std::vector<xrt::bo> bufferListA;
+        std::vector<xrt::bo> bufferListB;
+        std::vector<xrt::bo> bufferListA_out;
+        std::vector<xrt::kernel> transposeKernelList;
+        std::vector<size_t> blocksPerReplication;
+
+        size_t local_matrix_width = handler.getWidthforRank();
+        size_t local_matrix_height = handler.getHeightforRank();
+        size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+
+        size_t total_offset = 0;
+        size_t row_offset = 0;
+        // Setup the kernels depending on the number of kernel replications
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+                // Calculate how many blocks the current kernel replication will need to process.
+                size_t blocks_per_replication = (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications);
+                size_t blocks_remainder = (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications;
+                if (blocks_remainder > r) {
+                        // Catch the case, that the number of blocks is not divisible by the number of kernel replications
+                        blocks_per_replication += 1;
+                }
+                if (blocks_per_replication < 1) {
+                        continue;
+                }
+                blocksPerReplication.push_back(blocks_per_replication);
+                size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width * local_matrix_width * data.blockSize * data.blockSize;
+                bufferSizeList.push_back(buffer_size);
+                bufferStartList.push_back(total_offset);
+                bufferOffsetList.push_back(row_offset);
+
+                row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
+
+                total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * local_matrix_width;
+
+                int memory_bank_info_a = 0;
+                int memory_bank_info_b = 0;
+                int memory_bank_info_out = 0;
+                
+                // create the kernels
+                xrt::kernel transposeKernel(*config.device, *config.program, ("transpose0:{transpose0_" +  std::to_string(r + 1) + "}").c_str());
+
+               
+                xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * 
+                                sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0));
+                xrt::bo bufferB(*config.device, data.B + bufferStartList[r] * data.blockSize * data.blockSize, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
+                xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2));
+
+                auto run = transposeKernel(bufferA, bufferB, bufferA_out, static_cast<cl_uint>(bufferOffsetList[r]),static_cast<cl_uint>(bufferOffsetList[r]),
+                        static_cast<cl_uint>(blocks_per_replication), static_cast<cl_uint>(handler.getWidthforRank()),
+                        static_cast<cl_uint>((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)));
+
+                bufferListA.push_back(bufferA);
+                bufferListB.push_back(bufferB);
+                bufferListA_out.push_back(bufferA_out);
+                transposeKernelList.push_back(transposeKernel);
+        }
+
+        std::vector<double> transferTimings;
+        std::vector<double> calculationTimings;
+
+        for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) {
+
+            auto startTransfer = std::chrono::high_resolution_clock::now();
+
+            for (int r = 0; r < transposeKernelList.size(); r++) {
+                bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+                bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+            }
+            auto endTransfer = std::chrono::high_resolution_clock::now();
+
+            std::chrono::duration<double> transferTime =
+                    std::chrono::duration_cast<std::chrono::duration<double>>
+                            (endTransfer - startTransfer);
+
+            MPI_Barrier(MPI_COMM_WORLD);
+
+            auto startCalculation = std::chrono::high_resolution_clock::now();
+
+            for (int r = 0; r < transposeKernelList.size(); r++)
+            {
+                bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+            }
+
+
+        // Exchange A data via PCIe and MPI
+        handler.exchangeData(data);
+
+        for (int r = 0; r < transposeKernelList.size(); r++)
+        {
+            bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+        }
+
+        std::vector<xrt::run> runs;
+        auto startKernelCalculation = std::chrono::high_resolution_clock::now();
+        for (int r = 0; r < transposeKernelList.size(); r++)
+        {
+             runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast<cl_uint>(bufferOffsetList[r]),static_cast<cl_uint>(bufferOffsetList[r]),
+                        static_cast<cl_uint>(blocksPerReplication[r]), static_cast<cl_uint>(handler.getWidthforRank()),
+                        static_cast<cl_uint>((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize))));
+        }
+        for (int r = 0; r < transposeKernelList.size(); r++)
+        {
+            runs[r].wait();
+        }
+        auto endCalculation = std::chrono::high_resolution_clock::now();
+#ifndef NDEBUG
+                int mpi_rank;
+                MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+                std::cout << "Rank " << mpi_rank << ": " << "Done i=" << repetition << std::endl;
+                std::cout << "Kernel execution time: " << std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startKernelCalculation).count() 
+                        << "s (" << ((config.programSettings->matrixSize * config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * 3) 
+                                / std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startKernelCalculation).count() * 1.0e-9) << " GB/s)" << std::endl;
+#endif
+
+        // Transfer back data for next repetition!
+        handler.exchangeData(data);
+
+            std::chrono::duration<double> calculationTime =
+                    std::chrono::duration_cast<std::chrono::duration<double>>
+                            (endCalculation - startCalculation);
+            calculationTimings.push_back(calculationTime.count());
+
+            std::vector<HOST_DATA_TYPE> tmp_write_buffer(local_matrix_height * local_matrix_width * data.blockSize * data.blockSize); 
+
+            startTransfer = std::chrono::high_resolution_clock::now();
+
+                for (int r = 0; r < transposeKernelList.size(); r++) {
+                        // Copy possibly incomplete first block row
+                        if (bufferOffsetList[r] != 0) {
+                                bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+                                bufferListA_out[r].read(tmp_write_buffer.data());
+                                for (int row = 0; row < data.blockSize; row++) {
+                                        for (int col = bufferOffsetList[r] * data.blockSize; col < local_matrix_width * data.blockSize; col++) {
+                                                data.result[bufferStartList[r] * data.blockSize * data.blockSize + row * local_matrix_width * data.blockSize + col] =
+                                                        tmp_write_buffer[row * local_matrix_width * data.blockSize + col];
+                                        }
+                                }
+                                // Copy remaining buffer
+                                std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize, tmp_write_buffer.begin() + bufferSizeList[r],&data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize * data.blockSize]);
+                        }
+                        else {
+                                bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+                                bufferListA_out[r].read(data.result + bufferStartList[r] * data.blockSize * data.blockSize);
+                        }
+                }
+            endTransfer = std::chrono::high_resolution_clock::now();
+            transferTime +=
+                    std::chrono::duration_cast<std::chrono::duration<double>>
+                            (endTransfer - startTransfer);
+            transferTimings.push_back(transferTime.count());
+        }
+
+        std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
+                transferTimings,
+                calculationTimings
+        });
+
+        return result;
+    }
+
+}  // namespace transpose
+}  // namespace fpga_execution
+}  // namespace intel
+
+#endif
diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
new file mode 100644
index 00000000..8629af01
--- /dev/null
+++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
@@ -0,0 +1,238 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_XRT_PCIE_PQ_EXECUTION_H_
+#define SRC_HOST_XRT_PCIE_PQ_EXECUTION_H_
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* Project's headers */
+#include "transpose_benchmark.hpp"
+#include "data_handlers/data_handler_types.h"
+#include "data_handlers/pq.hpp"
+
+namespace transpose {
+namespace fpga_execution {
+namespace pcie_pq {
+
+    /**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and PCIe+MPI over the host for communication
+ * 
+ * @param config The progrma configuration
+ * @param data data object that contains all required data for the execution on the FPGA
+ * @param handler data handler instance that should be used to exchange data between hosts
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
+ */
+static  std::unique_ptr<transpose::TransposeExecutionTimings>
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, xrt::device, bool, xrt::uuid>& config, transpose::TransposeData<bool>& data, transpose::data_handler::DistributedPQTransposeDataHandler<xrt::device, bool, xrt::uuid> &handler) {
+        int err;
+
+        if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
+                throw std::runtime_error("Used data handler not supported by execution handler!");
+        }
+#ifdef USE_SVM
+        throw new std::runtime_error("SVM not supported in the host implementation of this communication method");
+#endif
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+        throw new std::runtime_error("Using the Write Rect method is not supported in this host implementation of this communication method");
+#endif
+
+
+        std::vector<size_t> bufferSizeList;
+        std::vector<size_t> bufferStartList;
+        std::vector<size_t> bufferOffsetList;
+        std::vector<xrt::bo> bufferListA;
+        std::vector<xrt::bo> bufferListB;
+        std::vector<xrt::bo> bufferListA_out;
+        std::vector<xrt::kernel> transposeKernelList;
+        std::vector<size_t> blocksPerReplication;
+
+        size_t local_matrix_width = handler.getWidthforRank();
+        size_t local_matrix_height = handler.getHeightforRank();
+        size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+
+        size_t total_offset = 0;
+        size_t row_offset = 0;
+        // Setup the kernels depending on the number of kernel replications
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+                // Calculate how many blocks the current kernel replication will need to process.
+                size_t blocks_per_replication = (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications);
+                size_t blocks_remainder = (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications;
+                if (blocks_remainder > r) {
+                        // Catch the case, that the number of blocks is not divisible by the number of kernel replications
+                        blocks_per_replication += 1;
+                }
+                if (blocks_per_replication < 1) {
+                        continue;
+                }
+                blocksPerReplication.push_back(blocks_per_replication);
+                size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width * local_matrix_width * data.blockSize * data.blockSize;
+                bufferSizeList.push_back(buffer_size);
+                bufferStartList.push_back(total_offset);
+                bufferOffsetList.push_back(row_offset);
+
+                row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
+
+                total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * local_matrix_width;
+
+                int memory_bank_info_a = 0;
+                int memory_bank_info_b = 0;
+                int memory_bank_info_out = 0;
+                
+                // create the kernels
+                xrt::kernel transposeKernel(*config.device, *config.program, ("transpose0:{transpose0_" +  std::to_string(r + 1) + "}").c_str());
+
+               
+                xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * 
+                                sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0));
+                xrt::bo bufferB(*config.device, data.B + bufferStartList[r] * data.blockSize * data.blockSize, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
+                xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2));
+
+                auto run = transposeKernel(bufferA, bufferB, bufferA_out, static_cast<cl_uint>(bufferOffsetList[r]),static_cast<cl_uint>(bufferOffsetList[r]),
+                        static_cast<cl_uint>(blocks_per_replication), static_cast<cl_uint>(handler.getWidthforRank()),
+                        static_cast<cl_uint>((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)));
+
+                bufferListA.push_back(bufferA);
+                bufferListB.push_back(bufferB);
+                bufferListA_out.push_back(bufferA_out);
+                transposeKernelList.push_back(transposeKernel);
+        }
+
+        std::vector<double> transferTimings;
+        std::vector<double> calculationTimings;
+
+        for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) {
+
+            auto startTransfer = std::chrono::high_resolution_clock::now();
+
+            for (int r = 0; r < transposeKernelList.size(); r++) {
+                bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+                bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+            }
+            auto endTransfer = std::chrono::high_resolution_clock::now();
+
+            std::chrono::duration<double> transferTime =
+                    std::chrono::duration_cast<std::chrono::duration<double>>
+                            (endTransfer - startTransfer);
+
+            MPI_Barrier(MPI_COMM_WORLD);
+
+            auto startCalculation = std::chrono::high_resolution_clock::now();
+
+            for (int r = 0; r < transposeKernelList.size(); r++)
+            {
+                bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+            }
+
+
+        // Exchange A data via PCIe and MPI
+        handler.exchangeData(data);
+
+        for (int r = 0; r < transposeKernelList.size(); r++)
+        {
+            bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+        }
+
+        std::vector<xrt::run> runs;
+        auto startKernelCalculation = std::chrono::high_resolution_clock::now();
+        for (int r = 0; r < transposeKernelList.size(); r++)
+        {
+             runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast<cl_uint>(bufferOffsetList[r]),static_cast<cl_uint>(bufferOffsetList[r]),
+                        static_cast<cl_uint>(blocksPerReplication[r]), static_cast<cl_uint>(handler.getWidthforRank()),
+                        static_cast<cl_uint>((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize))));
+        }
+        for (int r = 0; r < transposeKernelList.size(); r++)
+        {
+            runs[r].wait();
+        }
+        auto endCalculation = std::chrono::high_resolution_clock::now();
+#ifndef NDEBUG
+                int mpi_rank;
+                MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+                std::cout << "Rank " << mpi_rank << ": " << "Done i=" << repetition << std::endl;
+                std::cout << "Kernel execution time: " << std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startKernelCalculation).count() 
+                        << "s (" << ((config.programSettings->matrixSize * config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * 3) 
+                                / std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startKernelCalculation).count() * 1.0e-9) << " GB/s)" << std::endl;
+#endif
+
+        // Transfer back data for next repetition!
+        handler.exchangeData(data);
+
+            std::chrono::duration<double> calculationTime =
+                    std::chrono::duration_cast<std::chrono::duration<double>>
+                            (endCalculation - startCalculation);
+            calculationTimings.push_back(calculationTime.count());
+
+            std::vector<HOST_DATA_TYPE> tmp_write_buffer(local_matrix_height * local_matrix_width * data.blockSize * data.blockSize); 
+
+            startTransfer = std::chrono::high_resolution_clock::now();
+
+                for (int r = 0; r < transposeKernelList.size(); r++) {
+                        // Copy possibly incomplete first block row
+                        if (bufferOffsetList[r] != 0) {
+                                bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+                                bufferListA_out[r].read(tmp_write_buffer.data());
+                                for (int row = 0; row < data.blockSize; row++) {
+                                        for (int col = bufferOffsetList[r] * data.blockSize; col < local_matrix_width * data.blockSize; col++) {
+                                                data.result[bufferStartList[r] * data.blockSize * data.blockSize + row * local_matrix_width * data.blockSize + col] =
+                                                        tmp_write_buffer[row * local_matrix_width * data.blockSize + col];
+                                        }
+                                }
+                                // Copy remaining buffer
+                                std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize, tmp_write_buffer.begin() + bufferSizeList[r],&data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize * data.blockSize]);
+                        }
+                        else {
+                                bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+                                bufferListA_out[r].read(data.result + bufferStartList[r] * data.blockSize * data.blockSize);
+                        }
+                }
+            endTransfer = std::chrono::high_resolution_clock::now();
+            transferTime +=
+                    std::chrono::duration_cast<std::chrono::duration<double>>
+                            (endTransfer - startTransfer);
+            transferTimings.push_back(transferTime.count());
+        }
+
+        std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
+                transferTimings,
+                calculationTimings
+        });
+
+        for (int i=0; i < local_matrix_height; i++) {
+            for (int j=0; j < local_matrix_width; j++) {
+                std::cout << data.result[i * local_matrix_width + j] << ",";
+            }
+            std::cout << std::endl; 
+        }
+        std::cout << std::endl; 
+
+        return result;
+    }
+
+}  // namespace transpose
+}  // namespace fpga_execution
+}  // namespace intel
+
+#endif
diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp
deleted file mode 100644
index e66b3a36..00000000
--- a/PTRANS/src/host/transpose_benchmark.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-//
-// Created by Marius Meyer on 04.12.19.
-//
-
-/*
-Copyright (c) 2019 Marius Meyer
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-*/
-
-#include "transpose_benchmark.hpp"
-
-/* C++ standard library headers */
-#include <memory>
-#include <random>
-
-/* Project's headers */
-#include "execution_types/execution_intel.hpp"
-#include "execution_types/execution_intel_pq.hpp"
-#include "execution_types/execution_pcie.hpp"
-#include "execution_types/execution_pcie_pq.hpp"
-#include "execution_types/execution_cpu.hpp"
-#include "communication_types.hpp"
-
-#include "data_handlers/data_handler_types.h"
-#include "data_handlers/diagonal.hpp"
-#include "data_handlers/pq.hpp"
-
-#include "parameters.h"
-
-
-transpose::TransposeBenchmark::TransposeBenchmark(int argc, char* argv[]) : HpccFpgaBenchmark(argc, argv) {
-    if (setupBenchmark(argc, argv)) {
-        setTransposeDataHandler(executionSettings->programSettings->dataHandlerIdentifier);
-    }
-}
-
-void
-transpose::TransposeBenchmark::addAdditionalParseOptions(cxxopts::Options &options) {
-    options.add_options()
-        ("m", "Matrix size in number of blocks in one dimension",
-            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_MATRIX_SIZE)))
-        ("b", "Block size in number of values in one dimension",
-            cxxopts::value<uint>()->default_value(std::to_string(BLOCK_SIZE)))
-        ("p", "Value of P that equals the width of the PQ grid of FPGAs. Q is determined by the world size.",
-            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_P_VALUE)))
-        ("distribute-buffers", "Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs.")
-        ("handler", "Specify the used data handler that distributes the data over devices and memory banks",
-            cxxopts::value<std::string>()->default_value(DEFAULT_DIST_TYPE));
-}
-
-std::unique_ptr<transpose::TransposeExecutionTimings>
-transpose::TransposeBenchmark::executeKernel(TransposeData &data) {
-    switch (executionSettings->programSettings->communicationType) {
-        case hpcc_base::CommunicationType::intel_external_channels: 
-                                if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
-                                    return transpose::fpga_execution::intel::calculate(*executionSettings, data);
-                                }
-                                else {
-                                    return transpose::fpga_execution::intel_pq::calculate(*executionSettings, data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*dataHandler));
-                                } break;
-        case hpcc_base::CommunicationType::pcie_mpi :                                 
-                                if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
-                                    return transpose::fpga_execution::pcie::calculate(*executionSettings, data, *dataHandler);
-                                }
-                                else {
-                                    return transpose::fpga_execution::pcie_pq::calculate(*executionSettings, data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*dataHandler));
-                                } break;
-#ifdef MKL_FOUND
-        case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*executionSettings, data, *dataHandler); break;
-#endif
-        default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType));
-    }
-}
-
-void
-transpose::TransposeBenchmark::collectAndPrintResults(const transpose::TransposeExecutionTimings &output) {
-    double flops = static_cast<double>(executionSettings->programSettings->matrixSize) * executionSettings->programSettings->matrixSize;
-
-    // Number of experiment repetitions
-    uint number_measurements = output.calculationTimings.size();
-    std::vector<double> max_measures(number_measurements);
-    std::vector<double> max_transfers(number_measurements);
-#ifdef _USE_MPI_
-        // Copy the object variable to a local variable to make it accessible to the lambda function
-        int mpi_size = mpi_comm_size;
-        MPI_Reduce(output.calculationTimings.data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-        MPI_Reduce(output.transferTimings.data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-#else
-        std::copy(output.calculationTimings.begin(), output.calculationTimings.end(), max_measures.begin());
-        std::copy(output.transferTimings.begin(), output.transferTimings.end(), max_transfers.begin());
-#endif
-
-    double avgCalculationTime = accumulate(max_measures.begin(), max_measures.end(), 0.0)
-                                / max_measures.size();
-    double minCalculationTime = *min_element(max_measures.begin(), max_measures.end());
-
-    double avgTransferTime = accumulate(max_transfers.begin(), max_transfers.end(), 0.0)
-                                / max_transfers.size();
-    double minTransferTime = *min_element(max_transfers.begin(), max_transfers.end());
-
-    double avgCalcFLOPS = flops / avgCalculationTime;
-    double maxCalcFLOPS = flops / minCalculationTime;
-    double avgMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime;
-    double maxMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime;
-    double avgTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime;
-    double maxTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime;
-
-
-
-
-    if (mpi_comm_rank == 0) {
-        std::cout << "       total [s]     transfer [s]  calc [s]      calc FLOPS    Mem [B/s]     PCIe [B/s]" << std::endl;
-        std::cout << "avg:   " << (avgTransferTime + avgCalculationTime)
-                << "   " << avgTransferTime
-                << "   " << avgCalculationTime
-                << "   " << avgCalcFLOPS
-                << "   " << avgMemBandwidth
-                << "   " << avgTransferBandwidth
-                << std::endl;
-        std::cout << "best:  " << (minTransferTime + minCalculationTime)
-                << "   " << minTransferTime
-                << "   " << minCalculationTime
-                << "   " << maxCalcFLOPS
-                << "   " << maxMemBandwidth
-                << "   " << maxTransferBandwidth
-                << std::endl;
-    }
-}
-
-std::unique_ptr<transpose::TransposeData>
-transpose::TransposeBenchmark::generateInputData() {
-    return dataHandler->generateData(*executionSettings);
-}
-
-bool  
-transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeData &data) {
-
-    // exchange the data using MPI depending on the chosen distribution scheme
-    dataHandler->exchangeData(data);
-
-    dataHandler->reference_transpose(data);
-
-    double max_error = 0.0;
-    for (size_t i = 0; i < executionSettings->programSettings->blockSize * executionSettings->programSettings->blockSize * data.numBlocks; i++) {
-        max_error = std::max(fabs(data.A[i]), max_error);
-    }
-
-    double global_max_error = 0;
-    MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-
-    if (mpi_comm_rank == 0) {
-        std::cout << "Maximum error: " << global_max_error << " < " << 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon() <<  std::endl;
-        std::cout << "Mach. Epsilon: " << std::numeric_limits<HOST_DATA_TYPE>::epsilon() << std::endl;
-    }
-
-    return static_cast<double>(global_max_error) < 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon();
-}
-
-}
diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index 148adc7f..d1ab4340 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -31,10 +31,18 @@ SOFTWARE.
 #include "parameters.h"
 #include "hpcc_benchmark.hpp"
 #include "transpose_data.hpp"
+#ifdef USE_OCL_HOST
 #include "execution_types/execution_intel.hpp"
 #include "execution_types/execution_intel_pq.hpp"
 #include "execution_types/execution_pcie.hpp"
 #include "execution_types/execution_pcie_pq.hpp"
+#endif
+#ifdef USE_XRT_HOST
+#include "execution_types/execution_xrt_pcie_pq.hpp"
+#ifdef USE_ACCL
+#include "execution_types/execution_xrt_accl_pq.hpp"
+#endif
+#endif
 #include "execution_types/execution_cpu.hpp"
 #include "communication_types.hpp"
 
@@ -54,7 +62,7 @@ namespace transpose {
  */
 template<class TDevice, class TContext, class TProgram> 
 class TransposeBenchmark : 
-public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext, TProgram, TransposeData, TransposeExecutionTimings> {
+public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext, TProgram, TransposeData<TContext>, TransposeExecutionTimings> {
 protected:
 
     /**
@@ -85,7 +93,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      * 
      * @return std::unique_ptr<TransposeData> The input and output data of the benchmark
      */
-    std::unique_ptr<TransposeData>
+    std::unique_ptr<TransposeData<TContext>>
     generateInputData() override {
         return this->dataHandler->generateData(*(this->executionSettings));
     }
@@ -110,7 +118,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      * @return std::unique_ptr<TransposeExecutionTimings> Measured runtimes of the kernel execution
      */
     std::unique_ptr<TransposeExecutionTimings>
-    executeKernel(TransposeData &data) override {
+    executeKernel(TransposeData<TContext> &data) override {
         switch (this->executionSettings->programSettings->communicationType) {
 #ifdef USE_OCL_HOST
             case hpcc_base::CommunicationType::intel_external_channels: 
@@ -128,6 +136,14 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
                                         return transpose::fpga_execution::pcie_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler));
                                     } break;
 #endif
+#ifdef USE_XRT_HOST
+            case hpcc_base::CommunicationType::pcie_mpi:
+                                    return transpose::fpga_execution::pcie_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler)); break;
+#ifdef USE_ACCL
+            case hpcc_base::CommunicationType::accl:
+                                    return transpose::fpga_execution::accl_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler)); break;
+#endif
+#endif
 #ifdef MKL_FOUND
             case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*(this->executionSettings), data, *dataHandler); break;
 #endif
@@ -143,7 +159,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(TransposeData &data) override {
+    validateOutputAndPrintError(TransposeData<TContext> &data) override {
 
         // exchange the data using MPI depending on the chosen distribution scheme
         this->dataHandler->exchangeData(data);
@@ -152,7 +168,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
 
         double max_error = 0.0;
         for (size_t i = 0; i < this->executionSettings->programSettings->blockSize * this->executionSettings->programSettings->blockSize * data.numBlocks; i++) {
-            max_error = std::max(fabs(data.A[i]), max_error);
+            max_error = std::max(std::abs<double>(data.A[i]), max_error);
         }
 
         double global_max_error = 0;
@@ -229,7 +245,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
      * @param argc the number of program input parameters
      * @param argv the program input parameters as array of strings
      */
-    TransposeBenchmark(int argc, char* argv[]) : hpcc_base::HpccFpgaBenchmark<transpose::TransposeProgramSettings,TDevice, TContext, TProgram, transpose::TransposeData, transpose::TransposeExecutionTimings>(argc, argv) {
+    TransposeBenchmark(int argc, char* argv[]) : hpcc_base::HpccFpgaBenchmark<transpose::TransposeProgramSettings,TDevice, TContext, TProgram, transpose::TransposeData<TContext>, transpose::TransposeExecutionTimings>(argc, argv) {
         if (this->setupBenchmark(argc, argv)) {
             this->setTransposeDataHandler(this->executionSettings->programSettings->dataHandlerIdentifier);
         }
@@ -238,7 +254,7 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
     /**
      * @brief Construct a new Transpose Benchmark object
      */
-    TransposeBenchmark() : hpcc_base::HpccFpgaBenchmark<transpose::TransposeProgramSettings,TDevice, TContext, TProgram, transpose::TransposeData, transpose::TransposeExecutionTimings>() {}
+    TransposeBenchmark() : hpcc_base::HpccFpgaBenchmark<transpose::TransposeProgramSettings,TDevice, TContext, TProgram, transpose::TransposeData<TContext>, transpose::TransposeExecutionTimings>() {}
 
 };
 
diff --git a/PTRANS/src/host/transpose_data.cpp b/PTRANS/src/host/transpose_data.cpp
index af794f30..20d6560f 100644
--- a/PTRANS/src/host/transpose_data.cpp
+++ b/PTRANS/src/host/transpose_data.cpp
@@ -37,47 +37,3 @@ transpose::TransposeProgramSettings::getSettingsMap() {
         return map;
 }
 
-transpose::TransposeData::TransposeData(cl::Context context, uint block_size, uint y_size) : context(context), 
-                                                                                numBlocks(y_size), blockSize(block_size) {
-    if (numBlocks * blockSize > 0) {
-#ifdef USE_SVM
-        A = reinterpret_cast<HOST_DATA_TYPE*>(
-                            clSVMAlloc(context(), 0 ,
-                            block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024));
-        B = reinterpret_cast<HOST_DATA_TYPE*>(
-                            clSVMAlloc(context(), 0 ,
-                            block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024));
-        result = reinterpret_cast<HOST_DATA_TYPE*>(
-                            clSVMAlloc(context(), 0 ,
-                            block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024));
-        exchange = reinterpret_cast<HOST_DATA_TYPE*>(
-                            clSVMAlloc(context(), 0 ,
-                            block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024));
-#else
-        posix_memalign(reinterpret_cast<void **>(&A), 64,
-                    sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
-        posix_memalign(reinterpret_cast<void **>(&B), 64,
-                    sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
-        posix_memalign(reinterpret_cast<void **>(&result), 64,
-                    sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
-        posix_memalign(reinterpret_cast<void **>(&exchange), 64,
-                    sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
-#endif
-    }
-}
-
-transpose::TransposeData::~TransposeData() {
-    if (numBlocks * blockSize > 0) {
-#ifdef USE_SVM
-        clSVMFree(context(), reinterpret_cast<void*>(A));});
-        clSVMFree(context(), reinterpret_cast<void*>(B));});
-        clSVMFree(context(), reinterpret_cast<void*>(result));});
-        clSVMFree(context(), reinterpret_cast<void*>(exchange));});
-#else
-        free(A);
-        free(B);
-        free(result);
-        free(exchange);
-#endif
-    }
-}
diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp
index a223353f..c73a9959 100644
--- a/PTRANS/src/host/transpose_data.hpp
+++ b/PTRANS/src/host/transpose_data.hpp
@@ -94,6 +94,7 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings {
  * @brief Data class cotnaining the data the kernel is exeucted with
  * 
  */
+template<class TContext>
 class TransposeData {
 
 public:
@@ -138,7 +139,7 @@ class TransposeData {
      * @brief The context that is used to allocate memory in SVM mode
      * 
      */
-    cl::Context context;
+    TContext context;
 
     /**
      * @brief Construct a new Transpose Data object
@@ -147,13 +148,54 @@ class TransposeData {
      * @param block_size size of the quadratic blocks that are stored within this object
      * @param y_size number of blocks that are stored within this object per replication
      */
-    TransposeData(cl::Context context, uint block_size, uint size_y);
+    TransposeData(TContext context, uint block_size, uint y_size): context(context),
+                                                                   numBlocks(y_size), blockSize(block_size) {
+        if (numBlocks * blockSize > 0) {
+#ifdef USE_SVM
+            A = reinterpret_cast<HOST_DATA_TYPE*>(
+                                     clSVMAlloc(context(), 0 ,
+                                 block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 4096));
+            B = reinterpret_cast<HOST_DATA_TYPE*>(
+                                clSVMAlloc(context(), 0 ,
+                                block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 4096));
+            result = reinterpret_cast<HOST_DATA_TYPE*>(
+                                clSVMAlloc(context(), 0 ,
+                                block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 4096));
+            exchange = reinterpret_cast<HOST_DATA_TYPE*>(
+                                clSVMAlloc(context(), 0 ,
+                                block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 4096));
+#else
+            posix_memalign(reinterpret_cast<void **>(&A), 4096,
+                        sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
+            posix_memalign(reinterpret_cast<void **>(&B), 4096,
+                        sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
+            posix_memalign(reinterpret_cast<void **>(&result), 4096,
+                        sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
+            posix_memalign(reinterpret_cast<void **>(&exchange), 4096,
+                        sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
+#endif
+        }
+    }
 
     /**
      * @brief Destroy the Transpose Data object. Free the allocated memory
      * 
      */
-    ~TransposeData();
+    ~TransposeData() {
+        if (numBlocks * blockSize > 0) {
+#ifdef USE_SVM
+            clSVMFree(context(), reinterpret_cast<void*>(A));});
+            clSVMFree(context(), reinterpret_cast<void*>(B));});
+            clSVMFree(context(), reinterpret_cast<void*>(result));});
+            clSVMFree(context(), reinterpret_cast<void*>(exchange));});
+#else
+            free(A);
+            free(B);
+            free(result);
+            free(exchange);
+#endif
+        }
+    }
 
 };
 

From 80a6e5721c108ffcb825dd7d2dc3233f3c945360 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 14 Apr 2022 16:58:38 +0100
Subject: [PATCH 019/318] Support for XRT w/o ACCL in base code

---
 cmake/general_benchmark_build_setup.cmake |  4 +-
 extern/CMakeLists.txt                     |  6 ++-
 shared/CMakeLists.txt                     |  4 ++
 shared/include/hpcc_benchmark.hpp         | 15 ++++--
 shared/include/setup/fpga_setup.hpp       | 20 ++++---
 shared/setup/fpga_setup.cpp               | 66 ++++++++++++-----------
 shared/setup/fpga_setup_accl.cpp          |  8 +--
 shared/setup/fpga_setup_xrt.cpp           |  5 +-
 8 files changed, 77 insertions(+), 51 deletions(-)

diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake
index 441b6f41..1537b092 100644
--- a/cmake/general_benchmark_build_setup.cmake
+++ b/cmake/general_benchmark_build_setup.cmake
@@ -91,7 +91,9 @@ endif()
 if (USE_ACCL)
     add_definitions(-DUSE_ACCL)
 endif()
-
+if (USE_XRT_HOST)
+    add_definitions(-DUSE_XRT_HOST)
+endif()
 if (USE_OCL_HOST)
     add_definitions(-DUSE_OCL_HOST)
 endif()
diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 77d5e3ac..18f03f37 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -55,13 +55,14 @@ if(NOT extern_cxxopts_POPULATED)
     EXCLUDE_FROM_ALL)
 endif()
 
+if (DEFINED USE_ACCL)
 # -------------------------------------------------------------------------------
 # ACCL Library
 FetchContent_Declare(
 	extern_accl
 
-	GIT_REPOSITORY	https://github.com/Mellich/ACCL.git
-	GIT_TAG		dev)
+    GIT_REPOSITORY	https://github.com/TristanLaan/ACCL.git
+	GIT_TAG		xrt_hardware_support)
 
 FetchContent_GetProperties(extern_accl)
 if(NOT extern_accl_POPULATED)
@@ -69,3 +70,4 @@ if(NOT extern_accl_POPULATED)
 	FetchContent_Populate(extern_accl)
 	set(extern_accl_SOURCE_DIR ${extern_accl_SOURCE_DIR} PARENT_SCOPE)
 endif()
+endif()
diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt
index fdb8ca2f..43749c0a 100644
--- a/shared/CMakeLists.txt
+++ b/shared/CMakeLists.txt
@@ -15,6 +15,10 @@ if (USE_ACCL)
     target_include_directories(hpcc_fpga_base PRIVATE ${ACCL_INCLUDE_PATH})
     target_link_libraries(hpcc_fpga_base accl)
 endif()
+if (USE_XRT_HOST)
+    target_link_directories(hpcc_fpga_base PUBLIC ${XRT_SEARCH_PATH})
+    target_link_libraries(hpcc_fpga_base xrt_coreutil xrt_core)
+endif()
 
 find_package(OpenCL QUIET)
 
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index b16994f2..0bf160f6 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -38,6 +38,9 @@ SOFTWARE.
 #ifdef USE_ACCL
 #include "setup/fpga_setup_accl.hpp"
 #endif
+#ifdef USE_XRT_HOST
+#include "setup/fpga_setup_xrt.hpp"
+#endif
 #include "setup/fpga_setup.hpp"
 #include "cxxopts.hpp"
 #include "parameters.h"
@@ -500,8 +503,8 @@ class HpccFpgaBenchmark {
             if (!programSettings->testOnly) {
 #ifdef USE_XRT_HOST
                 usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultDevice);
-                context = false;
-                program = fpga_setup::fpgaSetup(usedDevice);
+                context = std::unique_ptr<bool>(new bool(false));
+                program = fpga_setup::fpgaSetup(*usedDevice, programSettings->kernelFileName);
 #endif                                                             
 #ifdef USE_OCL_HOST
                 usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
@@ -511,7 +514,7 @@ class HpccFpgaBenchmark {
                                                                     &programSettings->kernelFileName);
 #endif
 #ifdef USE_ACCL
-                accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program);
+                //accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program);
 #endif
             }
 
@@ -696,9 +699,13 @@ std::ostream& operator<<(std::ostream& os, ExecutionSettings<TSettings, TDevice,
         std::string device_name;
         os << std::left;
         if (!printedExecutionSettings.programSettings->testOnly) {
-#ifndef USE_ACCL
+#ifdef USE_OCL_HOST
 		printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name);
 #endif
+#ifdef USE_XRT_HOST
+		device_name = printedExecutionSettings.device->template get_info<xrt::info::device::name>();
+#endif
+
 	}
         else {
             device_name = "TEST RUN: Not selected!";
diff --git a/shared/include/setup/fpga_setup.hpp b/shared/include/setup/fpga_setup.hpp
index 0799900c..1aa77117 100644
--- a/shared/include/setup/fpga_setup.hpp
+++ b/shared/include/setup/fpga_setup.hpp
@@ -30,13 +30,14 @@ SOFTWARE.
 #include <fstream>
 #include <memory>
 
+#ifdef USE_OCL_HOST
 /* External libraries */
 #ifdef USE_DEPRECATED_HPP_HEADER
 #include "CL/cl.hpp"
 #else
 #include OPENCL_HPP_HEADER
 #endif
-
+#endif
 
 /**
 Makro to convert the error integer representation to its string representation
@@ -74,6 +75,7 @@ class FpgaSetupException : public std::exception
     std::string error_message;
 };
 
+#ifdef USE_OCL_HOST
 /**
  * @brief Exception that is thrown if the ASSERT_CL failed
  * 
@@ -134,13 +136,6 @@ Sets up the given FPGA with the kernel in the provided file.
     fpgaSetup(const cl::Context *context, std::vector<cl::Device> deviceList,
               const std::string *usedKernelFile);
 
-/**
-Sets up the C++ environment by configuring std::cout and checking the clock
-granularity using bm_helper::checktick()
-*/
-    void
-    setupEnvironmentAndClocks();
-
 
 /**
 Searches an selects an FPGA device using the CL library functions.
@@ -159,5 +154,14 @@ choose a device.
     std::unique_ptr<cl::Device>
     selectFPGADevice(int defaultPlatform, int defaultDevice);
 
+
+#endif
+/**
+Sets up the C++ environment by configuring std::cout and checking the clock
+granularity using bm_helper::checktick()
+*/
+    void
+    setupEnvironmentAndClocks();
+
 }  // namespace fpga_setup
 #endif  // SRC_HOST_FPGA_SETUP_H_
diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp
index 6d08a26f..53ce4f55 100644
--- a/shared/setup/fpga_setup.cpp
+++ b/shared/setup/fpga_setup.cpp
@@ -28,6 +28,9 @@ FpgaSetupException::what() const noexcept
     return error_message.c_str();
 }
 
+
+#ifdef USE_OCL_HOST
+
 OpenClException::OpenClException(std::string error_name)
     : FpgaSetupException("An OpenCL error occured: " + error_name) {}
 
@@ -177,37 +180,6 @@ Sets up the given FPGA with the kernel in the provided file.
         return std::unique_ptr<cl::Program>(new cl::Program(program));
     }
 
-/**
-Sets up the C++ environment by configuring std::cout and checking the clock
-granularity using bm_helper::checktick()
-*/
-    void
-    setupEnvironmentAndClocks() {
-        std::cout << std::setprecision(5) << std::scientific;
-
-        int world_rank = 0;
-
-#ifdef _USE_MPI_
-        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
-#endif
-
-        if (world_rank == 0) {
-            std::cout << HLINE;
-            std::cout << "General setup:" << std::endl;
-
-            // Check clock granularity and output result
-            std::cout << "C++ high resolution clock is used." << std::endl;
-            std::cout << "The clock precision seems to be "
-                    << static_cast<double>
-                        (std::chrono::high_resolution_clock::period::num) /
-                        std::chrono::high_resolution_clock::period::den * 10e9
-                    << "ns" << std::endl;
-
-            std::cout << HLINE;
-        }
-    }
-
-
 /**
 Searches an selects an FPGA device using the CL library functions.
 If multiple platforms or devices are given, the user will be prompted to
@@ -321,4 +293,36 @@ choose a device.
         return std::unique_ptr<cl::Device>(new cl::Device(deviceList[chosenDeviceId]));
     }
 
+
+#endif
+/**
+Sets up the C++ environment by configuring std::cout and checking the clock
+granularity using bm_helper::checktick()
+*/
+    void
+    setupEnvironmentAndClocks() {
+        std::cout << std::setprecision(5) << std::scientific;
+
+        int world_rank = 0;
+
+#ifdef _USE_MPI_
+        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+#endif
+
+        if (world_rank == 0) {
+            std::cout << HLINE;
+            std::cout << "General setup:" << std::endl;
+
+            // Check clock granularity and output result
+            std::cout << "C++ high resolution clock is used." << std::endl;
+            std::cout << "The clock precision seems to be "
+                    << static_cast<double>
+                        (std::chrono::high_resolution_clock::period::num) /
+                        std::chrono::high_resolution_clock::period::den * 10e9
+                    << "ns" << std::endl;
+
+            std::cout << HLINE;
+        }
+    }
+
 }  // namespace fpga_setup
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 4abb8533..cbd98ede 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -13,7 +13,8 @@
 
 /* External libraries */
 #include "parameters.h"
-#include "xrt.h"
+#include "experimental/xrt_ip.h"
+#include "xrt/xrt_kernel.h"
 #ifdef _USE_MPI_
 #include "mpi.h"
 #endif
@@ -38,9 +39,10 @@ namespace fpga_setup {
         }
 #ifdef ACCL_HARDWARE_SUPPORT
         auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}");
-        auto hostctl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}",
+        auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}",
                 xrt::kernel::cu_access_mode::exclusive);
-        return std::unique_ptr<ACCL::ACCL>(new ACCL::ACCL(ranks, rank, device, cclo_ip, hostctrl_ip, 0, {0}, 0);
+        std::vector<int> mem(1,0);
+        return std::unique_ptr<ACCL::ACCL>(new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, 0));
 #else
                 // TODO: Add start port here. Currenty hardcoded!
         return std::unique_ptr<ACCL::ACCL>(new ACCL::ACCL(ranks, current_rank,
diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp
index f04e90aa..0410fd1b 100644
--- a/shared/setup/fpga_setup_xrt.cpp
+++ b/shared/setup/fpga_setup_xrt.cpp
@@ -14,6 +14,7 @@
 /* External libraries */
 #include "parameters.h"
 
+#include "xrt.h"
 #ifdef _USE_MPI_
 #include "mpi.h"
 #endif
@@ -22,14 +23,14 @@ namespace fpga_setup {
 
     std::unique_ptr<xrt::uuid>
     fpgaSetup(xrt::device &device,
-              std::string &kernelFileName) {
+              const std::string &kernelFileName) {
         int current_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
 
         int current_size;
         MPI_Comm_size(MPI_COMM_WORLD, & current_size);
 
-        return std::make_unique<xrt::uuid>(std::move(device.load_xclbin(kernelFileName)));
+        return std::unique_ptr<xrt::uuid>(new xrt::uuid(device.load_xclbin(kernelFileName)));
     }
 
     std::unique_ptr<xrt::device>

From 06a7150c9bca3782fcd3ecd305ad28e6b1b6781a Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 14 Apr 2022 17:43:30 +0100
Subject: [PATCH 020/318] Fix kernel arguments for XRT execution

---
 .../execution_types/execution_xrt_pcie_pq.hpp     | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
index 8629af01..2223858e 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
@@ -108,6 +108,9 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
                 xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * 
                                 sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0));
                 xrt::bo bufferB(*config.device, data.B + bufferStartList[r] * data.blockSize * data.blockSize, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
+                // TODO For small matrices, the 4KB alignment might fail for buffer B. Temporary fix seen in lines below (requires extra copying)
+                //xrt::bo bufferB(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
+                //bufferB.write(data.B + bufferStartList[r] * data.blockSize * data.blockSize);
                 xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2));
 
                 auto run = transposeKernel(bufferA, bufferB, bufferA_out, static_cast<cl_uint>(bufferOffsetList[r]),static_cast<cl_uint>(bufferOffsetList[r]),
@@ -159,9 +162,9 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
         auto startKernelCalculation = std::chrono::high_resolution_clock::now();
         for (int r = 0; r < transposeKernelList.size(); r++)
         {
-             runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast<cl_uint>(bufferOffsetList[r]),static_cast<cl_uint>(bufferOffsetList[r]),
+             runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast<cl_uint>(bufferStartList[r] + bufferOffsetList[r]),static_cast<cl_uint>(bufferOffsetList[r]),
                         static_cast<cl_uint>(blocksPerReplication[r]), static_cast<cl_uint>(handler.getWidthforRank()),
-                        static_cast<cl_uint>((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize))));
+                        static_cast<cl_uint>(handler.getHeightforRank())));
         }
         for (int r = 0; r < transposeKernelList.size(); r++)
         {
@@ -220,14 +223,6 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
                 calculationTimings
         });
 
-        for (int i=0; i < local_matrix_height; i++) {
-            for (int j=0; j < local_matrix_width; j++) {
-                std::cout << data.result[i * local_matrix_width + j] << ",";
-            }
-            std::cout << std::endl; 
-        }
-        std::cout << std::endl; 
-
         return result;
     }
 

From ff7c70e5f882696753b9cd9d225965a30a7edd0f Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 20 Apr 2022 10:16:34 +0100
Subject: [PATCH 021/318] Add transpose ACCL implementation

---
 PTRANS/src/host/data_handlers/pq.hpp          |   8 +
 .../execution_types/execution_xrt_accl_pq.hpp | 247 +++++++++++++++---
 .../execution_types/execution_xrt_pcie_pq.hpp |   4 -
 shared/include/hpcc_benchmark.hpp             |   2 +-
 4 files changed, 220 insertions(+), 41 deletions(-)

diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp
index 0e28c109..7fb08b6c 100644
--- a/PTRANS/src/host/data_handlers/pq.hpp
+++ b/PTRANS/src/host/data_handlers/pq.hpp
@@ -130,6 +130,14 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, T
         return height_per_rank;
     }
 
+    int getP() {
+        return pq_width;
+    }
+
+    int getQ() {
+        return pq_height;
+    }
+
     /**
      * @brief Generate data for transposition based on the implemented distribution scheme
      * 
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
index ce42dd6f..832a7d37 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
@@ -28,14 +28,208 @@ SOFTWARE.
 #include <chrono>
 
 /* Project's headers */
+#include "buffer.hpp"
+#include "cclo.hpp"
+#include "constants.hpp"
+#include "fpgabuffer.hpp"
 #include "transpose_benchmark.hpp"
 #include "data_handlers/data_handler_types.h"
 #include "data_handlers/pq.hpp"
+#include "transpose_data.hpp"
 
 namespace transpose {
 namespace fpga_execution {
 namespace accl_pq {
 
+    void accl_exchangeData(ACCL::ACCL &accl, transpose::data_handler::DistributedPQTransposeDataHandler<xrt::device, bool, xrt::uuid> &handler,
+                            transpose::TransposeData<bool> & data, xrt::bo bufferAXrt, int global_width) {
+       
+        int pq_width = handler.getP();
+        int pq_height = handler.getQ();
+        int width_per_rank = handler.getWidthforRank();
+        int height_per_rank = handler.getHeightforRank();
+        MPI_Datatype data_block;
+        MPI_Type_vector(data.blockSize,data.blockSize,(handler.getWidthforRank() - 1)*data.blockSize, MPI_FLOAT, &data_block);
+        MPI_Type_commit(&data_block);
+
+        int mpi_comm_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank);
+        int pq_row = mpi_comm_rank / pq_width;
+        int pq_col = mpi_comm_rank % pq_width;
+ 
+        auto AcclBufferA = ACCL::FPGABuffer<HOST_DATA_TYPE>(bufferAXrt, data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32, true, data.A);
+
+        if (pq_width == pq_height) {
+            if (pq_col != pq_row) {
+
+                int pair_rank = pq_width * pq_col + pq_row;
+
+                // To re-calculate the matrix transposition locally on this host, we need to 
+                // exchange matrix A for every kernel replication
+                // The order of the matrix blocks does not change during the exchange, because they are distributed diagonally 
+                // and will be handled in the order below:
+                //
+                // . . 1 3
+                // . . . 2
+                // 1 . . .
+                // 3 2 . .
+                auto AcclBufferA_recv = accl.create_buffer(data.exchange, data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32); 
+
+                // Send and receive matrix A using ACCL directly on FPGA 
+                auto send = accl.send(0, AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,true,ACCL::streamFlags::NO_STREAM, true);
+                accl.recv(0, *AcclBufferA_recv, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, true, ACCL::streamFlags::NO_STREAM);
+                send->wait();
+                // Copy received matrix from receiving buffer to A buffer completely on FPGA
+                accl.copy(*AcclBufferA_recv, AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, true, true); 
+            }
+        }
+        else {
+            // Taken from "Parallel matrix transpose algorithms on distributed memory concurrent computers" by J. Choi, J. J. Dongarra, D. W. Walker
+            // and translated to C++
+            // This will do a diagonal exchange of matrix blocks.
+
+            // Determine LCM using GCD from standard library using the C++14 call
+            // In C++17 this changes to std::gcd in numeric, also std::lcm is directly available in numeric
+            int gcd = std::__gcd(pq_height, pq_width);
+            int least_common_multiple = pq_height * pq_width / gcd;
+
+            // If the global matrix size is not a multiple of the LCM block size, the numbers of send and received blocks
+            // may be wrongly calculated. Throw exception to prevent this and make aware of this issue!
+            if (global_width % least_common_multiple > 0) {
+                throw std::runtime_error("Implementation does not support matrix sizes that are not multiple of LCM blocks! Results may be wrong!");
+            }
+
+            // MPI requests for non-blocking communication
+            // First half of vector is for Isend, second half for Irecv!
+            std::vector<ACCL::CCLO*> accl_requests(2 * gcd);
+
+            // Begin algorithm from Figure 14 for general case
+            int g = transpose::data_handler::mod(pq_row - pq_col, gcd);
+            int p = transpose::data_handler::mod(pq_col + g, pq_width);
+            int q = transpose::data_handler::mod(pq_row - g, pq_height);
+
+            // Pre-calculate target ranks in LCM block
+            // The vector list variable can be interpreted as 2D matrix. Every entry represents the target rank of the sub-block
+            // Since the LCM block will repeat, we only need to store this small amount of data!
+            std::vector<int> target_list(least_common_multiple/pq_height * least_common_multiple/pq_width);
+            for (int row = 0; row  < least_common_multiple/pq_height; row++) {
+                for (int col = 0; col  < least_common_multiple/pq_width; col++) {
+                    int global_block_col = pq_col + col * pq_width;
+                    int global_block_row = pq_row + row * pq_height;
+                    int destination_rank = (global_block_col % pq_height) * pq_width + (global_block_row % pq_width);
+                    target_list[row * least_common_multiple/pq_width + col] = destination_rank;
+                }
+            }
+
+            // Create some ACCL buffers to send and receive from other FPGAs
+            // They can reside completely on FPGA
+            std::vector<std::unique_ptr<ACCL::BaseBuffer>> send_buffers;
+            std::vector<std::unique_ptr<ACCL::BaseBuffer>> recv_buffers;
+            for (int i = 0; i < gcd; i++) {
+                // TODO Is there a way to initialize buffer only in FPGA memory with ACCL?
+                send_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32)); 
+                recv_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32)); 
+            }
+            int current_parallel_execution = 0;
+            for (int j = 0; j < least_common_multiple/pq_width; j++) {
+                for (int i = 0; i < least_common_multiple/pq_height; i++) {
+                    // Determine sender and receiver rank of current rank for current communication step
+                    int send_rank = transpose::data_handler::mod(p + i * gcd, pq_width) + transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width;
+                    int recv_rank = transpose::data_handler::mod(p - i * gcd, pq_width) + transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width;
+
+                    // Also count receiving buffer size because sending and receiving buffer size may differ in certain scenarios!
+                    int receiving_size = 0;
+                    int sending_size = 0;
+
+                    std::vector<int> send_rows;
+                    std::vector<int> send_cols;
+                    // Look up which blocks are affected by the current rank
+                    for (int row = 0; row  < least_common_multiple/pq_height; row++) {
+                        for (int col = 0; col  < least_common_multiple/pq_width; col++) {
+                            if (target_list[row * least_common_multiple/pq_width + col] == send_rank) {
+                                send_rows.push_back(row);
+                                send_cols.push_back(col);
+                                sending_size += data.blockSize * data.blockSize;
+                            }
+                            if (target_list[row * least_common_multiple/pq_width + col] == recv_rank) {
+                                receiving_size += data.blockSize * data.blockSize;
+                            }
+                        }
+                    }
+                    receiving_size *= (height_per_rank)/(least_common_multiple/pq_height) * ((width_per_rank)/(least_common_multiple/pq_width));
+                    sending_size *= (height_per_rank)/(least_common_multiple/pq_height) * ((width_per_rank)/(least_common_multiple/pq_width));
+
+                    // Copy the required date for this communication step to the send buffer!
+                    for (int t=0; t < send_rows.size(); t++) {
+                        for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) {
+                            for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) {
+                                size_t sending_buffer_offset = lcm_row * data.blockSize * data.blockSize * ((width_per_rank)/(least_common_multiple/pq_width)) + lcm_col * data.blockSize * data.blockSize;
+                                size_t matrix_buffer_offset = (send_cols[t] + lcm_col * least_common_multiple/pq_width)  * data.blockSize + (send_rows[t] + lcm_row * least_common_multiple/pq_height) * width_per_rank * data.blockSize * data.blockSize;
+                                for (int block_row = 0; block_row < data.blockSize; block_row++) {
+                                    // TODO May be more efficient when done async!
+                                    accl.copy(*AcclBufferA.slice(matrix_buffer_offset + block_row * width_per_rank * data.blockSize, matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize),*send_buffers[current_parallel_execution]->slice(sending_buffer_offset, sending_buffer_offset + data.blockSize),data.blockSize, true, true);
+                                }
+                            }
+                        }
+                    }
+
+                    // Do actual MPI communication
+#ifndef NDEBUG
+                    std::cout << "Rank " << mpi_comm_rank << ": blocks (" << sending_size / (data.blockSize * data.blockSize) << "," << receiving_size / (data.blockSize * data.blockSize) << ") send " << send_rank << ", recv " << recv_rank << std::endl << std::flush;
+#endif
+                    accl_requests[current_parallel_execution] = (accl.send(0, *send_buffers[current_parallel_execution], sending_size, send_rank, 0, false, ACCL::streamFlags::NO_STREAM, true));
+                    accl_requests[current_parallel_execution] = (accl.recv(0, *recv_buffers[current_parallel_execution], sending_size, send_rank, 0, false, ACCL::streamFlags::NO_STREAM, true));
+                    // Increase the counter for parallel executions
+                    current_parallel_execution = (current_parallel_execution + 1) % gcd;
+
+                    // Wait for MPI requests if GCD MPI calls are scheduled in parallel
+                    if ((current_parallel_execution) % gcd == 0) {
+
+
+                        for (auto& req :accl_requests) {
+                        
+                            MPI_Status status;
+                            int index;
+
+                            // Wait for all send and recv events to complete
+                            // TODO do the CCLO pointers need to be freed?
+                            accl.nop(false, accl_requests);
+                            // For each message that was received in parallel
+                            if (index >= gcd) {
+                                std::vector<int> recv_rows;
+                                std::vector<int> recv_cols;
+                                // Look up which blocks are affected by the current rank
+                                for (int row = 0; row  < least_common_multiple/pq_height; row++) {
+                                    for (int col = 0; col  < least_common_multiple/pq_width; col++) {
+                                        if (target_list[row * least_common_multiple/pq_width + col] == status.MPI_SOURCE) {
+                                            recv_rows.push_back(row);
+                                            recv_cols.push_back(col);
+                                        }
+                                    }
+                                }
+                                // Copy received data to matrix A buffer
+                                for (int t=0; t < recv_rows.size(); t++) {
+                                    for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) {
+                                        for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) {
+                                            size_t receiving_buffer_offset = lcm_row * data.blockSize * data.blockSize * ((width_per_rank)/(least_common_multiple/pq_width)) + lcm_col * data.blockSize * data.blockSize;
+                                            size_t matrix_buffer_offset = (recv_cols[t] + lcm_col * least_common_multiple/pq_width)  * data.blockSize + (recv_rows[t] + lcm_row * least_common_multiple/pq_height) * width_per_rank * data.blockSize * data.blockSize;
+                                            for (int block_row = 0; block_row < data.blockSize; block_row++) {
+                                                // TODO May be more efficient when done async!
+                                                accl.copy(*recv_buffers[current_parallel_execution]->slice(receiving_buffer_offset, receiving_buffer_offset + data.blockSize),*AcclBufferA.slice(matrix_buffer_offset + block_row * width_per_rank * data.blockSize, matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize), data.blockSize, true, true);
+
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    } 
+                }
+            }
+        }
+    }
+
+
     /**
  * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and PCIe+MPI over the host for communication
  * 
@@ -97,23 +291,14 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
 
                 total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * local_matrix_width;
 
-                int memory_bank_info_a = 0;
-                int memory_bank_info_b = 0;
-                int memory_bank_info_out = 0;
-                
                 // create the kernels
                 xrt::kernel transposeKernel(*config.device, *config.program, ("transpose0:{transpose0_" +  std::to_string(r + 1) + "}").c_str());
 
-               
                 xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * 
                                 sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0));
                 xrt::bo bufferB(*config.device, data.B + bufferStartList[r] * data.blockSize * data.blockSize, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
                 xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2));
 
-                auto run = transposeKernel(bufferA, bufferB, bufferA_out, static_cast<cl_uint>(bufferOffsetList[r]),static_cast<cl_uint>(bufferOffsetList[r]),
-                        static_cast<cl_uint>(blocks_per_replication), static_cast<cl_uint>(handler.getWidthforRank()),
-                        static_cast<cl_uint>((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)));
-
                 bufferListA.push_back(bufferA);
                 bufferListB.push_back(bufferB);
                 bufferListA_out.push_back(bufferA_out);
@@ -141,33 +326,26 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
 
             auto startCalculation = std::chrono::high_resolution_clock::now();
 
+
+            // Exchange A data via ACCL
+            if (bufferListA.size() > 1) {
+                std::cerr << "WARNING: Only the matrix A of the first kernel replication will be exchanged via ACCL!" << std::endl;
+            }
+            accl_exchangeData(*config.accl, handler, data, bufferListA[0], config.programSettings->matrixSize / data.blockSize);
+
+            std::vector<xrt::run> runs;
+            auto startKernelCalculation = std::chrono::high_resolution_clock::now();
             for (int r = 0; r < transposeKernelList.size(); r++)
             {
-                bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+                 runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast<cl_uint>(bufferOffsetList[r]),static_cast<cl_uint>(bufferOffsetList[r]),
+                            static_cast<cl_uint>(blocksPerReplication[r]), static_cast<cl_uint>(handler.getWidthforRank()),
+                            static_cast<cl_uint>((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize))));
             }
-
-
-        // Exchange A data via PCIe and MPI
-        handler.exchangeData(data);
-
-        for (int r = 0; r < transposeKernelList.size(); r++)
-        {
-            bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
-        }
-
-        std::vector<xrt::run> runs;
-        auto startKernelCalculation = std::chrono::high_resolution_clock::now();
-        for (int r = 0; r < transposeKernelList.size(); r++)
-        {
-             runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast<cl_uint>(bufferOffsetList[r]),static_cast<cl_uint>(bufferOffsetList[r]),
-                        static_cast<cl_uint>(blocksPerReplication[r]), static_cast<cl_uint>(handler.getWidthforRank()),
-                        static_cast<cl_uint>((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize))));
-        }
-        for (int r = 0; r < transposeKernelList.size(); r++)
-        {
-            runs[r].wait();
-        }
-        auto endCalculation = std::chrono::high_resolution_clock::now();
+            for (int r = 0; r < transposeKernelList.size(); r++)
+            {
+                runs[r].wait();
+            }
+            auto endCalculation = std::chrono::high_resolution_clock::now();
 #ifndef NDEBUG
                 int mpi_rank;
                 MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
@@ -177,9 +355,6 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
                                 / std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startKernelCalculation).count() * 1.0e-9) << " GB/s)" << std::endl;
 #endif
 
-        // Transfer back data for next repetition!
-        handler.exchangeData(data);
-
             std::chrono::duration<double> calculationTime =
                     std::chrono::duration_cast<std::chrono::duration<double>>
                             (endCalculation - startCalculation);
diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
index 2223858e..fd3618c9 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
@@ -113,10 +113,6 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
                 //bufferB.write(data.B + bufferStartList[r] * data.blockSize * data.blockSize);
                 xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2));
 
-                auto run = transposeKernel(bufferA, bufferB, bufferA_out, static_cast<cl_uint>(bufferOffsetList[r]),static_cast<cl_uint>(bufferOffsetList[r]),
-                        static_cast<cl_uint>(blocks_per_replication), static_cast<cl_uint>(handler.getWidthforRank()),
-                        static_cast<cl_uint>((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)));
-
                 bufferListA.push_back(bufferA);
                 bufferListB.push_back(bufferB);
                 bufferListA_out.push_back(bufferA_out);
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 0bf160f6..c3ec4b4c 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -514,7 +514,7 @@ class HpccFpgaBenchmark {
                                                                     &programSettings->kernelFileName);
 #endif
 #ifdef USE_ACCL
-                //accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program);
+                accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program);
 #endif
             }
 

From 9b3d8769799b441a7522c5853fd1a2439a394632 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 20 Apr 2022 10:43:09 +0100
Subject: [PATCH 022/318] Change ACCL dependency to dev branch

---
 extern/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 18f03f37..341f73cd 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -61,8 +61,8 @@ if (DEFINED USE_ACCL)
 FetchContent_Declare(
 	extern_accl
 
-    GIT_REPOSITORY	https://github.com/TristanLaan/ACCL.git
-	GIT_TAG		xrt_hardware_support)
+    GIT_REPOSITORY	https://github.com/Xilinx/ACCL.git
+	GIT_TAG		dev)
 
 FetchContent_GetProperties(extern_accl)
 if(NOT extern_accl_POPULATED)

From 9fe0d145318316b7581f74182ff9809e1549bf99 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 20 Apr 2022 10:43:44 +0100
Subject: [PATCH 023/318] Only initialize ACCL when ACCL implementation is used

---
 shared/include/hpcc_benchmark.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index c3ec4b4c..ab4d092d 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -514,7 +514,12 @@ class HpccFpgaBenchmark {
                                                                     &programSettings->kernelFileName);
 #endif
 #ifdef USE_ACCL
-                accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program);
+                if (programSettings->communicationType == CommunicationType::accl) {
+                    accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program);
+                }
+                else {
+                    accl = std::unique_ptr<ACCL::ACCL>(nullptr);
+                }
 #endif
             }
 

From f5c291aa02dc24a7d6393b73c2ce32221f0a98fb Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 20 Apr 2022 13:34:13 +0100
Subject: [PATCH 024/318] Add YCM config to gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 17305538..e6b8e632 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,9 @@
 # Python virtual environments
 .venv
 
+#YCM config
+.ycm_extra_conf.py
+
 # CMake build directories should be created in the following folder
 *._*
 build/*

From 72ed4cdbae552a795bcf56fe9d3fa9ea1a9b748a Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 20 Apr 2022 18:15:03 +0100
Subject: [PATCH 025/318] Update extenr deps to XRT simulation branch

---
 extern/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 341f73cd..7845280d 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -61,8 +61,8 @@ if (DEFINED USE_ACCL)
 FetchContent_Declare(
 	extern_accl
 
-    GIT_REPOSITORY	https://github.com/Xilinx/ACCL.git
-	GIT_TAG		dev)
+    GIT_REPOSITORY	https://github.com/TristanLaan/ACCL.git
+	GIT_TAG		simbuffer_bo_constructor)
 
 FetchContent_GetProperties(extern_accl)
 if(NOT extern_accl_POPULATED)

From 44ff497697f9be0e107c0efcd02ae03766d91f9d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 20 Apr 2022 18:15:35 +0100
Subject: [PATCH 026/318] Adapt consturctor to new signature

---
 shared/setup/fpga_setup_accl.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index cbd98ede..d521264e 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -46,8 +46,7 @@ namespace fpga_setup {
 #else
                 // TODO: Add start port here. Currenty hardcoded!
         return std::unique_ptr<ACCL::ACCL>(new ACCL::ACCL(ranks, current_rank,
-                          "tcp://localhost:" +
-                            std::to_string(5500 + current_rank)));
+                            5500));
 #endif
         }
 

From 1764ff39648457f867db460f52bfdc92a0b31d9e Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 20 Apr 2022 18:16:20 +0100
Subject: [PATCH 027/318] Add debug logging and fixes of ACCL execution

---
 .../execution_types/execution_xrt_accl_pq.hpp | 52 ++++++++++++++-----
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
index 832a7d37..10fb36e1 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
@@ -48,17 +48,14 @@ namespace accl_pq {
         int pq_height = handler.getQ();
         int width_per_rank = handler.getWidthforRank();
         int height_per_rank = handler.getHeightforRank();
-        MPI_Datatype data_block;
-        MPI_Type_vector(data.blockSize,data.blockSize,(handler.getWidthforRank() - 1)*data.blockSize, MPI_FLOAT, &data_block);
-        MPI_Type_commit(&data_block);
 
         int mpi_comm_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank);
         int pq_row = mpi_comm_rank / pq_width;
         int pq_col = mpi_comm_rank % pq_width;
  
-        auto AcclBufferA = ACCL::FPGABuffer<HOST_DATA_TYPE>(bufferAXrt, data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32, true, data.A);
-
+        auto AcclBufferA = accl.create_buffer<HOST_DATA_TYPE>(bufferAXrt, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32);
+        
         if (pq_width == pq_height) {
             if (pq_col != pq_row) {
 
@@ -73,14 +70,14 @@ namespace accl_pq {
                 // . . . 2
                 // 1 . . .
                 // 3 2 . .
-                auto AcclBufferA_recv = accl.create_buffer(data.exchange, data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32); 
+                auto AcclBufferA_recv = accl.create_buffer(data.exchange, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32); 
 
                 // Send and receive matrix A using ACCL directly on FPGA 
-                auto send = accl.send(0, AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,true,ACCL::streamFlags::NO_STREAM, true);
+                auto send = accl.send(0, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,true,ACCL::streamFlags::NO_STREAM, true);
                 accl.recv(0, *AcclBufferA_recv, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, true, ACCL::streamFlags::NO_STREAM);
                 send->wait();
                 // Copy received matrix from receiving buffer to A buffer completely on FPGA
-                accl.copy(*AcclBufferA_recv, AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, true, true); 
+                accl.copy(*AcclBufferA_recv, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, true, true); 
             }
         }
         else {
@@ -127,8 +124,8 @@ namespace accl_pq {
             std::vector<std::unique_ptr<ACCL::BaseBuffer>> recv_buffers;
             for (int i = 0; i < gcd; i++) {
                 // TODO Is there a way to initialize buffer only in FPGA memory with ACCL?
-                send_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32)); 
-                recv_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32)); 
+                send_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); 
+                recv_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); 
             }
             int current_parallel_execution = 0;
             for (int j = 0; j < least_common_multiple/pq_width; j++) {
@@ -159,6 +156,9 @@ namespace accl_pq {
                     receiving_size *= (height_per_rank)/(least_common_multiple/pq_height) * ((width_per_rank)/(least_common_multiple/pq_width));
                     sending_size *= (height_per_rank)/(least_common_multiple/pq_height) * ((width_per_rank)/(least_common_multiple/pq_width));
 
+#ifndef NDEBUG
+                    std::cout << "Copy data to send buffers" << std::endl;
+#endif
                     // Copy the required date for this communication step to the send buffer!
                     for (int t=0; t < send_rows.size(); t++) {
                         for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) {
@@ -167,7 +167,17 @@ namespace accl_pq {
                                 size_t matrix_buffer_offset = (send_cols[t] + lcm_col * least_common_multiple/pq_width)  * data.blockSize + (send_rows[t] + lcm_row * least_common_multiple/pq_height) * width_per_rank * data.blockSize * data.blockSize;
                                 for (int block_row = 0; block_row < data.blockSize; block_row++) {
                                     // TODO May be more efficient when done async!
-                                    accl.copy(*AcclBufferA.slice(matrix_buffer_offset + block_row * width_per_rank * data.blockSize, matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize),*send_buffers[current_parallel_execution]->slice(sending_buffer_offset, sending_buffer_offset + data.blockSize),data.blockSize, true, true);
+                                    std::cout << "A(" << matrix_buffer_offset + block_row * width_per_rank * data.blockSize 
+                                                    << "," << matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize
+                                                << ") send(" << sending_buffer_offset
+                                                << "," << sending_buffer_offset + data.blockSize << ")" << std::endl;
+                                    accl.copy(*AcclBufferA->slice(
+                                                    matrix_buffer_offset + block_row * width_per_rank * data.blockSize, 
+                                                    matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize),
+                                                *send_buffers[current_parallel_execution]->slice(
+                                                    sending_buffer_offset,
+                                                    sending_buffer_offset + data.blockSize),
+                                                data.blockSize, true, true);
                                 }
                             }
                         }
@@ -215,7 +225,7 @@ namespace accl_pq {
                                             size_t matrix_buffer_offset = (recv_cols[t] + lcm_col * least_common_multiple/pq_width)  * data.blockSize + (recv_rows[t] + lcm_row * least_common_multiple/pq_height) * width_per_rank * data.blockSize * data.blockSize;
                                             for (int block_row = 0; block_row < data.blockSize; block_row++) {
                                                 // TODO May be more efficient when done async!
-                                                accl.copy(*recv_buffers[current_parallel_execution]->slice(receiving_buffer_offset, receiving_buffer_offset + data.blockSize),*AcclBufferA.slice(matrix_buffer_offset + block_row * width_per_rank * data.blockSize, matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize), data.blockSize, true, true);
+                                                accl.copy(*recv_buffers[current_parallel_execution]->slice(receiving_buffer_offset, receiving_buffer_offset + data.blockSize),*AcclBufferA->slice(matrix_buffer_offset + block_row * width_per_rank * data.blockSize, matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize), data.blockSize, true, true);
 
                                             }
                                         }
@@ -268,6 +278,9 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
 
         size_t total_offset = 0;
         size_t row_offset = 0;
+#ifndef NDEBUG
+        std::cout << "Start kernel creation" << std::endl;
+#endif
         // Setup the kernels depending on the number of kernel replications
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
 
@@ -307,9 +320,12 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
 
         std::vector<double> transferTimings;
         std::vector<double> calculationTimings;
-
+        
         for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) {
 
+#ifndef NDEBUG
+        std::cout << "Start data transfer" << std::endl;
+#endif
             auto startTransfer = std::chrono::high_resolution_clock::now();
 
             for (int r = 0; r < transposeKernelList.size(); r++) {
@@ -331,8 +347,13 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
             if (bufferListA.size() > 1) {
                 std::cerr << "WARNING: Only the matrix A of the first kernel replication will be exchanged via ACCL!" << std::endl;
             }
+#ifndef NDEBUG
+        std::cout << "Start data exchange with ACCL" << std::endl;
+#endif
             accl_exchangeData(*config.accl, handler, data, bufferListA[0], config.programSettings->matrixSize / data.blockSize);
-
+#ifndef NDEBUG
+        std::cout << "End data exchange with ACCL" << std::endl;
+#endif
             std::vector<xrt::run> runs;
             auto startKernelCalculation = std::chrono::high_resolution_clock::now();
             for (int r = 0; r < transposeKernelList.size(); r++)
@@ -341,6 +362,9 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
                             static_cast<cl_uint>(blocksPerReplication[r]), static_cast<cl_uint>(handler.getWidthforRank()),
                             static_cast<cl_uint>((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize))));
             }
+#ifndef NDEBUG
+        std::cout << "Wait for kernels to complete" << std::endl;
+#endif
             for (int r = 0; r < transposeKernelList.size(); r++)
             {
                 runs[r].wait();

From 536eb3f8264a495aa5e2400f6c0fbb49d08942ff Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 25 Apr 2022 15:13:43 +0100
Subject: [PATCH 028/318] Refactoring and cleanup of ACCL host code

---
 .../execution_types/execution_xrt_accl_pq.hpp | 758 ++++++++++--------
 .../execution_types/execution_xrt_pcie_pq.hpp | 365 +++++----
 2 files changed, 620 insertions(+), 503 deletions(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
index 10fb36e1..dab92c96 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
@@ -23,407 +23,495 @@ SOFTWARE.
 #define SRC_HOST_ACCL_PQ_EXECUTION_H_
 
 /* C++ standard library headers */
+#include <chrono>
 #include <memory>
 #include <vector>
-#include <chrono>
 
 /* Project's headers */
 #include "buffer.hpp"
 #include "cclo.hpp"
 #include "constants.hpp"
-#include "fpgabuffer.hpp"
-#include "transpose_benchmark.hpp"
 #include "data_handlers/data_handler_types.h"
 #include "data_handlers/pq.hpp"
+#include "fpgabuffer.hpp"
 #include "transpose_data.hpp"
 
 namespace transpose {
 namespace fpga_execution {
 namespace accl_pq {
 
-    void accl_exchangeData(ACCL::ACCL &accl, transpose::data_handler::DistributedPQTransposeDataHandler<xrt::device, bool, xrt::uuid> &handler,
-                            transpose::TransposeData<bool> & data, xrt::bo bufferAXrt, int global_width) {
-       
-        int pq_width = handler.getP();
-        int pq_height = handler.getQ();
-        int width_per_rank = handler.getWidthforRank();
-        int height_per_rank = handler.getHeightforRank();
-
-        int mpi_comm_rank;
-        MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank);
-        int pq_row = mpi_comm_rank / pq_width;
-        int pq_col = mpi_comm_rank % pq_width;
- 
-        auto AcclBufferA = accl.create_buffer<HOST_DATA_TYPE>(bufferAXrt, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32);
-        
-        if (pq_width == pq_height) {
-            if (pq_col != pq_row) {
-
-                int pair_rank = pq_width * pq_col + pq_row;
-
-                // To re-calculate the matrix transposition locally on this host, we need to 
-                // exchange matrix A for every kernel replication
-                // The order of the matrix blocks does not change during the exchange, because they are distributed diagonally 
-                // and will be handled in the order below:
-                //
-                // . . 1 3
-                // . . . 2
-                // 1 . . .
-                // 3 2 . .
-                auto AcclBufferA_recv = accl.create_buffer(data.exchange, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32); 
-
-                // Send and receive matrix A using ACCL directly on FPGA 
-                auto send = accl.send(0, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,true,ACCL::streamFlags::NO_STREAM, true);
-                accl.recv(0, *AcclBufferA_recv, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, true, ACCL::streamFlags::NO_STREAM);
-                send->wait();
-                // Copy received matrix from receiving buffer to A buffer completely on FPGA
-                accl.copy(*AcclBufferA_recv, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, true, true); 
-            }
-        }
-        else {
-            // Taken from "Parallel matrix transpose algorithms on distributed memory concurrent computers" by J. Choi, J. J. Dongarra, D. W. Walker
-            // and translated to C++
-            // This will do a diagonal exchange of matrix blocks.
-
-            // Determine LCM using GCD from standard library using the C++14 call
-            // In C++17 this changes to std::gcd in numeric, also std::lcm is directly available in numeric
-            int gcd = std::__gcd(pq_height, pq_width);
-            int least_common_multiple = pq_height * pq_width / gcd;
-
-            // If the global matrix size is not a multiple of the LCM block size, the numbers of send and received blocks
-            // may be wrongly calculated. Throw exception to prevent this and make aware of this issue!
-            if (global_width % least_common_multiple > 0) {
-                throw std::runtime_error("Implementation does not support matrix sizes that are not multiple of LCM blocks! Results may be wrong!");
-            }
+void accl_exchangeData(
+    ACCL::ACCL &accl,
+    transpose::data_handler::DistributedPQTransposeDataHandler<xrt::device, bool, xrt::uuid>
+        &handler,
+    transpose::TransposeData<bool> &data, xrt::bo &bufferAXrt, int global_width) {
+
+  int pq_width = handler.getP();
+  int pq_height = handler.getQ();
+  int width_per_rank = handler.getWidthforRank();
+  int height_per_rank = handler.getHeightforRank();
+
+  int mpi_comm_rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank);
+  int pq_row = mpi_comm_rank / pq_width;
+  int pq_col = mpi_comm_rank % pq_width;
+
+  auto AcclBufferA = accl.create_buffer<HOST_DATA_TYPE>(
+      bufferAXrt, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32);
+  if (pq_width == pq_height) {
+    if (pq_col != pq_row) {
+
+      int pair_rank = pq_width * pq_col + pq_row;
+
+      // To re-calculate the matrix transposition locally on this host, we need to
+      // exchange matrix A for every kernel replication
+      // The order of the matrix blocks does not change during the exchange, because they are
+      // distributed diagonally and will be handled in the order below:
+      //
+      // . . 1 3
+      // . . . 2
+      // 1 . . .
+      // 3 2 . .
+      // auto AcclBufferA_recv = accl.create_buffer<HOST_DATA_TYPE>(
+      //     data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32);
+      // AcclBufferA_recv->sync_to_device();
+      // Send and receive matrix A using ACCL directly on FPGA
+      accl.send(0, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,
+                true, ACCL::streamFlags::NO_STREAM);
+      accl.recv(0, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,
+                true, ACCL::streamFlags::NO_STREAM);
+      // Copy received matrix from receiving buffer to A buffer completely on FPGA
+      // accl.copy(*AcclBufferA_recv, *AcclBufferA, data.blockSize * data.blockSize *
+      // data.numBlocks,
+      // true, true);
+    }
+  } else {
+    // Taken from "Parallel matrix transpose algorithms on distributed memory concurrent computers"
+    // by J. Choi, J. J. Dongarra, D. W. Walker and translated to C++ This will do a diagonal
+    // exchange of matrix blocks.
+
+    // Determine LCM using GCD from standard library using the C++14 call
+    // In C++17 this changes to std::gcd in numeric, also std::lcm is directly available in numeric
+    int gcd = std::__gcd(pq_height, pq_width);
+    int least_common_multiple = pq_height * pq_width / gcd;
+
+    // If the global matrix size is not a multiple of the LCM block size, the numbers of send and
+    // received blocks may be wrongly calculated. Throw exception to prevent this and make aware of
+    // this issue!
+    if (global_width % least_common_multiple > 0) {
+      throw std::runtime_error("Implementation does not support matrix sizes that are not multiple "
+                               "of LCM blocks! Results may be wrong!");
+    }
 
-            // MPI requests for non-blocking communication
-            // First half of vector is for Isend, second half for Irecv!
-            std::vector<ACCL::CCLO*> accl_requests(2 * gcd);
-
-            // Begin algorithm from Figure 14 for general case
-            int g = transpose::data_handler::mod(pq_row - pq_col, gcd);
-            int p = transpose::data_handler::mod(pq_col + g, pq_width);
-            int q = transpose::data_handler::mod(pq_row - g, pq_height);
-
-            // Pre-calculate target ranks in LCM block
-            // The vector list variable can be interpreted as 2D matrix. Every entry represents the target rank of the sub-block
-            // Since the LCM block will repeat, we only need to store this small amount of data!
-            std::vector<int> target_list(least_common_multiple/pq_height * least_common_multiple/pq_width);
-            for (int row = 0; row  < least_common_multiple/pq_height; row++) {
-                for (int col = 0; col  < least_common_multiple/pq_width; col++) {
-                    int global_block_col = pq_col + col * pq_width;
-                    int global_block_row = pq_row + row * pq_height;
-                    int destination_rank = (global_block_col % pq_height) * pq_width + (global_block_row % pq_width);
-                    target_list[row * least_common_multiple/pq_width + col] = destination_rank;
-                }
-            }
+    // MPI requests for non-blocking communication
+    // First half of vector is for Isend, second half for Irecv!
+    std::vector<ACCL::CCLO *> accl_requests(2 * gcd);
+
+    // Begin algorithm from Figure 14 for general case
+    int g = transpose::data_handler::mod(pq_row - pq_col, gcd);
+    int p = transpose::data_handler::mod(pq_col + g, pq_width);
+    int q = transpose::data_handler::mod(pq_row - g, pq_height);
+
+    // Pre-calculate target ranks in LCM block
+    // The vector list variable can be interpreted as 2D matrix. Every entry represents the target
+    // rank of the sub-block Since the LCM block will repeat, we only need to store this small
+    // amount of data!
+    std::vector<int> target_list(least_common_multiple / pq_height * least_common_multiple /
+                                 pq_width);
+    for (int row = 0; row < least_common_multiple / pq_height; row++) {
+      for (int col = 0; col < least_common_multiple / pq_width; col++) {
+        int global_block_col = pq_col + col * pq_width;
+        int global_block_row = pq_row + row * pq_height;
+        int destination_rank =
+            (global_block_col % pq_height) * pq_width + (global_block_row % pq_width);
+        target_list[row * least_common_multiple / pq_width + col] = destination_rank;
+      }
+    }
 
-            // Create some ACCL buffers to send and receive from other FPGAs
-            // They can reside completely on FPGA
-            std::vector<std::unique_ptr<ACCL::BaseBuffer>> send_buffers;
-            std::vector<std::unique_ptr<ACCL::BaseBuffer>> recv_buffers;
-            for (int i = 0; i < gcd; i++) {
-                // TODO Is there a way to initialize buffer only in FPGA memory with ACCL?
-                send_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); 
-                recv_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); 
+    // Create some ACCL buffers to send and receive from other FPGAs
+    // They can reside completely on FPGA
+    std::vector<std::unique_ptr<ACCL::BaseBuffer>> send_buffers;
+    std::vector<std::unique_ptr<ACCL::BaseBuffer>> recv_buffers;
+    for (int i = 0; i < gcd; i++) {
+      // TODO Is there a way to initialize buffer only in FPGA memory with ACCL?
+      send_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(
+          data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32));
+      recv_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(
+          data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32));
+      send_buffers.back()->sync_to_device();
+      recv_buffers.back()->sync_to_device();
+    }
+    int current_parallel_execution = 0;
+    for (int j = 0; j < least_common_multiple / pq_width; j++) {
+      for (int i = 0; i < least_common_multiple / pq_height; i++) {
+        // Determine sender and receiver rank of current rank for current communication step
+        int send_rank = transpose::data_handler::mod(p + i * gcd, pq_width) +
+                        transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width;
+        int recv_rank = transpose::data_handler::mod(p - i * gcd, pq_width) +
+                        transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width;
+
+        // Also count receiving buffer size because sending and receiving buffer size may differ in
+        // certain scenarios!
+        int receiving_size = 0;
+        int sending_size = 0;
+
+        std::vector<int> send_rows;
+        std::vector<int> send_cols;
+        // Look up which blocks are affected by the current rank
+        for (int row = 0; row < least_common_multiple / pq_height; row++) {
+          for (int col = 0; col < least_common_multiple / pq_width; col++) {
+            if (target_list[row * least_common_multiple / pq_width + col] == send_rank) {
+              send_rows.push_back(row);
+              send_cols.push_back(col);
+              sending_size += data.blockSize * data.blockSize;
             }
-            int current_parallel_execution = 0;
-            for (int j = 0; j < least_common_multiple/pq_width; j++) {
-                for (int i = 0; i < least_common_multiple/pq_height; i++) {
-                    // Determine sender and receiver rank of current rank for current communication step
-                    int send_rank = transpose::data_handler::mod(p + i * gcd, pq_width) + transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width;
-                    int recv_rank = transpose::data_handler::mod(p - i * gcd, pq_width) + transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width;
-
-                    // Also count receiving buffer size because sending and receiving buffer size may differ in certain scenarios!
-                    int receiving_size = 0;
-                    int sending_size = 0;
-
-                    std::vector<int> send_rows;
-                    std::vector<int> send_cols;
-                    // Look up which blocks are affected by the current rank
-                    for (int row = 0; row  < least_common_multiple/pq_height; row++) {
-                        for (int col = 0; col  < least_common_multiple/pq_width; col++) {
-                            if (target_list[row * least_common_multiple/pq_width + col] == send_rank) {
-                                send_rows.push_back(row);
-                                send_cols.push_back(col);
-                                sending_size += data.blockSize * data.blockSize;
-                            }
-                            if (target_list[row * least_common_multiple/pq_width + col] == recv_rank) {
-                                receiving_size += data.blockSize * data.blockSize;
-                            }
-                        }
-                    }
-                    receiving_size *= (height_per_rank)/(least_common_multiple/pq_height) * ((width_per_rank)/(least_common_multiple/pq_width));
-                    sending_size *= (height_per_rank)/(least_common_multiple/pq_height) * ((width_per_rank)/(least_common_multiple/pq_width));
+            if (target_list[row * least_common_multiple / pq_width + col] == recv_rank) {
+              receiving_size += data.blockSize * data.blockSize;
+            }
+          }
+        }
+        receiving_size *= (height_per_rank) / (least_common_multiple / pq_height) *
+                          ((width_per_rank) / (least_common_multiple / pq_width));
+        sending_size *= (height_per_rank) / (least_common_multiple / pq_height) *
+                        ((width_per_rank) / (least_common_multiple / pq_width));
 
 #ifndef NDEBUG
-                    std::cout << "Copy data to send buffers" << std::endl;
+        std::cout << "Copy data to send buffers" << std::endl;
 #endif
-                    // Copy the required date for this communication step to the send buffer!
-                    for (int t=0; t < send_rows.size(); t++) {
-                        for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) {
-                            for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) {
-                                size_t sending_buffer_offset = lcm_row * data.blockSize * data.blockSize * ((width_per_rank)/(least_common_multiple/pq_width)) + lcm_col * data.blockSize * data.blockSize;
-                                size_t matrix_buffer_offset = (send_cols[t] + lcm_col * least_common_multiple/pq_width)  * data.blockSize + (send_rows[t] + lcm_row * least_common_multiple/pq_height) * width_per_rank * data.blockSize * data.blockSize;
-                                for (int block_row = 0; block_row < data.blockSize; block_row++) {
-                                    // TODO May be more efficient when done async!
-                                    std::cout << "A(" << matrix_buffer_offset + block_row * width_per_rank * data.blockSize 
-                                                    << "," << matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize
-                                                << ") send(" << sending_buffer_offset
-                                                << "," << sending_buffer_offset + data.blockSize << ")" << std::endl;
-                                    accl.copy(*AcclBufferA->slice(
-                                                    matrix_buffer_offset + block_row * width_per_rank * data.blockSize, 
-                                                    matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize),
-                                                *send_buffers[current_parallel_execution]->slice(
-                                                    sending_buffer_offset,
-                                                    sending_buffer_offset + data.blockSize),
-                                                data.blockSize, true, true);
-                                }
-                            }
-                        }
-                    }
+        // Copy the required date for this communication step to the send buffer!
+        for (int t = 0; t < send_rows.size(); t++) {
+          for (int lcm_row = 0; lcm_row < (height_per_rank) / (least_common_multiple / pq_height);
+               lcm_row++) {
+            for (int lcm_col = 0; lcm_col < (width_per_rank) / (least_common_multiple / pq_width);
+                 lcm_col++) {
+              size_t sending_buffer_offset =
+                  lcm_row * data.blockSize * data.blockSize *
+                      ((width_per_rank) / (least_common_multiple / pq_width)) +
+                  lcm_col * data.blockSize * data.blockSize;
+              size_t matrix_buffer_offset =
+                  (send_cols[t] + lcm_col * least_common_multiple / pq_width) * data.blockSize +
+                  (send_rows[t] + lcm_row * least_common_multiple / pq_height) * width_per_rank *
+                      data.blockSize * data.blockSize;
+              for (int block_row = 0; block_row < data.blockSize; block_row++) {
+                // TODO May be more efficient when done async!
+                std::cout << "A("
+                          << matrix_buffer_offset + block_row * width_per_rank * data.blockSize
+                          << ","
+                          << matrix_buffer_offset + block_row * width_per_rank * data.blockSize +
+                                 data.blockSize
+                          << ") send(" << sending_buffer_offset << ","
+                          << sending_buffer_offset + data.blockSize << ")" << std::endl;
+                accl.copy(*AcclBufferA->slice(
+                              matrix_buffer_offset + block_row * width_per_rank * data.blockSize,
+                              matrix_buffer_offset + block_row * width_per_rank * data.blockSize +
+                                  data.blockSize),
+                          *send_buffers[current_parallel_execution]->slice(
+                              sending_buffer_offset, sending_buffer_offset + data.blockSize),
+                          data.blockSize, true, true);
+                std::cout << "Copy done!" << std::endl;
+              }
+            }
+          }
+        }
 
-                    // Do actual MPI communication
+        // Do actual MPI communication
+#ifndef NDEBUG
+        std::cout << "Rank " << mpi_comm_rank << ": blocks ("
+                  << sending_size / (data.blockSize * data.blockSize) << ","
+                  << receiving_size / (data.blockSize * data.blockSize) << ") send " << send_rank
+                  << ", recv " << recv_rank << std::endl
+                  << std::flush;
+#endif
+        accl_requests[current_parallel_execution] =
+            (accl.send(0, *send_buffers[current_parallel_execution], sending_size, send_rank, 0,
+                       true, ACCL::streamFlags::NO_STREAM, true));
+        accl_requests[current_parallel_execution + gcd] =
+            (accl.recv(0, *recv_buffers[current_parallel_execution], sending_size, send_rank, 0,
+                       true, ACCL::streamFlags::NO_STREAM, true));
+        // Increase the counter for parallel executions
+        current_parallel_execution = (current_parallel_execution + 1) % gcd;
+
+        // Wait for MPI requests if GCD MPI calls are scheduled in parallel
+        if ((current_parallel_execution) % gcd == 0) {
+
+          for (auto &req : accl_requests) {
+
+            MPI_Status status;
+            int index;
 #ifndef NDEBUG
-                    std::cout << "Rank " << mpi_comm_rank << ": blocks (" << sending_size / (data.blockSize * data.blockSize) << "," << receiving_size / (data.blockSize * data.blockSize) << ") send " << send_rank << ", recv " << recv_rank << std::endl << std::flush;
+            std::cout << "Wait for all requests to complete" << std::endl;
 #endif
-                    accl_requests[current_parallel_execution] = (accl.send(0, *send_buffers[current_parallel_execution], sending_size, send_rank, 0, false, ACCL::streamFlags::NO_STREAM, true));
-                    accl_requests[current_parallel_execution] = (accl.recv(0, *recv_buffers[current_parallel_execution], sending_size, send_rank, 0, false, ACCL::streamFlags::NO_STREAM, true));
-                    // Increase the counter for parallel executions
-                    current_parallel_execution = (current_parallel_execution + 1) % gcd;
-
-                    // Wait for MPI requests if GCD MPI calls are scheduled in parallel
-                    if ((current_parallel_execution) % gcd == 0) {
-
-
-                        for (auto& req :accl_requests) {
-                        
-                            MPI_Status status;
-                            int index;
-
-                            // Wait for all send and recv events to complete
-                            // TODO do the CCLO pointers need to be freed?
-                            accl.nop(false, accl_requests);
-                            // For each message that was received in parallel
-                            if (index >= gcd) {
-                                std::vector<int> recv_rows;
-                                std::vector<int> recv_cols;
-                                // Look up which blocks are affected by the current rank
-                                for (int row = 0; row  < least_common_multiple/pq_height; row++) {
-                                    for (int col = 0; col  < least_common_multiple/pq_width; col++) {
-                                        if (target_list[row * least_common_multiple/pq_width + col] == status.MPI_SOURCE) {
-                                            recv_rows.push_back(row);
-                                            recv_cols.push_back(col);
-                                        }
-                                    }
-                                }
-                                // Copy received data to matrix A buffer
-                                for (int t=0; t < recv_rows.size(); t++) {
-                                    for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) {
-                                        for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) {
-                                            size_t receiving_buffer_offset = lcm_row * data.blockSize * data.blockSize * ((width_per_rank)/(least_common_multiple/pq_width)) + lcm_col * data.blockSize * data.blockSize;
-                                            size_t matrix_buffer_offset = (recv_cols[t] + lcm_col * least_common_multiple/pq_width)  * data.blockSize + (recv_rows[t] + lcm_row * least_common_multiple/pq_height) * width_per_rank * data.blockSize * data.blockSize;
-                                            for (int block_row = 0; block_row < data.blockSize; block_row++) {
-                                                // TODO May be more efficient when done async!
-                                                accl.copy(*recv_buffers[current_parallel_execution]->slice(receiving_buffer_offset, receiving_buffer_offset + data.blockSize),*AcclBufferA->slice(matrix_buffer_offset + block_row * width_per_rank * data.blockSize, matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize), data.blockSize, true, true);
-
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    } 
+            // Wait for all send and recv events to complete
+            // TODO do the CCLO pointers need to be freed?
+            accl.nop(false, accl_requests);
+            // For each message that was received in parallel
+            if (index >= gcd) {
+              std::vector<int> recv_rows;
+              std::vector<int> recv_cols;
+              // Look up which blocks are affected by the current rank
+              for (int row = 0; row < least_common_multiple / pq_height; row++) {
+                for (int col = 0; col < least_common_multiple / pq_width; col++) {
+                  if (target_list[row * least_common_multiple / pq_width + col] ==
+                      status.MPI_SOURCE) {
+                    recv_rows.push_back(row);
+                    recv_cols.push_back(col);
+                  }
                 }
+              }
+              // Copy received data to matrix A buffer
+              for (int t = 0; t < recv_rows.size(); t++) {
+                for (int lcm_row = 0;
+                     lcm_row < (height_per_rank) / (least_common_multiple / pq_height); lcm_row++) {
+                  for (int lcm_col = 0;
+                       lcm_col < (width_per_rank) / (least_common_multiple / pq_width); lcm_col++) {
+                    size_t receiving_buffer_offset =
+                        lcm_row * data.blockSize * data.blockSize *
+                            ((width_per_rank) / (least_common_multiple / pq_width)) +
+                        lcm_col * data.blockSize * data.blockSize;
+                    size_t matrix_buffer_offset =
+                        (recv_cols[t] + lcm_col * least_common_multiple / pq_width) *
+                            data.blockSize +
+                        (recv_rows[t] + lcm_row * least_common_multiple / pq_height) *
+                            width_per_rank * data.blockSize * data.blockSize;
+                    for (int block_row = 0; block_row < data.blockSize; block_row++) {
+                      // TODO May be more efficient when done async!
+                      accl.copy(
+                          *recv_buffers[current_parallel_execution]->slice(
+                              receiving_buffer_offset, receiving_buffer_offset + data.blockSize),
+                          *AcclBufferA->slice(
+                              matrix_buffer_offset + block_row * width_per_rank * data.blockSize,
+                              matrix_buffer_offset + block_row * width_per_rank * data.blockSize +
+                                  data.blockSize),
+                          data.blockSize, true, true);
+                    }
+                  }
+                }
+              }
             }
+          }
         }
+      }
     }
+  }
+}
 
-
-    /**
- * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and PCIe+MPI over the host for communication
- * 
+/**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and
+ * PCIe+MPI over the host for communication
+ *
  * @param config The progrma configuration
  * @param data data object that contains all required data for the execution on the FPGA
  * @param handler data handler instance that should be used to exchange data between hosts
- * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times
  */
-static  std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, xrt::device, bool, xrt::uuid>& config, transpose::TransposeData<bool>& data, transpose::data_handler::DistributedPQTransposeDataHandler<xrt::device, bool, xrt::uuid> &handler) {
-        int err;
-
-        if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
-                throw std::runtime_error("Used data handler not supported by execution handler!");
-        }
+static std::unique_ptr<transpose::TransposeExecutionTimings>
+calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, xrt::device, bool,
+                                             xrt::uuid> &config,
+          transpose::TransposeData<bool> &data,
+          transpose::data_handler::DistributedPQTransposeDataHandler<xrt::device, bool, xrt::uuid>
+              &handler) {
+  int err;
+
+  if (config.programSettings->dataHandlerIdentifier !=
+      transpose::data_handler::DataHandlerType::pq) {
+    throw std::runtime_error("Used data handler not supported by execution handler!");
+  }
 #ifdef USE_SVM
-        throw new std::runtime_error("SVM not supported in the host implementation of this communication method");
+  throw new std::runtime_error(
+      "SVM not supported in the host implementation of this communication method");
 #endif
 #ifdef USE_BUFFER_WRITE_RECT_FOR_A
-        throw new std::runtime_error("Using the Write Rect method is not supported in this host implementation of this communication method");
+  throw new std::runtime_error("Using the Write Rect method is not supported in this host "
+                               "implementation of this communication method");
 #endif
 
-
-        std::vector<size_t> bufferSizeList;
-        std::vector<size_t> bufferStartList;
-        std::vector<size_t> bufferOffsetList;
-        std::vector<xrt::bo> bufferListA;
-        std::vector<xrt::bo> bufferListB;
-        std::vector<xrt::bo> bufferListA_out;
-        std::vector<xrt::kernel> transposeKernelList;
-        std::vector<size_t> blocksPerReplication;
-
-        size_t local_matrix_width = handler.getWidthforRank();
-        size_t local_matrix_height = handler.getHeightforRank();
-        size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
-
-        size_t total_offset = 0;
-        size_t row_offset = 0;
+  std::vector<size_t> bufferSizeList;
+  std::vector<size_t> bufferStartList;
+  std::vector<size_t> bufferOffsetList;
+  std::vector<xrt::bo> bufferListA;
+  std::vector<xrt::bo> bufferListB;
+  std::vector<xrt::bo> bufferListA_out;
+  std::vector<xrt::kernel> transposeKernelList;
+  std::vector<size_t> blocksPerReplication;
+
+  size_t local_matrix_width = handler.getWidthforRank();
+  size_t local_matrix_height = handler.getHeightforRank();
+  size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+
+  size_t total_offset = 0;
+  size_t row_offset = 0;
 #ifndef NDEBUG
-        std::cout << "Start kernel creation" << std::endl;
+  std::cout << "Start kernel creation" << std::endl;
 #endif
-        // Setup the kernels depending on the number of kernel replications
-        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-
-                // Calculate how many blocks the current kernel replication will need to process.
-                size_t blocks_per_replication = (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications);
-                size_t blocks_remainder = (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications;
-                if (blocks_remainder > r) {
-                        // Catch the case, that the number of blocks is not divisible by the number of kernel replications
-                        blocks_per_replication += 1;
-                }
-                if (blocks_per_replication < 1) {
-                        continue;
-                }
-                blocksPerReplication.push_back(blocks_per_replication);
-                size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width * local_matrix_width * data.blockSize * data.blockSize;
-                bufferSizeList.push_back(buffer_size);
-                bufferStartList.push_back(total_offset);
-                bufferOffsetList.push_back(row_offset);
+  // Setup the kernels depending on the number of kernel replications
+  for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+    // Calculate how many blocks the current kernel replication will need to process.
+    size_t blocks_per_replication =
+        (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications);
+    size_t blocks_remainder =
+        (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications;
+    if (blocks_remainder > r) {
+      // Catch the case, that the number of blocks is not divisible by the number of kernel
+      // replications
+      blocks_per_replication += 1;
+    }
+    if (blocks_per_replication < 1) {
+      continue;
+    }
+    blocksPerReplication.push_back(blocks_per_replication);
+    size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width *
+                         local_matrix_width * data.blockSize * data.blockSize;
+    bufferSizeList.push_back(buffer_size);
+    bufferStartList.push_back(total_offset);
+    bufferOffsetList.push_back(row_offset);
 
-                row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
+    row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
 
-                total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * local_matrix_width;
+    total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width *
+                    local_matrix_width;
 
-                // create the kernels
-                xrt::kernel transposeKernel(*config.device, *config.program, ("transpose0:{transpose0_" +  std::to_string(r + 1) + "}").c_str());
+    // create the kernels
+    xrt::kernel transposeKernel(*config.device, *config.program,
+                                ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str());
 
-                xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * 
-                                sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0));
-                xrt::bo bufferB(*config.device, data.B + bufferStartList[r] * data.blockSize * data.blockSize, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
-                xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2));
+    xrt::bo bufferA(*config.device, data.A,
+                    data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE),
+                    transposeKernel.group_id(0));
+    xrt::bo bufferB(*config.device, &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
+                    buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
+    xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
+                        transposeKernel.group_id(2));
 
-                bufferListA.push_back(bufferA);
-                bufferListB.push_back(bufferB);
-                bufferListA_out.push_back(bufferA_out);
-                transposeKernelList.push_back(transposeKernel);
-        }
+    bufferListA.push_back(bufferA);
+    bufferListB.push_back(bufferB);
+    bufferListA_out.push_back(bufferA_out);
+    transposeKernelList.push_back(transposeKernel);
+  }
+
+  std::vector<double> transferTimings;
+  std::vector<double> calculationTimings;
 
-        std::vector<double> transferTimings;
-        std::vector<double> calculationTimings;
-        
-        for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) {
+  for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) {
 
 #ifndef NDEBUG
-        std::cout << "Start data transfer" << std::endl;
+    std::cout << "Start data transfer" << std::endl;
 #endif
-            auto startTransfer = std::chrono::high_resolution_clock::now();
+    auto startTransfer = std::chrono::high_resolution_clock::now();
 
-            for (int r = 0; r < transposeKernelList.size(); r++) {
-                bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
-                bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
-            }
-            auto endTransfer = std::chrono::high_resolution_clock::now();
-
-            std::chrono::duration<double> transferTime =
-                    std::chrono::duration_cast<std::chrono::duration<double>>
-                            (endTransfer - startTransfer);
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    }
+    auto endTransfer = std::chrono::high_resolution_clock::now();
 
-            MPI_Barrier(MPI_COMM_WORLD);
+    std::chrono::duration<double> transferTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(endTransfer - startTransfer);
 
-            auto startCalculation = std::chrono::high_resolution_clock::now();
+    MPI_Barrier(MPI_COMM_WORLD);
 
+    auto startCalculation = std::chrono::high_resolution_clock::now();
 
-            // Exchange A data via ACCL
-            if (bufferListA.size() > 1) {
-                std::cerr << "WARNING: Only the matrix A of the first kernel replication will be exchanged via ACCL!" << std::endl;
-            }
+    // Exchange A data via ACCL
+    if (bufferListA.size() > 1) {
+      std::cerr << "WARNING: Only the matrix A of the first kernel replication will be exchanged "
+                   "via ACCL!"
+                << std::endl;
+    }
 #ifndef NDEBUG
-        std::cout << "Start data exchange with ACCL" << std::endl;
+    std::cout << "Start data exchange with ACCL" << std::endl;
 #endif
-            accl_exchangeData(*config.accl, handler, data, bufferListA[0], config.programSettings->matrixSize / data.blockSize);
+    accl_exchangeData(*config.accl, handler, data, bufferListA[0],
+                      config.programSettings->matrixSize / data.blockSize);
 #ifndef NDEBUG
-        std::cout << "End data exchange with ACCL" << std::endl;
+    std::cout << "End data exchange with ACCL" << std::endl;
 #endif
-            std::vector<xrt::run> runs;
-            auto startKernelCalculation = std::chrono::high_resolution_clock::now();
-            for (int r = 0; r < transposeKernelList.size(); r++)
-            {
-                 runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast<cl_uint>(bufferOffsetList[r]),static_cast<cl_uint>(bufferOffsetList[r]),
-                            static_cast<cl_uint>(blocksPerReplication[r]), static_cast<cl_uint>(handler.getWidthforRank()),
-                            static_cast<cl_uint>((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize))));
-            }
+    std::vector<xrt::run> runs;
+    auto startKernelCalculation = std::chrono::high_resolution_clock::now();
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      runs.push_back(transposeKernelList[r](
+          bufferListA[r], bufferListB[r], bufferListA_out[r],
+          static_cast<cl_uint>(bufferOffsetList[r]), static_cast<cl_uint>(bufferOffsetList[r]),
+          static_cast<cl_uint>(blocksPerReplication[r]),
+          static_cast<cl_uint>(handler.getWidthforRank()),
+          static_cast<cl_uint>((bufferSizeList[r]) /
+                               (local_matrix_width * data.blockSize * data.blockSize))));
+    }
 #ifndef NDEBUG
-        std::cout << "Wait for kernels to complete" << std::endl;
+    std::cout << "Wait for kernels to complete" << std::endl;
 #endif
-            for (int r = 0; r < transposeKernelList.size(); r++)
-            {
-                runs[r].wait();
-            }
-            auto endCalculation = std::chrono::high_resolution_clock::now();
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      runs[r].wait();
+    }
+    auto endCalculation = std::chrono::high_resolution_clock::now();
 #ifndef NDEBUG
-                int mpi_rank;
-                MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-                std::cout << "Rank " << mpi_rank << ": " << "Done i=" << repetition << std::endl;
-                std::cout << "Kernel execution time: " << std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startKernelCalculation).count() 
-                        << "s (" << ((config.programSettings->matrixSize * config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * 3) 
-                                / std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startKernelCalculation).count() * 1.0e-9) << " GB/s)" << std::endl;
+    int mpi_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    std::cout << "Rank " << mpi_rank << ": "
+              << "Done i=" << repetition << std::endl;
+    std::cout << "Kernel execution time: "
+              << std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation -
+                                                                           startKernelCalculation)
+                     .count()
+              << "s ("
+              << ((config.programSettings->matrixSize * config.programSettings->matrixSize *
+                   sizeof(HOST_DATA_TYPE) * 3) /
+                  std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation -
+                                                                            startKernelCalculation)
+                      .count() *
+                  1.0e-9)
+              << " GB/s)" << std::endl;
 #endif
 
-            std::chrono::duration<double> calculationTime =
-                    std::chrono::duration_cast<std::chrono::duration<double>>
-                            (endCalculation - startCalculation);
-            calculationTimings.push_back(calculationTime.count());
-
-            std::vector<HOST_DATA_TYPE> tmp_write_buffer(local_matrix_height * local_matrix_width * data.blockSize * data.blockSize); 
-
-            startTransfer = std::chrono::high_resolution_clock::now();
-
-                for (int r = 0; r < transposeKernelList.size(); r++) {
-                        // Copy possibly incomplete first block row
-                        if (bufferOffsetList[r] != 0) {
-                                bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-                                bufferListA_out[r].read(tmp_write_buffer.data());
-                                for (int row = 0; row < data.blockSize; row++) {
-                                        for (int col = bufferOffsetList[r] * data.blockSize; col < local_matrix_width * data.blockSize; col++) {
-                                                data.result[bufferStartList[r] * data.blockSize * data.blockSize + row * local_matrix_width * data.blockSize + col] =
-                                                        tmp_write_buffer[row * local_matrix_width * data.blockSize + col];
-                                        }
-                                }
-                                // Copy remaining buffer
-                                std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize, tmp_write_buffer.begin() + bufferSizeList[r],&data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize * data.blockSize]);
-                        }
-                        else {
-                                bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-                                bufferListA_out[r].read(data.result + bufferStartList[r] * data.blockSize * data.blockSize);
-                        }
-                }
-            endTransfer = std::chrono::high_resolution_clock::now();
-            transferTime +=
-                    std::chrono::duration_cast<std::chrono::duration<double>>
-                            (endTransfer - startTransfer);
-            transferTimings.push_back(transferTime.count());
+    std::chrono::duration<double> calculationTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation -
+                                                                  startCalculation);
+    calculationTimings.push_back(calculationTime.count());
+
+    std::vector<HOST_DATA_TYPE> tmp_write_buffer(local_matrix_height * local_matrix_width *
+                                                 data.blockSize * data.blockSize);
+
+    startTransfer = std::chrono::high_resolution_clock::now();
+
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      // Copy possibly incomplete first block row
+      if (bufferOffsetList[r] != 0) {
+        bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+        bufferListA_out[r].read(tmp_write_buffer.data());
+        for (int row = 0; row < data.blockSize; row++) {
+          for (int col = bufferOffsetList[r] * data.blockSize;
+               col < local_matrix_width * data.blockSize; col++) {
+            data.result[bufferStartList[r] * data.blockSize * data.blockSize +
+                        row * local_matrix_width * data.blockSize + col] =
+                tmp_write_buffer[row * local_matrix_width * data.blockSize + col];
+          }
         }
+        // Copy remaining buffer
+        std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize,
+                  tmp_write_buffer.begin() + bufferSizeList[r],
+                  &data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize *
+                               data.blockSize]);
+      } else {
+        bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+        bufferListA_out[r].read(data.result, bufferSizeList[r] * sizeof(HOST_DATA_TYPE),
+                                bufferStartList[r] * data.blockSize * data.blockSize *
+                                    sizeof(HOST_DATA_TYPE));
+      }
+    }
+    endTransfer = std::chrono::high_resolution_clock::now();
+    transferTime +=
+        std::chrono::duration_cast<std::chrono::duration<double>>(endTransfer - startTransfer);
+    transferTimings.push_back(transferTime.count());
+  }
 
-        std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
-                transferTimings,
-                calculationTimings
-        });
+  std::unique_ptr<transpose::TransposeExecutionTimings> result(
+      new transpose::TransposeExecutionTimings{transferTimings, calculationTimings});
 
-        return result;
-    }
+  return result;
+}
 
-}  // namespace transpose
-}  // namespace fpga_execution
-}  // namespace intel
+} // namespace accl_pq
+} // namespace fpga_execution
+} // namespace transpose
 
 #endif
diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
index fd3618c9..85481b6f 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
@@ -23,207 +23,236 @@ SOFTWARE.
 #define SRC_HOST_XRT_PCIE_PQ_EXECUTION_H_
 
 /* C++ standard library headers */
+#include <chrono>
 #include <memory>
 #include <vector>
-#include <chrono>
 
 /* Project's headers */
-#include "transpose_benchmark.hpp"
 #include "data_handlers/data_handler_types.h"
 #include "data_handlers/pq.hpp"
+#include "transpose_benchmark.hpp"
 
 namespace transpose {
 namespace fpga_execution {
 namespace pcie_pq {
 
-    /**
- * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and PCIe+MPI over the host for communication
- * 
+/**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and
+ * PCIe+MPI over the host for communication
+ *
  * @param config The progrma configuration
  * @param data data object that contains all required data for the execution on the FPGA
  * @param handler data handler instance that should be used to exchange data between hosts
- * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times
  */
-static  std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, xrt::device, bool, xrt::uuid>& config, transpose::TransposeData<bool>& data, transpose::data_handler::DistributedPQTransposeDataHandler<xrt::device, bool, xrt::uuid> &handler) {
-        int err;
-
-        if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
-                throw std::runtime_error("Used data handler not supported by execution handler!");
-        }
+static std::unique_ptr<transpose::TransposeExecutionTimings>
+calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, xrt::device, bool,
+                                             xrt::uuid> &config,
+          transpose::TransposeData<bool> &data,
+          transpose::data_handler::DistributedPQTransposeDataHandler<xrt::device, bool, xrt::uuid>
+              &handler) {
+  int err;
+
+  if (config.programSettings->dataHandlerIdentifier !=
+      transpose::data_handler::DataHandlerType::pq) {
+    throw std::runtime_error("Used data handler not supported by execution handler!");
+  }
 #ifdef USE_SVM
-        throw new std::runtime_error("SVM not supported in the host implementation of this communication method");
+  throw new std::runtime_error(
+      "SVM not supported in the host implementation of this communication method");
 #endif
 #ifdef USE_BUFFER_WRITE_RECT_FOR_A
-        throw new std::runtime_error("Using the Write Rect method is not supported in this host implementation of this communication method");
+  throw new std::runtime_error("Using the Write Rect method is not supported in this host "
+                               "implementation of this communication method");
 #endif
 
+  std::vector<size_t> bufferSizeList;
+  std::vector<size_t> bufferStartList;
+  std::vector<size_t> bufferOffsetList;
+  std::vector<xrt::bo> bufferListA;
+  std::vector<xrt::bo> bufferListB;
+  std::vector<xrt::bo> bufferListA_out;
+  std::vector<xrt::kernel> transposeKernelList;
+  std::vector<size_t> blocksPerReplication;
+
+  size_t local_matrix_width = handler.getWidthforRank();
+  size_t local_matrix_height = handler.getHeightforRank();
+  size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+
+  size_t total_offset = 0;
+  size_t row_offset = 0;
+  // Setup the kernels depending on the number of kernel replications
+  for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+    // Calculate how many blocks the current kernel replication will need to process.
+    size_t blocks_per_replication =
+        (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications);
+    size_t blocks_remainder =
+        (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications;
+    if (blocks_remainder > r) {
+      // Catch the case, that the number of blocks is not divisible by the number of kernel
+      // replications
+      blocks_per_replication += 1;
+    }
+    if (blocks_per_replication < 1) {
+      continue;
+    }
+    blocksPerReplication.push_back(blocks_per_replication);
+    size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width *
+                         local_matrix_width * data.blockSize * data.blockSize;
+    bufferSizeList.push_back(buffer_size);
+    bufferStartList.push_back(total_offset);
+    bufferOffsetList.push_back(row_offset);
+
+    row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
+
+    total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width *
+                    local_matrix_width;
+
+    int memory_bank_info_a = 0;
+    int memory_bank_info_b = 0;
+    int memory_bank_info_out = 0;
+
+    // create the kernels
+    xrt::kernel transposeKernel(*config.device, *config.program,
+                                ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str());
+
+    xrt::bo bufferA(*config.device, data.A,
+                    data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE),
+                    transposeKernel.group_id(0));
+    xrt::bo bufferB(*config.device,&data.B[bufferStartList[r] * data.blockSize * data.blockSize],
+                    buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
+    // TODO For small matrices, the 4KB alignment might fail for buffer B. Temporary fix seen in
+    // lines below (requires extra copying)
+    // xrt::bo bufferB(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
+    // transposeKernel.group_id(1)); bufferB.write(data.B + bufferStartList[r] * data.blockSize *
+    // data.blockSize);
+    xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
+                        transposeKernel.group_id(2));
+
+    bufferListA.push_back(bufferA);
+    bufferListB.push_back(bufferB);
+    bufferListA_out.push_back(bufferA_out);
+    transposeKernelList.push_back(transposeKernel);
+  }
+
+  std::vector<double> transferTimings;
+  std::vector<double> calculationTimings;
+
+  for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) {
+
+    auto startTransfer = std::chrono::high_resolution_clock::now();
+
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    }
+    auto endTransfer = std::chrono::high_resolution_clock::now();
 
-        std::vector<size_t> bufferSizeList;
-        std::vector<size_t> bufferStartList;
-        std::vector<size_t> bufferOffsetList;
-        std::vector<xrt::bo> bufferListA;
-        std::vector<xrt::bo> bufferListB;
-        std::vector<xrt::bo> bufferListA_out;
-        std::vector<xrt::kernel> transposeKernelList;
-        std::vector<size_t> blocksPerReplication;
-
-        size_t local_matrix_width = handler.getWidthforRank();
-        size_t local_matrix_height = handler.getHeightforRank();
-        size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
-
-        size_t total_offset = 0;
-        size_t row_offset = 0;
-        // Setup the kernels depending on the number of kernel replications
-        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-
-                // Calculate how many blocks the current kernel replication will need to process.
-                size_t blocks_per_replication = (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications);
-                size_t blocks_remainder = (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications;
-                if (blocks_remainder > r) {
-                        // Catch the case, that the number of blocks is not divisible by the number of kernel replications
-                        blocks_per_replication += 1;
-                }
-                if (blocks_per_replication < 1) {
-                        continue;
-                }
-                blocksPerReplication.push_back(blocks_per_replication);
-                size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width * local_matrix_width * data.blockSize * data.blockSize;
-                bufferSizeList.push_back(buffer_size);
-                bufferStartList.push_back(total_offset);
-                bufferOffsetList.push_back(row_offset);
-
-                row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
-
-                total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * local_matrix_width;
-
-                int memory_bank_info_a = 0;
-                int memory_bank_info_b = 0;
-                int memory_bank_info_out = 0;
-                
-                // create the kernels
-                xrt::kernel transposeKernel(*config.device, *config.program, ("transpose0:{transpose0_" +  std::to_string(r + 1) + "}").c_str());
-
-               
-                xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * 
-                                sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0));
-                xrt::bo bufferB(*config.device, data.B + bufferStartList[r] * data.blockSize * data.blockSize, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
-                // TODO For small matrices, the 4KB alignment might fail for buffer B. Temporary fix seen in lines below (requires extra copying)
-                //xrt::bo bufferB(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
-                //bufferB.write(data.B + bufferStartList[r] * data.blockSize * data.blockSize);
-                xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2));
-
-                bufferListA.push_back(bufferA);
-                bufferListB.push_back(bufferB);
-                bufferListA_out.push_back(bufferA_out);
-                transposeKernelList.push_back(transposeKernel);
-        }
-
-        std::vector<double> transferTimings;
-        std::vector<double> calculationTimings;
-
-        for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) {
-
-            auto startTransfer = std::chrono::high_resolution_clock::now();
-
-            for (int r = 0; r < transposeKernelList.size(); r++) {
-                bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
-                bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
-            }
-            auto endTransfer = std::chrono::high_resolution_clock::now();
-
-            std::chrono::duration<double> transferTime =
-                    std::chrono::duration_cast<std::chrono::duration<double>>
-                            (endTransfer - startTransfer);
-
-            MPI_Barrier(MPI_COMM_WORLD);
+    std::chrono::duration<double> transferTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(endTransfer - startTransfer);
 
-            auto startCalculation = std::chrono::high_resolution_clock::now();
+    MPI_Barrier(MPI_COMM_WORLD);
 
-            for (int r = 0; r < transposeKernelList.size(); r++)
-            {
-                bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-            }
+    auto startCalculation = std::chrono::high_resolution_clock::now();
 
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+    }
 
-        // Exchange A data via PCIe and MPI
-        handler.exchangeData(data);
+    // Exchange A data via PCIe and MPI
+    handler.exchangeData(data);
 
-        for (int r = 0; r < transposeKernelList.size(); r++)
-        {
-            bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
-        }
+    std::copy(data.A, data.A + data.numBlocks * data.blockSize * data.blockSize, data.exchange);
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    }
 
-        std::vector<xrt::run> runs;
-        auto startKernelCalculation = std::chrono::high_resolution_clock::now();
-        for (int r = 0; r < transposeKernelList.size(); r++)
-        {
-             runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast<cl_uint>(bufferStartList[r] + bufferOffsetList[r]),static_cast<cl_uint>(bufferOffsetList[r]),
-                        static_cast<cl_uint>(blocksPerReplication[r]), static_cast<cl_uint>(handler.getWidthforRank()),
-                        static_cast<cl_uint>(handler.getHeightforRank())));
-        }
-        for (int r = 0; r < transposeKernelList.size(); r++)
-        {
-            runs[r].wait();
-        }
-        auto endCalculation = std::chrono::high_resolution_clock::now();
+    std::vector<xrt::run> runs;
+    auto startKernelCalculation = std::chrono::high_resolution_clock::now();
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      runs.push_back(transposeKernelList[r](
+          bufferListA[r], bufferListB[r], bufferListA_out[r],
+          static_cast<cl_uint>(bufferStartList[r] + bufferOffsetList[r]),
+          static_cast<cl_uint>(bufferOffsetList[r]), static_cast<cl_uint>(blocksPerReplication[r]),
+          static_cast<cl_uint>(handler.getWidthforRank()),
+          static_cast<cl_uint>(handler.getHeightforRank())));
+    }
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      runs[r].wait();
+    }
+    auto endCalculation = std::chrono::high_resolution_clock::now();
 #ifndef NDEBUG
-                int mpi_rank;
-                MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
-                std::cout << "Rank " << mpi_rank << ": " << "Done i=" << repetition << std::endl;
-                std::cout << "Kernel execution time: " << std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startKernelCalculation).count() 
-                        << "s (" << ((config.programSettings->matrixSize * config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * 3) 
-                                / std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startKernelCalculation).count() * 1.0e-9) << " GB/s)" << std::endl;
+    int mpi_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    std::cout << "Rank " << mpi_rank << ": "
+              << "Done i=" << repetition << std::endl;
+    std::cout << "Kernel execution time: "
+              << std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation -
+                                                                           startKernelCalculation)
+                     .count()
+              << "s ("
+              << ((config.programSettings->matrixSize * config.programSettings->matrixSize *
+                   sizeof(HOST_DATA_TYPE) * 3) /
+                  std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation -
+                                                                            startKernelCalculation)
+                      .count() *
+                  1.0e-9)
+              << " GB/s)" << std::endl;
 #endif
 
-        // Transfer back data for next repetition!
-        handler.exchangeData(data);
-
-            std::chrono::duration<double> calculationTime =
-                    std::chrono::duration_cast<std::chrono::duration<double>>
-                            (endCalculation - startCalculation);
-            calculationTimings.push_back(calculationTime.count());
-
-            std::vector<HOST_DATA_TYPE> tmp_write_buffer(local_matrix_height * local_matrix_width * data.blockSize * data.blockSize); 
-
-            startTransfer = std::chrono::high_resolution_clock::now();
-
-                for (int r = 0; r < transposeKernelList.size(); r++) {
-                        // Copy possibly incomplete first block row
-                        if (bufferOffsetList[r] != 0) {
-                                bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-                                bufferListA_out[r].read(tmp_write_buffer.data());
-                                for (int row = 0; row < data.blockSize; row++) {
-                                        for (int col = bufferOffsetList[r] * data.blockSize; col < local_matrix_width * data.blockSize; col++) {
-                                                data.result[bufferStartList[r] * data.blockSize * data.blockSize + row * local_matrix_width * data.blockSize + col] =
-                                                        tmp_write_buffer[row * local_matrix_width * data.blockSize + col];
-                                        }
-                                }
-                                // Copy remaining buffer
-                                std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize, tmp_write_buffer.begin() + bufferSizeList[r],&data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize * data.blockSize]);
-                        }
-                        else {
-                                bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-                                bufferListA_out[r].read(data.result + bufferStartList[r] * data.blockSize * data.blockSize);
-                        }
-                }
-            endTransfer = std::chrono::high_resolution_clock::now();
-            transferTime +=
-                    std::chrono::duration_cast<std::chrono::duration<double>>
-                            (endTransfer - startTransfer);
-            transferTimings.push_back(transferTime.count());
+    // Transfer back data for next repetition!
+    handler.exchangeData(data);
+
+    std::chrono::duration<double> calculationTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation -
+                                                                  startCalculation);
+    calculationTimings.push_back(calculationTime.count());
+
+    std::vector<HOST_DATA_TYPE> tmp_write_buffer(local_matrix_height * local_matrix_width *
+                                                 data.blockSize * data.blockSize);
+
+    startTransfer = std::chrono::high_resolution_clock::now();
+
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      // Copy possibly incomplete first block row
+      if (bufferOffsetList[r] != 0) {
+        bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+        bufferListA_out[r].read(tmp_write_buffer.data());
+        for (int row = 0; row < data.blockSize; row++) {
+          for (int col = bufferOffsetList[r] * data.blockSize;
+               col < local_matrix_width * data.blockSize; col++) {
+            data.result[bufferStartList[r] * data.blockSize * data.blockSize +
+                        row * local_matrix_width * data.blockSize + col] =
+                tmp_write_buffer[row * local_matrix_width * data.blockSize + col];
+          }
         }
+        // Copy remaining buffer
+        std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize,
+                  tmp_write_buffer.begin() + bufferSizeList[r],
+                  &data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize *
+                               data.blockSize]);
+      } else {
+        bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+        bufferListA_out[r].read(data.result, bufferSizeList[r] * sizeof(HOST_DATA_TYPE),
+                                bufferStartList[r] * data.blockSize * data.blockSize *
+                                    sizeof(HOST_DATA_TYPE));
+      }
+    }
+    endTransfer = std::chrono::high_resolution_clock::now();
+    transferTime +=
+        std::chrono::duration_cast<std::chrono::duration<double>>(endTransfer - startTransfer);
+    transferTimings.push_back(transferTime.count());
+  }
 
-        std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
-                transferTimings,
-                calculationTimings
-        });
+  std::unique_ptr<transpose::TransposeExecutionTimings> result(
+      new transpose::TransposeExecutionTimings{transferTimings, calculationTimings});
 
-        return result;
-    }
+  return result;
+}
 
-}  // namespace transpose
-}  // namespace fpga_execution
-}  // namespace intel
+} // namespace pcie_pq
+} // namespace fpga_execution
+} // namespace transpose
 
 #endif

From b4800f29ebbf6e43d6195b7c2703a1e36eb3c2d7 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 25 Apr 2022 15:16:52 +0100
Subject: [PATCH 029/318] Fix faulty derived data type

---
 PTRANS/src/host/data_handlers/pq.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp
index 7fb08b6c..afa11575 100644
--- a/PTRANS/src/host/data_handlers/pq.hpp
+++ b/PTRANS/src/host/data_handlers/pq.hpp
@@ -180,8 +180,8 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, T
         std::uniform_real_distribution<> dis(-100.0, 100.0);
         for (size_t i = 0; i < blocks_per_rank * settings.programSettings->blockSize; i++) {
             for (size_t j = 0; j < settings.programSettings->blockSize; j++) {
-                d->A[i * settings.programSettings->blockSize + j] = i * settings.programSettings->blockSize + j;//dis(gen);
-                d->B[i * settings.programSettings->blockSize + j] = 0.0; //dis(gen);
+                d->A[i * settings.programSettings->blockSize + j] = dis(gen);
+                d->B[i * settings.programSettings->blockSize + j] = dis(gen);
                 d->result[i * settings.programSettings->blockSize + j] = 0.0;
             }
         }
@@ -216,11 +216,11 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, T
                 // 3 2 . .
    
 
-                size_t remaining_data_size = data.numBlocks;
+                size_t remaining_data_size = data.numBlocks * data.blockSize * data.blockSize;
                 size_t offset = 0;
                 while (remaining_data_size > 0) {
                     int next_chunk = (remaining_data_size > std::numeric_limits<int>::max()) ? std::numeric_limits<int>::max(): remaining_data_size;
-                    MPI_Sendrecv(&data.A[offset], next_chunk, data_block, pair_rank, 0, &data.exchange[offset], next_chunk, data_block, pair_rank, 0, MPI_COMM_WORLD, &status);
+                    MPI_Sendrecv(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, &data.exchange[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD, &status);
 
                     remaining_data_size -= next_chunk;
                     offset += static_cast<size_t>(next_chunk) * static_cast<size_t>(data.blockSize * data.blockSize);
@@ -379,7 +379,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, T
     }
 
     void 
-    reference_transpose(TransposeData<TContext>& data) {
+    reference_transpose(TransposeData<TContext>& data) override {
         for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
             for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
                 data.A[i * height_per_rank * data.blockSize + j] -= (data.result[j * width_per_rank * data.blockSize + i] - data.B[j * width_per_rank * data.blockSize + i]);

From c90daab4cca01b097f8761f830365e3dd13aa34a Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 25 Apr 2022 15:34:58 +0100
Subject: [PATCH 030/318] Add emulation switch for ACCL to benchmarks

---
 shared/include/hpcc_benchmark.hpp        | 15 +++++-
 shared/include/setup/fpga_setup_accl.hpp | 23 ++++----
 shared/setup/fpga_setup_accl.cpp         | 67 ++++++++++++------------
 3 files changed, 58 insertions(+), 47 deletions(-)

diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index ab4d092d..bece837c 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -126,6 +126,11 @@ class BaseSettings {
      */
     CommunicationType communicationType;
 
+    /**
+     * @brief Use ACCL emulation constructor instead of hardware execution
+     */
+    bool useAcclEmulation;
+
     /**
      * @brief Construct a new Base Settings object
      * 
@@ -146,6 +151,11 @@ class BaseSettings {
 #else
             kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : 1),
 #endif
+#ifdef USE_ACCL
+            useAcclEmulation(static_cast<bool>(results.count("accl-emulation"))),
+#else
+            useAcclEmulation(false),
+#endif
 #ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED
             communicationType(retrieveCommunicationType(results["comm-type"].as<std::string>(), results["f"].as<std::string>())),
 #else
@@ -393,6 +403,9 @@ class HpccFpgaBenchmark {
                 cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_REPETITIONS)))
 #ifdef INTEL_FPGA
                 ("i", "Use memory Interleaving")
+#endif
+#ifdef USE_ACCL
+                ("accl-emulation", "Use the accl emulation instead of hardware execution")
 #endif
                 ("skip-validation", "Skip the validation of the output data. This will speed up execution and helps when working with special data types.")
                 ("device", "Index of the device that has to be used. If not given you "\
@@ -515,7 +528,7 @@ class HpccFpgaBenchmark {
 #endif
 #ifdef USE_ACCL
                 if (programSettings->communicationType == CommunicationType::accl) {
-                    accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program);
+                    accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program, programSettings->useAcclEmulation);
                 }
                 else {
                     accl = std::unique_ptr<ACCL::ACCL>(nullptr);
diff --git a/shared/include/setup/fpga_setup_accl.hpp b/shared/include/setup/fpga_setup_accl.hpp
index 7158a81b..dcf2a530 100644
--- a/shared/include/setup/fpga_setup_accl.hpp
+++ b/shared/include/setup/fpga_setup_accl.hpp
@@ -22,18 +22,17 @@ SOFTWARE.
 #ifndef SRC_HOST_FPGA_SETUP_ACCL_H_
 #define SRC_HOST_FPGA_SETUP_ACCL_H_
 
-#include <string>
-#include <vector>
-#include <iostream>
-#include <iomanip>
 #include <chrono>
 #include <fstream>
+#include <iomanip>
+#include <iostream>
 #include <memory>
+#include <string>
+#include <vector>
 
 /* External libraries */
-#include "xrt/xrt_device.h"
 #include "accl.hpp"
-
+#include "xrt/xrt_device.h"
 
 namespace fpga_setup {
 
@@ -41,12 +40,12 @@ namespace fpga_setup {
 Sets up the given FPGA with the kernel in the provided file.
 
 @param device The device used for the program
-@param usedKernelFile The path to the kernel file
+@param program The program used to find the ACCL kernels for hardware execution
+@param useAcclEmulation Construct an ACCL emulation instance instead of hardware execution
 @return The ACCL instance used for communication
 */
-    std::unique_ptr<ACCL::ACCL>
-    fpgaSetupACCL(xrt::device &device,
-              xrt::uuid &program);
+std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
+                                          bool useAcclEmulation);
 
-}  // namespace fpga_setup
-#endif  // SRC_HOST_FPGA_SETUP_H_
+} // namespace fpga_setup
+#endif // SRC_HOST_FPGA_SETUP_H_
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index d521264e..b4753430 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -4,16 +4,16 @@
 
 #include "setup/fpga_setup_accl.hpp"
 
-#include <string>
-#include <vector>
-#include <iostream>
-#include <iomanip>
 #include <chrono>
 #include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
 
 /* External libraries */
-#include "parameters.h"
 #include "experimental/xrt_ip.h"
+#include "parameters.h"
 #include "xrt/xrt_kernel.h"
 #ifdef _USE_MPI_
 #include "mpi.h"
@@ -21,33 +21,32 @@
 
 namespace fpga_setup {
 
-    std::unique_ptr<ACCL::ACCL>
-    fpgaSetupACCL(xrt::device &device,
-              xrt::uuid &program) {
-        int current_rank;
-        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
-
-        int current_size;
-        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
- 
-        std::vector<ACCL::rank_t> ranks = {};
-        for (int i = 0; i < current_size; ++i) {
-		    // TODO: Replace the ip addresses and ports here for execution of real hardware?
-            ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i,
-                       1024};
-            ranks.emplace_back(new_rank);
-        }
-#ifdef ACCL_HARDWARE_SUPPORT
-        auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}");
-        auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}",
-                xrt::kernel::cu_access_mode::exclusive);
-        std::vector<int> mem(1,0);
-        return std::unique_ptr<ACCL::ACCL>(new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, 0));
-#else
-                // TODO: Add start port here. Currenty hardcoded!
-        return std::unique_ptr<ACCL::ACCL>(new ACCL::ACCL(ranks, current_rank,
-                            5500));
-#endif
-        }
+std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
+                                          bool useAcclEmulation) {
+  int current_rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &current_rank);
+
+  int current_size;
+  MPI_Comm_size(MPI_COMM_WORLD, &current_size);
+
+  std::vector<ACCL::rank_t> ranks = {};
+  for (int i = 0; i < current_size; ++i) {
+    // TODO: Replace the ip addresses and ports here for execution of real hardware?
+    ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, 1024};
+    ranks.emplace_back(new_rank);
+  }
+  if (!useAcclEmulation) {
+    auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}");
+    auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}",
+                                   xrt::kernel::cu_access_mode::exclusive);
+    std::vector<int> mem(1, 0);
+    return std::unique_ptr<ACCL::ACCL>(
+        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, 0));
+  } else {
+    // TODO: Add start port here. Currenty hardcoded!
+    return std::unique_ptr<ACCL::ACCL>(
+        new ACCL::ACCL(ranks, current_rank, 5500, device, ACCL::networkProtocol::TCP, 16, 1024));
+  }
+}
 
-}  // namespace fpga_setup
+} // namespace fpga_setup

From 0f96d28df1a147ae0592ca3c1701b0d625d11fdf Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 25 Apr 2022 15:37:29 +0100
Subject: [PATCH 031/318] Add ACCL buffers implementation for PTRANS

---
 PTRANS/src/device/transpose_PQ_ACCL_buffers.cl | 1 +
 1 file changed, 1 insertion(+)
 create mode 120000 PTRANS/src/device/transpose_PQ_ACCL_buffers.cl

diff --git a/PTRANS/src/device/transpose_PQ_ACCL_buffers.cl b/PTRANS/src/device/transpose_PQ_ACCL_buffers.cl
new file mode 120000
index 00000000..64e94f20
--- /dev/null
+++ b/PTRANS/src/device/transpose_PQ_ACCL_buffers.cl
@@ -0,0 +1 @@
+transpose_PQ_PCIE.cl
\ No newline at end of file

From 869dbf23549af4f44d2bf604134aebdab879ca47 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 25 Apr 2022 17:10:28 +0100
Subject: [PATCH 032/318] Include ACCL build to HPCC builds

---
 PTRANS/src/device/CMakeLists.txt |  2 +-
 cmake/accl.cmake                 | 72 ++++++++++++++++++++++++++++++++
 cmake/kernelTargets.cmake        | 16 ++++++-
 extern/CMakeLists.txt            |  4 +-
 4 files changed, 90 insertions(+), 4 deletions(-)
 create mode 100644 cmake/accl.cmake

diff --git a/PTRANS/src/device/CMakeLists.txt b/PTRANS/src/device/CMakeLists.txt
index 7542a861..21176719 100644
--- a/PTRANS/src/device/CMakeLists.txt
+++ b/PTRANS/src/device/CMakeLists.txt
@@ -11,7 +11,7 @@ if (INTELFPGAOPENCL_FOUND)
 endif()
 
 if (VITIS_FOUND)
-    generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE)
+    generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE transpose_PQ_ACCL_buffers)
     add_test(NAME test_emulation_PQ_PCIE_xilinx COMMAND Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
     add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
diff --git a/cmake/accl.cmake b/cmake/accl.cmake
new file mode 100644
index 00000000..6e7ccb38
--- /dev/null
+++ b/cmake/accl.cmake
@@ -0,0 +1,72 @@
+
+set(ACCL_STACK_TYPE "UDP" CACHE STRING "Network layer used in ACCL")
+set(ACCL_UDP_ETH_IF 0 CACHE STRING "Ethernet interface used. On ETHZ: 0 = switch, 1 = direct")
+set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform")
+
+set(ACCL_CCLO_KERNEL_DIR ${extern_accl_SOURCE_DIR}/kernels/cclo/)
+set(ACCL_CCLO_KERNEL_XO cclo_offload.xo)
+
+set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware)
+set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/)
+set(ACCL_UDP_MAC_XO ${ACCL_VNX_DIR}/Ethernet/_x.${FPGA_BOARD_NAME}/cmac_${ACCL_UDP_ETH_IF}.xo)
+set(ACCL_UDP_NET_XO ${ACCL_VNX_DIR}/NetLayers/_x.${FPGA_BOARD_NAME}/networklayer.xo)
+
+add_custom_command(
+    OUTPUT ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO}
+    COMMAND make STACK_TYPE=${ACCL_STACK_TYPE} PLATFORM=${FPGA_BOARD_NAME}
+    WORKING_DIRECTORY ${ACCL_CCLO_KERNEL_DIR})
+
+add_custom_command(
+    OUTPUT ${ACCL_UDP_MAC_XO}
+    COMMAND make -C ${ACCL_VNX_DIR}/Ethernet DEVICE=${FPGA_BOARD_NAME} INTERFACE=${ACCL_UDP_ETH_IF} all
+    WORKING_DIRECTORY ${ACCL_HARDWARE_DIR}) 
+
+add_custom_command(
+    OUTPUT ${ACCL_UDP_NET_XO}
+    COMMAND make -C ${ACCL_VNX_DIR}/NetLayers DEVICE=${FPGA_BOARD_NAME} all
+    WORKING_DIRECTORY ${ACCL_HARDWARE_DIR}) 
+
+
+set(ACCL_PLUGINS_DIR ${extern_accl_SOURCE_DIR}/kernels/plugins)
+set(ACCL_PLUGINS_HOSTCTRL ${ACCL_PLUGINS_DIR}/hostctrl/hostctrl.xo)
+set(ACCL_PLUGINS_SUM ${ACCL_PLUGINS_DIR}/reduce_sum/reduce_sum.xo)
+set(ACCL_PLUGINS_COMPRESSION ${ACCL_PLUGINS_DIR}/hp_compression/hp_compression.xo)
+set(ACCL_PLUGINS_LOOPBACK ${ACCL_PLUGINS_DIR}/loopback/loopback.xo)
+
+set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL}
+    ${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO} PARENT_SCOPE)
+
+add_custom_target(
+    accl_udp_stack
+    DEPENDS ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO})
+
+add_custom_target(
+    accl_cclo
+    DEPENDS ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO})
+
+add_custom_command(
+    OUTPUT ${ACCL_PLUGINS_HOSTCTRL}
+    COMMAND vitis_hls build_hostctrl.tcl -tclargs ip ${ACCL_DEVICE_NAME}
+    WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/hostctrl ) 
+add_custom_command(
+    OUTPUT ${ACCL_PLUGINS_SUM}
+    COMMAND vitis_hls build.tcl -tclargs ip ${ACCL_DEVICE_NAME}
+    WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/reduce_sum ) 
+add_custom_command(
+    OUTPUT ${ACCL_PLUGINS_COMPRESSION}
+    COMMAND vitis_hls build.tcl -tclargs ip ${ACCL_DEVICE_NAME}
+    WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/hp_compression ) 
+add_custom_command(
+    OUTPUT ${ACCL_PLUGINS_LOOPBACK}
+    COMMAND vitis_hls build_loopback.tcl -tclargs ip ${ACCL_DEVICE_NAME}
+    WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/loopback ) 
+
+add_custom_target(
+    accl_plugins
+    DEPENDS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} 
+    ${ACCL_PLUGINS_COMPRESSION})
+
+add_custom_target(
+    accl_udp)
+add_dependencies(accl_udp accl_udp_stack accl_cclo accl_plugins)
+
diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index 1d7e667f..35c128a9 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -9,6 +9,10 @@ else()
         set(VPP_FLAGS "-O3")
 endif()
 
+if (USE_ACCL)
+    include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake)
+endif()
+
 ##
 # This function will create build targets for the kernels for emulationand synthesis for xilinx.
 ##
@@ -21,6 +25,10 @@ function(generate_kernel_targets_xilinx)
         else()
                 set(base_file_part "src/device/${kernel_file_name}")
         endif()
+        string(REGEX MATCH ".*_ACCL.*" is_accl_kernel ${kernel_file_name})
+        if (is_accl_kernel AND NOT USE_ACCL)
+            continue()
+        endif()
         set(base_file "${CMAKE_SOURCE_DIR}/${base_file_part}.cl")
         if (KERNEL_REPLICATION_ENABLED)
             set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_replicated_xilinx.cl")
@@ -40,6 +48,9 @@ function(generate_kernel_targets_xilinx)
             set(gen_xilinx_link_settings ${XILINX_LINK_SETTINGS_FILE})
             set(xilinx_link_settings ${CMAKE_BINARY_DIR}/settings/settings.link.xilinx.${kernel_file_name}.ini)
         endif()
+        if (USE_ACCL AND is_accl_kernel)
+            list(APPEND additional_xos ${ACCL_UDP_XOS}) 
+        endif()
         set(xilinx_report_folder "${EXECUTABLE_OUTPUT_PATH}/xilinx_reports")
         set(local_CLFLAGS ${CLFLAGS} -DXILINX_FPGA)
         list(APPEND local_CLFLAGS --report_dir=${xilinx_report_folder} --log_dir=${xilinx_report_folder}/logs)
@@ -95,7 +106,7 @@ function(generate_kernel_targets_xilinx)
                 DEPENDS ${XILINX_COMPILE_SETTINGS_FILE}
                 )
         add_custom_command(OUTPUT ${bitstream_f}
-                COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_f} ${bitstream_compile}
+                COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_f} ${bitstream_compile} ${additional_xos}
                 MAIN_DEPENDENCY ${bitstream_compile}
                 DEPENDS ${xilinx_link_settings}
                 )
@@ -110,6 +121,9 @@ function(generate_kernel_targets_xilinx)
 		DEPENDS ${bitstream_compile} 
                 DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h
                 )
+            if(USE_ACCL AND is_accl_kernel)
+            add_dependencies(${kernel_file_name}_xilinx accl_udp)
+        endif()
         list(APPEND kernel_emulation_targets_xilinx ${kernel_file_name}_emulate_xilinx)
         set(kernel_emulation_targets_xilinx ${kernel_emulation_targets_xilinx} CACHE INTERNAL "Kernel emulation targets used to define dependencies for the tests for Xilinx devices")
     endforeach ()
diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 7845280d..341f73cd 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -61,8 +61,8 @@ if (DEFINED USE_ACCL)
 FetchContent_Declare(
 	extern_accl
 
-    GIT_REPOSITORY	https://github.com/TristanLaan/ACCL.git
-	GIT_TAG		simbuffer_bo_constructor)
+    GIT_REPOSITORY	https://github.com/Xilinx/ACCL.git
+	GIT_TAG		dev)
 
 FetchContent_GetProperties(extern_accl)
 if(NOT extern_accl_POPULATED)

From 9f33a199ebeedd1c6075f1c84424364f3e0a02a2 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 25 Apr 2022 18:51:24 +0100
Subject: [PATCH 033/318] First version for ACCL+PTRANS synth

---
 .../Xilinx_U280_DDR_ACCL_buffers.cmake        | 26 +++++++
 ...k.xilinx.transpose_pq_accl_buffers.ddr.ini | 76 +++++++++++++++++++
 cmake/accl.cmake                              |  8 +-
 cmake/kernelTargets.cmake                     |  6 +-
 4 files changed, 112 insertions(+), 4 deletions(-)
 create mode 100644 PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake
 create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini

diff --git a/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake b/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake
new file mode 100644
index 00000000..527f7612
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake
@@ -0,0 +1,26 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes)
+set(USE_XRT_HOST Yes)
+set(USE_OCL_HOST No)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini
new file mode 100644
index 00000000..4809e31c
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini
@@ -0,0 +1,76 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+nk=transpose0:2:transpose0.transpose1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=lb_user_krnl:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR0
+slr=cmac_0:SLR0
+slr=transpose0:SLR1
+slr=transpose1:SLR2
+
+sp=ccl_offload_0.m_axi_0:DDR[0:1]
+sp=ccl_offload_0.m_axi_1:DDR[0:1]
+sp=transpose0.m_axi_gmem:DDR[0]
+sp=transpose1.m_axi_gmem:DDR[1]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index 6e7ccb38..b8f74167 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -4,13 +4,17 @@ set(ACCL_UDP_ETH_IF 0 CACHE STRING "Ethernet interface used. On ETHZ: 0 = switch
 set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform")
 
 set(ACCL_CCLO_KERNEL_DIR ${extern_accl_SOURCE_DIR}/kernels/cclo/)
-set(ACCL_CCLO_KERNEL_XO cclo_offload.xo)
+set(ACCL_CCLO_KERNEL_XO ccl_offload.xo)
 
 set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware)
 set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/)
+set(ACCL_NETLAYER_HLS ${ACCL_VNX_DIR}/NetLayers/100G-fpga-network-stack-core)
 set(ACCL_UDP_MAC_XO ${ACCL_VNX_DIR}/Ethernet/_x.${FPGA_BOARD_NAME}/cmac_${ACCL_UDP_ETH_IF}.xo)
 set(ACCL_UDP_NET_XO ${ACCL_VNX_DIR}/NetLayers/_x.${FPGA_BOARD_NAME}/networklayer.xo)
 
+set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HMB)
+list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_VNX_DIR}/Ethernet/post_sys_link.tcl)
+list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_HLS_IP_FOLDER})
 add_custom_command(
     OUTPUT ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO}
     COMMAND make STACK_TYPE=${ACCL_STACK_TYPE} PLATFORM=${FPGA_BOARD_NAME}
@@ -34,7 +38,7 @@ set(ACCL_PLUGINS_COMPRESSION ${ACCL_PLUGINS_DIR}/hp_compression/hp_compression.x
 set(ACCL_PLUGINS_LOOPBACK ${ACCL_PLUGINS_DIR}/loopback/loopback.xo)
 
 set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL}
-    ${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO} PARENT_SCOPE)
+    ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO} CACHE INTERNAL "Object files required for ACCL")
 
 add_custom_target(
     accl_udp_stack
diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index 35c128a9..fc84248c 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -10,7 +10,7 @@ else()
 endif()
 
 if (USE_ACCL)
-    include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake)
+   include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake)
 endif()
 
 ##
@@ -54,7 +54,9 @@ function(generate_kernel_targets_xilinx)
         set(xilinx_report_folder "${EXECUTABLE_OUTPUT_PATH}/xilinx_reports")
         set(local_CLFLAGS ${CLFLAGS} -DXILINX_FPGA)
         list(APPEND local_CLFLAGS --report_dir=${xilinx_report_folder} --log_dir=${xilinx_report_folder}/logs)
-
+        if (is_accl_kernel)
+            list(APPEND local_CLFLAGS ${ACCL_LINK_CONFIG})
+        endif()
         string(REGEX MATCH "^.+\.tcl" is_tcl_script ${XILINX_COMPILE_SETTINGS_FILE})
         if (is_tcl_script)
                 set(CLFLAGS --hls.pre_tcl ${XILINX_COMPILE_SETTINGS_FILE})

From 979d275b6aa9f12c9295da0c5a2cfb201cab08bc Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 26 Apr 2022 16:54:22 +0100
Subject: [PATCH 034/318] Update configurations for first synthesis

---
 .../Xilinx_U280_DDR_ACCL_buffers.cmake        |  7 +-
 .../Xilinx_U280_DDR_ACCL_buffers_ddr.cmake    | 25 +++++++
 .../Xilinx_U280_DDR_ACCL_buffers_hbm.cmake    | 25 +++++++
 .../settings.compile.xilinx.accl_buffers.ini  |  0
 .../settings.link.xilinx.accl_buffers.ddr.ini | 71 +++++++++++++++++++
 .../settings.link.xilinx.accl_buffers.hbm.ini | 71 +++++++++++++++++++
 b_eff/src/device/CMakeLists.txt               | 24 ++++---
 b_eff/src/device/communication_ACCL.cl        | 27 +++++++
 8 files changed, 238 insertions(+), 12 deletions(-)
 create mode 100644 b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_ddr.cmake
 create mode 100644 b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_hbm.cmake
 create mode 100644 b_eff/settings/settings.compile.xilinx.accl_buffers.ini
 create mode 100644 b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini
 create mode 100644 b_eff/settings/settings.link.xilinx.accl_buffers.hbm.ini
 create mode 100644 b_eff/src/device/communication_ACCL.cl

diff --git a/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake b/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake
index 527f7612..21c8ec77 100644
--- a/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake
+++ b/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake
@@ -9,12 +9,13 @@
 set(USE_MPI Yes CACHE BOOL "" FORCE)
 set(USE_SVM No CACHE BOOL "" FORCE)
 set(USE_HBM No CACHE BOOL "" FORCE)
-set(USE_ACCL Yes)
-set(USE_XRT_HOST Yes)
-set(USE_OCL_HOST No)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
 set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
 set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini CACHE FILEPATH "" FORCE)
 set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
 
 # STREAM specific options
 # Defaults to a total of ~12GB data
diff --git a/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_ddr.cmake b/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_ddr.cmake
new file mode 100644
index 00000000..523c8761
--- /dev/null
+++ b/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_ddr.cmake
@@ -0,0 +1,25 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_buffers.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_hbm.cmake b/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_hbm.cmake
new file mode 100644
index 00000000..f097ebd9
--- /dev/null
+++ b/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_hbm.cmake
@@ -0,0 +1,25 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_buffers.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/b_eff/settings/settings.compile.xilinx.accl_buffers.ini b/b_eff/settings/settings.compile.xilinx.accl_buffers.ini
new file mode 100644
index 00000000..e69de29b
diff --git a/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini b/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini
new file mode 100644
index 00000000..64c67abc
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini
@@ -0,0 +1,71 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=lb_user_krnl:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR0
+slr=cmac_0:SLR0
+
+sp=ccl_offload_0.m_axi_0:DDR[0:1]
+sp=ccl_offload_0.m_axi_1:DDR[0:1]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
diff --git a/b_eff/settings/settings.link.xilinx.accl_buffers.hbm.ini b/b_eff/settings/settings.link.xilinx.accl_buffers.hbm.ini
new file mode 100644
index 00000000..e6352198
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.accl_buffers.hbm.ini
@@ -0,0 +1,71 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
+
+sp=ccl_offload_0.m_axi_0:HBM[0:5]
+sp=ccl_offload_0.m_axi_1:HBM[0:5]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt
index 8316a884..e5939572 100644
--- a/b_eff/src/device/CMakeLists.txt
+++ b/b_eff/src/device/CMakeLists.txt
@@ -3,12 +3,18 @@ set(KERNEL_REPLICATION_ENABLED Yes CACHE INTERNAL "Enables kernel replication in
 set(NUM_REPLICATIONS 2)
 include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake)
 
-generate_kernel_targets_intel(communication_bw520n_IEC)
-add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 -m 0 -n 1
-        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1
-        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type PCIE -l 1 -u 10 -m 0 -n 1
-        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 1 -m 20 -n 1
-        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+if (INTELFPGAOPENCL_FOUND)
+        generate_kernel_targets_intel(communication_bw520n_IEC)
+        add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 -m 0 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type PCIE -l 1 -u 10 -m 0 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 1 -m 20 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+endif()
+
+if (VITIS_FOUND)
+        generate_kernel_targets_xilinx(communication_ACCL)
+endif()
diff --git a/b_eff/src/device/communication_ACCL.cl b/b_eff/src/device/communication_ACCL.cl
new file mode 100644
index 00000000..80c12a86
--- /dev/null
+++ b/b_eff/src/device/communication_ACCL.cl
@@ -0,0 +1,27 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+__kernel
+void dummy(__global void *nothing) {
+    // Do nothing.
+    // Will be exluded during linking process and will not be in final bitstream
+}
\ No newline at end of file

From aaa8583bbc7d16b728d1dabc553c585146a6d0c0 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 26 Apr 2022 16:54:42 +0100
Subject: [PATCH 035/318] Change Kernel ordering on SLRs

---
 ...k.xilinx.transpose_pq_accl_buffers.ddr.ini | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini
index 4809e31c..1cb8cc27 100644
--- a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini
@@ -26,17 +26,17 @@ nk=loopback:1:lb_user_krnl
 nk=transpose0:2:transpose0.transpose1
 
 # Kernels Foorplaning
-slr=compression_0_0:SLR0
-slr=compression_0_1:SLR0
-slr=compression_0_2:SLR0
-slr=lb_user_krnl:SLR0
-slr=arith_0:SLR0
-slr=ccl_offload_0:SLR0
-slr=hostctrl_0:SLR0
-slr=networklayer_0:SLR0
-slr=cmac_0:SLR0
-slr=transpose0:SLR1
-slr=transpose1:SLR2
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
+slr=transpose0:SLR0
+slr=transpose1:SLR1
 
 sp=ccl_offload_0.m_axi_0:DDR[0:1]
 sp=ccl_offload_0.m_axi_1:DDR[0:1]

From 0e9dc1752835e05bbde33f018df8504b70e6ac44 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 26 Apr 2022 18:08:35 +0100
Subject: [PATCH 036/318] Fix placement of kernels in b_eff DDR

---
 .../settings.link.xilinx.accl_buffers.ddr.ini  | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini b/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini
index 64c67abc..2ee98436 100644
--- a/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini
+++ b/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini
@@ -25,15 +25,15 @@ nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
 nk=loopback:1:lb_user_krnl
 
 # Kernels Foorplaning
-slr=compression_0_0:SLR0
-slr=compression_0_1:SLR0
-slr=compression_0_2:SLR0
-slr=lb_user_krnl:SLR0
-slr=arith_0:SLR0
-slr=ccl_offload_0:SLR0
-slr=hostctrl_0:SLR0
-slr=networklayer_0:SLR0
-slr=cmac_0:SLR0
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
 
 sp=ccl_offload_0.m_axi_0:DDR[0:1]
 sp=ccl_offload_0.m_axi_1:DDR[0:1]

From ddefbac38169e01d2d478809e96843b2c926be07 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 26 Apr 2022 18:09:17 +0100
Subject: [PATCH 037/318] Update ACCL cmake scripts for TCP

---
 cmake/accl.cmake          | 98 +++++++++++++++++++++++++++++++--------
 cmake/kernelTargets.cmake |  2 +-
 2 files changed, 79 insertions(+), 21 deletions(-)

diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index b8f74167..ca7b0fc2 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -1,24 +1,21 @@
 
+# General definitions
 set(ACCL_STACK_TYPE "UDP" CACHE STRING "Network layer used in ACCL")
 set(ACCL_UDP_ETH_IF 0 CACHE STRING "Ethernet interface used. On ETHZ: 0 = switch, 1 = direct")
 set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform")
-
-set(ACCL_CCLO_KERNEL_DIR ${extern_accl_SOURCE_DIR}/kernels/cclo/)
-set(ACCL_CCLO_KERNEL_XO ccl_offload.xo)
-
 set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware)
+
+# UDP related definitions
 set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/)
 set(ACCL_NETLAYER_HLS ${ACCL_VNX_DIR}/NetLayers/100G-fpga-network-stack-core)
 set(ACCL_UDP_MAC_XO ${ACCL_VNX_DIR}/Ethernet/_x.${FPGA_BOARD_NAME}/cmac_${ACCL_UDP_ETH_IF}.xo)
 set(ACCL_UDP_NET_XO ${ACCL_VNX_DIR}/NetLayers/_x.${FPGA_BOARD_NAME}/networklayer.xo)
-
 set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HMB)
-list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_VNX_DIR}/Ethernet/post_sys_link.tcl)
-list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_HLS_IP_FOLDER})
-add_custom_command(
-    OUTPUT ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO}
-    COMMAND make STACK_TYPE=${ACCL_STACK_TYPE} PLATFORM=${FPGA_BOARD_NAME}
-    WORKING_DIRECTORY ${ACCL_CCLO_KERNEL_DIR})
+if (ACCL_STACK_TYPE STREQUAL "UDP")
+    list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_VNX_DIR}/Ethernet/post_sys_link.tcl)
+    list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_HLS_IP_FOLDER})
+    set(ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE})
+endif()
 
 add_custom_command(
     OUTPUT ${ACCL_UDP_MAC_XO}
@@ -30,24 +27,67 @@ add_custom_command(
     COMMAND make -C ${ACCL_VNX_DIR}/NetLayers DEVICE=${FPGA_BOARD_NAME} all
     WORKING_DIRECTORY ${ACCL_HARDWARE_DIR}) 
 
+add_custom_target(
+    accl_udp_stack
+    DEPENDS ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO})
 
-set(ACCL_PLUGINS_DIR ${extern_accl_SOURCE_DIR}/kernels/plugins)
-set(ACCL_PLUGINS_HOSTCTRL ${ACCL_PLUGINS_DIR}/hostctrl/hostctrl.xo)
-set(ACCL_PLUGINS_SUM ${ACCL_PLUGINS_DIR}/reduce_sum/reduce_sum.xo)
-set(ACCL_PLUGINS_COMPRESSION ${ACCL_PLUGINS_DIR}/hp_compression/hp_compression.xo)
-set(ACCL_PLUGINS_LOOPBACK ${ACCL_PLUGINS_DIR}/loopback/loopback.xo)
+# TCP related definitions
+set(ACCL_TCP_BASE_DIR ${extern_accl_SOURCE_DIR}/Vitis_with_100Gbps_TCP-IP)
+set(ACCL_TCP_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/network_krnl.xo)
+set(ACCL_TCP_CMAC_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/cmac_krnl.xo)
+if (ACCL_STACK_TYPE STREQUAL "TCP")
+    list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_TCP_BASE_DIR}/scripts/post_sys_link.tcl)
+    list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo)
+    set(ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE} EN_FANIN=1)
+endif()
 
-set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL}
-    ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO} CACHE INTERNAL "Object files required for ACCL")
+# TODO: This is very sppecific to the Xilinx build system, because
+# different Vivado version is required to build these ips
+add_custom_command(
+    OUTPUT ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo
+    COMMAND mkdir build && cd build && cmake .. -DFDEV_NAME=u280 
+            -DVIVADO_HLS_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 
+            -DVIVADO_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 
+            -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=1 -DTCP_STACK_WINDOW_SCALING_EN=0
+    WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR})
+
+add_custom_command(
+    OUTPUT ${ACCL_TCP_CMAC_XO}
+    COMMAND make cmac_krnl DEVICE=${FPGA_BOARD_NAME} XSA=${FPGA_BOARD_NAME} TEMP_DIR=_x.hw.${FPGA_BOARD_NAME}/
+    WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR}
+    DEPENDS ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo) 
+
+add_custom_command(
+    OUTPUT ${ACCL_TCP_XO}
+    COMMAND make network_krnl DEVICE=${FPGA_BOARD_NAME} XSA=${FPGA_BOARD_NAME} TEMP_DIR=_x.hw.${FPGA_BOARD_NAME}/
+    WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR}
+    DEPENDS ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo) 
 
 add_custom_target(
-    accl_udp_stack
-    DEPENDS ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO})
+    accl_tcp_stack
+    DEPENDS ${ACCL_TCP_XO} ${ACCL_TCP_CMAC_XO})
+      
+
+# Build CCLO
+set(ACCL_CCLO_KERNEL_DIR ${extern_accl_SOURCE_DIR}/kernels/cclo/)
+set(ACCL_CCLO_KERNEL_XO ccl_offload.xo)
+
+add_custom_command(
+    OUTPUT ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO}
+    COMMAND make ${ACCL_CCLO_BUILD_ARGS} PLATFORM=${FPGA_BOARD_NAME}
+    WORKING_DIRECTORY ${ACCL_CCLO_KERNEL_DIR})
 
 add_custom_target(
     accl_cclo
     DEPENDS ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO})
 
+# Build the ACCL Plugins
+set(ACCL_PLUGINS_DIR ${extern_accl_SOURCE_DIR}/kernels/plugins)
+set(ACCL_PLUGINS_HOSTCTRL ${ACCL_PLUGINS_DIR}/hostctrl/hostctrl.xo)
+set(ACCL_PLUGINS_SUM ${ACCL_PLUGINS_DIR}/reduce_sum/reduce_sum.xo)
+set(ACCL_PLUGINS_COMPRESSION ${ACCL_PLUGINS_DIR}/hp_compression/hp_compression.xo)
+set(ACCL_PLUGINS_LOOPBACK ${ACCL_PLUGINS_DIR}/loopback/loopback.xo)
+
 add_custom_command(
     OUTPUT ${ACCL_PLUGINS_HOSTCTRL}
     COMMAND vitis_hls build_hostctrl.tcl -tclargs ip ${ACCL_DEVICE_NAME}
@@ -70,7 +110,25 @@ add_custom_target(
     DEPENDS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} 
     ${ACCL_PLUGINS_COMPRESSION})
 
+set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL}
+    ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO} CACHE INTERNAL "Object files required for ACCL with UDP")
+
+set(ACCL_TCP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL}
+    ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_TCP_CMAC_XO} ${ACCL_TCP_XO} CACHE INTERNAL "Object files required for ACCL with TCP")
+
+if (ACCL_STACK_TYPE STREQUAL "UDP")
+    set(ACCL_XOS ${ACCL_UDP_XOS} CACHE INTERNAL "Object files required for ACCL")
+else()
+    set(ACCL_XOS ${ACCL_TCP_XOS} CACHE INTERNAL "Object files required for ACCL")
+endif()
+
 add_custom_target(
     accl_udp)
 add_dependencies(accl_udp accl_udp_stack accl_cclo accl_plugins)
 
+add_custom_target(
+    accl_tcp)
+add_dependencies(accl_tcp accl_tcp_stack accl_cclo accl_plugins)
+
+
+
diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index fc84248c..7f5a4775 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -49,7 +49,7 @@ function(generate_kernel_targets_xilinx)
             set(xilinx_link_settings ${CMAKE_BINARY_DIR}/settings/settings.link.xilinx.${kernel_file_name}.ini)
         endif()
         if (USE_ACCL AND is_accl_kernel)
-            list(APPEND additional_xos ${ACCL_UDP_XOS}) 
+            list(APPEND additional_xos ${ACCL_XOS}) 
         endif()
         set(xilinx_report_folder "${EXECUTABLE_OUTPUT_PATH}/xilinx_reports")
         set(local_CLFLAGS ${CLFLAGS} -DXILINX_FPGA)

From 4745d8f9ab55d9619deb506cbc77a6947a517bdd Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 26 Apr 2022 18:24:34 +0100
Subject: [PATCH 038/318] Add ACCL TCP configs for PTRANS

---
 .../Xilinx_U280_DDR_ACCL_TCP_buffers.cmake    | 28 ++++++
 ...linx.transpose_pq_accl_tcp_buffers.ddr.ini | 86 +++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 PTRANS/configs/Xilinx_U280_DDR_ACCL_TCP_buffers.cmake
 create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini

diff --git a/PTRANS/configs/Xilinx_U280_DDR_ACCL_TCP_buffers.cmake b/PTRANS/configs/Xilinx_U280_DDR_ACCL_TCP_buffers.cmake
new file mode 100644
index 00000000..e8e77751
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U280_DDR_ACCL_TCP_buffers.cmake
@@ -0,0 +1,28 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(ACCL_STACK_TYPE "TCP" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini
new file mode 100644
index 00000000..a1492b0a
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini
@@ -0,0 +1,86 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=network_krnl:1:network_krnl_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_krnl:1:cmac_krnl_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+nk=transpose0:2:transpose0.transpose1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=lb_user_krnl:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=network_krnl_0:SLR1
+slr=cmac_krnl_0:SLR2
+slr=transpose0:SLR0
+slr=transpose1:SLR1
+
+sp=network_krnl_0.m00_axi:DDR[0]
+sp=network_krnl_0.m01_axi:DDR[0]
+sp=ccl_offload_0.m_axi_0:DDR[0:1]
+sp=ccl_offload_0.m_axi_1:DDR[0:1]
+sp=transpose0.m_axi_gmem:DDR[0]
+sp=transpose1.m_axi_gmem:DDR[1]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to TCP Network Kernel
+stream_connect=network_krnl_0.m_axis_tcp_port_status:ccl_offload_0.s_axis_eth_port_status:512
+stream_connect=network_krnl_0.m_axis_tcp_open_status:ccl_offload_0.s_axis_eth_open_status:512
+stream_connect=network_krnl_0.m_axis_tcp_notification:ccl_offload_0.s_axis_eth_notification:512
+stream_connect=network_krnl_0.m_axis_tcp_rx_meta:ccl_offload_0.s_axis_eth_rx_meta:512
+stream_connect=network_krnl_0.m_axis_tcp_rx_data:ccl_offload_0.s_axis_eth_rx_data:512
+stream_connect=network_krnl_0.m_axis_tcp_tx_status:ccl_offload_0.s_axis_eth_tx_status:512
+stream_connect=ccl_offload_0.m_axis_eth_listen_port:network_krnl_0.s_axis_tcp_listen_port:512
+stream_connect=ccl_offload_0.m_axis_eth_open_connection:network_krnl_0.s_axis_tcp_open_connection:512
+stream_connect=ccl_offload_0.m_axis_eth_read_pkg:network_krnl_0.s_axis_tcp_read_pkg:512
+stream_connect=ccl_offload_0.m_axis_eth_tx_meta:network_krnl_0.s_axis_tcp_tx_meta:512
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:network_krnl_0.s_axis_tcp_tx_data:512
+
+# Connect Network Kernel to CMAC Kernel
+stream_connect=cmac_krnl_0.axis_net_rx:network_krnl_0.axis_net_rx
+stream_connect=network_krnl_0.axis_net_tx:cmac_krnl_0.axis_net_tx
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl

From f089ec171ff2b522b8cb6629031ac9d17e154f95 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 27 Apr 2022 10:11:41 +0100
Subject: [PATCH 039/318] Create unified device target for ACCL

---
 cmake/accl.cmake          | 10 +++++++---
 cmake/kernelTargets.cmake |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index ca7b0fc2..d3989e47 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -32,7 +32,7 @@ add_custom_target(
     DEPENDS ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO})
 
 # TCP related definitions
-set(ACCL_TCP_BASE_DIR ${extern_accl_SOURCE_DIR}/Vitis_with_100Gbps_TCP-IP)
+set(ACCL_TCP_BASE_DIR ${ACCL_HARDWARE_DIR}/Vitis_with_100Gbps_TCP-IP)
 set(ACCL_TCP_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/network_krnl.xo)
 set(ACCL_TCP_CMAC_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/cmac_krnl.xo)
 if (ACCL_STACK_TYPE STREQUAL "TCP")
@@ -130,5 +130,9 @@ add_custom_target(
     accl_tcp)
 add_dependencies(accl_tcp accl_tcp_stack accl_cclo accl_plugins)
 
-
-
+add_custom_target(accl_device)
+if (ACCL_STACK_TYPE STREQUAL "UDP")
+    add_dependencies(accl_device accl_udp)
+else()
+    add_dependencies(accl_device accl_tcp)
+endif()
diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index 7f5a4775..22680a6c 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -123,8 +123,8 @@ function(generate_kernel_targets_xilinx)
 		DEPENDS ${bitstream_compile} 
                 DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h
                 )
-            if(USE_ACCL AND is_accl_kernel)
-            add_dependencies(${kernel_file_name}_xilinx accl_udp)
+        if(USE_ACCL AND is_accl_kernel)
+            add_dependencies(${kernel_file_name}_xilinx accl_device)
         endif()
         list(APPEND kernel_emulation_targets_xilinx ${kernel_file_name}_emulate_xilinx)
         set(kernel_emulation_targets_xilinx ${kernel_emulation_targets_xilinx} CACHE INTERNAL "Kernel emulation targets used to define dependencies for the tests for Xilinx devices")

From 4da920372e28bf65fb85dd675321d7cd411ef122 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 27 Apr 2022 14:52:45 +0100
Subject: [PATCH 040/318] Also call make installip

---
 cmake/accl.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index d3989e47..88bc2b64 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -48,7 +48,8 @@ add_custom_command(
     COMMAND mkdir build && cd build && cmake .. -DFDEV_NAME=u280 
             -DVIVADO_HLS_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 
             -DVIVADO_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 
-            -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=1 -DTCP_STACK_WINDOW_SCALING_EN=0
+            -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=1 -DTCP_STACK_WINDOW_SCALING_EN=0 &&
+            make installip
     WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR})
 
 add_custom_command(

From b45bd26cd33a0bba0d8513e835ddfcb2a0444670 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 27 Apr 2022 14:53:09 +0100
Subject: [PATCH 041/318] Fix b_eff host code build scripts

---
 b_eff/src/host/CMakeLists.txt        | 2 ++
 b_eff/src/host/network_benchmark.hpp | 3 ---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt
index b8c44859..d0be57ba 100755
--- a/b_eff/src/host/CMakeLists.txt
+++ b/b_eff/src/host/CMakeLists.txt
@@ -14,6 +14,7 @@ if (INTELFPGAOPENCL_FOUND)
     target_link_libraries(${LIB_NAME}_intel hpcc_fpga_base)
     target_link_libraries(${HOST_EXE_NAME}_intel ${LIB_NAME}_intel)
     target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA -D_USE_MPI_)
+    target_compile_definitions(${HOST_EXE_NAME}_intel PRIVATE -DINTEL_FPGA)
     target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_intel_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_intel> -h)
 endif()
@@ -28,6 +29,7 @@ if (Vitis_FOUND)
     target_include_directories(${LIB_NAME}_xilinx PRIVATE ${ACCL_INCLUDE_PATH})
     target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx)
     target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
+    target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_xilinx_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_xilinx> -h)
 endif()
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index efffe1bf..8e9e2fc1 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -31,8 +31,6 @@ SOFTWARE.
 #include "hpcc_benchmark.hpp"
 #include "parameters.h"
 
-//TODO: remove this custom allocator since cl2.hpp is available here?
-#if 0
 #ifdef XILINX_FPGA
 template <typename T>
 struct aligned_allocator {
@@ -59,7 +57,6 @@ namespace cl {
     template <class T> using vector = std::vector<T,aligned_allocator<T>>; 
 }
 #endif
-#endif
 
 /**
  * @brief Contains all classes and methods needed by the Network benchmark

From 64ca17851299b7a0d782a13a8d8f0b7773bcec3e Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 4 May 2022 11:45:37 +0100
Subject: [PATCH 042/318] Reformatting and fix data read back

---
 .../execution_types/execution_xrt_pcie_pq.hpp | 130 ++++++++++--------
 1 file changed, 72 insertions(+), 58 deletions(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
index 85481b6f..d59ba2e0 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
@@ -37,32 +37,37 @@ namespace fpga_execution {
 namespace pcie_pq {
 
 /**
- * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and
- * PCIe+MPI over the host for communication
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ
+ * distribution and PCIe+MPI over the host for communication
  *
  * @param config The progrma configuration
- * @param data data object that contains all required data for the execution on the FPGA
- * @param handler data handler instance that should be used to exchange data between hosts
- * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times
+ * @param data data object that contains all required data for the execution on
+ * the FPGA
+ * @param handler data handler instance that should be used to exchange data
+ * between hosts
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured
+ * execution times
  */
-static std::unique_ptr<transpose::TransposeExecutionTimings>
-calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, xrt::device, bool,
-                                             xrt::uuid> &config,
+static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
+    const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
+                                       xrt::device, bool, xrt::uuid> &config,
           transpose::TransposeData<bool> &data,
-          transpose::data_handler::DistributedPQTransposeDataHandler<xrt::device, bool, xrt::uuid>
-              &handler) {
+    transpose::data_handler::DistributedPQTransposeDataHandler<
+        xrt::device, bool, xrt::uuid> &handler) {
   int err;
 
   if (config.programSettings->dataHandlerIdentifier !=
       transpose::data_handler::DataHandlerType::pq) {
-    throw std::runtime_error("Used data handler not supported by execution handler!");
+    throw std::runtime_error(
+        "Used data handler not supported by execution handler!");
   }
 #ifdef USE_SVM
-  throw new std::runtime_error(
-      "SVM not supported in the host implementation of this communication method");
+  throw new std::runtime_error("SVM not supported in the host implementation "
+                               "of this communication method");
 #endif
 #ifdef USE_BUFFER_WRITE_RECT_FOR_A
-  throw new std::runtime_error("Using the Write Rect method is not supported in this host "
+  throw new std::runtime_error(
+      "Using the Write Rect method is not supported in this host "
                                "implementation of this communication method");
 #endif
 
@@ -77,56 +82,59 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
 
   size_t local_matrix_width = handler.getWidthforRank();
   size_t local_matrix_height = handler.getHeightforRank();
-  size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+  size_t local_matrix_width_bytes =
+      local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
 
   size_t total_offset = 0;
   size_t row_offset = 0;
   // Setup the kernels depending on the number of kernel replications
   for (int r = 0; r < config.programSettings->kernelReplications; r++) {
 
-    // Calculate how many blocks the current kernel replication will need to process.
+    // Calculate how many blocks the current kernel replication will need to
+    // process.
     size_t blocks_per_replication =
-        (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications);
-    size_t blocks_remainder =
-        (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications;
+        (local_matrix_height * local_matrix_width /
+         config.programSettings->kernelReplications);
+    size_t blocks_remainder = (local_matrix_height * local_matrix_width) %
+                              config.programSettings->kernelReplications;
     if (blocks_remainder > r) {
-      // Catch the case, that the number of blocks is not divisible by the number of kernel
-      // replications
+      // Catch the case, that the number of blocks is not divisible by the
+      // number of kernel replications
       blocks_per_replication += 1;
     }
     if (blocks_per_replication < 1) {
       continue;
     }
     blocksPerReplication.push_back(blocks_per_replication);
-    size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width *
-                         local_matrix_width * data.blockSize * data.blockSize;
+    size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) /
+                         local_matrix_width * local_matrix_width *
+                         data.blockSize * data.blockSize;
     bufferSizeList.push_back(buffer_size);
     bufferStartList.push_back(total_offset);
     bufferOffsetList.push_back(row_offset);
 
     row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
 
-    total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width *
-                    local_matrix_width;
-
-    int memory_bank_info_a = 0;
-    int memory_bank_info_b = 0;
-    int memory_bank_info_out = 0;
+    total_offset += (bufferOffsetList.back() + blocks_per_replication) /
+                    local_matrix_width * local_matrix_width;
 
     // create the kernels
     xrt::kernel transposeKernel(*config.device, *config.program,
                                 ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str());
 
     xrt::bo bufferA(*config.device, data.A,
-                    data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE),
+                    data.numBlocks * data.blockSize * data.blockSize *
+                        sizeof(HOST_DATA_TYPE),
                     transposeKernel.group_id(0));
-    xrt::bo bufferB(*config.device,&data.B[bufferStartList[r] * data.blockSize * data.blockSize],
+    xrt::bo bufferB(
+        *config.device,
+        &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
                     buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
-    // TODO For small matrices, the 4KB alignment might fail for buffer B. Temporary fix seen in
-    // lines below (requires extra copying)
-    // xrt::bo bufferB(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
-    // transposeKernel.group_id(1)); bufferB.write(data.B + bufferStartList[r] * data.blockSize *
-    // data.blockSize);
+    // TODO For small matrices, the 4KB alignment might fail for buffer B.
+    // Temporary fix seen in lines below (requires extra copying) xrt::bo
+    // bufferB(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
+    // transposeKernel.group_id(1)); bufferB.write(data.B + bufferStartList[r] *
+    // data.blockSize * data.blockSize);
     xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
                         transposeKernel.group_id(2));
 
@@ -150,7 +158,8 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
     auto endTransfer = std::chrono::high_resolution_clock::now();
 
     std::chrono::duration<double> transferTime =
-        std::chrono::duration_cast<std::chrono::duration<double>>(endTransfer - startTransfer);
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endTransfer - startTransfer);
 
     MPI_Barrier(MPI_COMM_WORLD);
 
@@ -163,7 +172,8 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
     // Exchange A data via PCIe and MPI
     handler.exchangeData(data);
 
-    std::copy(data.A, data.A + data.numBlocks * data.blockSize * data.blockSize, data.exchange);
+    std::copy(data.A, data.A + data.numBlocks * data.blockSize * data.blockSize,
+              data.exchange);
     for (int r = 0; r < transposeKernelList.size(); r++) {
       bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
     }
@@ -188,14 +198,15 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
     std::cout << "Rank " << mpi_rank << ": "
               << "Done i=" << repetition << std::endl;
     std::cout << "Kernel execution time: "
-              << std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation -
-                                                                           startKernelCalculation)
+              << std::chrono::duration_cast<std::chrono::duration<double>>(
+                     endCalculation - startKernelCalculation)
                      .count()
               << "s ("
-              << ((config.programSettings->matrixSize * config.programSettings->matrixSize *
-                   sizeof(HOST_DATA_TYPE) * 3) /
-                  std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation -
-                                                                            startKernelCalculation)
+              << ((config.programSettings->matrixSize *
+                   config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) *
+                   3) /
+                  std::chrono::duration_cast<std::chrono::duration<double>>(
+                      endCalculation - startKernelCalculation)
                       .count() *
                   1.0e-9)
               << " GB/s)" << std::endl;
@@ -205,12 +216,13 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
     handler.exchangeData(data);
 
     std::chrono::duration<double> calculationTime =
-        std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation -
-                                                                  startCalculation);
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endCalculation - startCalculation);
     calculationTimings.push_back(calculationTime.count());
 
-    std::vector<HOST_DATA_TYPE> tmp_write_buffer(local_matrix_height * local_matrix_width *
-                                                 data.blockSize * data.blockSize);
+    std::vector<HOST_DATA_TYPE> tmp_write_buffer(
+        local_matrix_height * local_matrix_width * data.blockSize *
+        data.blockSize);
 
     startTransfer = std::chrono::high_resolution_clock::now();
 
@@ -224,29 +236,31 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
                col < local_matrix_width * data.blockSize; col++) {
             data.result[bufferStartList[r] * data.blockSize * data.blockSize +
                         row * local_matrix_width * data.blockSize + col] =
-                tmp_write_buffer[row * local_matrix_width * data.blockSize + col];
+                tmp_write_buffer[row * local_matrix_width * data.blockSize +
+                                 col];
           }
         }
         // Copy remaining buffer
-        std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize,
+        std::copy(tmp_write_buffer.begin() +
+                      local_matrix_width * data.blockSize * data.blockSize,
                   tmp_write_buffer.begin() + bufferSizeList[r],
-                  &data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize *
-                               data.blockSize]);
+                  &data.result[(bufferStartList[r] + local_matrix_width) *
+                               data.blockSize * data.blockSize]);
       } else {
         bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-        bufferListA_out[r].read(data.result, bufferSizeList[r] * sizeof(HOST_DATA_TYPE),
-                                bufferStartList[r] * data.blockSize * data.blockSize *
-                                    sizeof(HOST_DATA_TYPE));
+        bufferListA_out[r].read(
+            &data.result[bufferStartList[r] * data.blockSize * data.blockSize]);
       }
     }
     endTransfer = std::chrono::high_resolution_clock::now();
-    transferTime +=
-        std::chrono::duration_cast<std::chrono::duration<double>>(endTransfer - startTransfer);
+    transferTime += std::chrono::duration_cast<std::chrono::duration<double>>(
+        endTransfer - startTransfer);
     transferTimings.push_back(transferTime.count());
   }
 
   std::unique_ptr<transpose::TransposeExecutionTimings> result(
-      new transpose::TransposeExecutionTimings{transferTimings, calculationTimings});
+      new transpose::TransposeExecutionTimings{transferTimings,
+                                               calculationTimings});
 
   return result;
 }

From e07261ee2eca88970fe9f7bb52fd9778358a6875 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 5 May 2022 10:12:31 +0100
Subject: [PATCH 043/318] Fix offset calculation of MPI transpose

---
 PTRANS/src/host/data_handlers/pq.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp
index afa11575..87e7d15f 100644
--- a/PTRANS/src/host/data_handlers/pq.hpp
+++ b/PTRANS/src/host/data_handlers/pq.hpp
@@ -223,7 +223,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, T
                     MPI_Sendrecv(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, &data.exchange[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD, &status);
 
                     remaining_data_size -= next_chunk;
-                    offset += static_cast<size_t>(next_chunk) * static_cast<size_t>(data.blockSize * data.blockSize);
+                    offset += static_cast<size_t>(next_chunk);
                 }
 
                 // Exchange window pointers

From cc928b203863b2973b9ea94e1667fdcd20ea427d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 5 May 2022 10:12:54 +0100
Subject: [PATCH 044/318] Add number of wrong entries to output

---
 PTRANS/src/host/transpose_benchmark.hpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index d1ab4340..392789c8 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -167,16 +167,23 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
         this->dataHandler->reference_transpose(data);
 
         double max_error = 0.0;
+        int error_count = 0;
         for (size_t i = 0; i < this->executionSettings->programSettings->blockSize * this->executionSettings->programSettings->blockSize * data.numBlocks; i++) {
             max_error = std::max(std::abs<double>(data.A[i]), max_error);
+            if (std::abs<double>(data.A[i]) - 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon() > 0.0) {
+                error_count++;
+            }
         }
 
         double global_max_error = 0;
+        int global_error_count = 0;
         MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&error_count, &global_error_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
 
         if (this->mpi_comm_rank == 0) {
-            std::cout << "Maximum error: " << global_max_error << " < " << 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon() <<  std::endl;
-            std::cout << "Mach. Epsilon: " << std::numeric_limits<HOST_DATA_TYPE>::epsilon() << std::endl;
+            std::cout << "Erronous entries: " << global_error_count << std::endl;
+            std::cout << "Maximum error:    " << global_max_error << " < " << 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon() <<  std::endl;
+            std::cout << "Mach. Epsilon:    " << std::numeric_limits<HOST_DATA_TYPE>::epsilon() << std::endl;
         }
 
         return static_cast<double>(global_max_error) < 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon();

From 97414a49f20a6e1d600bda35c01da7e4f7791ea1 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 5 May 2022 16:28:01 +0100
Subject: [PATCH 045/318] Add support for C++ kernel code

---
 cmake/kernelTargets.cmake | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index 22680a6c..84ce896f 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -13,6 +13,8 @@ if (USE_ACCL)
    include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake)
 endif()
 
+set(file_endings "cpp" "cl")
+
 ##
 # This function will create build targets for the kernels for emulationand synthesis for xilinx.
 ##
@@ -29,11 +31,19 @@ function(generate_kernel_targets_xilinx)
         if (is_accl_kernel AND NOT USE_ACCL)
             continue()
         endif()
-        set(base_file "${CMAKE_SOURCE_DIR}/${base_file_part}.cl")
+        set(file_exists No)
+        foreach (ending ${file_endings})
+            set(search_file_name "${CMAKE_SOURCE_DIR}/${base_file_part}.${ending}")
+            if (NOT file_exists AND EXISTS ${search_file_name})
+                set(file_exists Yes)
+                set(selected_file_ending ${ending})
+                set(base_file "${search_file_name}")
+            endif()
+        endforeach()
         if (KERNEL_REPLICATION_ENABLED)
-            set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_replicated_xilinx.cl")
+            set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_replicated_xilinx.${selected_file_ending}")
         else()
-            set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_copied_xilinx.cl")
+            set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_copied_xilinx.${selected_file_ending}")
         endif()
         set(bitstream_compile xilinx_tmp_compile/${kernel_file_name}.xo)
         set(bitstream_compile_emulate xilinx_tmp_compile/${kernel_file_name}_emulate.xo)
@@ -55,7 +65,7 @@ function(generate_kernel_targets_xilinx)
         set(local_CLFLAGS ${CLFLAGS} -DXILINX_FPGA)
         list(APPEND local_CLFLAGS --report_dir=${xilinx_report_folder} --log_dir=${xilinx_report_folder}/logs)
         if (is_accl_kernel)
-            list(APPEND local_CLFLAGS ${ACCL_LINK_CONFIG})
+            list(APPEND local_harware_only_flags ${ACCL_LINK_CONFIG})
         endif()
         string(REGEX MATCH "^.+\.tcl" is_tcl_script ${XILINX_COMPILE_SETTINGS_FILE})
         if (is_tcl_script)
@@ -108,7 +118,7 @@ function(generate_kernel_targets_xilinx)
                 DEPENDS ${XILINX_COMPILE_SETTINGS_FILE}
                 )
         add_custom_command(OUTPUT ${bitstream_f}
-                COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_f} ${bitstream_compile} ${additional_xos}
+                COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} ${local_harware_only_flags} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_f} ${bitstream_compile} ${additional_xos}
                 MAIN_DEPENDENCY ${bitstream_compile}
                 DEPENDS ${xilinx_link_settings}
                 )

From fa9a2051f1588bc18c6a040186a1789daaab60ab Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 5 May 2022 16:29:03 +0100
Subject: [PATCH 046/318] Add C++ baseline for PTRANS

---
 .../src/device/transpose_PQ_ACCL_buffers.cl   |   1 -
 .../src/device/transpose_PQ_ACCL_buffers.cpp  |   1 +
 PTRANS/src/device/transpose_PQ_PCIE.cpp       | 158 ++++++++++++++++++
 3 files changed, 159 insertions(+), 1 deletion(-)
 delete mode 120000 PTRANS/src/device/transpose_PQ_ACCL_buffers.cl
 create mode 120000 PTRANS/src/device/transpose_PQ_ACCL_buffers.cpp
 create mode 100644 PTRANS/src/device/transpose_PQ_PCIE.cpp

diff --git a/PTRANS/src/device/transpose_PQ_ACCL_buffers.cl b/PTRANS/src/device/transpose_PQ_ACCL_buffers.cl
deleted file mode 120000
index 64e94f20..00000000
--- a/PTRANS/src/device/transpose_PQ_ACCL_buffers.cl
+++ /dev/null
@@ -1 +0,0 @@
-transpose_PQ_PCIE.cl
\ No newline at end of file
diff --git a/PTRANS/src/device/transpose_PQ_ACCL_buffers.cpp b/PTRANS/src/device/transpose_PQ_ACCL_buffers.cpp
new file mode 120000
index 00000000..58aeb801
--- /dev/null
+++ b/PTRANS/src/device/transpose_PQ_ACCL_buffers.cpp
@@ -0,0 +1 @@
+transpose_PQ_PCIE.cpp
\ No newline at end of file
diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cpp b/PTRANS/src/device/transpose_PQ_PCIE.cpp
new file mode 100644
index 00000000..456c6919
--- /dev/null
+++ b/PTRANS/src/device/transpose_PQ_PCIE.cpp
@@ -0,0 +1,158 @@
+/******************************************************************************
+ *  Author: Arjun Ramaswami
+ *
+ *  Edited by Marius Meyer:
+ *  - Adapt to used kernel signature
+ *  - Change to row-column loop structure
+ *****************************************************************************/
+#include "parameters.h"
+
+const unsigned int block_size = BLOCK_SIZE;
+const unsigned int channel_width = CHANNEL_WIDTH;
+
+
+
+extern "C" {
+
+// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+
+/**
+ * Read blocks of matrix A and transpose them in memory.
+ * Write the block into an external channel.
+ *
+ * Will do the following:
+ *
+ * A -> trans(A) -> ext. ch
+ *
+ * @param A Buffer for matrix A
+ * @param B Buffer for matrix B
+ * @param A_out Buffer for result matrix
+ * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise
+                on the block level, the whole matrix A might be written to global memory and the relevant columns
+                need to be picked using this offset.
+ * @param number_of_blocks The number of blocks that will be processed starting from the block offset
+ * @param width_in_blocks The with of matrix A in blocks
+ * @param height_in_blocks The height of matix A in blocks
+ */
+void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
+                                 const DEVICE_DATA_TYPE *B,
+                                 DEVICE_DATA_TYPE *A_out,
+            const unsigned int offset_a,
+            const unsigned int offset_b,
+            const unsigned int number_of_blocks,
+            const unsigned int width_in_blocks,
+            const unsigned int height_in_blocks) {
+
+    // local memory double buffer for a matrix block
+    DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width];
+#pragma HLS ARRAY_PARTITION variable = a_block complete dim = 2
+    // local memory double buffer for a matrix block
+    DEVICE_DATA_TYPE a_plus_b_block[block_size * block_size / channel_width][channel_width];
+#pragma HLS ARRAY_PARTITION variable = a_plus_b_block complete dim = 2
+
+    // transpose the matrix block-wise from global memory
+block_loop:
+    for (unsigned int block = 0; block < number_of_blocks; block++) {
+read_A:
+        for (unsigned int row = 0; row < block_size; row++) {
+read_A_line:
+            for (unsigned int col = 0; col < block_size / channel_width; col++) {
+        #pragma HLS unroll region
+                unsigned long block_row_a = (block + offset_a) / width_in_blocks;
+                unsigned long block_col_a = (block + offset_a) % width_in_blocks;
+                unsigned long ls_address_trans = block_col_a * block_size * block_size * height_in_blocks +
+                            block_row_a * block_size + 
+                            row * block_size * height_in_blocks;
+
+                // read in block of A from global memory and store it in a memory efficient manner for transpose
+                DEVICE_DATA_TYPE rotate_in[channel_width];
+#pragma HLS ARRAY_PARTITION variable = rotate_in complete dim = 0
+
+                // Blocks of a will be stored columnwise in global memory
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    rotate_in[unroll_count] = A[ls_address_trans + col * channel_width + unroll_count];
+                }
+
+                unsigned int chunk = row * (block_size / channel_width) + col;
+
+                unsigned rot = (row) & (channel_width - 1);
+
+                // rotate temporary buffer to store data into local buffer
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    // every block of (N / channel_width), rotates the index by 1
+                    // store in double buffer
+                    a_block[chunk][unroll_count] = rotate_in[(unroll_count + channel_width - rot)
+                                                                                                & (channel_width - 1)];
+                }
+            }
+        }
+
+        // Read transposed A from local memory and add B 
+read_B:
+        for (unsigned int row = 0; row < block_size; row++) {
+read_B_line:
+            for (unsigned int col = 0; col < block_size / channel_width; col++) {
+#pragma HLS unroll region
+                unsigned long block_row = (block + offset_b) / width_in_blocks;
+                unsigned long block_col = (block + offset_b) % width_in_blocks;
+                unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks +
+                        block_col * block_size + 
+                        row * block_size * width_in_blocks;
+                unsigned int chunk = row * (block_size / channel_width) + col;
+
+                DEVICE_DATA_TYPE data_chunk[channel_width];
+#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0
+                DEVICE_DATA_TYPE rotate_out[channel_width];
+#pragma HLS ARRAY_PARTITION variable = rotate_out complete dim = 0
+
+                unsigned int base = col * block_size;
+                unsigned int offset = row / channel_width;
+
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    unsigned rot = ((channel_width + unroll_count - row) * (block_size / channel_width)) &
+                                                                                                (BLOCK_SIZE - 1);
+                    unsigned row_rotate = base + offset + rot;
+                    rotate_out[unroll_count] = a_block[row_rotate][unroll_count];
+                }
+
+                unsigned rot_out = row & (channel_width - 1);
+
+                // rotate temporary buffer to store data into local buffer
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) & (channel_width - 1)];
+                }
+
+                // load tranposed A from global memory
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    data_chunk[unroll_count] += B[ls_address_row + col * channel_width + unroll_count];
+                }
+
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    a_plus_b_block[chunk][unroll_count] = data_chunk[unroll_count];
+                }
+            }
+        }
+        // Write back result
+write_result:
+        for (unsigned int row = 0; row < block_size; row++) {
+write_result_line:
+            for (unsigned int col = 0; col < block_size / channel_width; col++) {
+#pragma HLS unroll region
+                unsigned long block_row = (block + offset_b) / width_in_blocks;
+                unsigned long block_col = (block + offset_b) % width_in_blocks;
+                unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks +
+                        block_col * block_size + 
+                        row * block_size * width_in_blocks;
+                unsigned int chunk = row * (block_size / channel_width) + col;
+
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    A_out[ls_address_row + col * channel_width + unroll_count] = a_plus_b_block[chunk][unroll_count];
+                }
+            }
+        }
+    }
+}
+
+// PY_CODE_GEN block_end
+
+}

From ffc5648cdf17964548bed3db2c24c71f6881be14 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 5 May 2022 16:29:39 +0100
Subject: [PATCH 047/318] Add copying for other kernel replications

---
 .../execution_types/execution_xrt_accl_pq.hpp | 353 +++++++++++-------
 1 file changed, 212 insertions(+), 141 deletions(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
index dab92c96..8400fb76 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
@@ -42,9 +42,10 @@ namespace accl_pq {
 
 void accl_exchangeData(
     ACCL::ACCL &accl,
-    transpose::data_handler::DistributedPQTransposeDataHandler<xrt::device, bool, xrt::uuid>
-        &handler,
-    transpose::TransposeData<bool> &data, xrt::bo &bufferAXrt, int global_width) {
+    transpose::data_handler::DistributedPQTransposeDataHandler<
+        xrt::device, bool, xrt::uuid> &handler,
+    transpose::TransposeData<bool> &data, std::vector<xrt::bo> &bufferAXrt,
+    int global_width) {
 
   int pq_width = handler.getP();
   int pq_height = handler.getQ();
@@ -56,16 +57,21 @@ void accl_exchangeData(
   int pq_row = mpi_comm_rank / pq_width;
   int pq_col = mpi_comm_rank % pq_width;
 
-  auto AcclBufferA = accl.create_buffer<HOST_DATA_TYPE>(
-      bufferAXrt, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32);
+  std::vector<std::unique_ptr<ACCL::BaseBuffer>> acclBuffersA;
+  for (auto &bo : bufferAXrt) {
+    acclBuffersA.push_back(accl.create_buffer<HOST_DATA_TYPE>(
+        bo, data.blockSize * data.blockSize * data.numBlocks,
+        ACCL::dataType::float32));
+  }
+
   if (pq_width == pq_height) {
     if (pq_col != pq_row) {
 
       int pair_rank = pq_width * pq_col + pq_row;
 
-      // To re-calculate the matrix transposition locally on this host, we need to
-      // exchange matrix A for every kernel replication
-      // The order of the matrix blocks does not change during the exchange, because they are
+      // To re-calculate the matrix transposition locally on this host, we need
+      // to exchange matrix A for every kernel replication The order of the
+      // matrix blocks does not change during the exchange, because they are
       // distributed diagonally and will be handled in the order below:
       //
       // . . 1 3
@@ -73,34 +79,35 @@ void accl_exchangeData(
       // 1 . . .
       // 3 2 . .
       // auto AcclBufferA_recv = accl.create_buffer<HOST_DATA_TYPE>(
-      //     data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32);
+      //     data.blockSize * data.blockSize * data.numBlocks,
+      //     ACCL::dataType::float32);
       // AcclBufferA_recv->sync_to_device();
       // Send and receive matrix A using ACCL directly on FPGA
-      accl.send(0, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,
+      accl.send(0, *acclBuffersA[0],
+                data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,
                 true, ACCL::streamFlags::NO_STREAM);
-      accl.recv(0, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,
+      accl.recv(0, *acclBuffersA[0],
+                data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,
                 true, ACCL::streamFlags::NO_STREAM);
-      // Copy received matrix from receiving buffer to A buffer completely on FPGA
-      // accl.copy(*AcclBufferA_recv, *AcclBufferA, data.blockSize * data.blockSize *
-      // data.numBlocks,
-      // true, true);
     }
   } else {
-    // Taken from "Parallel matrix transpose algorithms on distributed memory concurrent computers"
-    // by J. Choi, J. J. Dongarra, D. W. Walker and translated to C++ This will do a diagonal
-    // exchange of matrix blocks.
+    // Taken from "Parallel matrix transpose algorithms on distributed memory
+    // concurrent computers" by J. Choi, J. J. Dongarra, D. W. Walker and
+    // translated to C++ This will do a diagonal exchange of matrix blocks.
 
     // Determine LCM using GCD from standard library using the C++14 call
-    // In C++17 this changes to std::gcd in numeric, also std::lcm is directly available in numeric
+    // In C++17 this changes to std::gcd in numeric, also std::lcm is directly
+    // available in numeric
     int gcd = std::__gcd(pq_height, pq_width);
     int least_common_multiple = pq_height * pq_width / gcd;
 
-    // If the global matrix size is not a multiple of the LCM block size, the numbers of send and
-    // received blocks may be wrongly calculated. Throw exception to prevent this and make aware of
-    // this issue!
+    // If the global matrix size is not a multiple of the LCM block size, the
+    // numbers of send and received blocks may be wrongly calculated. Throw
+    // exception to prevent this and make aware of this issue!
     if (global_width % least_common_multiple > 0) {
-      throw std::runtime_error("Implementation does not support matrix sizes that are not multiple "
-                               "of LCM blocks! Results may be wrong!");
+      throw std::runtime_error(
+          "Implementation does not support matrix sizes that are not multiple "
+          "of LCM blocks! Results may be wrong!");
     }
 
     // MPI requests for non-blocking communication
@@ -113,18 +120,19 @@ void accl_exchangeData(
     int q = transpose::data_handler::mod(pq_row - g, pq_height);
 
     // Pre-calculate target ranks in LCM block
-    // The vector list variable can be interpreted as 2D matrix. Every entry represents the target
-    // rank of the sub-block Since the LCM block will repeat, we only need to store this small
-    // amount of data!
-    std::vector<int> target_list(least_common_multiple / pq_height * least_common_multiple /
-                                 pq_width);
+    // The vector list variable can be interpreted as 2D matrix. Every entry
+    // represents the target rank of the sub-block Since the LCM block will
+    // repeat, we only need to store this small amount of data!
+    std::vector<int> target_list(least_common_multiple / pq_height *
+                                 least_common_multiple / pq_width);
     for (int row = 0; row < least_common_multiple / pq_height; row++) {
       for (int col = 0; col < least_common_multiple / pq_width; col++) {
         int global_block_col = pq_col + col * pq_width;
         int global_block_row = pq_row + row * pq_height;
-        int destination_rank =
-            (global_block_col % pq_height) * pq_width + (global_block_row % pq_width);
-        target_list[row * least_common_multiple / pq_width + col] = destination_rank;
+        int destination_rank = (global_block_col % pq_height) * pq_width +
+                               (global_block_row % pq_width);
+        target_list[row * least_common_multiple / pq_width + col] =
+            destination_rank;
       }
     }
 
@@ -135,23 +143,28 @@ void accl_exchangeData(
     for (int i = 0; i < gcd; i++) {
       // TODO Is there a way to initialize buffer only in FPGA memory with ACCL?
       send_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(
-          data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32));
+          data.blockSize * data.blockSize * data.numBlocks,
+          ACCL::dataType::float32));
       recv_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(
-          data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32));
+          data.blockSize * data.blockSize * data.numBlocks,
+          ACCL::dataType::float32));
       send_buffers.back()->sync_to_device();
       recv_buffers.back()->sync_to_device();
     }
     int current_parallel_execution = 0;
     for (int j = 0; j < least_common_multiple / pq_width; j++) {
       for (int i = 0; i < least_common_multiple / pq_height; i++) {
-        // Determine sender and receiver rank of current rank for current communication step
-        int send_rank = transpose::data_handler::mod(p + i * gcd, pq_width) +
-                        transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width;
-        int recv_rank = transpose::data_handler::mod(p - i * gcd, pq_width) +
-                        transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width;
-
-        // Also count receiving buffer size because sending and receiving buffer size may differ in
-        // certain scenarios!
+        // Determine sender and receiver rank of current rank for current
+        // communication step
+        int send_rank =
+            transpose::data_handler::mod(p + i * gcd, pq_width) +
+            transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width;
+        int recv_rank =
+            transpose::data_handler::mod(p - i * gcd, pq_width) +
+            transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width;
+
+        // Also count receiving buffer size because sending and receiving buffer
+        // size may differ in certain scenarios!
         int receiving_size = 0;
         int sending_size = 0;
 
@@ -160,53 +173,69 @@ void accl_exchangeData(
         // Look up which blocks are affected by the current rank
         for (int row = 0; row < least_common_multiple / pq_height; row++) {
           for (int col = 0; col < least_common_multiple / pq_width; col++) {
-            if (target_list[row * least_common_multiple / pq_width + col] == send_rank) {
+            if (target_list[row * least_common_multiple / pq_width + col] ==
+                send_rank) {
               send_rows.push_back(row);
               send_cols.push_back(col);
               sending_size += data.blockSize * data.blockSize;
             }
-            if (target_list[row * least_common_multiple / pq_width + col] == recv_rank) {
+            if (target_list[row * least_common_multiple / pq_width + col] ==
+                recv_rank) {
               receiving_size += data.blockSize * data.blockSize;
             }
           }
         }
-        receiving_size *= (height_per_rank) / (least_common_multiple / pq_height) *
-                          ((width_per_rank) / (least_common_multiple / pq_width));
-        sending_size *= (height_per_rank) / (least_common_multiple / pq_height) *
+        receiving_size *=
+            (height_per_rank) / (least_common_multiple / pq_height) *
+            ((width_per_rank) / (least_common_multiple / pq_width));
+        sending_size *= (height_per_rank) /
+                        (least_common_multiple / pq_height) *
                         ((width_per_rank) / (least_common_multiple / pq_width));
 
 #ifndef NDEBUG
         std::cout << "Copy data to send buffers" << std::endl;
 #endif
-        // Copy the required date for this communication step to the send buffer!
+        // Copy the required date for this communication step to the send
+        // buffer!
         for (int t = 0; t < send_rows.size(); t++) {
-          for (int lcm_row = 0; lcm_row < (height_per_rank) / (least_common_multiple / pq_height);
+          for (int lcm_row = 0;
+               lcm_row <
+               (height_per_rank) / (least_common_multiple / pq_height);
                lcm_row++) {
-            for (int lcm_col = 0; lcm_col < (width_per_rank) / (least_common_multiple / pq_width);
+            for (int lcm_col = 0;
+                 lcm_col <
+                 (width_per_rank) / (least_common_multiple / pq_width);
                  lcm_col++) {
               size_t sending_buffer_offset =
                   lcm_row * data.blockSize * data.blockSize *
                       ((width_per_rank) / (least_common_multiple / pq_width)) +
                   lcm_col * data.blockSize * data.blockSize;
               size_t matrix_buffer_offset =
-                  (send_cols[t] + lcm_col * least_common_multiple / pq_width) * data.blockSize +
-                  (send_rows[t] + lcm_row * least_common_multiple / pq_height) * width_per_rank *
-                      data.blockSize * data.blockSize;
+                  (send_cols[t] + lcm_col * least_common_multiple / pq_width) *
+                      data.blockSize +
+                  (send_rows[t] + lcm_row * least_common_multiple / pq_height) *
+                      width_per_rank * data.blockSize * data.blockSize;
               for (int block_row = 0; block_row < data.blockSize; block_row++) {
                 // TODO May be more efficient when done async!
                 std::cout << "A("
-                          << matrix_buffer_offset + block_row * width_per_rank * data.blockSize
+                          << matrix_buffer_offset +
+                                 block_row * width_per_rank * data.blockSize
                           << ","
-                          << matrix_buffer_offset + block_row * width_per_rank * data.blockSize +
+                          << matrix_buffer_offset +
+                                 block_row * width_per_rank * data.blockSize +
                                  data.blockSize
                           << ") send(" << sending_buffer_offset << ","
-                          << sending_buffer_offset + data.blockSize << ")" << std::endl;
-                accl.copy(*AcclBufferA->slice(
-                              matrix_buffer_offset + block_row * width_per_rank * data.blockSize,
-                              matrix_buffer_offset + block_row * width_per_rank * data.blockSize +
+                          << sending_buffer_offset + data.blockSize << ")"
+                          << std::endl;
+                accl.copy(*acclBuffersA[0]->slice(
+                              matrix_buffer_offset +
+                                  block_row * width_per_rank * data.blockSize,
+                              matrix_buffer_offset +
+                                  block_row * width_per_rank * data.blockSize +
                                   data.blockSize),
                           *send_buffers[current_parallel_execution]->slice(
-                              sending_buffer_offset, sending_buffer_offset + data.blockSize),
+                              sending_buffer_offset,
+                              sending_buffer_offset + data.blockSize),
                           data.blockSize, true, true);
                 std::cout << "Copy done!" << std::endl;
               }
@@ -218,16 +247,17 @@ void accl_exchangeData(
 #ifndef NDEBUG
         std::cout << "Rank " << mpi_comm_rank << ": blocks ("
                   << sending_size / (data.blockSize * data.blockSize) << ","
-                  << receiving_size / (data.blockSize * data.blockSize) << ") send " << send_rank
-                  << ", recv " << recv_rank << std::endl
+                  << receiving_size / (data.blockSize * data.blockSize)
+                  << ") send " << send_rank << ", recv " << recv_rank
+                  << std::endl
                   << std::flush;
 #endif
-        accl_requests[current_parallel_execution] =
-            (accl.send(0, *send_buffers[current_parallel_execution], sending_size, send_rank, 0,
-                       true, ACCL::streamFlags::NO_STREAM, true));
-        accl_requests[current_parallel_execution + gcd] =
-            (accl.recv(0, *recv_buffers[current_parallel_execution], sending_size, send_rank, 0,
-                       true, ACCL::streamFlags::NO_STREAM, true));
+        accl_requests[current_parallel_execution] = (accl.send(
+            0, *send_buffers[current_parallel_execution], sending_size,
+            send_rank, 0, true, ACCL::streamFlags::NO_STREAM, true));
+        accl_requests[current_parallel_execution + gcd] = (accl.recv(
+            0, *recv_buffers[current_parallel_execution], sending_size,
+            send_rank, 0, true, ACCL::streamFlags::NO_STREAM, true));
         // Increase the counter for parallel executions
         current_parallel_execution = (current_parallel_execution + 1) % gcd;
 
@@ -249,10 +279,12 @@ void accl_exchangeData(
               std::vector<int> recv_rows;
               std::vector<int> recv_cols;
               // Look up which blocks are affected by the current rank
-              for (int row = 0; row < least_common_multiple / pq_height; row++) {
-                for (int col = 0; col < least_common_multiple / pq_width; col++) {
-                  if (target_list[row * least_common_multiple / pq_width + col] ==
-                      status.MPI_SOURCE) {
+              for (int row = 0; row < least_common_multiple / pq_height;
+                   row++) {
+                for (int col = 0; col < least_common_multiple / pq_width;
+                     col++) {
+                  if (target_list[row * least_common_multiple / pq_width +
+                                  col] == status.MPI_SOURCE) {
                     recv_rows.push_back(row);
                     recv_cols.push_back(col);
                   }
@@ -261,26 +293,37 @@ void accl_exchangeData(
               // Copy received data to matrix A buffer
               for (int t = 0; t < recv_rows.size(); t++) {
                 for (int lcm_row = 0;
-                     lcm_row < (height_per_rank) / (least_common_multiple / pq_height); lcm_row++) {
+                     lcm_row <
+                     (height_per_rank) / (least_common_multiple / pq_height);
+                     lcm_row++) {
                   for (int lcm_col = 0;
-                       lcm_col < (width_per_rank) / (least_common_multiple / pq_width); lcm_col++) {
+                       lcm_col <
+                       (width_per_rank) / (least_common_multiple / pq_width);
+                       lcm_col++) {
                     size_t receiving_buffer_offset =
                         lcm_row * data.blockSize * data.blockSize *
-                            ((width_per_rank) / (least_common_multiple / pq_width)) +
+                            ((width_per_rank) /
+                             (least_common_multiple / pq_width)) +
                         lcm_col * data.blockSize * data.blockSize;
                     size_t matrix_buffer_offset =
-                        (recv_cols[t] + lcm_col * least_common_multiple / pq_width) *
+                        (recv_cols[t] +
+                         lcm_col * least_common_multiple / pq_width) *
                             data.blockSize +
-                        (recv_rows[t] + lcm_row * least_common_multiple / pq_height) *
+                        (recv_rows[t] +
+                         lcm_row * least_common_multiple / pq_height) *
                             width_per_rank * data.blockSize * data.blockSize;
-                    for (int block_row = 0; block_row < data.blockSize; block_row++) {
+                    for (int block_row = 0; block_row < data.blockSize;
+                         block_row++) {
                       // TODO May be more efficient when done async!
                       accl.copy(
                           *recv_buffers[current_parallel_execution]->slice(
-                              receiving_buffer_offset, receiving_buffer_offset + data.blockSize),
-                          *AcclBufferA->slice(
-                              matrix_buffer_offset + block_row * width_per_rank * data.blockSize,
-                              matrix_buffer_offset + block_row * width_per_rank * data.blockSize +
+                              receiving_buffer_offset,
+                              receiving_buffer_offset + data.blockSize),
+                          *acclBuffersA[0]->slice(
+                              matrix_buffer_offset +
+                                  block_row * width_per_rank * data.blockSize,
+                              matrix_buffer_offset +
+                                  block_row * width_per_rank * data.blockSize +
                                   data.blockSize),
                           data.blockSize, true, true);
                     }
@@ -293,36 +336,47 @@ void accl_exchangeData(
       }
     }
   }
+  // Copy received matrix A to the buffers of other kernel replications that
+  // may be placed on different memory banks
+  for (int b = 1; b < acclBuffersA.size(); b++) {
+    accl.copy(*acclBuffersA[0], *acclBuffersA[b],
+              data.blockSize * data.blockSize * data.numBlocks, true, true);
+  }
 }
 
 /**
- * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and
- * PCIe+MPI over the host for communication
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ
+ * distribution and PCIe+MPI over the host for communication
  *
  * @param config The progrma configuration
- * @param data data object that contains all required data for the execution on the FPGA
- * @param handler data handler instance that should be used to exchange data between hosts
- * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times
+ * @param data data object that contains all required data for the execution on
+ * the FPGA
+ * @param handler data handler instance that should be used to exchange data
+ * between hosts
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured
+ * execution times
  */
-static std::unique_ptr<transpose::TransposeExecutionTimings>
-calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, xrt::device, bool,
-                                             xrt::uuid> &config,
-          transpose::TransposeData<bool> &data,
-          transpose::data_handler::DistributedPQTransposeDataHandler<xrt::device, bool, xrt::uuid>
-              &handler) {
+static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
+    const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
+                                       xrt::device, bool, xrt::uuid> &config,
+    transpose::TransposeData<bool> &data,
+    transpose::data_handler::DistributedPQTransposeDataHandler<
+        xrt::device, bool, xrt::uuid> &handler) {
   int err;
 
   if (config.programSettings->dataHandlerIdentifier !=
       transpose::data_handler::DataHandlerType::pq) {
-    throw std::runtime_error("Used data handler not supported by execution handler!");
+    throw std::runtime_error(
+        "Used data handler not supported by execution handler!");
   }
 #ifdef USE_SVM
-  throw new std::runtime_error(
-      "SVM not supported in the host implementation of this communication method");
+  throw new std::runtime_error("SVM not supported in the host implementation "
+                               "of this communication method");
 #endif
 #ifdef USE_BUFFER_WRITE_RECT_FOR_A
-  throw new std::runtime_error("Using the Write Rect method is not supported in this host "
-                               "implementation of this communication method");
+  throw new std::runtime_error(
+      "Using the Write Rect method is not supported in this host "
+      "implementation of this communication method");
 #endif
 
   std::vector<size_t> bufferSizeList;
@@ -336,7 +390,8 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
 
   size_t local_matrix_width = handler.getWidthforRank();
   size_t local_matrix_height = handler.getHeightforRank();
-  size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+  size_t local_matrix_width_bytes =
+      local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
 
   size_t total_offset = 0;
   size_t row_offset = 0;
@@ -346,40 +401,47 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
   // Setup the kernels depending on the number of kernel replications
   for (int r = 0; r < config.programSettings->kernelReplications; r++) {
 
-    // Calculate how many blocks the current kernel replication will need to process.
+    // Calculate how many blocks the current kernel replication will need to
+    // process.
     size_t blocks_per_replication =
-        (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications);
-    size_t blocks_remainder =
-        (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications;
+        (local_matrix_height * local_matrix_width /
+         config.programSettings->kernelReplications);
+    size_t blocks_remainder = (local_matrix_height * local_matrix_width) %
+                              config.programSettings->kernelReplications;
     if (blocks_remainder > r) {
-      // Catch the case, that the number of blocks is not divisible by the number of kernel
-      // replications
+      // Catch the case, that the number of blocks is not divisible by the
+      // number of kernel replications
       blocks_per_replication += 1;
     }
     if (blocks_per_replication < 1) {
       continue;
     }
     blocksPerReplication.push_back(blocks_per_replication);
-    size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width *
-                         local_matrix_width * data.blockSize * data.blockSize;
+    size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) /
+                         local_matrix_width * local_matrix_width *
+                         data.blockSize * data.blockSize;
     bufferSizeList.push_back(buffer_size);
     bufferStartList.push_back(total_offset);
     bufferOffsetList.push_back(row_offset);
 
     row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
 
-    total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width *
-                    local_matrix_width;
+    total_offset += (bufferOffsetList.back() + blocks_per_replication) /
+                    local_matrix_width * local_matrix_width;
 
     // create the kernels
-    xrt::kernel transposeKernel(*config.device, *config.program,
-                                ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str());
+    xrt::kernel transposeKernel(
+        *config.device, *config.program,
+        ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str());
 
     xrt::bo bufferA(*config.device, data.A,
-                    data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE),
+                    data.numBlocks * data.blockSize * data.blockSize *
+                        sizeof(HOST_DATA_TYPE),
                     transposeKernel.group_id(0));
-    xrt::bo bufferB(*config.device, &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
-                    buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
+    xrt::bo bufferB(
+        *config.device,
+        &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
+        buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
     xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
                         transposeKernel.group_id(2));
 
@@ -392,7 +454,8 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
   std::vector<double> transferTimings;
   std::vector<double> calculationTimings;
 
-  for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) {
+  for (int repetition = 0; repetition < config.programSettings->numRepetitions;
+       repetition++) {
 
 #ifndef NDEBUG
     std::cout << "Start data transfer" << std::endl;
@@ -406,7 +469,8 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
     auto endTransfer = std::chrono::high_resolution_clock::now();
 
     std::chrono::duration<double> transferTime =
-        std::chrono::duration_cast<std::chrono::duration<double>>(endTransfer - startTransfer);
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endTransfer - startTransfer);
 
     MPI_Barrier(MPI_COMM_WORLD);
 
@@ -414,14 +478,15 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
 
     // Exchange A data via ACCL
     if (bufferListA.size() > 1) {
-      std::cerr << "WARNING: Only the matrix A of the first kernel replication will be exchanged "
+      std::cerr << "WARNING: Only the matrix A of the first kernel replication "
+                   "will be exchanged "
                    "via ACCL!"
                 << std::endl;
     }
 #ifndef NDEBUG
     std::cout << "Start data exchange with ACCL" << std::endl;
 #endif
-    accl_exchangeData(*config.accl, handler, data, bufferListA[0],
+    accl_exchangeData(*config.accl, handler, data, bufferListA,
                       config.programSettings->matrixSize / data.blockSize);
 #ifndef NDEBUG
     std::cout << "End data exchange with ACCL" << std::endl;
@@ -431,11 +496,13 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
     for (int r = 0; r < transposeKernelList.size(); r++) {
       runs.push_back(transposeKernelList[r](
           bufferListA[r], bufferListB[r], bufferListA_out[r],
-          static_cast<cl_uint>(bufferOffsetList[r]), static_cast<cl_uint>(bufferOffsetList[r]),
+          static_cast<cl_uint>(bufferOffsetList[r]),
+          static_cast<cl_uint>(bufferOffsetList[r]),
           static_cast<cl_uint>(blocksPerReplication[r]),
           static_cast<cl_uint>(handler.getWidthforRank()),
-          static_cast<cl_uint>((bufferSizeList[r]) /
-                               (local_matrix_width * data.blockSize * data.blockSize))));
+          static_cast<cl_uint>(
+              (bufferSizeList[r]) /
+              (local_matrix_width * data.blockSize * data.blockSize))));
     }
 #ifndef NDEBUG
     std::cout << "Wait for kernels to complete" << std::endl;
@@ -450,26 +517,28 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
     std::cout << "Rank " << mpi_rank << ": "
               << "Done i=" << repetition << std::endl;
     std::cout << "Kernel execution time: "
-              << std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation -
-                                                                           startKernelCalculation)
+              << std::chrono::duration_cast<std::chrono::duration<double>>(
+                     endCalculation - startKernelCalculation)
                      .count()
               << "s ("
-              << ((config.programSettings->matrixSize * config.programSettings->matrixSize *
-                   sizeof(HOST_DATA_TYPE) * 3) /
-                  std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation -
-                                                                            startKernelCalculation)
+              << ((config.programSettings->matrixSize *
+                   config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) *
+                   3) /
+                  std::chrono::duration_cast<std::chrono::duration<double>>(
+                      endCalculation - startKernelCalculation)
                       .count() *
                   1.0e-9)
               << " GB/s)" << std::endl;
 #endif
 
     std::chrono::duration<double> calculationTime =
-        std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation -
-                                                                  startCalculation);
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endCalculation - startCalculation);
     calculationTimings.push_back(calculationTime.count());
 
-    std::vector<HOST_DATA_TYPE> tmp_write_buffer(local_matrix_height * local_matrix_width *
-                                                 data.blockSize * data.blockSize);
+    std::vector<HOST_DATA_TYPE> tmp_write_buffer(
+        local_matrix_height * local_matrix_width * data.blockSize *
+        data.blockSize);
 
     startTransfer = std::chrono::high_resolution_clock::now();
 
@@ -483,29 +552,31 @@ calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings
                col < local_matrix_width * data.blockSize; col++) {
             data.result[bufferStartList[r] * data.blockSize * data.blockSize +
                         row * local_matrix_width * data.blockSize + col] =
-                tmp_write_buffer[row * local_matrix_width * data.blockSize + col];
+                tmp_write_buffer[row * local_matrix_width * data.blockSize +
+                                 col];
           }
         }
         // Copy remaining buffer
-        std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize,
+        std::copy(tmp_write_buffer.begin() +
+                      local_matrix_width * data.blockSize * data.blockSize,
                   tmp_write_buffer.begin() + bufferSizeList[r],
-                  &data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize *
-                               data.blockSize]);
+                  &data.result[(bufferStartList[r] + local_matrix_width) *
+                               data.blockSize * data.blockSize]);
       } else {
         bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-        bufferListA_out[r].read(data.result, bufferSizeList[r] * sizeof(HOST_DATA_TYPE),
-                                bufferStartList[r] * data.blockSize * data.blockSize *
-                                    sizeof(HOST_DATA_TYPE));
+        bufferListA_out[r].read(
+            &data.result[bufferStartList[r] * data.blockSize * data.blockSize]);
       }
     }
     endTransfer = std::chrono::high_resolution_clock::now();
-    transferTime +=
-        std::chrono::duration_cast<std::chrono::duration<double>>(endTransfer - startTransfer);
+    transferTime += std::chrono::duration_cast<std::chrono::duration<double>>(
+        endTransfer - startTransfer);
     transferTimings.push_back(transferTime.count());
   }
 
   std::unique_ptr<transpose::TransposeExecutionTimings> result(
-      new transpose::TransposeExecutionTimings{transferTimings, calculationTimings});
+      new transpose::TransposeExecutionTimings{transferTimings,
+                                               calculationTimings});
 
   return result;
 }

From cdc69a11f657f8d1863942c81afc0962e1d327f4 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 5 May 2022 16:45:23 +0100
Subject: [PATCH 048/318] Add FORCE_FILE_ENDING flag for convenience

---
 cmake/kernelTargets.cmake | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index 84ce896f..66fecfb4 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -13,7 +13,7 @@ if (USE_ACCL)
    include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake)
 endif()
 
-set(file_endings "cpp" "cl")
+set(file_endings "cl" "cpp" )
 
 ##
 # This function will create build targets for the kernels for emulationand synthesis for xilinx.
@@ -32,6 +32,9 @@ function(generate_kernel_targets_xilinx)
             continue()
         endif()
         set(file_exists No)
+        if (DEFINED FORCE_FILE_ENDING)
+                set(file_endings ${FORCE_FILE_ENDING})
+        endif()
         foreach (ending ${file_endings})
             set(search_file_name "${CMAKE_SOURCE_DIR}/${base_file_part}.${ending}")
             if (NOT file_exists AND EXISTS ${search_file_name})

From cf8f7926614d1d6a70545605a837cc5ad9834d21 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 6 May 2022 13:35:55 +0100
Subject: [PATCH 049/318] Remove need to be power of 2 for PTRANS block size

---
 PTRANS/src/device/transpose_PQ_PCIE.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cpp b/PTRANS/src/device/transpose_PQ_PCIE.cpp
index 456c6919..521d8e1a 100644
--- a/PTRANS/src/device/transpose_PQ_PCIE.cpp
+++ b/PTRANS/src/device/transpose_PQ_PCIE.cpp
@@ -46,9 +46,11 @@ void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
     // local memory double buffer for a matrix block
     DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width];
 #pragma HLS ARRAY_PARTITION variable = a_block complete dim = 2
+#pragma HLS BIND_STORAGE variable = a_block type = RAM_1P impl = URAM
     // local memory double buffer for a matrix block
     DEVICE_DATA_TYPE a_plus_b_block[block_size * block_size / channel_width][channel_width];
 #pragma HLS ARRAY_PARTITION variable = a_plus_b_block complete dim = 2
+#pragma HLS BIND_STORAGE variable = a_plus_b_block type = RAM_1P impl = URAM
 
     // transpose the matrix block-wise from global memory
 block_loop:
@@ -75,14 +77,14 @@ void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
 
                 unsigned int chunk = row * (block_size / channel_width) + col;
 
-                unsigned rot = (row) & (channel_width - 1);
+                unsigned rot = (row) % (channel_width);
 
                 // rotate temporary buffer to store data into local buffer
                 for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
                     // every block of (N / channel_width), rotates the index by 1
                     // store in double buffer
                     a_block[chunk][unroll_count] = rotate_in[(unroll_count + channel_width - rot)
-                                                                                                & (channel_width - 1)];
+                                                                                                % (channel_width)];
                 }
             }
         }
@@ -109,17 +111,17 @@ void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
                 unsigned int offset = row / channel_width;
 
                 for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
-                    unsigned rot = ((channel_width + unroll_count - row) * (block_size / channel_width)) &
-                                                                                                (BLOCK_SIZE - 1);
+                    unsigned rot = ((channel_width + unroll_count - row) * (block_size / channel_width)) %
+                                                                                                (block_size);
                     unsigned row_rotate = base + offset + rot;
                     rotate_out[unroll_count] = a_block[row_rotate][unroll_count];
                 }
 
-                unsigned rot_out = row & (channel_width - 1);
+                unsigned rot_out = row % (channel_width);
 
                 // rotate temporary buffer to store data into local buffer
                 for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
-                    data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) & (channel_width - 1)];
+                    data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) % (channel_width)];
                 }
 
                 // load tranposed A from global memory

From 4a55592a296e617c23d7f6f3db19cfb29987c050 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 6 May 2022 13:36:10 +0100
Subject: [PATCH 050/318] Add .cpp as possible ending for custom kernels

---
 cmake/customKernelTargets.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/customKernelTargets.cmake b/cmake/customKernelTargets.cmake
index 82ac811f..4657ba53 100644
--- a/cmake/customKernelTargets.cmake
+++ b/cmake/customKernelTargets.cmake
@@ -9,7 +9,7 @@ include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake)
 
 file(GLOB custom_kernel_files
     RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
-    "*.cl"
+    "*.cl" "*.cpp"
 )
 
 set(custom_kernel_targets "")

From c07a29df6c919d56161d320a51e498dd67f751e3 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 6 May 2022 16:08:11 +0100
Subject: [PATCH 051/318] Remove compile settings from linking call

---
 cmake/kernelTargets.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index 66fecfb4..4fa28b81 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -121,7 +121,7 @@ function(generate_kernel_targets_xilinx)
                 DEPENDS ${XILINX_COMPILE_SETTINGS_FILE}
                 )
         add_custom_command(OUTPUT ${bitstream_f}
-                COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} ${local_harware_only_flags} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_f} ${bitstream_compile} ${additional_xos}
+                COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} ${local_harware_only_flags} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} -o ${bitstream_f} ${additional_xos} ${bitstream_compile}
                 MAIN_DEPENDENCY ${bitstream_compile}
                 DEPENDS ${xilinx_link_settings}
                 )

From ddb88c796c8b1e4b5ea0d3f2eefcbddddf5a1dce Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 12 May 2022 11:23:24 +0100
Subject: [PATCH 052/318] use pipeline pragma instead of unroll region

---
 PTRANS/src/device/transpose_PQ_PCIE.cpp | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cpp b/PTRANS/src/device/transpose_PQ_PCIE.cpp
index 521d8e1a..be7e6828 100644
--- a/PTRANS/src/device/transpose_PQ_PCIE.cpp
+++ b/PTRANS/src/device/transpose_PQ_PCIE.cpp
@@ -43,23 +43,24 @@ void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
             const unsigned int width_in_blocks,
             const unsigned int height_in_blocks) {
 
-    // local memory double buffer for a matrix block
-    DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width];
-#pragma HLS ARRAY_PARTITION variable = a_block complete dim = 2
-#pragma HLS BIND_STORAGE variable = a_block type = RAM_1P impl = URAM
-    // local memory double buffer for a matrix block
-    DEVICE_DATA_TYPE a_plus_b_block[block_size * block_size / channel_width][channel_width];
-#pragma HLS ARRAY_PARTITION variable = a_plus_b_block complete dim = 2
-#pragma HLS BIND_STORAGE variable = a_plus_b_block type = RAM_1P impl = URAM
-
     // transpose the matrix block-wise from global memory
 block_loop:
     for (unsigned int block = 0; block < number_of_blocks; block++) {
+
+        // local memory double buffer for a matrix block
+        DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width];
+#pragma HLS ARRAY_PARTITION variable = a_block complete dim = 2
+// #pragma HLS BIND_STORAGE variable = a_block type = RAM_1P impl = URAM
+        // local memory double buffer for a matrix block
+        DEVICE_DATA_TYPE a_plus_b_block[block_size * block_size / channel_width][channel_width];
+#pragma HLS ARRAY_PARTITION variable = a_plus_b_block complete dim = 2
+// #pragma HLS BIND_STORAGE variable = a_plus_b_block type = RAM_1P impl = URAM
+
 read_A:
         for (unsigned int row = 0; row < block_size; row++) {
 read_A_line:
             for (unsigned int col = 0; col < block_size / channel_width; col++) {
-        #pragma HLS unroll region
+#pragma HLS PIPELINE
                 unsigned long block_row_a = (block + offset_a) / width_in_blocks;
                 unsigned long block_col_a = (block + offset_a) % width_in_blocks;
                 unsigned long ls_address_trans = block_col_a * block_size * block_size * height_in_blocks +
@@ -94,7 +95,7 @@ void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
         for (unsigned int row = 0; row < block_size; row++) {
 read_B_line:
             for (unsigned int col = 0; col < block_size / channel_width; col++) {
-#pragma HLS unroll region
+#pragma HLS PIPELINE
                 unsigned long block_row = (block + offset_b) / width_in_blocks;
                 unsigned long block_col = (block + offset_b) % width_in_blocks;
                 unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks +
@@ -139,7 +140,7 @@ void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
         for (unsigned int row = 0; row < block_size; row++) {
 write_result_line:
             for (unsigned int col = 0; col < block_size / channel_width; col++) {
-#pragma HLS unroll region
+#pragma HLS PIPELINE
                 unsigned long block_row = (block + offset_b) / width_in_blocks;
                 unsigned long block_col = (block + offset_b) % width_in_blocks;
                 unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks +

From 06ce1110ee821789052c591c943a6e9509daa17d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 12 May 2022 11:24:29 +0100
Subject: [PATCH 053/318] Only sync data if required for baseline version

---
 PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
index d59ba2e0..b5788fed 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
@@ -162,9 +162,11 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
             endTransfer - startTransfer);
 
     MPI_Barrier(MPI_COMM_WORLD);
-
+    int mpi_size;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
     auto startCalculation = std::chrono::high_resolution_clock::now();
 
+    if (mpi_size > 1) {
     for (int r = 0; r < transposeKernelList.size(); r++) {
       bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
     }
@@ -177,6 +179,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     for (int r = 0; r < transposeKernelList.size(); r++) {
       bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
     }
+    }
 
     std::vector<xrt::run> runs;
     auto startKernelCalculation = std::chrono::high_resolution_clock::now();

From 5bff3fc311c950ccd5ab42d317f563c2ce549db9 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 19 May 2022 14:03:32 +0100
Subject: [PATCH 054/318] Add variable for CCLO build parameters

---
 cmake/accl.cmake | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index 88bc2b64..8b9823d6 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -4,7 +4,8 @@ set(ACCL_STACK_TYPE "UDP" CACHE STRING "Network layer used in ACCL")
 set(ACCL_UDP_ETH_IF 0 CACHE STRING "Ethernet interface used. On ETHZ: 0 = switch, 1 = direct")
 set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform")
 set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware)
-
+set(ACCL_CCLO_ADDITIONAL_BUILD_ARGS "" CACHE STRING "Add additional build arguments that will be passed to the CCLO makefile")
+set(ACCL_CCLO_BUILD_ARGS ${ACCL_CCLO_ADDITIONAL_BUILD_ARGS})
 # UDP related definitions
 set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/)
 set(ACCL_NETLAYER_HLS ${ACCL_VNX_DIR}/NetLayers/100G-fpga-network-stack-core)
@@ -14,7 +15,7 @@ set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HMB)
 if (ACCL_STACK_TYPE STREQUAL "UDP")
     list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_VNX_DIR}/Ethernet/post_sys_link.tcl)
     list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_HLS_IP_FOLDER})
-    set(ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE})
+    list(APPEND ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE})
 endif()
 
 add_custom_command(
@@ -38,7 +39,7 @@ set(ACCL_TCP_CMAC_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/cmac_krnl.xo)
 if (ACCL_STACK_TYPE STREQUAL "TCP")
     list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_TCP_BASE_DIR}/scripts/post_sys_link.tcl)
     list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo)
-    set(ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE} EN_FANIN=1)
+    list(APPEND ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE} EN_FANIN=1)
 endif()
 
 # TODO: This is very sppecific to the Xilinx build system, because

From 3c552f8150bd9c205a9fe3649149e357e37b1068 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 19 May 2022 14:03:48 +0100
Subject: [PATCH 055/318] Fix compile flag handling

---
 cmake/kernelTargets.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index 4fa28b81..b7a237a3 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -65,7 +65,7 @@ function(generate_kernel_targets_xilinx)
             list(APPEND additional_xos ${ACCL_XOS}) 
         endif()
         set(xilinx_report_folder "${EXECUTABLE_OUTPUT_PATH}/xilinx_reports")
-        set(local_CLFLAGS ${CLFLAGS} -DXILINX_FPGA)
+        set(local_CLFLAGS -DXILINX_FPGA)
         list(APPEND local_CLFLAGS --report_dir=${xilinx_report_folder} --log_dir=${xilinx_report_folder}/logs)
         if (is_accl_kernel)
             list(APPEND local_harware_only_flags ${ACCL_LINK_CONFIG})
@@ -76,6 +76,7 @@ function(generate_kernel_targets_xilinx)
         else()
                 set(CLFLAGS --config ${XILINX_COMPILE_SETTINGS_FILE})
         endif()
+        list(APPEND local_CLFLAGS ${CLFLAGS})
 
         # build emulation config for device
         add_custom_command(OUTPUT ${EXECUTABLE_OUTPUT_PATH}/emconfig.json

From 8eccea40bd5e2d8db7a6fc588f6cf9db8d07baf0 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 20 May 2022 10:14:12 +0100
Subject: [PATCH 056/318] Update ACCL calls to new dev signature

---
 PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
index 8400fb76..67a3a1e0 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
@@ -254,10 +254,10 @@ void accl_exchangeData(
 #endif
         accl_requests[current_parallel_execution] = (accl.send(
             0, *send_buffers[current_parallel_execution], sending_size,
-            send_rank, 0, true, ACCL::streamFlags::NO_STREAM, true));
+            send_rank, 0, true, ACCL::streamFlags::NO_STREAM, ACCL::dataType::none, true));
         accl_requests[current_parallel_execution + gcd] = (accl.recv(
             0, *recv_buffers[current_parallel_execution], sending_size,
-            send_rank, 0, true, ACCL::streamFlags::NO_STREAM, true));
+            send_rank, 0, true, ACCL::streamFlags::NO_STREAM, ACCL::dataType::none, true));
         // Increase the counter for parallel executions
         current_parallel_execution = (current_parallel_execution + 1) % gcd;
 

From 36c89013864d665b7ec66c5206391a0373363d8b Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 18 May 2022 17:55:35 +0100
Subject: [PATCH 057/318] Work on matrix transpose host code

---
 PTRANS/CMakeLists.txt                         |  5 ++
 PTRANS/src/common/parameters.h.in             |  2 +
 .../execution_types/execution_xrt_accl_pq.hpp | 52 ++++++++++++++-----
 shared/setup/fpga_setup_accl.cpp              |  4 +-
 4 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/PTRANS/CMakeLists.txt b/PTRANS/CMakeLists.txt
index 71e64026..ef4c4a47 100755
--- a/PTRANS/CMakeLists.txt
+++ b/PTRANS/CMakeLists.txt
@@ -18,6 +18,11 @@ set(HOST_EMULATION_REORDER No CACHE BOOL "Reorder the scheduling of FPGA kernels
 
 mark_as_advanced(READ_KERNEL_NAME WRITE_KERNEL_NAME USE_BUFFER_WRITE_RECT_FOR_A XILINX_UNROLL_INNER_LOOPS)
 
+if (USE_ACCL)
+    math(EXPR calculate_accl_buffer_size "${BLOCK_SIZE} * ${BLOCK_SIZE} * 8")
+    set(ACCL_BUFFER_SIZE ${calculate_accl_buffer_size} CACHE STRING "Size of ACCL buffers in bytes")
+endif()
+
 set(USE_MPI Yes)
 set(USE_OPENMP Yes)
 set(USE_DEPRECATED_HPP_HEADER No)
diff --git a/PTRANS/src/common/parameters.h.in b/PTRANS/src/common/parameters.h.in
index 68b50dd7..e42792ff 100644
--- a/PTRANS/src/common/parameters.h.in
+++ b/PTRANS/src/common/parameters.h.in
@@ -16,6 +16,8 @@
 #define NUM_REPLICATIONS @NUM_REPLICATIONS@
 #cmakedefine HOST_EMULATION_REORDER
 
+#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@
+
 /**
  * Kernel Parameters
  */
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
index 67a3a1e0..8d3edac5 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
@@ -62,6 +62,7 @@ void accl_exchangeData(
     acclBuffersA.push_back(accl.create_buffer<HOST_DATA_TYPE>(
         bo, data.blockSize * data.blockSize * data.numBlocks,
         ACCL::dataType::float32));
+    acclBuffersA.back()->sync_from_device();
   }
 
   if (pq_width == pq_height) {
@@ -78,17 +79,38 @@ void accl_exchangeData(
       // . . . 2
       // 1 . . .
       // 3 2 . .
-      // auto AcclBufferA_recv = accl.create_buffer<HOST_DATA_TYPE>(
-      //     data.blockSize * data.blockSize * data.numBlocks,
-      //     ACCL::dataType::float32);
-      // AcclBufferA_recv->sync_to_device();
+      auto acclBufferA_recv = accl.create_buffer<HOST_DATA_TYPE>(
+          data.blockSize * data.blockSize * data.numBlocks,
+          ACCL::dataType::float32);
+      acclBufferA_recv->sync_to_device();
       // Send and receive matrix A using ACCL directly on FPGA
-      accl.send(0, *acclBuffersA[0],
-                data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,
-                true, ACCL::streamFlags::NO_STREAM);
-      accl.recv(0, *acclBuffersA[0],
-                data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,
-                true, ACCL::streamFlags::NO_STREAM);
+      if (mpi_comm_rank < pair_rank) {
+        for (int block_num = 0; block_num < data.numBlocks; block_num++) {
+          accl.send(0,
+                    *acclBuffersA[0]->slice(
+                        data.blockSize * data.blockSize * block_num,
+                        data.blockSize * data.blockSize * (block_num + 1)),
+                    data.blockSize * data.blockSize, pair_rank, 0, true,
+                    ACCL::streamFlags::NO_STREAM);
+        }
+        accl.recv(0, *acclBufferA_recv,
+                  data.blockSize * data.blockSize * data.numBlocks, pair_rank,
+                  1, true, ACCL::streamFlags::NO_STREAM);
+      } else {
+        accl.recv(0, *acclBufferA_recv,
+                  data.blockSize * data.blockSize * data.numBlocks, pair_rank,
+                  0, true, ACCL::streamFlags::NO_STREAM);
+        for (int block_num = 0; block_num < data.numBlocks; block_num++) {
+          accl.send(0,
+                    *acclBuffersA[0]->slice(
+                        data.blockSize * data.blockSize * block_num,
+                        data.blockSize * data.blockSize * (block_num + 1)),
+                    data.blockSize * data.blockSize, pair_rank, 1, true,
+                    ACCL::streamFlags::NO_STREAM);
+        }
+      }
+      accl.copy(*acclBufferA_recv, *acclBuffersA[0],
+                data.blockSize * data.blockSize * data.numBlocks, true, true);
     }
   } else {
     // Taken from "Parallel matrix transpose algorithms on distributed memory
@@ -254,10 +276,12 @@ void accl_exchangeData(
 #endif
         accl_requests[current_parallel_execution] = (accl.send(
             0, *send_buffers[current_parallel_execution], sending_size,
-            send_rank, 0, true, ACCL::streamFlags::NO_STREAM, ACCL::dataType::none, true));
+            send_rank, 0, true, ACCL::streamFlags::NO_STREAM,
+            ACCL::dataType::none, true));
         accl_requests[current_parallel_execution + gcd] = (accl.recv(
             0, *recv_buffers[current_parallel_execution], sending_size,
-            send_rank, 0, true, ACCL::streamFlags::NO_STREAM, ACCL::dataType::none, true));
+            send_rank, 0, true, ACCL::streamFlags::NO_STREAM,
+            ACCL::dataType::none, true));
         // Increase the counter for parallel executions
         current_parallel_execution = (current_parallel_execution + 1) % gcd;
 
@@ -569,6 +593,10 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
       }
     }
     endTransfer = std::chrono::high_resolution_clock::now();
+
+    accl_exchangeData(*config.accl, handler, data, bufferListA,
+                      config.programSettings->matrixSize / data.blockSize);
+
     transferTime += std::chrono::duration_cast<std::chrono::duration<double>>(
         endTransfer - startTransfer);
     transferTimings.push_back(transferTime.count());
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index b4753430..5ce08a41 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -32,7 +32,7 @@ std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &progra
   std::vector<ACCL::rank_t> ranks = {};
   for (int i = 0; i < current_size; ++i) {
     // TODO: Replace the ip addresses and ports here for execution of real hardware?
-    ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, 1024};
+    ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, ACCL_BUFFER_SIZE};
     ranks.emplace_back(new_rank);
   }
   if (!useAcclEmulation) {
@@ -45,7 +45,7 @@ std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &progra
   } else {
     // TODO: Add start port here. Currenty hardcoded!
     return std::unique_ptr<ACCL::ACCL>(
-        new ACCL::ACCL(ranks, current_rank, 5500, device, ACCL::networkProtocol::TCP, 16, 1024));
+        new ACCL::ACCL(ranks, current_rank, 5500, device, ACCL::networkProtocol::TCP, 16, ACCL_BUFFER_SIZE));
   }
 }
 

From 32f6586569c4a5ee872f2862317a557216e632d4 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 18 May 2022 17:55:51 +0100
Subject: [PATCH 058/318] Set ACCL buffer size with cmake

---
 cmake/accl.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index 8b9823d6..2875657d 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -3,6 +3,7 @@
 set(ACCL_STACK_TYPE "UDP" CACHE STRING "Network layer used in ACCL")
 set(ACCL_UDP_ETH_IF 0 CACHE STRING "Ethernet interface used. On ETHZ: 0 = switch, 1 = direct")
 set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform")
+set(ACCL_BUFFER_SIZE 8192 CACHE STRING "Size of ACCL buffers in bytes")
 set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware)
 set(ACCL_CCLO_ADDITIONAL_BUILD_ARGS "" CACHE STRING "Add additional build arguments that will be passed to the CCLO makefile")
 set(ACCL_CCLO_BUILD_ARGS ${ACCL_CCLO_ADDITIONAL_BUILD_ARGS})

From e59049bf71b0359b30d8dedcf8076671c34ced2c Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 20 Apr 2022 15:10:20 +0100
Subject: [PATCH 059/318] Add support for new base impl for LINPACK

---
 LINPACK/src/host/CMakeLists.txt               |   4 +-
 .../host/execution_types/execution_iec.hpp    |   7 +-
 .../host/execution_types/execution_pcie.hpp   |   7 +-
 LINPACK/src/host/linpack_benchmark.cpp        | 713 ------------------
 LINPACK/src/host/linpack_benchmark.hpp        | 689 +++++++++++------
 LINPACK/src/host/linpack_data.cpp             | 259 +++++++
 LINPACK/src/host/linpack_data.hpp             | 274 +++++++
 LINPACK/src/host/main.cpp                     |   2 +-
 8 files changed, 997 insertions(+), 958 deletions(-)
 delete mode 100644 LINPACK/src/host/linpack_benchmark.cpp
 create mode 100644 LINPACK/src/host/linpack_data.cpp
 create mode 100644 LINPACK/src/host/linpack_data.hpp

diff --git a/LINPACK/src/host/CMakeLists.txt b/LINPACK/src/host/CMakeLists.txt
index d8feb95d..5422f31f 100755
--- a/LINPACK/src/host/CMakeLists.txt
+++ b/LINPACK/src/host/CMakeLists.txt
@@ -1,6 +1,6 @@
 
 add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase)
-set(HOST_SOURCE linpack_benchmark.cpp gmres.c blas.c)
+set(HOST_SOURCE linpack_data.cpp gmres.c blas.c)
 
 set(HOST_EXE_NAME Linpack)
 set(LIB_NAME lp)
@@ -17,6 +17,7 @@ if (INTELFPGAOPENCL_FOUND)
         target_compile_definitions(${LIB_NAME}_intel PRIVATE -DCL_VERSION_2_0)
     endif()
     target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA)
+    target_compile_definitions(${HOST_EXE_NAME}_intel PRIVATE -DINTEL_FPGA)
     target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_intel_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_intel> -h)
 endif()
@@ -30,6 +31,7 @@ if (Vitis_FOUND)
     target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
     target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx)
     target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
+    target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_xilinx_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_xilinx> -h)
 endif()
diff --git a/LINPACK/src/host/execution_types/execution_iec.hpp b/LINPACK/src/host/execution_types/execution_iec.hpp
index b98bcc31..3c232f41 100644
--- a/LINPACK/src/host/execution_types/execution_iec.hpp
+++ b/LINPACK/src/host/execution_types/execution_iec.hpp
@@ -35,7 +35,7 @@ SOFTWARE.
 #endif
 
 #include "parameters.h"
-#include "linpack_benchmark.hpp"
+#include "linpack_data.hpp"
 
 namespace linpack {
 namespace execution {
@@ -44,8 +44,9 @@ namespace iec {
 /*
  Prepare kernels and execute benchmark for a bitstream that makes use of intel external channels
 */
+template<class TDevice, class TContext, class TProgram>
 std::unique_ptr<linpack::LinpackExecutionTimings>
-calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&config,
+calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings, TDevice, TContext, TProgram>&config,
           linpack::LinpackData& data) {
 
     int err;
@@ -735,4 +736,4 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
 }   // namespace execution
 }  // namespace linpack
 
-#endif
\ No newline at end of file
+#endif
diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp
index 51b9c546..5462f025 100644
--- a/LINPACK/src/host/execution_types/execution_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_pcie.hpp
@@ -39,7 +39,7 @@ SOFTWARE.
 #endif
 
 #include "parameters.h"
-#include "linpack_benchmark.hpp"
+#include "linpack_data.hpp"
 
 namespace linpack {
 namespace execution {
@@ -50,8 +50,9 @@ namespace pcie {
 
  @copydoc bm_execution::calculate()
 */
+template<class TDevice, class TContext, class TProgram>
 std::unique_ptr<linpack::LinpackExecutionTimings>
-calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&config,
+calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings, TDevice, TContext, TProgram>&config,
           linpack::LinpackData& data) {
 
     cl_int err;
@@ -729,4 +730,4 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
 }   // namespace execution
 }  // namespace linpack
 
-#endif
\ No newline at end of file
+#endif
diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp
deleted file mode 100644
index d60be9d1..00000000
--- a/LINPACK/src/host/linpack_benchmark.cpp
+++ /dev/null
@@ -1,713 +0,0 @@
-//
-// Created by Marius Meyer on 04.12.19.
-//
-
-/*
-Copyright (c) 2019 Marius Meyer
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-*/
-
-#include "linpack_benchmark.hpp"
-
-/* C++ standard library headers */
-#include <memory>
-#include <random>
-
-/* Project's headers */
-#include "communication_types.hpp"
-#include "execution_types/execution_types.hpp"
-#include "parameters.h"
-
-linpack::LinpackProgramSettings::LinpackProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
-    matrixSize(results["m"].as<uint>() * (1 << (results["b"].as<uint>()))), blockSize(1 << (results["b"].as<uint>())), 
-    isEmulationKernel(results.count("emulation") > 0), isDiagonallyDominant(results.count("uniform") == 0),
-    torus_width(results["p"].as<uint>()) {
-    int mpi_comm_rank;
-    int mpi_comm_size;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size);
-    // calculate the row and column of the MPI rank in the torus 
-    if (mpi_comm_size % torus_width != 0) {
-        throw std::runtime_error("MPI size not dividable by P=" + std::to_string(torus_width) + "!");
-    } 
-    torus_height = mpi_comm_size / torus_width;
-    torus_row = (mpi_comm_rank / torus_width);
-    torus_col = (mpi_comm_rank % torus_width);
-}
-
-std::map<std::string, std::string>
-linpack::LinpackProgramSettings::getSettingsMap() {
-        auto map = hpcc_base::BaseSettings::getSettingsMap();
-        map["Matrix Size"] = std::to_string(matrixSize);
-        map["Block Size"] = std::to_string(blockSize);
-        map["Emulate"] = (isEmulationKernel) ? "Yes" : "No";
-        map["Data Type"] = STR(HOST_DATA_TYPE);
-        map["FPGA Torus"] = "P=" + std::to_string(torus_width) + ", Q=" + std::to_string(torus_height);
-        return map;
-}
-
-linpack::LinpackData::LinpackData(cl::Context context, size_t width, size_t height) : norma(0.0), context(context),
-    matrix_width(width), matrix_height(height) {
-#ifdef USE_SVM
-    A = reinterpret_cast<HOST_DATA_TYPE*>(
-                        clSVMAlloc(context(), 0 ,
-                        size * size * sizeof(HOST_DATA_TYPE), 1024));
-    b = reinterpret_cast<HOST_DATA_TYPE*>(
-                        clSVMAlloc(context(), 0 ,
-                        size  * sizeof(HOST_DATA_TYPE), 1024));
-    ipvt = reinterpret_cast<cl_int*>(
-                        clSVMAlloc(context(), 0 ,
-                        size * sizeof(cl_int), 1024));
-#else
-    posix_memalign(reinterpret_cast<void**>(&A), 4096, width * height * sizeof(HOST_DATA_TYPE));
-    posix_memalign(reinterpret_cast<void**>(&b), 4096, width * sizeof(HOST_DATA_TYPE));
-    posix_memalign(reinterpret_cast<void**>(&ipvt), 4096, height * sizeof(cl_int));
-#endif
-    }
-
-linpack::LinpackData::~LinpackData() {
-#ifdef USE_SVM
-    clSVMFree(context(), reinterpret_cast<void*>(A));
-    clSVMFree(context(), reinterpret_cast<void*>(b));
-    clSVMFree(context(), reinterpret_cast<void*>(ipvt));
-#else
-    free(A);
-    free(b);
-    free(ipvt);
-#endif
-}
-
-linpack::LinpackBenchmark::LinpackBenchmark(int argc, char* argv[]) : HpccFpgaBenchmark(argc, argv) {
-    setupBenchmark(argc, argv);
-}
-
-void
-linpack::LinpackBenchmark::addAdditionalParseOptions(cxxopts::Options &options) {
-    options.add_options()
-        ("m", "Global matrix size in number of blocks in one dimension. Local matrix sizes will be determined by PQ grid.",
-            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_MATRIX_SIZE)))
-        ("b", "Log2 of the block size in number of values in one dimension",
-            cxxopts::value<uint>()->default_value(std::to_string(LOCAL_MEM_BLOCK_LOG)))
-        ("p", "Width of the FPGA grid. The heigth (Q) will be calculated from mpi_size / P.",
-            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_P_VALUE)))
-        ("uniform", "Generate a uniform matrix instead of a diagonally dominant. This has to be supported by the FPGA kernel!")
-        ("emulation", "Use kernel arguments for emulation. This may be necessary to simulate persistent local memory on the FPGA");
-}
-
-std::unique_ptr<linpack::LinpackExecutionTimings>
-linpack::LinpackBenchmark::executeKernel(LinpackData &data) {
-    std::unique_ptr<linpack::LinpackExecutionTimings> timings;
-    switch (executionSettings->programSettings->communicationType) {
-        case hpcc_base::CommunicationType::pcie_mpi : timings = execution::pcie::calculate(*executionSettings, data); break;
-        case hpcc_base::CommunicationType::intel_external_channels: timings = execution::iec::calculate(*executionSettings, data); break;
-        default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType));
-    }
-#ifdef DISTRIBUTED_VALIDATION
-    distributed_gesl_nopvt_ref(data);
-#endif
-    return timings;
-}
-
-void
-linpack::LinpackBenchmark::collectAndPrintResults(const linpack::LinpackExecutionTimings &output) {
-    // Calculate performance for kernel execution plus data transfer
-    double tmean = 0;
-    double tlumean = 0;
-    double tslmean = 0;
-    double tmin = std::numeric_limits<double>::max();
-    double lu_min = std::numeric_limits<double>::max();
-    double sl_min = std::numeric_limits<double>::max();
-
-#ifndef NDEBUG
-    std::cout << "Rank " << mpi_comm_rank << ": Result collection started" << std::endl;
-#endif
-
-    std::vector<double> global_lu_times(output.gefaTimings.size());
-    MPI_Reduce(output.gefaTimings.data(), global_lu_times.data(), output.gefaTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-    std::vector<double> global_sl_times(output.geslTimings.size());
-    MPI_Reduce(output.geslTimings.data(), global_sl_times.data(), output.geslTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-#ifndef NDEBUG
-    std::cout << "Rank " << mpi_comm_rank << ": Result collection done" << std::endl;
-#endif
-
-
-    if (mpi_comm_rank > 0) {
-        // Only the master rank needs to calculate and print result
-        return;
-    }
-
-    double total_matrix_size = static_cast<double>(executionSettings->programSettings->matrixSize);
-    double gflops_lu = ((2.0e0*total_matrix_size * total_matrix_size * total_matrix_size)/ 3.0) / 1.0e9; 
-    double gflops_sl = (2.0*(total_matrix_size * total_matrix_size))/1.0e9;
-    for (int i =0; i < global_lu_times.size(); i++) {
-        double currentTime = global_lu_times[i] + global_sl_times[i];
-        tmean +=  currentTime;
-        tlumean +=  global_lu_times[i];
-        tslmean += global_sl_times[i];
-        if (currentTime < tmin) {
-            tmin = currentTime;
-        }
-        if (global_lu_times[i] < lu_min) {
-            lu_min = global_lu_times[i];
-        }
-        if (global_sl_times[i] < sl_min) {
-            sl_min = global_sl_times[i];
-        }
-    }
-    tmean = tmean / global_lu_times.size();
-    tlumean = tlumean / global_lu_times.size();
-    tslmean = tslmean / global_sl_times.size();
-
-     std::cout << std::setw(ENTRY_SPACE)
-              << "Method" << std::setw(ENTRY_SPACE)
-              << "best" << std::setw(ENTRY_SPACE) << "mean"
-              << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl;
-
-    std::cout << std::setw(ENTRY_SPACE) << "total" << std::setw(ENTRY_SPACE)
-              << tmin << std::setw(ENTRY_SPACE) << tmean
-              << std::setw(ENTRY_SPACE) << ((gflops_lu + gflops_sl) / tmin)
-              << std::endl;
-
-    std::cout << std::setw(ENTRY_SPACE) << "GEFA" << std::setw(ENTRY_SPACE)
-            << lu_min << std::setw(ENTRY_SPACE) << tlumean
-            << std::setw(ENTRY_SPACE) << ((gflops_lu) / lu_min)
-            << std::endl;
-
-    std::cout << std::setw(ENTRY_SPACE) << "GESL" << std::setw(ENTRY_SPACE)
-              << sl_min << std::setw(ENTRY_SPACE) << tslmean
-              << std::setw(ENTRY_SPACE) << (gflops_sl / sl_min)
-              << std::endl;
-}
-
-std::unique_ptr<linpack::LinpackData>
-linpack::LinpackBenchmark::generateInputData() {
-    int local_matrix_width = executionSettings->programSettings->matrixSize / executionSettings->programSettings->torus_width;
-    int local_matrix_height = executionSettings->programSettings->matrixSize / executionSettings->programSettings->torus_height;
-
-    if ((executionSettings->programSettings->matrixSize / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_width > 0 || 
-        (executionSettings->programSettings->matrixSize / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_height > 0) {
-            throw std::runtime_error("Global matrix size must be multiple of LCM of PQ grid!");
-    }
-
-    auto d = std::unique_ptr<linpack::LinpackData>(new linpack::LinpackData(*executionSettings->context ,local_matrix_width, local_matrix_height));
-    std::mt19937 gen(this->mpi_comm_rank);
-    std::uniform_real_distribution<> dis(0.0, 1.0);
-    d->norma = 0.0;
-    d->normb = 0.0;
-
-
-    /*
-    Generate a matrix by using pseudo random number in the range (0,1)
-    */
-    for (int j = 0; j < local_matrix_height; j++) {
-        // fill a single column of the matrix
-        for (int i = 0; i < local_matrix_width; i++) {
-                HOST_DATA_TYPE temp = dis(gen);
-                d->A[local_matrix_width*j+i] = temp;
-                d->norma = (temp > d->norma) ? temp : d->norma;
-        }
-    }
-
-
-    // If the matrix should be diagonally dominant, we need to exchange the sum of the rows with
-    // the ranks that share blocks in the same column
-    if (executionSettings->programSettings->isDiagonallyDominant) {
-        // create a communicator to exchange the rows
-        MPI_Comm row_communicator;
-        MPI_Comm_split(MPI_COMM_WORLD, executionSettings->programSettings->torus_row, 0,&row_communicator);
-
-        // Caclulate the sum for every row and insert in into the matrix
-        for (int local_matrix_row = 0; local_matrix_row < local_matrix_height; local_matrix_row++) {
-            int blockSize = executionSettings->programSettings->blockSize;
-            int global_matrix_row = executionSettings->programSettings->torus_row * blockSize + (local_matrix_row / blockSize) * blockSize * executionSettings->programSettings->torus_height + (local_matrix_row % blockSize);
-            int local_matrix_col = (global_matrix_row - executionSettings->programSettings->torus_col * blockSize) / (blockSize * executionSettings->programSettings->torus_width) * blockSize + (global_matrix_row % blockSize);
-            int diagonal_rank = (global_matrix_row / blockSize) % executionSettings->programSettings->torus_width;
-            bool diagonal_on_this_rank = diagonal_rank == executionSettings->programSettings->torus_col;
-            // set the diagonal elements of the matrix to 0
-            if (diagonal_on_this_rank) {
-                d->A[local_matrix_width*local_matrix_row + local_matrix_col] = 0.0;
-            }
-            HOST_DATA_TYPE local_row_sum = 0.0;
-            for (int i = 0; i < local_matrix_width; i++) {
-                local_row_sum += d->A[local_matrix_width*local_matrix_row + i];
-            } 
-            HOST_DATA_TYPE row_sum = 0.0;
-            MPI_Reduce(&local_row_sum, &row_sum, 1, MPI_DATA_TYPE, MPI_SUM, diagonal_rank, row_communicator);
-            // insert row sum into matrix if it contains the diagonal block
-            if (diagonal_on_this_rank) {
-                // update norm of local matrix
-                d->norma = (row_sum > d->norma) ? row_sum : d->norma;
-                d->A[local_matrix_width*local_matrix_row + local_matrix_col] = row_sum;
-            }
-        }
-    }
-        
-    // initialize other vectors
-    for (int i = 0; i < local_matrix_width; i++) {
-        d->b[i] = 0.0;
-    }
-    for (int i = 0; i < local_matrix_height; i++) {
-        d->ipvt[i] = i;
-    }
-
-    MPI_Comm col_communicator;
-    MPI_Comm_split(MPI_COMM_WORLD, executionSettings->programSettings->torus_col, 0,&col_communicator);
-
-    // Generate vector b by accumulating the columns of the matrix.
-    // This will lead to a result vector x with ones on every position
-    // Every rank will have a valid part of the final b vector stored
-    for (int j = 0; j < local_matrix_width; j++) {
-        HOST_DATA_TYPE local_col_sum = 0.0;
-        for (int i = 0; i < local_matrix_height; i++) {
-            local_col_sum += d->A[local_matrix_width*i+j];
-        }
-        MPI_Allreduce(&local_col_sum, &(d->b[j]), 1, MPI_DATA_TYPE, MPI_SUM, col_communicator);   
-        d->normb = (d->b[j] > d->normb) ? d->b[j] : d->normb;   
-    }
-    return d;
-}
-
-bool  
-linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &data) {
-    uint n= executionSettings->programSettings->matrixSize;
-    uint matrix_width = data.matrix_width;
-    uint matrix_height = data.matrix_height;
-    double residn;
-    double resid = 0.0;
-    double normx = 0.0;
-#ifndef DISTRIBUTED_VALIDATION
-    if (mpi_comm_rank > 0) {
-        for (int j = 0; j < matrix_height; j++) {
-            for (int i = 0; i < matrix_width; i+= executionSettings->programSettings->blockSize) {
-                MPI_Send(&data.A[matrix_width * j + i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
-            }
-        }
-        if (executionSettings->programSettings->torus_row == 0) {
-            for (int i = 0; i < matrix_width; i+= executionSettings->programSettings->blockSize) {
-                MPI_Send(&data.b[i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
-            }
-        }
-        residn = 0;
-    }
-    else {
-        MPI_Status status;
-        size_t current_offset = 0;
-        std::vector<HOST_DATA_TYPE> total_b_original(n);
-        std::vector<HOST_DATA_TYPE> total_b(n);
-        std::vector<HOST_DATA_TYPE> total_a(n*n);
-        for (int j = 0; j < n; j++) {
-            for (int i = 0; i < n; i+= executionSettings->programSettings->blockSize) {
-                int recvcol= (i / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_width;
-                int recvrow= (j / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_height;
-                int recvrank = executionSettings->programSettings->torus_width * recvrow + recvcol;
-                if (recvrank > 0) {
-                    MPI_Recv(&total_a[j * n + i],executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 0, MPI_COMM_WORLD,  &status);
-                }
-                else {
-                    for (int k=0; k < executionSettings->programSettings->blockSize; k++) {
-                        total_a[j * n + i + k] = data.A[current_offset + k];
-                    }
-                    current_offset += executionSettings->programSettings->blockSize;
-                }
-            }
-        }
-        current_offset = 0;
-        for (int i = 0; i < n; i+= executionSettings->programSettings->blockSize) {
-            int recvcol= (i / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_width;
-            if (recvcol > 0) {
-                MPI_Recv(&total_b[i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvcol, 0, MPI_COMM_WORLD, &status);
-            }
-            else {
-                for (int k=0; k < executionSettings->programSettings->blockSize; k++) {
-                    total_b[i + k] = data.b[current_offset + k];
-                }
-                current_offset += executionSettings->programSettings->blockSize;
-            }
-        }
-
-        std::copy(total_b.begin(), total_b.end(), total_b_original.begin());
-        gesl_ref_nopvt(total_a.data(), total_b.data(), n, n);
-
-        for (int i = 0; i < n; i++) {
-            resid = (resid > std::abs(total_b[i] - 1)) ? resid : std::abs(total_b[i] - 1);
-            normx = (normx > std::abs(total_b_original[i])) ? normx : std::abs(total_b_original[i]);
-        }
-    }
-#else
-    double local_resid = 0;
-    double local_normx = data.normb;
-    #pragma omp parallel for reduction(max:local_resid)
-    for (int i = 0; i < data.matrix_width; i++) {
-        local_resid = (local_resid > std::abs(data.b[i] - 1)) ? local_resid : std::abs(data.b[i] - 1);
-    }
-#ifndef NDEBUG
-    std::cout << "Rank " << mpi_comm_rank << ": resid=" << local_resid << ", normx=" << local_normx << std::endl;
-#endif
-
-    MPI_Reduce(&local_resid, &resid, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-    MPI_Reduce(&local_normx, &normx, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-#endif
-
-
-    HOST_DATA_TYPE eps = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
-    residn = resid / (static_cast<double>(n)*normx*eps);
-
-    #ifndef NDEBUG
-        if (residn > 1 &&  mpi_comm_size == 1) {
-            auto ref_result = generateInputData();
-            // For each column right of current diagonal element
-            for (int j = 0; j < n; j++) {
-                // For each element below it
-                for (int i = 0; i < n; i++) {
-                    std::cout << ref_result->A[n * j + i] << ", ";
-                }
-                std::cout << std::endl;
-            }
-            std::cout << std::endl;
-            // For each column right of current diagonal element
-            for (int j = 0; j < n; j++) {
-                // For each element below it
-                for (int i = 0; i < n; i++) {
-                    std::cout << data.A[n * j + i] << ", ";
-                }
-                std::cout << std::endl;
-            }
-            std::cout << std::endl;
-            if (executionSettings->programSettings->isDiagonallyDominant) {
-                linpack::gefa_ref_nopvt(ref_result->A, n, n);
-                linpack::gesl_ref_nopvt(ref_result->A, ref_result->b, n, n);
-            }
-            else {
-                linpack::gefa_ref(ref_result->A, n, n, ref_result->ipvt);
-                linpack::gesl_ref(ref_result->A, ref_result->b, ref_result->ipvt, n, n);
-            }
-            // For each column right of current diagonal element
-            for (int j = 0; j < n; j++) {
-                // For each element below it
-                for (int i = 0; i < n; i++) {
-                    std::cout << std::abs(ref_result->A[n * j + i] - data.A[n * j + i]) << ", ";
-                }
-                std::cout << std::endl;
-            }
-            std::cout << std::endl;
-        }
-    #endif
-
-    if (mpi_comm_rank == 0) {
-        //std::cout << resid << ", " << norma << ", " << normx << std::endl;
-        std::cout << "  norm. resid        resid       "\
-                    "machep   " << std::endl;
-        std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE)
-                << resid << std::setw(ENTRY_SPACE) << eps << std::endl;
-        return residn < 1;
-    }
-    else {
-        return true;
-    }
-}
-
-void 
-linpack::LinpackBenchmark::distributed_gesl_nopvt_ref(linpack::LinpackData& data) {
-    uint global_matrix_size = executionSettings->programSettings->matrixSize;
-    uint matrix_width = data.matrix_width;
-    uint matrix_height = data.matrix_height;
-    uint block_size = executionSettings->programSettings->blockSize;
-    // create a communicator to exchange the rows
-    MPI_Comm row_communicator;
-    MPI_Comm_split(MPI_COMM_WORLD, executionSettings->programSettings->torus_row, 0,&row_communicator);
-    MPI_Comm col_communicator;
-    MPI_Comm_split(MPI_COMM_WORLD, executionSettings->programSettings->torus_col, 0,&col_communicator);
-    std::vector<HOST_DATA_TYPE> b_tmp(matrix_width);
-
-    for (int k = 0; k < b_tmp.size(); k++) {
-        b_tmp[k] = data.b[k];
-    }
-
-    // solve l*y = b
-    // For each row in matrix
-    for (int k = 0; k < global_matrix_size - 1; k++) {
-        size_t local_k_index_col =  k / (block_size * executionSettings->programSettings->torus_width) * block_size;
-        size_t local_k_index_row =  k / (block_size * executionSettings->programSettings->torus_height) * block_size;
-        size_t remaining_k_col = k % (block_size * executionSettings->programSettings->torus_width);
-        size_t remaining_k_row = k % (block_size * executionSettings->programSettings->torus_height);
-        size_t start_offset = local_k_index_col;
-        if (remaining_k_col / block_size > executionSettings->programSettings->torus_col){
-            local_k_index_col += block_size;
-            start_offset = local_k_index_col;
-        }
-        else if (remaining_k_col / block_size == executionSettings->programSettings->torus_col) {
-            local_k_index_col += (remaining_k_col % block_size);
-            start_offset = local_k_index_col + 1;
-        }
-        if (remaining_k_row / block_size > executionSettings->programSettings->torus_row){
-            local_k_index_row += block_size;
-        }
-        else if (remaining_k_row / block_size == executionSettings->programSettings->torus_row) {
-            local_k_index_row += (remaining_k_row % block_size);
-        }
-
-        int row_diagonal_rank = (k / block_size) % executionSettings->programSettings->torus_height;
-        int col_diagonal_rank = (k / block_size) % executionSettings->programSettings->torus_width;
-        std::vector<HOST_DATA_TYPE> tmp_scaled_b(matrix_width, 0.0);
-        if (row_diagonal_rank == executionSettings->programSettings->torus_row) {
-            HOST_DATA_TYPE current_k;
-            current_k = (local_k_index_col < matrix_width) ? b_tmp[local_k_index_col] : 0.0;
-            MPI_Bcast(&current_k, 1, MPI_DATA_TYPE,  col_diagonal_rank, row_communicator);
-            // For each row below add
-            for (int i = start_offset; i < matrix_width; i++) {
-                // add solved upper row to current row
-                tmp_scaled_b[i] = current_k * data.A[matrix_width * local_k_index_row + i];
-            }
-        }
-        MPI_Bcast(&tmp_scaled_b.data()[start_offset], matrix_width - start_offset, MPI_DATA_TYPE, row_diagonal_rank, col_communicator);
-        for (int i = start_offset; i < matrix_width; i++) {
-            // add solved upper row to current row
-            b_tmp[i] += tmp_scaled_b[i];
-        }
-    }
-
-    // now solve  u*x = y
-    for (int k = global_matrix_size - 1; k >= 0; k--) {
-        size_t local_k_index_col =  k / (block_size * executionSettings->programSettings->torus_width) * block_size;
-        size_t local_k_index_row =  k / (block_size * executionSettings->programSettings->torus_height) * block_size;
-        size_t remaining_k_col = k % (block_size * executionSettings->programSettings->torus_width);
-        size_t remaining_k_row = k % (block_size * executionSettings->programSettings->torus_height);
-        if (remaining_k_col / block_size > executionSettings->programSettings->torus_col){
-            local_k_index_col += block_size;
-        }
-        else if (remaining_k_col / block_size == executionSettings->programSettings->torus_col) {
-            local_k_index_col += remaining_k_col % block_size;
-        }
-        if (remaining_k_row / block_size > executionSettings->programSettings->torus_row){
-            local_k_index_row += block_size;
-        }
-        else if (remaining_k_row / block_size == executionSettings->programSettings->torus_row) {
-            local_k_index_row += remaining_k_row % block_size;
-        }
-
-        HOST_DATA_TYPE scale_element = (local_k_index_col < matrix_width && local_k_index_row < matrix_height) ? b_tmp[local_k_index_col] * data.A[matrix_width * local_k_index_row + local_k_index_col] : 0.0;
-        int row_diagonal_rank = (k / block_size) % executionSettings->programSettings->torus_height;
-        int col_diagonal_rank = (k / block_size) % executionSettings->programSettings->torus_width;
-        MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, row_diagonal_rank, col_communicator);
-        if (col_diagonal_rank == executionSettings->programSettings->torus_col) {
-            b_tmp[local_k_index_col] = -scale_element;
-        }
-        MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, col_diagonal_rank, row_communicator);
-        size_t end_offset = local_k_index_col;
-
-        std::vector<HOST_DATA_TYPE> tmp_scaled_b(matrix_width, 0.0);
-        if (row_diagonal_rank == executionSettings->programSettings->torus_row) {
-            // For each row below add
-            for (int i = 0; i < end_offset; i++) {
-                tmp_scaled_b[i] = scale_element * data.A[matrix_width * local_k_index_row + i];
-            }
-        }
-        MPI_Bcast(tmp_scaled_b.data(), end_offset, MPI_DATA_TYPE, row_diagonal_rank, col_communicator);
-        for (int i = 0; i < end_offset; i++) {
-            // add solved upper row to current row
-            b_tmp[i] += tmp_scaled_b[i];
-        }
-    }
-    for (int k = 0; k < b_tmp.size(); k++) {
-        data.b[k] = b_tmp[k];
-    }
-
-#ifndef NDEBUG
-    MPI_Barrier(MPI_COMM_WORLD);
-    for (int rank = 0; rank < mpi_comm_size; rank++) {
-        if (rank == mpi_comm_rank) {
-            double sum = 0;
-            double max = 0;
-            for (int k = 0; k < matrix_width; k++) {
-                sum += std::abs(data.b[k]);
-                if (std::abs(data.b[k] - 1) > 0.1 || data.b[k] == NAN) {
-                    std::cout << "Rank " << mpi_comm_rank << " Pos: " << k << " Value: " << std::abs(data.b[k]) << std::endl;
-                }
-            }
-            std::cout << "Rank " << mpi_comm_rank << " Dist.Sum: " << sum << " Max: " << max << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-#endif
-}
-
-/**
-Standard LU factorization on a block with fixed size
-
-Case 1 of Zhangs description
-*/
-void
-linpack::gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt) {
-    for (int i = 0; i < n; i++) {
-        ipvt[i] = i;
-    }
-    // For each diagnonal element
-    for (int k = 0; k < n - 1; k++) {
-        HOST_DATA_TYPE max_val = fabs(a[k * lda + k]);
-        int pvt_index = k;
-        for (int i = k + 1; i < n; i++) {
-            if (max_val < fabs(a[k * lda + i])) {
-                pvt_index = i;
-                max_val = fabs(a[k * lda + i]);
-            }
-        }
-
-        for (int i = k; i < n; i++) {
-            HOST_DATA_TYPE tmp_val = a[i * lda + k];
-            a[i * lda + k] = a[i * lda + pvt_index];
-            a[i * lda + pvt_index] = tmp_val;
-        }
-        ipvt[k] = pvt_index;
-
-        // For each element below it
-        for (int i = k + 1; i < n; i++) {
-            a[k * lda + i] *= -1.0 / a[k * lda + k];
-        }
-        // For each column right of current diagonal element
-        for (int j = k + 1; j < n; j++) {
-            // For each element below it
-            for (int i = k+1; i < n; i++) {
-                a[j * lda + i] += a[k * lda + i] * a[j * lda + k];
-            }
-        }
-
-#ifdef DEBUG
-        std::cout << "A(k=" << k <<"): " << std::endl;
-                for (int i= 0; i < n; i++) {
-                    for (int j=0; j < n; j++) {
-                        std::cout << a[i*lda + j] << ", ";
-                    }
-                    std::cout << std::endl;
-                }
-                std::cout <<  std::endl;
-#endif
-
-    }
-}
-
-void
-linpack::gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda) {
-    auto b_tmp = new HOST_DATA_TYPE[n];
-    {
-        for (int k = 0; k < n; k++) {
-            b_tmp[k] = b[k];
-        }
-
-        // solve l*y = b
-        // For each row in matrix
-        for (int k = 0; k < n - 1; k++) {
-            if (ipvt[k] != k) {
-                HOST_DATA_TYPE tmp = b_tmp[k];
-                b_tmp[k] = b_tmp[ipvt[k]];
-                b_tmp[ipvt[k]] = tmp;
-            }
-            // For each row below add
-            for (int i = k + 1; i < n; i++) {
-                // add solved upper row to current row
-                b_tmp[i] += b_tmp[k] * a[lda * k + i];
-            }
-        }
-
-        // now solve  u*x = y
-        for (int k = n - 1; k >= 0; k--) {
-            b_tmp[k] = b_tmp[k] / a[lda * k + k];
-            for (int i = 0; i < k; i++) {
-                b_tmp[i] -= b_tmp[k] * a[lda * k + i];
-            }
-        }
-        for (int k = 0; k < n; k++) {
-            b[k] = b_tmp[k];
-        }
-    }
-    delete [] b_tmp;
-}
-
-void linpack::dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m, bool transposed) {
-    for (int i=0; i < n1; i++) {
-        for (int j=0; j < n2; j++) {
-            y[i] = y[i] + x[j] * (transposed ? m[ldm*i + j] :m[ldm*j + i]);
-        }
-    }
-}
-
-void
-linpack::gefa_ref_nopvt(HOST_DATA_TYPE* a, unsigned n, unsigned lda) {
-    // For each diagnonal element
-    for (int k = 0; k < n; k++) {
-        // Store negatie invers of diagonal elements to get rid of some divisions afterwards!
-        a[k * lda + k] = -1.0 / a[k * lda + k];
-        // For each element below it
-        for (int i = k + 1; i < n; i++) {
-            a[k * lda + i] *= a[k * lda + k];
-        }
-        // For each column right of current diagonal element
-        for (int j = k + 1; j < n; j++) {
-            // For each element below it
-            for (int i = k+1; i < n; i++) {
-                a[j * lda + i] += a[k * lda + i] * a[j * lda + k];
-            }
-        }
-
-#ifdef DEBUG
-        std::cout << "A(k=" << k << "): " << std::endl;
-                for (int i= 0; i < n; i++) {
-                    for (int j=0; j < n; j++) {
-                        std::cout << a[i*lda + j] << ", ";
-                    }
-                    std::cout << std::endl;
-                }
-                std::cout <<  std::endl;
-#endif
-
-    }
-}
-
-
-void
-linpack::gesl_ref_nopvt(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, unsigned n, unsigned lda) {
-    auto b_tmp = new HOST_DATA_TYPE[n];
-
-    for (int k = 0; k < n; k++) {
-        b_tmp[k] = b[k];
-    }
-
-    // solve l*y = b
-    // For each row in matrix
-    for (int k = 0; k < n - 1; k++) {
-        // For each row below add
-        for (int i = k + 1; i < n; i++) {
-            // add solved upper row to current row
-            b_tmp[i] += b_tmp[k] * a[lda * k + i];
-        }
-    }
-
-    // now solve  u*x = y
-    for (int k = n - 1; k >= 0; k--) {
-        HOST_DATA_TYPE scale = b_tmp[k] * a[lda * k + k];
-        b_tmp[k] = -scale;
-        for (int i = 0; i < k; i++) {
-            b_tmp[i] += scale * a[lda * k + i];
-        }
-    }
-    for (int k = 0; k < n; k++) {
-        b[k] = b_tmp[k];
-    }
-    delete [] b_tmp;
-}
diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp
index c05b323a..b79fa65a 100644
--- a/LINPACK/src/host/linpack_benchmark.hpp
+++ b/LINPACK/src/host/linpack_benchmark.hpp
@@ -26,10 +26,13 @@ SOFTWARE.
 /* C++ standard library headers */
 #include <complex>
 #include <memory>
+#include <random>
 
 /* Project's headers */
 #include "hpcc_benchmark.hpp"
+#include "execution_types/execution_types.hpp"
 #include "parameters.h"
+#include "linpack_data.hpp"
 extern "C" {
     #include "gmres.h"
 }
@@ -40,177 +43,12 @@ extern "C" {
  */
 namespace linpack {
 
-/**
- * @brief The Linpack specific program settings
- * 
- */
-class LinpackProgramSettings : public hpcc_base::BaseSettings {
-
-public:
-    /**
-     * @brief The size of the local matrix in number of blocks in one dimension
-     * 
-     */
-    uint matrixSize;
-
-    /**
-     * @brief Size of a single block of the matrix in values in one dimension
-     * 
-     */
-    uint blockSize;
-
-    /**
-     * @brief Indicates if the generated input matrix should be diagonally dominant
-     * 
-     */
-    bool isDiagonallyDominant;
-
-    /**
-     * @brief True, if the used kernel is an emulation kernel. Different kernel arguments may be used in this case to
-     *          simulate persistent local memory.
-     * 
-     */
-    bool isEmulationKernel;
-
-    /**
-     * @brief The row position of this MPI rank in the torus
-     * 
-     */
-    int torus_row;
-
-    /**
-     * @brief The rcolumn position of this MPI rank in the torus
-     * 
-     */
-    int torus_col;
-
-    /**
-     * @brief Width of the torus in number of ranks
-     * 
-     */
-    int torus_width;
-
-    /**
-     * @brief Height of the FPGA torus in number of ranks
-     * 
-     */
-    int torus_height;
-
-    /**
-     * @brief Construct a new Linpack Program Settings object
-     * 
-     * @param results the result map from parsing the program input parameters
-     */
-    LinpackProgramSettings(cxxopts::ParseResult &results);
-
-    /**
-     * @brief Get a map of the settings. This map will be used to print the final configuration.
-     * 
-     * @return a map of program parameters. keys are the name of the parameter.
-     */
-    std::map<std::string, std::string> getSettingsMap() override;
-
-};
-
-/**
- * @brief Data class containing the data the kernel is exeucted with
- * 
- */
-class LinpackData {
-
-public:
-
-    /**
-     * @brief  The input matrix representing the left side of the linear equation system
-     * 
-     */
-    HOST_DATA_TYPE *A;
-
-    /**
-     * @brief  The input vector the right side of the linear equation system
-     * 
-     */
-    HOST_DATA_TYPE *b;
-
-    /**
-     * @brief A vector that can be used to store pivoting information
-     * 
-     */
-    cl_int* ipvt;
-
-    /**
-     * @brief Width of the local matrix in values
-     * 
-     */
-    size_t matrix_width;
-
-    /**
-     * @brief Height of the local matrix in values
-     * 
-     */
-    size_t matrix_height;
-
-    /**
-     * @brief The context that is used to allocate memory in SVM mode
-     * 
-     */
-    cl::Context context;
-
-    /**
-     * @brief The maximum value of A that will be used for the error calculation
-     * 
-     */
-    HOST_DATA_TYPE norma;
-
-    /**
-     * @brief The maximum value of A that will be used for the error calculation
-     * 
-     */
-    HOST_DATA_TYPE normb;
-
-    /**
-     * @brief Construct a new Linpack Data object
-     * 
-     * @param context The OpenCL context used to allocate memory in SVM mode
-     * @param width width of the local matrix in values
-     * @param height height of the local matrix in values
-     */
-    LinpackData(cl::Context context, size_t width, size_t height);
-
-    /**
-     * @brief Destroy the Linpack Data object. Free the allocated memory
-     * 
-     */
-    ~LinpackData();
-
-};
-
-/**
- * @brief Measured execution timing from the kernel execution
- * 
- */
-class LinpackExecutionTimings {
-public:
-    /**
-     * @brief A vector containing the timings for all repetitions for the kernel execution for the gefa kernel
-     * 
-     */
-    std::vector<double> gefaTimings;
-
-    /**
-     * @brief A vector containing the timings for all repetitions for the kernel execution for the gesl kernel
-     * 
-     */
-    std::vector<double> geslTimings;
-
-
-};
-
 /**
  * @brief Implementation of the Linpack benchmark
  * 
  */
-class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSettings, LinpackData, LinpackExecutionTimings> {
+template<class TDevice, class TContext, class TProgram>
+class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSettings, TDevice, TContext, TProgram, LinpackData, LinpackExecutionTimings> {
 
 protected:
 
@@ -220,7 +58,18 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @param options 
      */
     void
-    addAdditionalParseOptions(cxxopts::Options &options) override;
+    addAdditionalParseOptions(cxxopts::Options &options) override {
+    options.add_options()
+        ("m", "Global matrix size in number of blocks in one dimension. Local matrix sizes will be determined by PQ grid.",
+            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_MATRIX_SIZE)))
+        ("b", "Log2 of the block size in number of values in one dimension",
+            cxxopts::value<uint>()->default_value(std::to_string(LOCAL_MEM_BLOCK_LOG)))
+        ("p", "Width of the FPGA grid. The heigth (Q) will be calculated from mpi_size / P.",
+            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_P_VALUE)))
+        ("uniform", "Generate a uniform matrix instead of a diagonally dominant. This has to be supported by the FPGA kernel!")
+        ("emulation", "Use kernel arguments for emulation. This may be necessary to simulate persistent local memory on the FPGA");
+}
+
 
     /**
      * @brief Distributed solving of l*y=b and u*x = y 
@@ -228,7 +77,130 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @param data The local data. b will contain the solution for the unknows that were handeled by this rank
      */
     void 
-    distributed_gesl_nopvt_ref(linpack::LinpackData& data);
+    distributed_gesl_nopvt_ref(linpack::LinpackData& data) {
+    uint global_matrix_size = this->executionSettings->programSettings->matrixSize;
+    uint matrix_width = data.matrix_width;
+    uint matrix_height = data.matrix_height;
+    uint block_size = this->executionSettings->programSettings->blockSize;
+    // create a communicator to exchange the rows
+    MPI_Comm row_communicator;
+    MPI_Comm_split(MPI_COMM_WORLD, this->executionSettings->programSettings->torus_row, 0,&row_communicator);
+    MPI_Comm col_communicator;
+    MPI_Comm_split(MPI_COMM_WORLD, this->executionSettings->programSettings->torus_col, 0,&col_communicator);
+    std::vector<HOST_DATA_TYPE> b_tmp(matrix_width);
+
+    for (int k = 0; k < b_tmp.size(); k++) {
+        b_tmp[k] = data.b[k];
+    }
+
+    // solve l*y = b
+    // For each row in matrix
+    for (int k = 0; k < global_matrix_size - 1; k++) {
+        size_t local_k_index_col =  k / (block_size * this->executionSettings->programSettings->torus_width) * block_size;
+        size_t local_k_index_row =  k / (block_size * this->executionSettings->programSettings->torus_height) * block_size;
+        size_t remaining_k_col = k % (block_size * this->executionSettings->programSettings->torus_width);
+        size_t remaining_k_row = k % (block_size * this->executionSettings->programSettings->torus_height);
+        size_t start_offset = local_k_index_col;
+        if (remaining_k_col / block_size > this->executionSettings->programSettings->torus_col){
+            local_k_index_col += block_size;
+            start_offset = local_k_index_col;
+        }
+        else if (remaining_k_col / block_size == this->executionSettings->programSettings->torus_col) {
+            local_k_index_col += (remaining_k_col % block_size);
+            start_offset = local_k_index_col + 1;
+        }
+        if (remaining_k_row / block_size > this->executionSettings->programSettings->torus_row){
+            local_k_index_row += block_size;
+        }
+        else if (remaining_k_row / block_size == this->executionSettings->programSettings->torus_row) {
+            local_k_index_row += (remaining_k_row % block_size);
+        }
+
+        int row_diagonal_rank = (k / block_size) % this->executionSettings->programSettings->torus_height;
+        int col_diagonal_rank = (k / block_size) % this->executionSettings->programSettings->torus_width;
+        std::vector<HOST_DATA_TYPE> tmp_scaled_b(matrix_width, 0.0);
+        if (row_diagonal_rank == this->executionSettings->programSettings->torus_row) {
+            HOST_DATA_TYPE current_k;
+            current_k = (local_k_index_col < matrix_width) ? b_tmp[local_k_index_col] : 0.0;
+            MPI_Bcast(&current_k, 1, MPI_DATA_TYPE,  col_diagonal_rank, row_communicator);
+            // For each row below add
+            for (int i = start_offset; i < matrix_width; i++) {
+                // add solved upper row to current row
+                tmp_scaled_b[i] = current_k * data.A[matrix_width * local_k_index_row + i];
+            }
+        }
+        MPI_Bcast(&tmp_scaled_b.data()[start_offset], matrix_width - start_offset, MPI_DATA_TYPE, row_diagonal_rank, col_communicator);
+        for (int i = start_offset; i < matrix_width; i++) {
+            // add solved upper row to current row
+            b_tmp[i] += tmp_scaled_b[i];
+        }
+    }
+
+    // now solve  u*x = y
+    for (int k = global_matrix_size - 1; k >= 0; k--) {
+        size_t local_k_index_col =  k / (block_size * this->executionSettings->programSettings->torus_width) * block_size;
+        size_t local_k_index_row =  k / (block_size * this->executionSettings->programSettings->torus_height) * block_size;
+        size_t remaining_k_col = k % (block_size * this->executionSettings->programSettings->torus_width);
+        size_t remaining_k_row = k % (block_size * this->executionSettings->programSettings->torus_height);
+        if (remaining_k_col / block_size > this->executionSettings->programSettings->torus_col){
+            local_k_index_col += block_size;
+        }
+        else if (remaining_k_col / block_size == this->executionSettings->programSettings->torus_col) {
+            local_k_index_col += remaining_k_col % block_size;
+        }
+        if (remaining_k_row / block_size > this->executionSettings->programSettings->torus_row){
+            local_k_index_row += block_size;
+        }
+        else if (remaining_k_row / block_size == this->executionSettings->programSettings->torus_row) {
+            local_k_index_row += remaining_k_row % block_size;
+        }
+
+        HOST_DATA_TYPE scale_element = (local_k_index_col < matrix_width && local_k_index_row < matrix_height) ? b_tmp[local_k_index_col] * data.A[matrix_width * local_k_index_row + local_k_index_col] : 0.0;
+        int row_diagonal_rank = (k / block_size) % this->executionSettings->programSettings->torus_height;
+        int col_diagonal_rank = (k / block_size) % this->executionSettings->programSettings->torus_width;
+        MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, row_diagonal_rank, col_communicator);
+        if (col_diagonal_rank == this->executionSettings->programSettings->torus_col) {
+            b_tmp[local_k_index_col] = -scale_element;
+        }
+        MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, col_diagonal_rank, row_communicator);
+        size_t end_offset = local_k_index_col;
+
+        std::vector<HOST_DATA_TYPE> tmp_scaled_b(matrix_width, 0.0);
+        if (row_diagonal_rank == this->executionSettings->programSettings->torus_row) {
+            // For each row below add
+            for (int i = 0; i < end_offset; i++) {
+                tmp_scaled_b[i] = scale_element * data.A[matrix_width * local_k_index_row + i];
+            }
+        }
+        MPI_Bcast(tmp_scaled_b.data(), end_offset, MPI_DATA_TYPE, row_diagonal_rank, col_communicator);
+        for (int i = 0; i < end_offset; i++) {
+            // add solved upper row to current row
+            b_tmp[i] += tmp_scaled_b[i];
+        }
+    }
+    for (int k = 0; k < b_tmp.size(); k++) {
+        data.b[k] = b_tmp[k];
+    }
+
+#ifndef NDEBUG
+    MPI_Barrier(MPI_COMM_WORLD);
+    for (int rank = 0; rank < this->mpi_comm_size; rank++) {
+        if (rank == this->mpi_comm_rank) {
+            double sum = 0;
+            double max = 0;
+            for (int k = 0; k < matrix_width; k++) {
+                sum += std::abs(data.b[k]);
+                if (std::abs(data.b[k] - 1) > 0.1 || data.b[k] == NAN) {
+                    std::cout << "Rank " << this->mpi_comm_rank << " Pos: " << k << " Value: " << std::abs(data.b[k]) << std::endl;
+                }
+            }
+            std::cout << "Rank " << this->mpi_comm_rank << " Dist.Sum: " << sum << " Max: " << max << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+#endif
+}
+
 
 public:
 
@@ -238,7 +210,93 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @return std::unique_ptr<LinpackData> The input and output data of the benchmark
      */
     std::unique_ptr<LinpackData>
-    generateInputData() override;
+    generateInputData() override {
+    int local_matrix_width = this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->torus_width;
+    int local_matrix_height = this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->torus_height;
+
+    if ((this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_width > 0 || 
+        (this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_height > 0) {
+            throw std::runtime_error("Global matrix size must be multiple of LCM of PQ grid!");
+    }
+
+    auto d = std::unique_ptr<linpack::LinpackData>(new linpack::LinpackData(*this->executionSettings->context ,local_matrix_width, local_matrix_height));
+    std::mt19937 gen(this->mpi_comm_rank);
+    std::uniform_real_distribution<> dis(0.0, 1.0);
+    d->norma = 0.0;
+    d->normb = 0.0;
+
+
+    /*
+    Generate a matrix by using pseudo random number in the range (0,1)
+    */
+    for (int j = 0; j < local_matrix_height; j++) {
+        // fill a single column of the matrix
+        for (int i = 0; i < local_matrix_width; i++) {
+                HOST_DATA_TYPE temp = dis(gen);
+                d->A[local_matrix_width*j+i] = temp;
+                d->norma = (temp > d->norma) ? temp : d->norma;
+        }
+    }
+
+
+    // If the matrix should be diagonally dominant, we need to exchange the sum of the rows with
+    // the ranks that share blocks in the same column
+    if (this->executionSettings->programSettings->isDiagonallyDominant) {
+        // create a communicator to exchange the rows
+        MPI_Comm row_communicator;
+        MPI_Comm_split(MPI_COMM_WORLD, this->executionSettings->programSettings->torus_row, 0,&row_communicator);
+
+        // Caclulate the sum for every row and insert in into the matrix
+        for (int local_matrix_row = 0; local_matrix_row < local_matrix_height; local_matrix_row++) {
+            int blockSize = this->executionSettings->programSettings->blockSize;
+            int global_matrix_row = this->executionSettings->programSettings->torus_row * blockSize + (local_matrix_row / blockSize) * blockSize * this->executionSettings->programSettings->torus_height + (local_matrix_row % blockSize);
+            int local_matrix_col = (global_matrix_row - this->executionSettings->programSettings->torus_col * blockSize) / (blockSize * this->executionSettings->programSettings->torus_width) * blockSize + (global_matrix_row % blockSize);
+            int diagonal_rank = (global_matrix_row / blockSize) % this->executionSettings->programSettings->torus_width;
+            bool diagonal_on_this_rank = diagonal_rank == this->executionSettings->programSettings->torus_col;
+            // set the diagonal elements of the matrix to 0
+            if (diagonal_on_this_rank) {
+                d->A[local_matrix_width*local_matrix_row + local_matrix_col] = 0.0;
+            }
+            HOST_DATA_TYPE local_row_sum = 0.0;
+            for (int i = 0; i < local_matrix_width; i++) {
+                local_row_sum += d->A[local_matrix_width*local_matrix_row + i];
+            } 
+            HOST_DATA_TYPE row_sum = 0.0;
+            MPI_Reduce(&local_row_sum, &row_sum, 1, MPI_DATA_TYPE, MPI_SUM, diagonal_rank, row_communicator);
+            // insert row sum into matrix if it contains the diagonal block
+            if (diagonal_on_this_rank) {
+                // update norm of local matrix
+                d->norma = (row_sum > d->norma) ? row_sum : d->norma;
+                d->A[local_matrix_width*local_matrix_row + local_matrix_col] = row_sum;
+            }
+        }
+    }
+        
+    // initialize other vectors
+    for (int i = 0; i < local_matrix_width; i++) {
+        d->b[i] = 0.0;
+    }
+    for (int i = 0; i < local_matrix_height; i++) {
+        d->ipvt[i] = i;
+    }
+
+    MPI_Comm col_communicator;
+    MPI_Comm_split(MPI_COMM_WORLD, this->executionSettings->programSettings->torus_col, 0,&col_communicator);
+
+    // Generate vector b by accumulating the columns of the matrix.
+    // This will lead to a result vector x with ones on every position
+    // Every rank will have a valid part of the final b vector stored
+    for (int j = 0; j < local_matrix_width; j++) {
+        HOST_DATA_TYPE local_col_sum = 0.0;
+        for (int i = 0; i < local_matrix_height; i++) {
+            local_col_sum += d->A[local_matrix_width*i+j];
+        }
+        MPI_Allreduce(&local_col_sum, &(d->b[j]), 1, MPI_DATA_TYPE, MPI_SUM, col_communicator);   
+        d->normb = (d->b[j] > d->normb) ? d->b[j] : d->normb;   
+    }
+    return d;
+}
+
 
     /**
      * @brief Linpack specific implementation of the kernel execution
@@ -247,7 +305,19 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @return std::unique_ptr<LinpackExecutionTimings> Measured runtimes of the kernel execution
      */
     std::unique_ptr<LinpackExecutionTimings>
-    executeKernel(LinpackData &data) override;
+    executeKernel(LinpackData &data) override {
+    std::unique_ptr<linpack::LinpackExecutionTimings> timings;
+    switch (this->executionSettings->programSettings->communicationType) {
+        case hpcc_base::CommunicationType::pcie_mpi : timings = execution::pcie::calculate(*this->executionSettings, data); break;
+        case hpcc_base::CommunicationType::intel_external_channels: timings = execution::iec::calculate(*this->executionSettings, data); break;
+        default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(this->executionSettings->programSettings->communicationType));
+    }
+#ifdef DISTRIBUTED_VALIDATION
+    distributed_gesl_nopvt_ref(data);
+#endif
+    return timings;
+}
+
 
     /**
      * @brief Linpack specific implementation of the execution validation
@@ -257,7 +327,144 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(LinpackData &data) override;
+    validateOutputAndPrintError(LinpackData &data) override {
+    uint n= this->executionSettings->programSettings->matrixSize;
+    uint matrix_width = data.matrix_width;
+    uint matrix_height = data.matrix_height;
+    double residn;
+    double resid = 0.0;
+    double normx = 0.0;
+#ifndef DISTRIBUTED_VALIDATION
+    if (mpi_comm_rank > 0) {
+        for (int j = 0; j < matrix_height; j++) {
+            for (int i = 0; i < matrix_width; i+= this->executionSettings->programSettings->blockSize) {
+                MPI_Send(&data.A[matrix_width * j + i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
+            }
+        }
+        if (executionSettings->programSettings->torus_row == 0) {
+            for (int i = 0; i < matrix_width; i+= this->executionSettings->programSettings->blockSize) {
+                MPI_Send(&data.b[i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
+            }
+        }
+        residn = 0;
+    }
+    else {
+        MPI_Status status;
+        size_t current_offset = 0;
+        std::vector<HOST_DATA_TYPE> total_b_original(n);
+        std::vector<HOST_DATA_TYPE> total_b(n);
+        std::vector<HOST_DATA_TYPE> total_a(n*n);
+        for (int j = 0; j < n; j++) {
+            for (int i = 0; i < n; i+= this->executionSettings->programSettings->blockSize) {
+                int recvcol= (i / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_width;
+                int recvrow= (j / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_height;
+                int recvrank = this->executionSettings->programSettings->torus_width * recvrow + recvcol;
+                if (recvrank > 0) {
+                    MPI_Recv(&total_a[j * n + i],executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 0, MPI_COMM_WORLD,  &status);
+                }
+                else {
+                    for (int k=0; k < this->executionSettings->programSettings->blockSize; k++) {
+                        total_a[j * n + i + k] = data.A[current_offset + k];
+                    }
+                    current_offset += this->executionSettings->programSettings->blockSize;
+                }
+            }
+        }
+        current_offset = 0;
+        for (int i = 0; i < n; i+= this->executionSettings->programSettings->blockSize) {
+            int recvcol= (i / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_width;
+            if (recvcol > 0) {
+                MPI_Recv(&total_b[i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvcol, 0, MPI_COMM_WORLD, &status);
+            }
+            else {
+                for (int k=0; k < this->executionSettings->programSettings->blockSize; k++) {
+                    total_b[i + k] = data.b[current_offset + k];
+                }
+                current_offset += this->executionSettings->programSettings->blockSize;
+            }
+        }
+
+        std::copy(total_b.begin(), total_b.end(), total_b_original.begin());
+        gesl_ref_nopvt(total_a.data(), total_b.data(), n, n);
+
+        for (int i = 0; i < n; i++) {
+            resid = (resid > std::abs(total_b[i] - 1)) ? resid : std::abs(total_b[i] - 1);
+            normx = (normx > std::abs(total_b_original[i])) ? normx : std::abs(total_b_original[i]);
+        }
+    }
+#else
+    double local_resid = 0;
+    double local_normx = data.normb;
+    #pragma omp parallel for reduction(max:local_resid)
+    for (int i = 0; i < data.matrix_width; i++) {
+        local_resid = (local_resid > std::abs(data.b[i] - 1)) ? local_resid : std::abs(data.b[i] - 1);
+    }
+#ifndef NDEBUG
+    std::cout << "Rank " << this->mpi_comm_rank << ": resid=" << local_resid << ", normx=" << local_normx << std::endl;
+#endif
+
+    MPI_Reduce(&local_resid, &resid, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&local_normx, &normx, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+#endif
+
+
+    HOST_DATA_TYPE eps = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
+    residn = resid / (static_cast<double>(n)*normx*eps);
+
+    #ifndef NDEBUG
+        if (residn > 1 &&  this->mpi_comm_size == 1) {
+            auto ref_result = generateInputData();
+            // For each column right of current diagonal element
+            for (int j = 0; j < n; j++) {
+                // For each element below it
+                for (int i = 0; i < n; i++) {
+                    std::cout << ref_result->A[n * j + i] << ", ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+            // For each column right of current diagonal element
+            for (int j = 0; j < n; j++) {
+                // For each element below it
+                for (int i = 0; i < n; i++) {
+                    std::cout << data.A[n * j + i] << ", ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+            if (this->executionSettings->programSettings->isDiagonallyDominant) {
+                linpack::gefa_ref_nopvt(ref_result->A, n, n);
+                linpack::gesl_ref_nopvt(ref_result->A, ref_result->b, n, n);
+            }
+            else {
+                linpack::gefa_ref(ref_result->A, n, n, ref_result->ipvt);
+                linpack::gesl_ref(ref_result->A, ref_result->b, ref_result->ipvt, n, n);
+            }
+            // For each column right of current diagonal element
+            for (int j = 0; j < n; j++) {
+                // For each element below it
+                for (int i = 0; i < n; i++) {
+                    std::cout << std::abs(ref_result->A[n * j + i] - data.A[n * j + i]) << ", ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+        }
+    #endif
+
+    if (this->mpi_comm_rank == 0) {
+        //std::cout << resid << ", " << norma << ", " << normx << std::endl;
+        std::cout << "  norm. resid        resid       "\
+                    "machep   " << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE)
+                << resid << std::setw(ENTRY_SPACE) << eps << std::endl;
+        return residn < 1;
+    }
+    else {
+        return true;
+    }
+}
+
 
     /**
      * @brief Linpack specific implementation of printing the execution results
@@ -265,7 +472,75 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @param output Measured runtimes of the kernel execution
      */
     void
-    collectAndPrintResults(const LinpackExecutionTimings &output) override;
+    collectAndPrintResults(const LinpackExecutionTimings &output) override {
+    // Calculate performance for kernel execution plus data transfer
+    double tmean = 0;
+    double tlumean = 0;
+    double tslmean = 0;
+    double tmin = std::numeric_limits<double>::max();
+    double lu_min = std::numeric_limits<double>::max();
+    double sl_min = std::numeric_limits<double>::max();
+
+#ifndef NDEBUG
+    std::cout << "Rank " << this->mpi_comm_rank << ": Result collection started" << std::endl;
+#endif
+
+    std::vector<double> global_lu_times(output.gefaTimings.size());
+    MPI_Reduce(output.gefaTimings.data(), global_lu_times.data(), output.gefaTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    std::vector<double> global_sl_times(output.geslTimings.size());
+    MPI_Reduce(output.geslTimings.data(), global_sl_times.data(), output.geslTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+#ifndef NDEBUG
+    std::cout << "Rank " << this->mpi_comm_rank << ": Result collection done" << std::endl;
+#endif
+
+
+    if (this->mpi_comm_rank > 0) {
+        // Only the master rank needs to calculate and print result
+        return;
+    }
+
+    double total_matrix_size = static_cast<double>(this->executionSettings->programSettings->matrixSize);
+    double gflops_lu = ((2.0e0*total_matrix_size * total_matrix_size * total_matrix_size)/ 3.0) / 1.0e9; 
+    double gflops_sl = (2.0*(total_matrix_size * total_matrix_size))/1.0e9;
+    for (int i =0; i < global_lu_times.size(); i++) {
+        double currentTime = global_lu_times[i] + global_sl_times[i];
+        tmean +=  currentTime;
+        tlumean +=  global_lu_times[i];
+        tslmean += global_sl_times[i];
+        if (currentTime < tmin) {
+            tmin = currentTime;
+        }
+        if (global_lu_times[i] < lu_min) {
+            lu_min = global_lu_times[i];
+        }
+        if (global_sl_times[i] < sl_min) {
+            sl_min = global_sl_times[i];
+        }
+    }
+    tmean = tmean / global_lu_times.size();
+    tlumean = tlumean / global_lu_times.size();
+    tslmean = tslmean / global_sl_times.size();
+
+     std::cout << std::setw(ENTRY_SPACE)
+              << "Method" << std::setw(ENTRY_SPACE)
+              << "best" << std::setw(ENTRY_SPACE) << "mean"
+              << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl;
+
+    std::cout << std::setw(ENTRY_SPACE) << "total" << std::setw(ENTRY_SPACE)
+              << tmin << std::setw(ENTRY_SPACE) << tmean
+              << std::setw(ENTRY_SPACE) << ((gflops_lu + gflops_sl) / tmin)
+              << std::endl;
+
+    std::cout << std::setw(ENTRY_SPACE) << "GEFA" << std::setw(ENTRY_SPACE)
+            << lu_min << std::setw(ENTRY_SPACE) << tlumean
+            << std::setw(ENTRY_SPACE) << ((gflops_lu) / lu_min)
+            << std::endl;
+
+    std::cout << std::setw(ENTRY_SPACE) << "GESL" << std::setw(ENTRY_SPACE)
+              << sl_min << std::setw(ENTRY_SPACE) << tslmean
+              << std::setw(ENTRY_SPACE) << (gflops_sl / sl_min)
+              << std::endl;
+}
 
     /**
      * @brief Construct a new Linpack Benchmark object
@@ -273,7 +548,9 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @param argc the number of program input parameters
      * @param argv the program input parameters as array of strings
      */
-    LinpackBenchmark(int argc, char* argv[]);
+    LinpackBenchmark(int argc, char* argv[]) : hpcc_base::HpccFpgaBenchmark<linpack::LinpackProgramSettings, TDevice, TContext, TProgram, linpack::LinpackData, linpack::LinpackExecutionTimings>(argc, argv) {
+        this->setupBenchmark(argc, argv);
+    }
 
         /**
      * @brief Construct a new Linpack Benchmark object
@@ -282,69 +559,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
 
 };
 
-/**
- *
- *
- * @param n1
- * @param y
- * @param n2
- * @param ldm
- * @param x
- * @param m
- */
-void dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m, bool transposed);
-
-/**
-Gaussian elemination reference implementation with partial pivoting.
-Can be used in exchange with kernel functions for functionality testing
-
-@param a the matrix with size of n*n
-@param n size of matrix A
-@param lda row with of the matrix. must be >=n
-@param ipvt array of pivoting indices
-
-*/
-void gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt);
-
-/**
-Solve linear equations using its LU decomposition.
-Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU
-where A is a matrix of size n*n
-
-@param a the matrix a in LU representation calculated by gefa call
-@param b vector b of the given equation
-@param ipvt vector containing pivoting information
-@param n size of matrix A
-@param lda row with of the matrix. must be >=n
-
-*/
-void gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda);
-
-/**
-Gaussian elemination reference implementation without pivoting.
-Can be used in exchange with kernel functions for functionality testing
-
-@param a the matrix with size of n*n
-@param n size of matrix A
-@param lda row with of the matrix. must be >=n
-
-*/
-void gefa_ref_nopvt(HOST_DATA_TYPE* a, unsigned n, unsigned lda);
-
-/**
-Solve linear equations using its LU decomposition without pivoting.
-Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU
-where A is a matrix of size n*n
-
-@param a the matrix a in LU representation calculated by gefa call
-@param b vector b of the given equation
-@param n size of matrix A
-@param lda row with of the matrix. must be >=n
-
-*/
-void gesl_ref_nopvt(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, unsigned n, unsigned lda);
-
-} // namespace stream
+} // namespace linpack
 
 
 #endif // SRC_HOST_STREAM_BENCHMARK_H_
diff --git a/LINPACK/src/host/linpack_data.cpp b/LINPACK/src/host/linpack_data.cpp
new file mode 100644
index 00000000..951c37c2
--- /dev/null
+++ b/LINPACK/src/host/linpack_data.cpp
@@ -0,0 +1,259 @@
+//
+// Created by Marius Meyer on 04.12.19.
+//
+
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "linpack_benchmark.hpp"
+
+/* C++ standard library headers */
+#include <memory>
+#include <random>
+
+/* Project's headers */
+#include "communication_types.hpp"
+#include "execution_types/execution_types.hpp"
+#include "parameters.h"
+
+linpack::LinpackProgramSettings::LinpackProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
+    matrixSize(results["m"].as<uint>() * (1 << (results["b"].as<uint>()))), blockSize(1 << (results["b"].as<uint>())), 
+    isEmulationKernel(results.count("emulation") > 0), isDiagonallyDominant(results.count("uniform") == 0),
+    torus_width(results["p"].as<uint>()) {
+    int mpi_comm_rank;
+    int mpi_comm_size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size);
+    // calculate the row and column of the MPI rank in the torus 
+    if (mpi_comm_size % torus_width != 0) {
+        throw std::runtime_error("MPI size not dividable by P=" + std::to_string(torus_width) + "!");
+    } 
+    torus_height = mpi_comm_size / torus_width;
+    torus_row = (mpi_comm_rank / torus_width);
+    torus_col = (mpi_comm_rank % torus_width);
+}
+
+std::map<std::string, std::string>
+linpack::LinpackProgramSettings::getSettingsMap() {
+        auto map = hpcc_base::BaseSettings::getSettingsMap();
+        map["Matrix Size"] = std::to_string(matrixSize);
+        map["Block Size"] = std::to_string(blockSize);
+        map["Emulate"] = (isEmulationKernel) ? "Yes" : "No";
+        map["Data Type"] = STR(HOST_DATA_TYPE);
+        map["FPGA Torus"] = "P=" + std::to_string(torus_width) + ", Q=" + std::to_string(torus_height);
+        return map;
+}
+
+linpack::LinpackData::LinpackData(cl::Context context, size_t width, size_t height) : norma(0.0), context(context),
+    matrix_width(width), matrix_height(height) {
+#ifdef USE_SVM
+    A = reinterpret_cast<HOST_DATA_TYPE*>(
+                        clSVMAlloc(context(), 0 ,
+                        size * size * sizeof(HOST_DATA_TYPE), 1024));
+    b = reinterpret_cast<HOST_DATA_TYPE*>(
+                        clSVMAlloc(context(), 0 ,
+                        size  * sizeof(HOST_DATA_TYPE), 1024));
+    ipvt = reinterpret_cast<cl_int*>(
+                        clSVMAlloc(context(), 0 ,
+                        size * sizeof(cl_int), 1024));
+#else
+    posix_memalign(reinterpret_cast<void**>(&A), 4096, width * height * sizeof(HOST_DATA_TYPE));
+    posix_memalign(reinterpret_cast<void**>(&b), 4096, width * sizeof(HOST_DATA_TYPE));
+    posix_memalign(reinterpret_cast<void**>(&ipvt), 4096, height * sizeof(cl_int));
+#endif
+    }
+
+linpack::LinpackData::~LinpackData() {
+#ifdef USE_SVM
+    clSVMFree(context(), reinterpret_cast<void*>(A));
+    clSVMFree(context(), reinterpret_cast<void*>(b));
+    clSVMFree(context(), reinterpret_cast<void*>(ipvt));
+#else
+    free(A);
+    free(b);
+    free(ipvt);
+#endif
+}
+
+/**
+Standard LU factorization on a block with fixed size
+
+Case 1 of Zhangs description
+*/
+void
+linpack::gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt) {
+    for (int i = 0; i < n; i++) {
+        ipvt[i] = i;
+    }
+    // For each diagnonal element
+    for (int k = 0; k < n - 1; k++) {
+        HOST_DATA_TYPE max_val = fabs(a[k * lda + k]);
+        int pvt_index = k;
+        for (int i = k + 1; i < n; i++) {
+            if (max_val < fabs(a[k * lda + i])) {
+                pvt_index = i;
+                max_val = fabs(a[k * lda + i]);
+            }
+        }
+
+        for (int i = k; i < n; i++) {
+            HOST_DATA_TYPE tmp_val = a[i * lda + k];
+            a[i * lda + k] = a[i * lda + pvt_index];
+            a[i * lda + pvt_index] = tmp_val;
+        }
+        ipvt[k] = pvt_index;
+
+        // For each element below it
+        for (int i = k + 1; i < n; i++) {
+            a[k * lda + i] *= -1.0 / a[k * lda + k];
+        }
+        // For each column right of current diagonal element
+        for (int j = k + 1; j < n; j++) {
+            // For each element below it
+            for (int i = k+1; i < n; i++) {
+                a[j * lda + i] += a[k * lda + i] * a[j * lda + k];
+            }
+        }
+
+#ifdef DEBUG
+        std::cout << "A(k=" << k <<"): " << std::endl;
+                for (int i= 0; i < n; i++) {
+                    for (int j=0; j < n; j++) {
+                        std::cout << a[i*lda + j] << ", ";
+                    }
+                    std::cout << std::endl;
+                }
+                std::cout <<  std::endl;
+#endif
+
+    }
+}
+
+void
+linpack::gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda) {
+    auto b_tmp = new HOST_DATA_TYPE[n];
+    {
+        for (int k = 0; k < n; k++) {
+            b_tmp[k] = b[k];
+        }
+
+        // solve l*y = b
+        // For each row in matrix
+        for (int k = 0; k < n - 1; k++) {
+            if (ipvt[k] != k) {
+                HOST_DATA_TYPE tmp = b_tmp[k];
+                b_tmp[k] = b_tmp[ipvt[k]];
+                b_tmp[ipvt[k]] = tmp;
+            }
+            // For each row below add
+            for (int i = k + 1; i < n; i++) {
+                // add solved upper row to current row
+                b_tmp[i] += b_tmp[k] * a[lda * k + i];
+            }
+        }
+
+        // now solve  u*x = y
+        for (int k = n - 1; k >= 0; k--) {
+            b_tmp[k] = b_tmp[k] / a[lda * k + k];
+            for (int i = 0; i < k; i++) {
+                b_tmp[i] -= b_tmp[k] * a[lda * k + i];
+            }
+        }
+        for (int k = 0; k < n; k++) {
+            b[k] = b_tmp[k];
+        }
+    }
+    delete [] b_tmp;
+}
+
+void linpack::dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m, bool transposed) {
+    for (int i=0; i < n1; i++) {
+        for (int j=0; j < n2; j++) {
+            y[i] = y[i] + x[j] * (transposed ? m[ldm*i + j] :m[ldm*j + i]);
+        }
+    }
+}
+
+void
+linpack::gefa_ref_nopvt(HOST_DATA_TYPE* a, unsigned n, unsigned lda) {
+    // For each diagnonal element
+    for (int k = 0; k < n; k++) {
+        // Store negatie invers of diagonal elements to get rid of some divisions afterwards!
+        a[k * lda + k] = -1.0 / a[k * lda + k];
+        // For each element below it
+        for (int i = k + 1; i < n; i++) {
+            a[k * lda + i] *= a[k * lda + k];
+        }
+        // For each column right of current diagonal element
+        for (int j = k + 1; j < n; j++) {
+            // For each element below it
+            for (int i = k+1; i < n; i++) {
+                a[j * lda + i] += a[k * lda + i] * a[j * lda + k];
+            }
+        }
+
+#ifdef DEBUG
+        std::cout << "A(k=" << k << "): " << std::endl;
+                for (int i= 0; i < n; i++) {
+                    for (int j=0; j < n; j++) {
+                        std::cout << a[i*lda + j] << ", ";
+                    }
+                    std::cout << std::endl;
+                }
+                std::cout <<  std::endl;
+#endif
+
+    }
+}
+
+
+void
+linpack::gesl_ref_nopvt(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, unsigned n, unsigned lda) {
+    auto b_tmp = new HOST_DATA_TYPE[n];
+
+    for (int k = 0; k < n; k++) {
+        b_tmp[k] = b[k];
+    }
+
+    // solve l*y = b
+    // For each row in matrix
+    for (int k = 0; k < n - 1; k++) {
+        // For each row below add
+        for (int i = k + 1; i < n; i++) {
+            // add solved upper row to current row
+            b_tmp[i] += b_tmp[k] * a[lda * k + i];
+        }
+    }
+
+    // now solve  u*x = y
+    for (int k = n - 1; k >= 0; k--) {
+        HOST_DATA_TYPE scale = b_tmp[k] * a[lda * k + k];
+        b_tmp[k] = -scale;
+        for (int i = 0; i < k; i++) {
+            b_tmp[i] += scale * a[lda * k + i];
+        }
+    }
+    for (int k = 0; k < n; k++) {
+        b[k] = b_tmp[k];
+    }
+    delete [] b_tmp;
+}
diff --git a/LINPACK/src/host/linpack_data.hpp b/LINPACK/src/host/linpack_data.hpp
new file mode 100644
index 00000000..51324a5c
--- /dev/null
+++ b/LINPACK/src/host/linpack_data.hpp
@@ -0,0 +1,274 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef SRC_HOST_LINPACK_DATA_H_
+#define SRC_HOST_LINPACK_DATA_H_
+
+/* C++ standard library headers */
+#include <complex>
+#include <memory>
+#include <random>
+
+/* Project's headers */
+#include "hpcc_benchmark.hpp"
+#include "parameters.h"
+extern "C" {
+    #include "gmres.h"
+}
+
+/**
+ * @brief Contains all classes and methods needed by the LINPACK benchmark
+ * 
+ */
+namespace linpack {
+
+/**
+ * @brief The Linpack specific program settings
+ * 
+ */
+class LinpackProgramSettings : public hpcc_base::BaseSettings {
+
+public:
+    /**
+     * @brief The size of the local matrix in number of blocks in one dimension
+     * 
+     */
+    uint matrixSize;
+
+    /**
+     * @brief Size of a single block of the matrix in values in one dimension
+     * 
+     */
+    uint blockSize;
+
+    /**
+     * @brief Indicates if the generated input matrix should be diagonally dominant
+     * 
+     */
+    bool isDiagonallyDominant;
+
+    /**
+     * @brief True, if the used kernel is an emulation kernel. Different kernel arguments may be used in this case to
+     *          simulate persistent local memory.
+     * 
+     */
+    bool isEmulationKernel;
+
+    /**
+     * @brief The row position of this MPI rank in the torus
+     * 
+     */
+    int torus_row;
+
+    /**
+     * @brief The rcolumn position of this MPI rank in the torus
+     * 
+     */
+    int torus_col;
+
+    /**
+     * @brief Width of the torus in number of ranks
+     * 
+     */
+    int torus_width;
+
+    /**
+     * @brief Height of the FPGA torus in number of ranks
+     * 
+     */
+    int torus_height;
+
+    /**
+     * @brief Construct a new Linpack Program Settings object
+     * 
+     * @param results the result map from parsing the program input parameters
+     */
+    LinpackProgramSettings(cxxopts::ParseResult &results);
+
+    /**
+     * @brief Get a map of the settings. This map will be used to print the final configuration.
+     * 
+     * @return a map of program parameters. keys are the name of the parameter.
+     */
+    std::map<std::string, std::string> getSettingsMap() override;
+
+};
+
+/**
+ * @brief Data class containing the data the kernel is exeucted with
+ * 
+ */
+class LinpackData {
+
+public:
+
+    /**
+     * @brief  The input matrix representing the left side of the linear equation system
+     * 
+     */
+    HOST_DATA_TYPE *A;
+
+    /**
+     * @brief  The input vector the right side of the linear equation system
+     * 
+     */
+    HOST_DATA_TYPE *b;
+
+    /**
+     * @brief A vector that can be used to store pivoting information
+     * 
+     */
+    cl_int* ipvt;
+
+    /**
+     * @brief Width of the local matrix in values
+     * 
+     */
+    size_t matrix_width;
+
+    /**
+     * @brief Height of the local matrix in values
+     * 
+     */
+    size_t matrix_height;
+
+    /**
+     * @brief The context that is used to allocate memory in SVM mode
+     * 
+     */
+    cl::Context context;
+
+    /**
+     * @brief The maximum value of A that will be used for the error calculation
+     * 
+     */
+    HOST_DATA_TYPE norma;
+
+    /**
+     * @brief The maximum value of A that will be used for the error calculation
+     * 
+     */
+    HOST_DATA_TYPE normb;
+
+    /**
+     * @brief Construct a new Linpack Data object
+     * 
+     * @param context The OpenCL context used to allocate memory in SVM mode
+     * @param width width of the local matrix in values
+     * @param height height of the local matrix in values
+     */
+    LinpackData(cl::Context context, size_t width, size_t height);
+
+    /**
+     * @brief Destroy the Linpack Data object. Free the allocated memory
+     * 
+     */
+    ~LinpackData();
+
+};
+
+/**
+ * @brief Measured execution timing from the kernel execution
+ * 
+ */
+class LinpackExecutionTimings {
+public:
+    /**
+     * @brief A vector containing the timings for all repetitions for the kernel execution for the gefa kernel
+     * 
+     */
+    std::vector<double> gefaTimings;
+
+    /**
+     * @brief A vector containing the timings for all repetitions for the kernel execution for the gesl kernel
+     * 
+     */
+    std::vector<double> geslTimings;
+
+
+};
+
+/**
+ *
+ *
+ * @param n1
+ * @param y
+ * @param n2
+ * @param ldm
+ * @param x
+ * @param m
+ */
+void dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m, bool transposed);
+
+/**
+Gaussian elemination reference implementation with partial pivoting.
+Can be used in exchange with kernel functions for functionality testing
+
+@param a the matrix with size of n*n
+@param n size of matrix A
+@param lda row with of the matrix. must be >=n
+@param ipvt array of pivoting indices
+
+*/
+void gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt);
+
+/**
+Solve linear equations using its LU decomposition.
+Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU
+where A is a matrix of size n*n
+
+@param a the matrix a in LU representation calculated by gefa call
+@param b vector b of the given equation
+@param ipvt vector containing pivoting information
+@param n size of matrix A
+@param lda row with of the matrix. must be >=n
+
+*/
+void gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda);
+
+/**
+Gaussian elemination reference implementation without pivoting.
+Can be used in exchange with kernel functions for functionality testing
+
+@param a the matrix with size of n*n
+@param n size of matrix A
+@param lda row with of the matrix. must be >=n
+
+*/
+void gefa_ref_nopvt(HOST_DATA_TYPE* a, unsigned n, unsigned lda);
+
+/**
+Solve linear equations using its LU decomposition without pivoting.
+Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU
+where A is a matrix of size n*n
+
+@param a the matrix a in LU representation calculated by gefa call
+@param b vector b of the given equation
+@param n size of matrix A
+@param lda row with of the matrix. must be >=n
+
+*/
+void gesl_ref_nopvt(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, unsigned n, unsigned lda);
+
+
+}
+#endif // SRC_HOST_LINPACK_DATA_H__
diff --git a/LINPACK/src/host/main.cpp b/LINPACK/src/host/main.cpp
index d05a7319..73dcc570 100644
--- a/LINPACK/src/host/main.cpp
+++ b/LINPACK/src/host/main.cpp
@@ -12,7 +12,7 @@ The program entry point
 int
 main(int argc, char *argv[]) {
     // Setup benchmark
-    LinpackBenchmark bm(argc, argv);
+    LinpackBenchmark<cl::Device, cl::Context, cl::Program> bm(argc, argv);
     bool success = bm.executeBenchmark();
     if (success) {
         return 0;

From b8ecac71abd00e90bffb174d5fbf02a546b7a8c7 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 28 Apr 2022 19:02:16 +0100
Subject: [PATCH 060/318] First compilable version HPL ACCL. No communicators!

---
 .../execution_accl_buffers.hpp                | 466 ++++++++++++++++++
 .../host/execution_types/execution_iec.hpp    |   3 +-
 .../host/execution_types/execution_pcie.hpp   |   3 +-
 .../host/execution_types/execution_types.hpp  |   8 +-
 LINPACK/src/host/linpack_benchmark.hpp        |   5 +
 LINPACK/src/host/linpack_data.cpp             |   3 +-
 LINPACK/src/host/main.cpp                     |   5 +
 7 files changed, 486 insertions(+), 7 deletions(-)
 create mode 100644 LINPACK/src/host/execution_types/execution_accl_buffers.hpp

diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
new file mode 100644
index 00000000..4a2a7907
--- /dev/null
+++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
@@ -0,0 +1,466 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef EXECUTION_TYPES_EXECUTION_ACCL_BUFFERS_HPP
+#define EXECUTION_TYPES_EXECUTION_ACCL_BUFFERS_HPP
+
+/* C++ standard library headers */
+#include <chrono>
+#include <fstream>
+#include <list>
+#include <memory>
+#include <thread>
+#include <vector>
+
+/* External library headers */
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+#include "linpack_data.hpp"
+#include "parameters.h"
+
+namespace linpack {
+namespace execution {
+namespace accl_buffers {
+
+/*
+ Prepare kernels and execute benchmark
+
+ @copydoc bm_execution::calculate()
+*/
+std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
+    const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings, xrt::device,
+                                       bool, xrt::uuid> &config,
+    linpack::LinpackData &data) {
+
+  cl_int err;
+
+  int num_omp_threads = 1;
+#ifdef _OPENMP
+  num_omp_threads = omp_get_num_threads();
+#endif
+
+  uint blocks_per_row = data.matrix_width / config.programSettings->blockSize;
+  uint blocks_per_col = data.matrix_height / config.programSettings->blockSize;
+
+  // TODO: Allow to handle Communicators in ACCL!
+  // // Communicate with all ranks in the same row of the torus
+  // // Configure ACCL Communicators
+  
+  // // Create Ranks. This must be the same configuration as used for
+  // // the global communicator!
+  // std::vector<ACCL::rank_t> all_accl_ranks = {};
+  // for (int i = 0; i < config.programSettings->torus_width * config.programSettings->torus_; ++i) {
+  //   // TODO: Replace the ip addresses and ports here for execution of real hardware?
+  //   ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, 1024};
+  //   all_accl_ranks.emplace_back(new_rank);
+  // }
+
+  // std::vector<ACCL::rank_t> row_ranks;
+  // std::vector<ACCL::rank_t> col_ranks;
+
+  // for (int i = 0; i < config.programSettings->torus_width; i++) {
+  //   row_ranks.push_back(all_accl_ranks[i]);
+  // }
+  // for (int i = config.programSettings->torus_col; i < all_accl_ranks.size();
+  //      i += config.programSettings->torus_width) {
+  //   col_ranks.push_back(all_accl_ranks[config.programSettings->torus_row *
+  //                                          config.programSettings->torus_width +
+  //                                      i]);
+  // }
+
+  // // Row communicator should now be index 1
+  // config.accl->configure_communicator(row_ranks,
+  //                                     config.programSettings->torus_col);
+  // // Column communicator should now be index 2
+  // config.accl->configure_communicator(col_ranks,
+  //                                     config.programSettings->torus_row);
+
+  // TODO: Select the correct memory groups!
+  // Create Buffers for input and output
+  // TODO: Need to set a memory group for the buffers here!
+  xrt::bo Buffer_a(
+      *config.device, data.A,
+      sizeof(HOST_DATA_TYPE) * data.matrix_height * data.matrix_width, 0);
+  xrt::bo Buffer_b(*config.device, data.b,
+                   sizeof(HOST_DATA_TYPE) * data.matrix_width, 0);
+  xrt::bo Buffer_pivot(*config.device, data.ipvt,
+                       sizeof(cl_int) * data.matrix_height, 0);
+
+  /* --- Setup MPI communication and required additional buffers --- */
+
+  // Buffers only used to store data received over the network layer
+  // The content will not be modified by the host
+  auto Buffer_lu1 = config.accl->create_buffer<HOST_DATA_TYPE>(
+      (config.programSettings->blockSize) * (config.programSettings->blockSize),
+      ACCL::dataType::float32, 1);
+  auto Buffer_lu2 = config.accl->create_buffer<HOST_DATA_TYPE>(
+      (config.programSettings->blockSize) * (config.programSettings->blockSize),
+      ACCL::dataType::float32, 1);
+
+  std::vector<std::vector<std::unique_ptr<ACCL::BaseBuffer>>> Buffer_left_list;
+  std::vector<std::vector<std::unique_ptr<ACCL::BaseBuffer>>> Buffer_top_list;
+
+  // Create two sets of communication buffers to allow overlap of communication
+  // and matrix multiplications
+  for (int rep = 0; rep < 2; rep++) {
+    Buffer_left_list.emplace_back();
+    Buffer_top_list.emplace_back();
+    for (int i = 0; i < blocks_per_row; i++) {
+      Buffer_top_list.back().push_back(config.accl->create_buffer<HOST_DATA_TYPE>(
+          config.programSettings->blockSize *
+              (config.programSettings->blockSize),
+          ACCL::dataType::float32, 1));
+    }
+
+    for (int i = 0; i < blocks_per_col; i++) {
+      Buffer_left_list.back().push_back(config.accl->create_buffer<HOST_DATA_TYPE>(
+          config.programSettings->blockSize *
+              (config.programSettings->blockSize),
+          ACCL::dataType::float32, 1));
+    }
+  }
+
+  /* --- Execute actual benchmark kernels --- */
+
+  double t;
+  std::vector<double> gefaExecutionTimes;
+  std::vector<double> geslExecutionTimes;
+  std::vector<double> gefaWaitTimes;
+  for (int i = 0; i < config.programSettings->numRepetitions; i++) {
+
+    Buffer_a.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    Buffer_b.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+    // Command queues
+    // A new command queue is created for every iteration of the algorithm to
+    // reduce the overhead of too large queues
+    std::vector<xrt::run> inner_mms;
+    std::thread flush_thread;
+
+    std::chrono::time_point<std::chrono::high_resolution_clock> t1, t2, twait1,
+        twait2;
+    std::chrono::duration<double> currentwaittime =
+        std::chrono::duration<double>::zero();
+
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col << "Start! " << std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
+    t1 = std::chrono::high_resolution_clock::now();
+
+    int kernel_offset = 0;
+#pragma omp parallel
+    {
+
+#pragma omp single
+      uint current_replication = 0;
+
+      // For every row of blocks create kernels and enqueue them
+      for (int block_row = 0; block_row < config.programSettings->matrixSize /
+                                              config.programSettings->blockSize;
+           block_row++) {
+
+        int local_block_row_remainder =
+            (block_row % config.programSettings->torus_height);
+        int local_block_row =
+            (block_row / config.programSettings->torus_height);
+        int local_block_col_remainder =
+            (block_row % config.programSettings->torus_width);
+        int local_block_col = (block_row / config.programSettings->torus_width);
+        bool in_same_row_as_lu =
+            local_block_row_remainder == config.programSettings->torus_row;
+        bool in_same_col_as_lu =
+            local_block_col_remainder == config.programSettings->torus_col;
+        int start_row_index =
+            local_block_row +
+            ((local_block_row_remainder >= config.programSettings->torus_row)
+                 ? 1
+                 : 0);
+        int start_col_index =
+            local_block_col +
+            ((local_block_col_remainder >= config.programSettings->torus_col)
+                 ? 1
+                 : 0);
+        int num_left_blocks =
+            (in_same_col_as_lu) ? blocks_per_col - start_row_index : 0;
+        int num_top_blocks =
+            (in_same_row_as_lu) ? blocks_per_row - start_col_index : 0;
+        int num_inner_block_rows = (blocks_per_col - start_row_index);
+        int num_inner_block_cols =
+            (num_inner_block_rows > 0) ? (blocks_per_row - start_col_index) : 0;
+        num_inner_block_rows =
+            (num_inner_block_cols > 0) ? num_inner_block_rows : 0;
+        bool is_calulating_lu_block = (in_same_col_as_lu && in_same_row_as_lu);
+
+#ifndef NDEBUG
+        std::cout << "Torus " << config.programSettings->torus_row << ","
+                  << config.programSettings->torus_col
+                  << " Start iteration     " << block_row << std::endl;
+#endif
+
+        uint total_inner_updates_first_row = num_inner_block_cols;
+        uint updates_per_replication =
+            total_inner_updates_first_row /
+            config.programSettings->kernelReplications;
+        uint total_inner_updates =
+            (num_inner_block_cols - 1) * (num_inner_block_rows - 1);
+        uint total_updates_per_replication =
+            total_inner_updates / config.programSettings->kernelReplications;
+        uint current_update = 0;
+
+        std::vector<xrt::run> comm_kernel_runs;
+
+#pragma omp single
+        {
+
+          if (is_calulating_lu_block) {
+            // create the LU kernel
+            auto lu_kernel = xrt::kernel(*config.device, *config.program, "lu");
+
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " LU     "
+                      << local_block_row << "," << local_block_col << std::endl;
+#endif
+            auto lu_run =
+                lu_kernel(Buffer_a, Buffer_lu1, Buffer_lu2, local_block_col,
+                          local_block_row, blocks_per_row);
+            lu_run.wait();
+          }
+
+          // Exchange LU blocks on all ranks to prevent stalls in MPI broadcast
+          // All tasks until now need to be executed so we can use the result of
+          // the LU factorization and communicate it via MPI with the other
+          // FPGAs
+
+          // Broadcast LU block in column to update all left blocks
+          config.accl->bcast(2, *Buffer_lu2,
+                             config.programSettings->blockSize *
+                                 config.programSettings->blockSize,
+                             local_block_row_remainder, true, true);
+          // Broadcast LU block in row to update all top blocks
+          config.accl->bcast(1, *Buffer_lu2,
+                             config.programSettings->blockSize *
+                                 config.programSettings->blockSize,
+                             local_block_col_remainder, true, true);
+        }
+
+        if (num_top_blocks > 0) {
+
+// Create top kernels
+#pragma omp for
+          for (int tops = start_col_index; tops < blocks_per_row; tops++) {
+            xrt::kernel k(*config.device, *config.program, "top_update");
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " Top    "
+                      << local_block_row << "," << tops << std::endl;
+#endif
+
+            comm_kernel_runs.push_back(
+                k(Buffer_a,
+                  Buffer_top_list[block_row % 2][tops - start_col_index],
+                  Buffer_lu1, (tops == start_col_index), tops, local_block_row,
+                  blocks_per_row));
+          }
+        }
+        if (num_left_blocks > 0) {
+
+// Create left kernels
+#pragma omp for
+          for (int tops = start_row_index; tops < blocks_per_col; tops++) {
+            xrt::kernel k(*config.device, *config.program, "left_update");
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " Left   " << tops
+                      << "," << local_block_col << std::endl;
+#endif
+            comm_kernel_runs.push_back(
+                k(Buffer_a,
+                  Buffer_left_list[block_row % 2][tops - start_row_index],
+                  Buffer_lu2, (tops == start_row_index), local_block_col, tops,
+                  blocks_per_row));
+          }
+        }
+
+#pragma omp single
+        {
+          // Wait until all top and left blocks are calculated
+          std::for_each(comm_kernel_runs.begin(), comm_kernel_runs.end(),
+                        [](xrt::run &e) { e.wait(); });
+
+          // Send the left and top blocks to all other ranks so they can be used
+          // to update all inner blocks
+          for (int lbi = 0;
+               lbi <
+               std::max(static_cast<int>(blocks_per_col - local_block_col), 0);
+               lbi++) {
+            config.accl->bcast(1, *Buffer_left_list[block_row % 2][lbi],
+                               config.programSettings->blockSize *
+                                   config.programSettings->blockSize,
+                               local_block_col_remainder, true, true);
+          }
+          for (int tbi = 0;
+               tbi <
+               std::max(static_cast<int>(blocks_per_row - local_block_row), 0);
+               tbi++) {
+            config.accl->bcast(2, *Buffer_top_list[block_row % 2][tbi],
+                               config.programSettings->blockSize *
+                                   config.programSettings->blockSize,
+                               local_block_row_remainder, true, true);
+          }
+
+          // update all remaining inner blocks using only global memory
+        }
+
+        std::vector<xrt::run> outer_mms;
+
+        // Wait for previous inner MMs to complete.
+        // They may need to be reused by the next outer MM calls!
+        std::for_each(inner_mms.begin(), inner_mms.end(),
+                      [](xrt::run &e) { e.wait(); });
+
+#pragma omp for
+        for (int lbi = 1; lbi < num_inner_block_rows; lbi++) {
+
+          // select the matrix multiplication kernel that should be used for
+          // this block updated
+          xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
+
+          int block_col = static_cast<cl_uint>(
+              (data.matrix_width / config.programSettings->blockSize) -
+              num_inner_block_cols);
+          int block_row = static_cast<cl_uint>(
+              (data.matrix_height / config.programSettings->blockSize) -
+              num_inner_block_rows + lbi);
+
+          outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][lbi],
+                                Buffer_top_list[block_row % 2][0], block_col,
+                                block_row, blocks_per_row));
+        }
+
+#pragma omp for
+        for (int tbi = 0; tbi < num_inner_block_cols; tbi++) {
+
+          // select the matrix multiplication kernel that should be used for
+          // this block updated
+          xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
+
+          int block_col = static_cast<cl_uint>(
+              (data.matrix_width / config.programSettings->blockSize) -
+              num_inner_block_cols + tbi);
+          int block_row = static_cast<cl_uint>(
+              (data.matrix_height / config.programSettings->blockSize) -
+              num_inner_block_rows);
+
+          outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][0],
+                                Buffer_top_list[block_row % 2][tbi], block_col,
+                                block_row, blocks_per_row));
+        }
+
+        // Clear inner MM runs vector for this iteration
+        // All runs have completed before scheduling the outer MMs
+        inner_mms.clear();
+
+#pragma omp for collapse(2) schedule(static)
+        for (int lbi = 1; lbi < num_inner_block_rows; lbi++) {
+          for (int tbi = 1; tbi < num_inner_block_cols; tbi++) {
+            // select the matrix multiplication kernel that should be used for
+            // this block updated
+
+            xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
+
+            int block_col = static_cast<cl_uint>(
+                (data.matrix_width / config.programSettings->blockSize) -
+                num_inner_block_cols + tbi);
+            int block_row = static_cast<cl_uint>(
+                (data.matrix_height / config.programSettings->blockSize) -
+                num_inner_block_rows + lbi);
+
+            inner_mms.push_back(k(Buffer_a,
+                                  Buffer_left_list[block_row % 2][lbi],
+                                  Buffer_top_list[block_row % 2][tbi],
+                                  block_col, block_row, blocks_per_row));
+          }
+        }
+
+#ifndef NDEBUG
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (is_calulating_lu_block)
+          std::cout << "---------------" << std::endl;
+#endif
+
+        // Wait for all outer MMs to complete because the results are required
+        // by the next communication phase
+        std::for_each(outer_mms.begin(), outer_mms.end(),
+                      [](xrt::run &e) { e.wait(); });
+      }
+    }
+
+#ifdef NDEBUG
+    t2 = std::chrono::high_resolution_clock::now();
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col << "End! " << std::endl;
+#endif
+
+#ifndef NDEBUG
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col
+              << "Wait time: " << currentwaittime.count() << "s" << std::endl;
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col << " Exit    " << i
+              << std::endl;
+#endif
+
+    std::chrono::duration<double> timespan =
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+    gefaExecutionTimes.push_back(timespan.count());
+
+    // Execute GESL
+    t1 = std::chrono::high_resolution_clock::now();
+    t2 = std::chrono::high_resolution_clock::now();
+    timespan =
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+    geslExecutionTimes.push_back(timespan.count());
+  }
+
+  /* --- Read back results from Device --- */
+
+  Buffer_a.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  if (!config.programSettings->isDiagonallyDominant) {
+    Buffer_pivot.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  }
+
+  std::unique_ptr<linpack::LinpackExecutionTimings> results(
+      new linpack::LinpackExecutionTimings{gefaExecutionTimes,
+                                           geslExecutionTimes});
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  return results;
+}
+
+} // namespace accl_buffers
+} // namespace execution
+} // namespace linpack
+
+#endif
diff --git a/LINPACK/src/host/execution_types/execution_iec.hpp b/LINPACK/src/host/execution_types/execution_iec.hpp
index 3c232f41..b07ed6a6 100644
--- a/LINPACK/src/host/execution_types/execution_iec.hpp
+++ b/LINPACK/src/host/execution_types/execution_iec.hpp
@@ -44,9 +44,8 @@ namespace iec {
 /*
  Prepare kernels and execute benchmark for a bitstream that makes use of intel external channels
 */
-template<class TDevice, class TContext, class TProgram>
 std::unique_ptr<linpack::LinpackExecutionTimings>
-calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings, TDevice, TContext, TProgram>&config,
+calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings, cl::Device, cl::Context, cl::Program>&config,
           linpack::LinpackData& data) {
 
     int err;
diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp
index 5462f025..5ef4ad27 100644
--- a/LINPACK/src/host/execution_types/execution_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_pcie.hpp
@@ -50,9 +50,8 @@ namespace pcie {
 
  @copydoc bm_execution::calculate()
 */
-template<class TDevice, class TContext, class TProgram>
 std::unique_ptr<linpack::LinpackExecutionTimings>
-calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings, TDevice, TContext, TProgram>&config,
+calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings, cl::Device, cl::Context, cl::Program>&config,
           linpack::LinpackData& data) {
 
     cl_int err;
diff --git a/LINPACK/src/host/execution_types/execution_types.hpp b/LINPACK/src/host/execution_types/execution_types.hpp
index 975dd4cf..1990336d 100644
--- a/LINPACK/src/host/execution_types/execution_types.hpp
+++ b/LINPACK/src/host/execution_types/execution_types.hpp
@@ -22,7 +22,13 @@ SOFTWARE.
 #ifndef EXECUTION_TYPES_HPP
 #define EXECUTION_TYPES_HPP
 
+#ifdef USE_OCL_HOST
 #include "execution_types/execution_pcie.hpp"
 #include "execution_types/execution_iec.hpp"
-
+#endif
+#ifdef USE_XRT_HOST
+#ifdef USE_ACCL
+#include "execution_types/execution_accl_buffers.hpp"
+#endif
+#endif
 #endif
\ No newline at end of file
diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp
index b79fa65a..eed54a44 100644
--- a/LINPACK/src/host/linpack_benchmark.hpp
+++ b/LINPACK/src/host/linpack_benchmark.hpp
@@ -308,8 +308,13 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
     executeKernel(LinpackData &data) override {
     std::unique_ptr<linpack::LinpackExecutionTimings> timings;
     switch (this->executionSettings->programSettings->communicationType) {
+#ifdef USE_OCL_HOST
         case hpcc_base::CommunicationType::pcie_mpi : timings = execution::pcie::calculate(*this->executionSettings, data); break;
         case hpcc_base::CommunicationType::intel_external_channels: timings = execution::iec::calculate(*this->executionSettings, data); break;
+#endif
+#ifdef USE_XRT_HOST
+        case hpcc_base::CommunicationType::accl : timings = execution::accl_buffers::calculate(*this->executionSettings, data); break;
+#endif
         default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(this->executionSettings->programSettings->communicationType));
     }
 #ifdef DISTRIBUTED_VALIDATION
diff --git a/LINPACK/src/host/linpack_data.cpp b/LINPACK/src/host/linpack_data.cpp
index 951c37c2..2c724796 100644
--- a/LINPACK/src/host/linpack_data.cpp
+++ b/LINPACK/src/host/linpack_data.cpp
@@ -24,7 +24,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
 
-#include "linpack_benchmark.hpp"
+#include "linpack_data.hpp"
 
 /* C++ standard library headers */
 #include <memory>
@@ -32,7 +32,6 @@ SOFTWARE.
 
 /* Project's headers */
 #include "communication_types.hpp"
-#include "execution_types/execution_types.hpp"
 #include "parameters.h"
 
 linpack::LinpackProgramSettings::LinpackProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
diff --git a/LINPACK/src/host/main.cpp b/LINPACK/src/host/main.cpp
index 73dcc570..cfd89914 100644
--- a/LINPACK/src/host/main.cpp
+++ b/LINPACK/src/host/main.cpp
@@ -12,7 +12,12 @@ The program entry point
 int
 main(int argc, char *argv[]) {
     // Setup benchmark
+#ifdef USE_OCL_HOST
     LinpackBenchmark<cl::Device, cl::Context, cl::Program> bm(argc, argv);
+#endif
+#ifdef USE_XRT_HOST
+    LinpackBenchmark<xrt::device, bool, xrt::uuid> bm(argc, argv);
+#endif
     bool success = bm.executeBenchmark();
     if (success) {
         return 0;

From c902b8243dbe4e7e55e0a13c6287f27ba8b3d42b Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 28 Apr 2022 19:02:27 +0100
Subject: [PATCH 061/318] Add ACCL config for HPL

---
 ...linx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake

diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake
new file mode 100644
index 00000000..941a1d78
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake
@@ -0,0 +1,28 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini" CACHE STRING "Link settings file" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)

From 264ac09db8a768e054fdfc6166195d37c2fbbf2d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 29 Apr 2022 15:24:20 +0100
Subject: [PATCH 062/318] Update for new communicator interface

---
 .../execution_accl_buffers.hpp                | 96 +++++++++----------
 1 file changed, 46 insertions(+), 50 deletions(-)

diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
index 4a2a7907..6424a99f 100644
--- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
+++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
@@ -48,8 +48,8 @@ namespace accl_buffers {
  @copydoc bm_execution::calculate()
 */
 std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
-    const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings, xrt::device,
-                                       bool, xrt::uuid> &config,
+    const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings,
+                                       xrt::device, bool, xrt::uuid> &config,
     linpack::LinpackData &data) {
 
   cl_int err;
@@ -62,38 +62,32 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
   uint blocks_per_row = data.matrix_width / config.programSettings->blockSize;
   uint blocks_per_col = data.matrix_height / config.programSettings->blockSize;
 
-  // TODO: Allow to handle Communicators in ACCL!
-  // // Communicate with all ranks in the same row of the torus
-  // // Configure ACCL Communicators
-  
-  // // Create Ranks. This must be the same configuration as used for
-  // // the global communicator!
-  // std::vector<ACCL::rank_t> all_accl_ranks = {};
-  // for (int i = 0; i < config.programSettings->torus_width * config.programSettings->torus_; ++i) {
-  //   // TODO: Replace the ip addresses and ports here for execution of real hardware?
-  //   ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, 1024};
-  //   all_accl_ranks.emplace_back(new_rank);
-  // }
-
-  // std::vector<ACCL::rank_t> row_ranks;
-  // std::vector<ACCL::rank_t> col_ranks;
-
-  // for (int i = 0; i < config.programSettings->torus_width; i++) {
-  //   row_ranks.push_back(all_accl_ranks[i]);
-  // }
-  // for (int i = config.programSettings->torus_col; i < all_accl_ranks.size();
-  //      i += config.programSettings->torus_width) {
-  //   col_ranks.push_back(all_accl_ranks[config.programSettings->torus_row *
-  //                                          config.programSettings->torus_width +
-  //                                      i]);
-  // }
-
-  // // Row communicator should now be index 1
-  // config.accl->configure_communicator(row_ranks,
-  //                                     config.programSettings->torus_col);
-  // // Column communicator should now be index 2
-  // config.accl->configure_communicator(col_ranks,
-  //                                     config.programSettings->torus_row);
+  // Communicate with all ranks in the same row of the torus
+  // Configure ACCL Communicators
+
+  // Get group of global communicator
+  std::vector<ACCL::rank_t> all_accl_ranks =
+      config.accl->get_comm_group(ACCL::GLOBAL_COMM);
+
+  std::vector<ACCL::rank_t> row_ranks;
+  std::vector<ACCL::rank_t> col_ranks;
+
+  // Create sub-groups for rows and columns
+  for (int i = 0; i < config.programSettings->torus_width; i++) {
+    row_ranks.push_back(all_accl_ranks[i]);
+  }
+  for (int i = config.programSettings->torus_col; i < all_accl_ranks.size();
+       i += config.programSettings->torus_width) {
+    col_ranks.push_back(all_accl_ranks[config.programSettings->torus_row *
+                                           config.programSettings->torus_width +
+                                       i]);
+  }
+
+  // Create communicators from sub-groups
+  ACCL::CommunicatorId row_comm = config.accl->configure_communicator(
+      row_ranks, config.programSettings->torus_col);
+  ACCL::CommunicatorId col_comm = config.accl->configure_communicator(
+      col_ranks, config.programSettings->torus_row);
 
   // TODO: Select the correct memory groups!
   // Create Buffers for input and output
@@ -126,17 +120,19 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
     Buffer_left_list.emplace_back();
     Buffer_top_list.emplace_back();
     for (int i = 0; i < blocks_per_row; i++) {
-      Buffer_top_list.back().push_back(config.accl->create_buffer<HOST_DATA_TYPE>(
-          config.programSettings->blockSize *
-              (config.programSettings->blockSize),
-          ACCL::dataType::float32, 1));
+      Buffer_top_list.back().push_back(
+          config.accl->create_buffer<HOST_DATA_TYPE>(
+              config.programSettings->blockSize *
+                  (config.programSettings->blockSize),
+              ACCL::dataType::float32, 1));
     }
 
     for (int i = 0; i < blocks_per_col; i++) {
-      Buffer_left_list.back().push_back(config.accl->create_buffer<HOST_DATA_TYPE>(
-          config.programSettings->blockSize *
-              (config.programSettings->blockSize),
-          ACCL::dataType::float32, 1));
+      Buffer_left_list.back().push_back(
+          config.accl->create_buffer<HOST_DATA_TYPE>(
+              config.programSettings->blockSize *
+                  (config.programSettings->blockSize),
+              ACCL::dataType::float32, 1));
     }
   }
 
@@ -253,15 +249,15 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
           // FPGAs
 
           // Broadcast LU block in column to update all left blocks
-          config.accl->bcast(2, *Buffer_lu2,
+          config.accl->bcast(*Buffer_lu2,
                              config.programSettings->blockSize *
                                  config.programSettings->blockSize,
-                             local_block_row_remainder, true, true);
+                             local_block_row_remainder, col_comm, true, true);
           // Broadcast LU block in row to update all top blocks
-          config.accl->bcast(1, *Buffer_lu2,
+          config.accl->bcast(*Buffer_lu2,
                              config.programSettings->blockSize *
                                  config.programSettings->blockSize,
-                             local_block_col_remainder, true, true);
+                             local_block_col_remainder, row_comm, true, true);
         }
 
         if (num_top_blocks > 0) {
@@ -314,19 +310,19 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                lbi <
                std::max(static_cast<int>(blocks_per_col - local_block_col), 0);
                lbi++) {
-            config.accl->bcast(1, *Buffer_left_list[block_row % 2][lbi],
+            config.accl->bcast(*Buffer_left_list[block_row % 2][lbi],
                                config.programSettings->blockSize *
                                    config.programSettings->blockSize,
-                               local_block_col_remainder, true, true);
+                               local_block_col_remainder, row_comm, true, true);
           }
           for (int tbi = 0;
                tbi <
                std::max(static_cast<int>(blocks_per_row - local_block_row), 0);
                tbi++) {
-            config.accl->bcast(2, *Buffer_top_list[block_row % 2][tbi],
+            config.accl->bcast(*Buffer_top_list[block_row % 2][tbi],
                                config.programSettings->blockSize *
                                    config.programSettings->blockSize,
-                               local_block_row_remainder, true, true);
+                               local_block_row_remainder, col_comm, true, true);
           }
 
           // update all remaining inner blocks using only global memory

From 94b5deb811e473ba125d638d6211645681e1f956 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 29 Apr 2022 16:18:44 +0100
Subject: [PATCH 063/318] Fix creation of row communicator

---
 LINPACK/src/host/execution_types/execution_accl_buffers.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
index 6424a99f..e1c2c5c0 100644
--- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
+++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
@@ -73,7 +73,11 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
   std::vector<ACCL::rank_t> col_ranks;
 
   // Create sub-groups for rows and columns
-  for (int i = 0; i < config.programSettings->torus_width; i++) {
+  for (int i = config.programSettings->torus_row *
+               config.programSettings->torus_width;
+       i < config.programSettings->torus_row *
+               (config.programSettings->torus_width + 1);
+       i++) {
     row_ranks.push_back(all_accl_ranks[i]);
   }
   for (int i = config.programSettings->torus_col; i < all_accl_ranks.size();

From 03baabe594b17b011ee0e984d43384baa9bd8dce Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 6 May 2022 18:19:09 +0100
Subject: [PATCH 064/318] Change communicator call to new version

---
 LINPACK/src/host/execution_types/execution_accl_buffers.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
index e1c2c5c0..fd58d75c 100644
--- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
+++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
@@ -88,9 +88,9 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
   }
 
   // Create communicators from sub-groups
-  ACCL::CommunicatorId row_comm = config.accl->configure_communicator(
+  ACCL::CommunicatorId row_comm = config.accl->create_communicator(
       row_ranks, config.programSettings->torus_col);
-  ACCL::CommunicatorId col_comm = config.accl->configure_communicator(
+  ACCL::CommunicatorId col_comm = config.accl->create_communicator(
       col_ranks, config.programSettings->torus_row);
 
   // TODO: Select the correct memory groups!

From 3fa0e842dc83ab64214f54c933d02108e90dec57 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 13 May 2022 17:34:18 +0100
Subject: [PATCH 065/318] Add XRT PCIE host code

---
 .../host/execution_types/execution_types.hpp  |   1 +
 .../execution_types/execution_xrt_pcie.hpp    | 493 ++++++++++++++++++
 LINPACK/src/host/linpack_benchmark.hpp        |   1 +
 3 files changed, 495 insertions(+)
 create mode 100644 LINPACK/src/host/execution_types/execution_xrt_pcie.hpp

diff --git a/LINPACK/src/host/execution_types/execution_types.hpp b/LINPACK/src/host/execution_types/execution_types.hpp
index 1990336d..294115ea 100644
--- a/LINPACK/src/host/execution_types/execution_types.hpp
+++ b/LINPACK/src/host/execution_types/execution_types.hpp
@@ -29,6 +29,7 @@ SOFTWARE.
 #ifdef USE_XRT_HOST
 #ifdef USE_ACCL
 #include "execution_types/execution_accl_buffers.hpp"
+#include "execution_types/execution_xrt_pcie.hpp"
 #endif
 #endif
 #endif
\ No newline at end of file
diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
new file mode 100644
index 00000000..58cd0acc
--- /dev/null
+++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
@@ -0,0 +1,493 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef EXECUTION_TYPES_EXECUTION_XRT_PCIE_HPP
+#define EXECUTION_TYPES_EXECUTION_XRT_PCIE_HPP
+
+/* C++ standard library headers */
+#include <chrono>
+#include <fstream>
+#include <list>
+#include <memory>
+#include <thread>
+#include <vector>
+
+/* External library headers */
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+#include "linpack_data.hpp"
+#include "parameters.h"
+
+namespace linpack {
+namespace execution {
+namespace xrt_pcie {
+
+/*
+ Prepare kernels and execute benchmark
+
+ @copydoc bm_execution::calculate()
+*/
+std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
+    const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings,
+                                       xrt::device, bool, xrt::uuid> &config,
+    linpack::LinpackData &data) {
+
+  cl_int err;
+
+  int num_omp_threads = 1;
+#ifdef _OPENMP
+  num_omp_threads = omp_get_num_threads();
+#endif
+
+  uint blocks_per_row = data.matrix_width / config.programSettings->blockSize;
+  uint blocks_per_col = data.matrix_height / config.programSettings->blockSize;
+
+  // Communicate with all ranks in the same row of the torus
+  MPI_Comm row_communicator;
+  MPI_Comm col_communicator;
+
+  MPI_Comm_split(MPI_COMM_WORLD, config.programSettings->torus_row, 0,
+                 &row_communicator);
+  MPI_Comm_split(MPI_COMM_WORLD, config.programSettings->torus_col, 0,
+                 &col_communicator);
+
+  // TODO: Select the correct memory groups!
+  // Create Buffers for input and output
+  // TODO: Need to set a memory group for the buffers here!
+
+  auto lu_tmp_kernel = xrt::kernel(*config.device, *config.program, "lu");
+  xrt::bo Buffer_a(
+      *config.device, data.A,
+      sizeof(HOST_DATA_TYPE) * data.matrix_height * data.matrix_width, lu_tmp_kernel.group_id(0));
+  xrt::bo Buffer_b(*config.device, data.b,
+                   sizeof(HOST_DATA_TYPE) * data.matrix_width, lu_tmp_kernel.group_id(0));
+  xrt::bo Buffer_pivot(*config.device, data.ipvt,
+                       sizeof(cl_int) * data.matrix_height, lu_tmp_kernel.group_id(0));
+
+  /* --- Setup MPI communication and required additional buffers --- */
+  HOST_DATA_TYPE *lu_block, *lu_trans_block;
+  posix_memalign(reinterpret_cast<void **>(&lu_block), 4096,
+                 sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
+                     (config.programSettings->blockSize));
+  posix_memalign(reinterpret_cast<void **>(&lu_trans_block), 4096,
+                 sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
+                     (config.programSettings->blockSize));
+
+  // Buffers only used to store data received over the network layer
+  // The content will not be modified by the host
+  xrt::bo Buffer_lu1(*config.device, lu_trans_block,
+                     sizeof(HOST_DATA_TYPE) *
+                         (config.programSettings->blockSize) *
+                         (config.programSettings->blockSize),
+                     lu_tmp_kernel.group_id(1));
+  xrt::bo Buffer_lu2(*config.device, lu_block,
+                     sizeof(HOST_DATA_TYPE) *
+                         (config.programSettings->blockSize) *
+                         (config.programSettings->blockSize),
+                     lu_tmp_kernel.group_id(2));
+
+  std::vector<std::vector<xrt::bo>> Buffer_left_list(2);
+  std::vector<std::vector<xrt::bo>> Buffer_top_list(2);
+  std::vector<std::vector<HOST_DATA_TYPE *>> left_blocks;
+  std::vector<std::vector<HOST_DATA_TYPE *>> top_blocks;
+
+  for (int double_buffer = 0; double_buffer < 2; double_buffer++) {
+      top_blocks.emplace_back(blocks_per_row);
+      left_blocks.emplace_back(blocks_per_col);
+    for (int i = 0; i < blocks_per_row; i++) {
+      posix_memalign(
+          reinterpret_cast<void **>(&(top_blocks[double_buffer][i])), 4096,
+          sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
+              (config.programSettings->blockSize));
+      Buffer_top_list[double_buffer].emplace_back(
+          *config.device, top_blocks[double_buffer][i],
+          sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
+              (config.programSettings->blockSize),
+          lu_tmp_kernel.group_id(0));
+    }
+
+    for (int i = 0; i < blocks_per_col; i++) {
+      posix_memalign(
+          reinterpret_cast<void **>(&(left_blocks[double_buffer][i])), 4096,
+          sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
+              (config.programSettings->blockSize));
+      Buffer_left_list[double_buffer].emplace_back(
+          *config.device, left_blocks[double_buffer][i],
+          sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
+              (config.programSettings->blockSize),
+          lu_tmp_kernel.group_id(2));
+    }
+  }
+
+  /* --- Execute actual benchmark kernels --- */
+
+  double t;
+  std::vector<double> gefaExecutionTimes;
+  std::vector<double> geslExecutionTimes;
+  std::vector<double> gefaWaitTimes;
+  for (int i = 0; i < config.programSettings->numRepetitions; i++) {
+
+    Buffer_a.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    Buffer_b.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+    // Command queues
+    // A new command queue is created for every iteration of the algorithm to
+    // reduce the overhead of too large queues
+    std::vector<xrt::run> inner_mms;
+    std::thread flush_thread;
+
+    std::chrono::time_point<std::chrono::high_resolution_clock> t1, t2, twait1,
+        twait2;
+    std::chrono::duration<double> currentwaittime =
+        std::chrono::duration<double>::zero();
+
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col << "Start! " << std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
+    t1 = std::chrono::high_resolution_clock::now();
+
+    int kernel_offset = 0;
+#pragma omp parallel
+    {
+
+#pragma omp single
+      uint current_replication = 0;
+
+      // For every row of blocks create kernels and enqueue them
+      for (int block_row = 0; block_row < config.programSettings->matrixSize /
+                                              config.programSettings->blockSize;
+           block_row++) {
+
+        int local_block_row_remainder =
+            (block_row % config.programSettings->torus_height);
+        int local_block_row =
+            (block_row / config.programSettings->torus_height);
+        int local_block_col_remainder =
+            (block_row % config.programSettings->torus_width);
+        int local_block_col = (block_row / config.programSettings->torus_width);
+        bool in_same_row_as_lu =
+            local_block_row_remainder == config.programSettings->torus_row;
+        bool in_same_col_as_lu =
+            local_block_col_remainder == config.programSettings->torus_col;
+        int start_row_index =
+            local_block_row +
+            ((local_block_row_remainder >= config.programSettings->torus_row)
+                 ? 1
+                 : 0);
+        int start_col_index =
+            local_block_col +
+            ((local_block_col_remainder >= config.programSettings->torus_col)
+                 ? 1
+                 : 0);
+        int num_left_blocks =
+            (in_same_col_as_lu) ? blocks_per_col - start_row_index : 0;
+        int num_top_blocks =
+            (in_same_row_as_lu) ? blocks_per_row - start_col_index : 0;
+        int num_inner_block_rows = (blocks_per_col - start_row_index);
+        int num_inner_block_cols =
+            (num_inner_block_rows > 0) ? (blocks_per_row - start_col_index) : 0;
+        num_inner_block_rows =
+            (num_inner_block_cols > 0) ? num_inner_block_rows : 0;
+        bool is_calulating_lu_block = (in_same_col_as_lu && in_same_row_as_lu);
+
+#ifndef NDEBUG
+        std::cout << "Torus " << config.programSettings->torus_row << ","
+                  << config.programSettings->torus_col
+                  << " Start iteration     " << block_row << std::endl;
+#endif
+
+        uint total_inner_updates_first_row = num_inner_block_cols;
+        uint updates_per_replication =
+            total_inner_updates_first_row /
+            config.programSettings->kernelReplications;
+        uint total_inner_updates =
+            (num_inner_block_cols - 1) * (num_inner_block_rows - 1);
+        uint total_updates_per_replication =
+            total_inner_updates / config.programSettings->kernelReplications;
+        uint current_update = 0;
+
+        std::vector<xrt::run> comm_kernel_runs;
+
+#pragma omp single
+        {
+
+          if (is_calulating_lu_block) {
+            // create the LU kernel
+            auto lu_kernel = xrt::kernel(*config.device, *config.program, "lu");
+
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " LU     "
+                      << local_block_row << "," << local_block_col << std::endl;
+#endif
+            auto lu_run =
+                lu_kernel(Buffer_a, Buffer_lu1, Buffer_lu2, local_block_col,
+                          local_block_row, blocks_per_row);
+            ert_cmd_state state = lu_run.wait();
+            if (state != ERT_CMD_STATE_COMPLETED) {
+              std::cerr << "Execution Lu failed: " << state << std::endl;
+            }
+            Buffer_lu1.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+            Buffer_lu2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+          }
+
+          // Broadcast LU block in column to update all left blocks
+          MPI_Bcast(lu_block,
+                    config.programSettings->blockSize *
+                        config.programSettings->blockSize,
+                    MPI_DATA_TYPE, local_block_row_remainder, col_communicator);
+          // Broadcast LU block in row to update all top blocks
+          MPI_Bcast(lu_trans_block,
+                    config.programSettings->blockSize *
+                        config.programSettings->blockSize,
+                    MPI_DATA_TYPE, local_block_col_remainder, row_communicator);
+        }
+
+        if (num_top_blocks > 0) {
+
+          Buffer_lu1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+// Create top kernels
+#pragma omp for
+          for (int tops = start_col_index; tops < blocks_per_row; tops++) {
+            xrt::kernel k(*config.device, *config.program, "top_update");
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " Top    "
+                      << local_block_row << "," << tops << std::endl;
+#endif
+
+            comm_kernel_runs.push_back(
+                k(Buffer_a,
+                  Buffer_top_list[block_row % 2][tops - start_col_index],
+                  Buffer_lu1, (tops == start_col_index), tops, local_block_row,
+                  blocks_per_row));
+          }
+        }
+        if (num_left_blocks > 0) {
+
+          Buffer_lu2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+// Create left kernels
+#pragma omp for
+          for (int tops = start_row_index; tops < blocks_per_col; tops++) {
+            xrt::kernel k(*config.device, *config.program, "left_update");
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " Left   " << tops
+                      << "," << local_block_col << std::endl;
+#endif
+            comm_kernel_runs.push_back(
+                k(Buffer_a,
+                  Buffer_left_list[block_row % 2][tops - start_row_index],
+                  Buffer_lu2, (tops == start_row_index), local_block_col, tops,
+                  blocks_per_row));
+          }
+        }
+
+#pragma omp single
+        {
+          // Wait until all top and left blocks are calculated
+          std::for_each(comm_kernel_runs.begin(), comm_kernel_runs.end(),
+                        [](xrt::run &e) { e.wait(); });
+
+          // Send the left and top blocks to all other ranks so they can be used
+          // to update all inner blocks
+          for (int lbi = 0;
+               lbi <
+               std::max(static_cast<int>(blocks_per_col - local_block_col), 0);
+               lbi++) {
+            Buffer_left_list[block_row % 2][lbi].sync(
+                XCL_BO_SYNC_BO_FROM_DEVICE);
+            MPI_Bcast(left_blocks[block_row % 2][lbi],
+                      config.programSettings->blockSize *
+                          config.programSettings->blockSize,
+                      MPI_DATA_TYPE, local_block_col_remainder,
+                      row_communicator);
+            Buffer_left_list[block_row % 2][lbi].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+          }
+          for (int tbi = 0;
+               tbi <
+               std::max(static_cast<int>(blocks_per_row - local_block_row), 0);
+               tbi++) {
+            Buffer_top_list[block_row % 2][tbi].sync(
+                XCL_BO_SYNC_BO_FROM_DEVICE);
+            MPI_Bcast(top_blocks[block_row % 2][tbi],
+                      config.programSettings->blockSize *
+                          config.programSettings->blockSize,
+                      MPI_DATA_TYPE, local_block_row_remainder,
+                      col_communicator);
+            Buffer_top_list[block_row % 2][tbi].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+          }
+
+          // update all remaining inner blocks using only global memory
+        }
+
+        std::vector<xrt::run> outer_mms;
+
+        // Wait for previous inner MMs to complete.
+        // They may need to be reused by the next outer MM calls!
+        std::for_each(inner_mms.begin(), inner_mms.end(),
+                      [](xrt::run &e) { e.wait(); });
+
+#pragma omp for
+        for (int lbi = 1; lbi < num_inner_block_rows; lbi++) {
+
+          // select the matrix multiplication kernel that should be used for
+          // this block updated
+          xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
+
+          int block_col = static_cast<cl_uint>(
+              (data.matrix_width / config.programSettings->blockSize) -
+              num_inner_block_cols);
+          int block_row = static_cast<cl_uint>(
+              (data.matrix_height / config.programSettings->blockSize) -
+              num_inner_block_rows + lbi);
+
+#ifndef NDEBUG
+          std::cout << "Torus " << config.programSettings->torus_row << ","
+                    << config.programSettings->torus_col << " MM row "
+                    << block_row << "," << block_col << std::endl;
+#endif
+
+          outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][lbi],
+                                Buffer_top_list[block_row % 2][0], block_col,
+                                block_row, blocks_per_row));
+        }
+
+#pragma omp for
+        for (int tbi = 0; tbi < num_inner_block_cols; tbi++) {
+
+          // select the matrix multiplication kernel that should be used for
+          // this block updated
+          xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
+
+          int block_col = static_cast<cl_uint>(
+              (data.matrix_width / config.programSettings->blockSize) -
+              num_inner_block_cols + tbi);
+          int block_row = static_cast<cl_uint>(
+              (data.matrix_height / config.programSettings->blockSize) -
+              num_inner_block_rows);
+
+#ifndef NDEBUG
+          std::cout << "Torus " << config.programSettings->torus_row << ","
+                    << config.programSettings->torus_col << " MM col "
+                    << block_row << "," << block_col << std::endl;
+#endif
+
+          outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][0],
+                                Buffer_top_list[block_row % 2][tbi], block_col,
+                                block_row, blocks_per_row));
+        }
+
+        // Clear inner MM runs vector for this iteration
+        // All runs have completed before scheduling the outer MMs
+        inner_mms.clear();
+
+#pragma omp for collapse(2) schedule(static)
+        for (int lbi = 1; lbi < num_inner_block_rows; lbi++) {
+          for (int tbi = 1; tbi < num_inner_block_cols; tbi++) {
+            // select the matrix multiplication kernel that should be used for
+            // this block updated
+
+            xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
+
+            int block_col = static_cast<cl_uint>(
+                (data.matrix_width / config.programSettings->blockSize) -
+                num_inner_block_cols + tbi);
+            int block_row = static_cast<cl_uint>(
+                (data.matrix_height / config.programSettings->blockSize) -
+                num_inner_block_rows + lbi);
+
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " MM     "
+                      << block_row << "," << block_col << std::endl;
+#endif
+
+            inner_mms.push_back(k(Buffer_a,
+                                  Buffer_left_list[block_row % 2][lbi],
+                                  Buffer_top_list[block_row % 2][tbi],
+                                  block_col, block_row, blocks_per_row));
+          }
+        }
+
+#ifndef NDEBUG
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (is_calulating_lu_block)
+          std::cout << "---------------" << std::endl;
+#endif
+
+        // Wait for all outer MMs to complete because the results are required
+        // by the next communication phase
+        std::for_each(outer_mms.begin(), outer_mms.end(),
+                      [](xrt::run &e) { e.wait(); });
+      }
+    }
+
+    t2 = std::chrono::high_resolution_clock::now();
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col << "End! " << std::endl;
+
+#ifndef NDEBUG
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col
+              << "Wait time: " << currentwaittime.count() << "s" << std::endl;
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col << " Exit    " << i
+              << std::endl;
+#endif
+
+    std::chrono::duration<double> timespan =
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+    gefaExecutionTimes.push_back(timespan.count());
+
+    // Execute GESL
+    t1 = std::chrono::high_resolution_clock::now();
+    t2 = std::chrono::high_resolution_clock::now();
+    timespan =
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+    geslExecutionTimes.push_back(timespan.count());
+  }
+
+  /* --- Read back results from Device --- */
+
+  Buffer_a.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  if (!config.programSettings->isDiagonallyDominant) {
+    Buffer_pivot.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  }
+
+  std::unique_ptr<linpack::LinpackExecutionTimings> results(
+      new linpack::LinpackExecutionTimings{gefaExecutionTimes,
+                                           geslExecutionTimes});
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  return results;
+}
+
+} // namespace xrt_pcie
+} // namespace execution
+} // namespace linpack
+
+#endif
diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp
index eed54a44..c6656ffd 100644
--- a/LINPACK/src/host/linpack_benchmark.hpp
+++ b/LINPACK/src/host/linpack_benchmark.hpp
@@ -313,6 +313,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
         case hpcc_base::CommunicationType::intel_external_channels: timings = execution::iec::calculate(*this->executionSettings, data); break;
 #endif
 #ifdef USE_XRT_HOST
+        case hpcc_base::CommunicationType::pcie_mpi : timings = execution::xrt_pcie::calculate(*this->executionSettings, data); break;
         case hpcc_base::CommunicationType::accl : timings = execution::accl_buffers::calculate(*this->executionSettings, data); break;
 #endif
         default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(this->executionSettings->programSettings->communicationType));

From f578a06ac3757eac5ebdbd3289a52336bec6f0bb Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 17 May 2022 12:09:37 +0100
Subject: [PATCH 066/318] Fix xrt scheduling

---
 .../execution_types/execution_xrt_pcie.hpp    | 80 ++++++++-----------
 1 file changed, 32 insertions(+), 48 deletions(-)

diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
index 58cd0acc..7c239aae 100644
--- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
@@ -85,22 +85,15 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                        sizeof(cl_int) * data.matrix_height, lu_tmp_kernel.group_id(0));
 
   /* --- Setup MPI communication and required additional buffers --- */
-  HOST_DATA_TYPE *lu_block, *lu_trans_block;
-  posix_memalign(reinterpret_cast<void **>(&lu_block), 4096,
-                 sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
-                     (config.programSettings->blockSize));
-  posix_memalign(reinterpret_cast<void **>(&lu_trans_block), 4096,
-                 sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
-                     (config.programSettings->blockSize));
 
   // Buffers only used to store data received over the network layer
   // The content will not be modified by the host
-  xrt::bo Buffer_lu1(*config.device, lu_trans_block,
+  xrt::bo Buffer_lu1(*config.device,
                      sizeof(HOST_DATA_TYPE) *
                          (config.programSettings->blockSize) *
                          (config.programSettings->blockSize),
                      lu_tmp_kernel.group_id(1));
-  xrt::bo Buffer_lu2(*config.device, lu_block,
+  xrt::bo Buffer_lu2(*config.device,
                      sizeof(HOST_DATA_TYPE) *
                          (config.programSettings->blockSize) *
                          (config.programSettings->blockSize),
@@ -108,31 +101,19 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 
   std::vector<std::vector<xrt::bo>> Buffer_left_list(2);
   std::vector<std::vector<xrt::bo>> Buffer_top_list(2);
-  std::vector<std::vector<HOST_DATA_TYPE *>> left_blocks;
-  std::vector<std::vector<HOST_DATA_TYPE *>> top_blocks;
 
   for (int double_buffer = 0; double_buffer < 2; double_buffer++) {
-      top_blocks.emplace_back(blocks_per_row);
-      left_blocks.emplace_back(blocks_per_col);
     for (int i = 0; i < blocks_per_row; i++) {
-      posix_memalign(
-          reinterpret_cast<void **>(&(top_blocks[double_buffer][i])), 4096,
-          sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
-              (config.programSettings->blockSize));
       Buffer_top_list[double_buffer].emplace_back(
-          *config.device, top_blocks[double_buffer][i],
+          *config.device,
           sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
               (config.programSettings->blockSize),
           lu_tmp_kernel.group_id(0));
     }
 
     for (int i = 0; i < blocks_per_col; i++) {
-      posix_memalign(
-          reinterpret_cast<void **>(&(left_blocks[double_buffer][i])), 4096,
-          sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
-              (config.programSettings->blockSize));
       Buffer_left_list[double_buffer].emplace_back(
-          *config.device, left_blocks[double_buffer][i],
+          *config.device,
           sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
               (config.programSettings->blockSize),
           lu_tmp_kernel.group_id(2));
@@ -151,8 +132,8 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
     Buffer_b.sync(XCL_BO_SYNC_BO_TO_DEVICE);
 
     // Command queues
-    // A new command queue is created for every iteration of the algorithm to
-    // reduce the overhead of too large queues
+    // A new command queue is created for every iteration of the
+    // algorithm to reduce the overhead of too large queues
     std::vector<xrt::run> inner_mms;
     std::thread flush_thread;
 
@@ -252,12 +233,12 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
           }
 
           // Broadcast LU block in column to update all left blocks
-          MPI_Bcast(lu_block,
+          MPI_Bcast(Buffer_lu2.map(),
                     config.programSettings->blockSize *
                         config.programSettings->blockSize,
                     MPI_DATA_TYPE, local_block_row_remainder, col_communicator);
           // Broadcast LU block in row to update all top blocks
-          MPI_Bcast(lu_trans_block,
+          MPI_Bcast(Buffer_lu1.map(),
                     config.programSettings->blockSize *
                         config.programSettings->blockSize,
                     MPI_DATA_TYPE, local_block_col_remainder, row_communicator);
@@ -319,7 +300,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                lbi++) {
             Buffer_left_list[block_row % 2][lbi].sync(
                 XCL_BO_SYNC_BO_FROM_DEVICE);
-            MPI_Bcast(left_blocks[block_row % 2][lbi],
+            MPI_Bcast(Buffer_left_list[block_row % 2][lbi].map(),
                       config.programSettings->blockSize *
                           config.programSettings->blockSize,
                       MPI_DATA_TYPE, local_block_col_remainder,
@@ -332,7 +313,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                tbi++) {
             Buffer_top_list[block_row % 2][tbi].sync(
                 XCL_BO_SYNC_BO_FROM_DEVICE);
-            MPI_Bcast(top_blocks[block_row % 2][tbi],
+            MPI_Bcast(Buffer_top_list[block_row % 2][tbi].map(),
                       config.programSettings->blockSize *
                           config.programSettings->blockSize,
                       MPI_DATA_TYPE, local_block_row_remainder,
@@ -355,24 +336,25 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 
           // select the matrix multiplication kernel that should be used for
           // this block updated
-          xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
+          xrt::kernel k(*config.device, *config.program,
+                        "inner_update_mm0");
 
-          int block_col = static_cast<cl_uint>(
+          int current_block_col = static_cast<cl_uint>(
               (data.matrix_width / config.programSettings->blockSize) -
               num_inner_block_cols);
-          int block_row = static_cast<cl_uint>(
+          int current_block_row = static_cast<cl_uint>(
               (data.matrix_height / config.programSettings->blockSize) -
               num_inner_block_rows + lbi);
 
 #ifndef NDEBUG
           std::cout << "Torus " << config.programSettings->torus_row << ","
-                    << config.programSettings->torus_col << " MM row "
-                    << block_row << "," << block_col << std::endl;
+                    << config.programSettings->torus_col << " MM col "
+                    << current_block_row << "," << current_block_col << std::endl;
 #endif
 
           outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][lbi],
-                                Buffer_top_list[block_row % 2][0], block_col,
-                                block_row, blocks_per_row));
+                                Buffer_top_list[block_row % 2][0], current_block_col,
+                                current_block_row, blocks_per_row));
         }
 
 #pragma omp for
@@ -380,24 +362,25 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 
           // select the matrix multiplication kernel that should be used for
           // this block updated
-          xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
+          xrt::kernel k(*config.device, *config.program,
+                        "inner_update_mm0");
 
-          int block_col = static_cast<cl_uint>(
+          int current_block_col = static_cast<cl_uint>(
               (data.matrix_width / config.programSettings->blockSize) -
               num_inner_block_cols + tbi);
-          int block_row = static_cast<cl_uint>(
+          int current_block_row = static_cast<cl_uint>(
               (data.matrix_height / config.programSettings->blockSize) -
               num_inner_block_rows);
 
 #ifndef NDEBUG
           std::cout << "Torus " << config.programSettings->torus_row << ","
-                    << config.programSettings->torus_col << " MM col "
-                    << block_row << "," << block_col << std::endl;
+                    << config.programSettings->torus_col << " MM row "
+                    << current_block_row << "," << current_block_col << std::endl;
 #endif
 
           outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][0],
-                                Buffer_top_list[block_row % 2][tbi], block_col,
-                                block_row, blocks_per_row));
+                                Buffer_top_list[block_row % 2][tbi], current_block_col,
+                                current_block_row, blocks_per_row));
         }
 
         // Clear inner MM runs vector for this iteration
@@ -410,25 +393,26 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
             // select the matrix multiplication kernel that should be used for
             // this block updated
 
-            xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
+            xrt::kernel k(*config.device, *config.program,
+                          "inner_update_mm0");
 
-            int block_col = static_cast<cl_uint>(
+            int current_block_col = static_cast<cl_uint>(
                 (data.matrix_width / config.programSettings->blockSize) -
                 num_inner_block_cols + tbi);
-            int block_row = static_cast<cl_uint>(
+            int current_block_row = static_cast<cl_uint>(
                 (data.matrix_height / config.programSettings->blockSize) -
                 num_inner_block_rows + lbi);
 
 #ifndef NDEBUG
             std::cout << "Torus " << config.programSettings->torus_row << ","
                       << config.programSettings->torus_col << " MM     "
-                      << block_row << "," << block_col << std::endl;
+                      << current_block_row << "," << current_block_col << std::endl;
 #endif
 
             inner_mms.push_back(k(Buffer_a,
                                   Buffer_left_list[block_row % 2][lbi],
                                   Buffer_top_list[block_row % 2][tbi],
-                                  block_col, block_row, blocks_per_row));
+                                  current_block_col, current_block_row, blocks_per_row));
           }
         }
 

From ecdb80d26446bc0f0d68f600f80df9c8db37379c Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 17 May 2022 15:32:16 +0100
Subject: [PATCH 067/318] Refactoring XRT host code

---
 .../execution_types/execution_xrt_pcie.hpp    | 49 ++++++++++---------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
index 7c239aae..b054a132 100644
--- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
@@ -76,13 +76,16 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
   // TODO: Need to set a memory group for the buffers here!
 
   auto lu_tmp_kernel = xrt::kernel(*config.device, *config.program, "lu");
-  xrt::bo Buffer_a(
-      *config.device, data.A,
-      sizeof(HOST_DATA_TYPE) * data.matrix_height * data.matrix_width, lu_tmp_kernel.group_id(0));
+  xrt::bo Buffer_a(*config.device, data.A,
+                   sizeof(HOST_DATA_TYPE) * data.matrix_height *
+                       data.matrix_width,
+                   lu_tmp_kernel.group_id(0));
   xrt::bo Buffer_b(*config.device, data.b,
-                   sizeof(HOST_DATA_TYPE) * data.matrix_width, lu_tmp_kernel.group_id(0));
+                   sizeof(HOST_DATA_TYPE) * data.matrix_width,
+                   lu_tmp_kernel.group_id(0));
   xrt::bo Buffer_pivot(*config.device, data.ipvt,
-                       sizeof(cl_int) * data.matrix_height, lu_tmp_kernel.group_id(0));
+                       sizeof(cl_int) * data.matrix_height,
+                       lu_tmp_kernel.group_id(0));
 
   /* --- Setup MPI communication and required additional buffers --- */
 
@@ -336,8 +339,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 
           // select the matrix multiplication kernel that should be used for
           // this block updated
-          xrt::kernel k(*config.device, *config.program,
-                        "inner_update_mm0");
+          xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
 
           int current_block_col = static_cast<cl_uint>(
               (data.matrix_width / config.programSettings->blockSize) -
@@ -349,12 +351,14 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 #ifndef NDEBUG
           std::cout << "Torus " << config.programSettings->torus_row << ","
                     << config.programSettings->torus_col << " MM col "
-                    << current_block_row << "," << current_block_col << std::endl;
+                    << current_block_row << "," << current_block_col
+                    << std::endl;
 #endif
 
           outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][lbi],
-                                Buffer_top_list[block_row % 2][0], current_block_col,
-                                current_block_row, blocks_per_row));
+                                Buffer_top_list[block_row % 2][0],
+                                current_block_col, current_block_row,
+                                blocks_per_row));
         }
 
 #pragma omp for
@@ -362,8 +366,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 
           // select the matrix multiplication kernel that should be used for
           // this block updated
-          xrt::kernel k(*config.device, *config.program,
-                        "inner_update_mm0");
+          xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
 
           int current_block_col = static_cast<cl_uint>(
               (data.matrix_width / config.programSettings->blockSize) -
@@ -375,12 +378,14 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 #ifndef NDEBUG
           std::cout << "Torus " << config.programSettings->torus_row << ","
                     << config.programSettings->torus_col << " MM row "
-                    << current_block_row << "," << current_block_col << std::endl;
+                    << current_block_row << "," << current_block_col
+                    << std::endl;
 #endif
 
           outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][0],
-                                Buffer_top_list[block_row % 2][tbi], current_block_col,
-                                current_block_row, blocks_per_row));
+                                Buffer_top_list[block_row % 2][tbi],
+                                current_block_col, current_block_row,
+                                blocks_per_row));
         }
 
         // Clear inner MM runs vector for this iteration
@@ -393,8 +398,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
             // select the matrix multiplication kernel that should be used for
             // this block updated
 
-            xrt::kernel k(*config.device, *config.program,
-                          "inner_update_mm0");
+            xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
 
             int current_block_col = static_cast<cl_uint>(
                 (data.matrix_width / config.programSettings->blockSize) -
@@ -406,13 +410,14 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 #ifndef NDEBUG
             std::cout << "Torus " << config.programSettings->torus_row << ","
                       << config.programSettings->torus_col << " MM     "
-                      << current_block_row << "," << current_block_col << std::endl;
+                      << current_block_row << "," << current_block_col
+                      << std::endl;
 #endif
 
-            inner_mms.push_back(k(Buffer_a,
-                                  Buffer_left_list[block_row % 2][lbi],
-                                  Buffer_top_list[block_row % 2][tbi],
-                                  current_block_col, current_block_row, blocks_per_row));
+            inner_mms.push_back(
+                k(Buffer_a, Buffer_left_list[block_row % 2][lbi],
+                  Buffer_top_list[block_row % 2][tbi], current_block_col,
+                  current_block_row, blocks_per_row));
           }
         }
 

From 5ce867f7ae50565df5c04e8d263bef528bd73a93 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 17 May 2022 15:32:39 +0100
Subject: [PATCH 068/318] Fix single FPGA ACCL host code

---
 .../execution_accl_buffers.hpp                | 151 ++++++++++++------
 1 file changed, 105 insertions(+), 46 deletions(-)

diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
index fd58d75c..2f3922ae 100644
--- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
+++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
@@ -82,9 +82,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
   }
   for (int i = config.programSettings->torus_col; i < all_accl_ranks.size();
        i += config.programSettings->torus_width) {
-    col_ranks.push_back(all_accl_ranks[config.programSettings->torus_row *
-                                           config.programSettings->torus_width +
-                                       i]);
+    col_ranks.push_back(all_accl_ranks[i]);
   }
 
   // Create communicators from sub-groups
@@ -93,27 +91,50 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
   ACCL::CommunicatorId col_comm = config.accl->create_communicator(
       col_ranks, config.programSettings->torus_row);
 
-  // TODO: Select the correct memory groups!
-  // Create Buffers for input and output
-  // TODO: Need to set a memory group for the buffers here!
-  xrt::bo Buffer_a(
-      *config.device, data.A,
-      sizeof(HOST_DATA_TYPE) * data.matrix_height * data.matrix_width, 0);
+  // Create global memory buffers
+  auto lu_tmp_kernel = xrt::kernel(*config.device, *config.program, "lu");
+  xrt::bo Buffer_a(*config.device, data.A,
+                   sizeof(HOST_DATA_TYPE) * data.matrix_height *
+                       data.matrix_width,
+                   lu_tmp_kernel.group_id(0));
   xrt::bo Buffer_b(*config.device, data.b,
-                   sizeof(HOST_DATA_TYPE) * data.matrix_width, 0);
+                   sizeof(HOST_DATA_TYPE) * data.matrix_width,
+                   lu_tmp_kernel.group_id(0));
   xrt::bo Buffer_pivot(*config.device, data.ipvt,
-                       sizeof(cl_int) * data.matrix_height, 0);
+                       sizeof(cl_int) * data.matrix_height,
+                       lu_tmp_kernel.group_id(0));
+
+  // TODO: To make this code work with the ACCL simulator, we need to create
+  // buffers using bos. This vector is used to store these bos during execution.
+  // They will be accessed via the ACCL buffers are not required in the code
+  // itself. Fixing the simulator code of ACCL to always create a bo would fix
+  // this issue.
+  std::vector<xrt::bo> tmp_bos;
 
   /* --- Setup MPI communication and required additional buffers --- */
 
   // Buffers only used to store data received over the network layer
   // The content will not be modified by the host
+  tmp_bos.emplace_back(*config.device,
+                       sizeof(HOST_DATA_TYPE) *
+                           (config.programSettings->blockSize) *
+                           (config.programSettings->blockSize),
+                       lu_tmp_kernel.group_id(1));
   auto Buffer_lu1 = config.accl->create_buffer<HOST_DATA_TYPE>(
+      tmp_bos.back(),
       (config.programSettings->blockSize) * (config.programSettings->blockSize),
-      ACCL::dataType::float32, 1);
+      ACCL::dataType::float32);
+  tmp_bos.emplace_back(*config.device,
+                       sizeof(HOST_DATA_TYPE) *
+                           (config.programSettings->blockSize) *
+                           (config.programSettings->blockSize),
+                       lu_tmp_kernel.group_id(2));
   auto Buffer_lu2 = config.accl->create_buffer<HOST_DATA_TYPE>(
+      tmp_bos.back(),
       (config.programSettings->blockSize) * (config.programSettings->blockSize),
-      ACCL::dataType::float32, 1);
+      ACCL::dataType::float32);
+  Buffer_lu1->sync_to_device();
+  Buffer_lu2->sync_to_device();
 
   std::vector<std::vector<std::unique_ptr<ACCL::BaseBuffer>>> Buffer_left_list;
   std::vector<std::vector<std::unique_ptr<ACCL::BaseBuffer>>> Buffer_top_list;
@@ -124,19 +145,33 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
     Buffer_left_list.emplace_back();
     Buffer_top_list.emplace_back();
     for (int i = 0; i < blocks_per_row; i++) {
+      tmp_bos.emplace_back(*config.device,
+                           sizeof(HOST_DATA_TYPE) *
+                               (config.programSettings->blockSize) *
+                               (config.programSettings->blockSize),
+                           lu_tmp_kernel.group_id(0));
       Buffer_top_list.back().push_back(
           config.accl->create_buffer<HOST_DATA_TYPE>(
-              config.programSettings->blockSize *
+              tmp_bos.back(),
+              (config.programSettings->blockSize) *
                   (config.programSettings->blockSize),
-              ACCL::dataType::float32, 1));
+              ACCL::dataType::float32));
+      Buffer_top_list.back().back()->sync_to_device();
     }
 
     for (int i = 0; i < blocks_per_col; i++) {
+      tmp_bos.emplace_back(*config.device,
+                           sizeof(HOST_DATA_TYPE) *
+                               (config.programSettings->blockSize) *
+                               (config.programSettings->blockSize),
+                           lu_tmp_kernel.group_id(2));
       Buffer_left_list.back().push_back(
           config.accl->create_buffer<HOST_DATA_TYPE>(
-              config.programSettings->blockSize *
+              tmp_bos.back(),
+              (config.programSettings->blockSize) *
                   (config.programSettings->blockSize),
-              ACCL::dataType::float32, 1));
+              ACCL::dataType::float32));
+      Buffer_left_list.back().back()->sync_to_device();
     }
   }
 
@@ -242,9 +277,12 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                       << local_block_row << "," << local_block_col << std::endl;
 #endif
             auto lu_run =
-                lu_kernel(Buffer_a, Buffer_lu1, Buffer_lu2, local_block_col,
-                          local_block_row, blocks_per_row);
-            lu_run.wait();
+                lu_kernel(Buffer_a, *Buffer_lu1->bo(), *Buffer_lu2->bo(),
+                          local_block_col, local_block_row, blocks_per_row);
+            ert_cmd_state state = lu_run.wait();
+            if (state != ERT_CMD_STATE_COMPLETED) {
+              std::cerr << "Execution Lu failed: " << state << std::endl;
+            }
           }
 
           // Exchange LU blocks on all ranks to prevent stalls in MPI broadcast
@@ -278,9 +316,9 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 
             comm_kernel_runs.push_back(
                 k(Buffer_a,
-                  Buffer_top_list[block_row % 2][tops - start_col_index],
-                  Buffer_lu1, (tops == start_col_index), tops, local_block_row,
-                  blocks_per_row));
+                  *Buffer_top_list[block_row % 2][tops - start_col_index]->bo(),
+                  *Buffer_lu1->bo(), (tops == start_col_index), tops,
+                  local_block_row, blocks_per_row));
           }
         }
         if (num_left_blocks > 0) {
@@ -294,11 +332,11 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                       << config.programSettings->torus_col << " Left   " << tops
                       << "," << local_block_col << std::endl;
 #endif
-            comm_kernel_runs.push_back(
-                k(Buffer_a,
-                  Buffer_left_list[block_row % 2][tops - start_row_index],
-                  Buffer_lu2, (tops == start_row_index), local_block_col, tops,
-                  blocks_per_row));
+            comm_kernel_runs.push_back(k(
+                Buffer_a,
+                *Buffer_left_list[block_row % 2][tops - start_row_index]->bo(),
+                *Buffer_lu2->bo(), (tops == start_row_index), local_block_col,
+                tops, blocks_per_row));
           }
         }
 
@@ -346,16 +384,24 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
           // this block updated
           xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
 
-          int block_col = static_cast<cl_uint>(
+          int current_block_col = static_cast<cl_uint>(
               (data.matrix_width / config.programSettings->blockSize) -
               num_inner_block_cols);
-          int block_row = static_cast<cl_uint>(
+          int current_block_row = static_cast<cl_uint>(
               (data.matrix_height / config.programSettings->blockSize) -
               num_inner_block_rows + lbi);
 
-          outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][lbi],
-                                Buffer_top_list[block_row % 2][0], block_col,
-                                block_row, blocks_per_row));
+#ifndef NDEBUG
+          std::cout << "Torus " << config.programSettings->torus_row << ","
+                    << config.programSettings->torus_col << " MM col "
+                    << current_block_row << "," << current_block_col
+                    << std::endl;
+#endif
+
+          outer_mms.push_back(
+              k(Buffer_a, *Buffer_left_list[block_row % 2][lbi]->bo(),
+                *Buffer_top_list[block_row % 2][0]->bo(), current_block_col,
+                current_block_row, blocks_per_row));
         }
 
 #pragma omp for
@@ -365,16 +411,24 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
           // this block updated
           xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
 
-          int block_col = static_cast<cl_uint>(
+          int current_block_col = static_cast<cl_uint>(
               (data.matrix_width / config.programSettings->blockSize) -
               num_inner_block_cols + tbi);
-          int block_row = static_cast<cl_uint>(
+          int current_block_row = static_cast<cl_uint>(
               (data.matrix_height / config.programSettings->blockSize) -
               num_inner_block_rows);
 
-          outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][0],
-                                Buffer_top_list[block_row % 2][tbi], block_col,
-                                block_row, blocks_per_row));
+#ifndef NDEBUG
+          std::cout << "Torus " << config.programSettings->torus_row << ","
+                    << config.programSettings->torus_col << " MM row "
+                    << current_block_row << "," << current_block_col
+                    << std::endl;
+#endif
+
+          outer_mms.push_back(
+              k(Buffer_a, *Buffer_left_list[block_row % 2][0]->bo(),
+                *Buffer_top_list[block_row % 2][tbi]->bo(), current_block_col,
+                current_block_row, blocks_per_row));
         }
 
         // Clear inner MM runs vector for this iteration
@@ -389,17 +443,24 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 
             xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
 
-            int block_col = static_cast<cl_uint>(
+            int current_block_col = static_cast<cl_uint>(
                 (data.matrix_width / config.programSettings->blockSize) -
                 num_inner_block_cols + tbi);
-            int block_row = static_cast<cl_uint>(
+            int current_block_row = static_cast<cl_uint>(
                 (data.matrix_height / config.programSettings->blockSize) -
                 num_inner_block_rows + lbi);
 
-            inner_mms.push_back(k(Buffer_a,
-                                  Buffer_left_list[block_row % 2][lbi],
-                                  Buffer_top_list[block_row % 2][tbi],
-                                  block_col, block_row, blocks_per_row));
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " MM     "
+                      << current_block_row << "," << current_block_col
+                      << std::endl;
+#endif
+
+            inner_mms.push_back(
+                k(Buffer_a, *Buffer_left_list[block_row % 2][lbi]->bo(),
+                  *Buffer_top_list[block_row % 2][tbi]->bo(), current_block_col,
+                  current_block_row, blocks_per_row));
           }
         }
 
@@ -416,11 +477,9 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
       }
     }
 
-#ifdef NDEBUG
     t2 = std::chrono::high_resolution_clock::now();
     std::cout << "Torus " << config.programSettings->torus_row << ","
               << config.programSettings->torus_col << "End! " << std::endl;
-#endif
 
 #ifndef NDEBUG
     std::cout << "Torus " << config.programSettings->torus_row << ","

From c883f3532f8b9eb9786a40f552025279c56b1907 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 18 May 2022 11:34:19 +0100
Subject: [PATCH 069/318] Refactor XRT host codes

---
 .../execution_accl_buffers.hpp                | 33 ++++++++++---------
 .../execution_types/execution_xrt_pcie.hpp    | 23 +++++++------
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
index 2f3922ae..29645a51 100644
--- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
+++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
@@ -73,10 +73,10 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
   std::vector<ACCL::rank_t> col_ranks;
 
   // Create sub-groups for rows and columns
-  for (int i = config.programSettings->torus_row *
-               config.programSettings->torus_width;
-       i < config.programSettings->torus_row *
-               (config.programSettings->torus_width + 1);
+  for (int i = config.programSettings->torus_width *
+               config.programSettings->torus_row;
+       i < config.programSettings->torus_width *
+               (config.programSettings->torus_row + 1);
        i++) {
     row_ranks.push_back(all_accl_ranks[i]);
   }
@@ -296,12 +296,11 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                                  config.programSettings->blockSize,
                              local_block_row_remainder, col_comm, true, true);
           // Broadcast LU block in row to update all top blocks
-          config.accl->bcast(*Buffer_lu2,
+          config.accl->bcast(*Buffer_lu1,
                              config.programSettings->blockSize *
                                  config.programSettings->blockSize,
                              local_block_col_remainder, row_comm, true, true);
         }
-
         if (num_top_blocks > 0) {
 
 // Create top kernels
@@ -343,8 +342,9 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 #pragma omp single
         {
           // Wait until all top and left blocks are calculated
-          std::for_each(comm_kernel_runs.begin(), comm_kernel_runs.end(),
-                        [](xrt::run &e) { e.wait(); });
+          for (auto &run : comm_kernel_runs) {
+            run.wait();
+          }
 
           // Send the left and top blocks to all other ranks so they can be used
           // to update all inner blocks
@@ -366,7 +366,6 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                                    config.programSettings->blockSize,
                                local_block_row_remainder, col_comm, true, true);
           }
-
           // update all remaining inner blocks using only global memory
         }
 
@@ -374,8 +373,9 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 
         // Wait for previous inner MMs to complete.
         // They may need to be reused by the next outer MM calls!
-        std::for_each(inner_mms.begin(), inner_mms.end(),
-                      [](xrt::run &e) { e.wait(); });
+        for (auto &run : inner_mms) {
+          run.wait();
+        }
 
 #pragma omp for
         for (int lbi = 1; lbi < num_inner_block_rows; lbi++) {
@@ -464,16 +464,17 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
           }
         }
 
+        // Wait for all outer MMs to complete because the results are required
+        // by the next communication phase
+        for (auto &run : outer_mms) {
+          run.wait();
+        }
+
 #ifndef NDEBUG
         MPI_Barrier(MPI_COMM_WORLD);
         if (is_calulating_lu_block)
           std::cout << "---------------" << std::endl;
 #endif
-
-        // Wait for all outer MMs to complete because the results are required
-        // by the next communication phase
-        std::for_each(outer_mms.begin(), outer_mms.end(),
-                      [](xrt::run &e) { e.wait(); });
       }
     }
 
diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
index b054a132..33330ea4 100644
--- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
@@ -292,9 +292,10 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 #pragma omp single
         {
           // Wait until all top and left blocks are calculated
-          std::for_each(comm_kernel_runs.begin(), comm_kernel_runs.end(),
-                        [](xrt::run &e) { e.wait(); });
-
+          for (auto &run : comm_kernel_runs) {
+            run.wait();
+          }
+          
           // Send the left and top blocks to all other ranks so they can be used
           // to update all inner blocks
           for (int lbi = 0;
@@ -331,8 +332,9 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 
         // Wait for previous inner MMs to complete.
         // They may need to be reused by the next outer MM calls!
-        std::for_each(inner_mms.begin(), inner_mms.end(),
-                      [](xrt::run &e) { e.wait(); });
+        for (auto &run : inner_mms) {
+          run.wait();
+        }
 
 #pragma omp for
         for (int lbi = 1; lbi < num_inner_block_rows; lbi++) {
@@ -421,16 +423,17 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
           }
         }
 
+        // Wait for all outer MMs to complete because the results are required
+        // by the next communication phase
+        for (auto &run : outer_mms) {
+          run.wait();
+        }
+
 #ifndef NDEBUG
         MPI_Barrier(MPI_COMM_WORLD);
         if (is_calulating_lu_block)
           std::cout << "---------------" << std::endl;
 #endif
-
-        // Wait for all outer MMs to complete because the results are required
-        // by the next communication phase
-        std::for_each(outer_mms.begin(), outer_mms.end(),
-                      [](xrt::run &e) { e.wait(); });
       }
     }
 

From 239d0a2c89290f00830b26e7bfb8290086403140 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 19 May 2022 14:16:38 +0100
Subject: [PATCH 070/318] Initialize kernels only once and reuse

---
 .../execution_types/execution_xrt_pcie.hpp    | 53 +++++++------------
 1 file changed, 20 insertions(+), 33 deletions(-)

diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
index 33330ea4..6de18915 100644
--- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
@@ -71,21 +71,21 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
   MPI_Comm_split(MPI_COMM_WORLD, config.programSettings->torus_col, 0,
                  &col_communicator);
 
-  // TODO: Select the correct memory groups!
-  // Create Buffers for input and output
-  // TODO: Need to set a memory group for the buffers here!
+  xrt::kernel kernel_mm(*config.device, *config.program, "inner_update_mm0");
+  xrt::kernel kernel_lu(*config.device, *config.program, "lu");
+  xrt::kernel kernel_top(*config.device, *config.program, "top_update");
+  xrt::kernel kernel_left(*config.device, *config.program, "left_update");
 
-  auto lu_tmp_kernel = xrt::kernel(*config.device, *config.program, "lu");
   xrt::bo Buffer_a(*config.device, data.A,
                    sizeof(HOST_DATA_TYPE) * data.matrix_height *
                        data.matrix_width,
-                   lu_tmp_kernel.group_id(0));
+                   kernel_lu.group_id(0));
   xrt::bo Buffer_b(*config.device, data.b,
                    sizeof(HOST_DATA_TYPE) * data.matrix_width,
-                   lu_tmp_kernel.group_id(0));
+                   kernel_lu.group_id(0));
   xrt::bo Buffer_pivot(*config.device, data.ipvt,
                        sizeof(cl_int) * data.matrix_height,
-                       lu_tmp_kernel.group_id(0));
+                       kernel_lu.group_id(0));
 
   /* --- Setup MPI communication and required additional buffers --- */
 
@@ -95,12 +95,12 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                      sizeof(HOST_DATA_TYPE) *
                          (config.programSettings->blockSize) *
                          (config.programSettings->blockSize),
-                     lu_tmp_kernel.group_id(1));
+                     kernel_lu.group_id(1));
   xrt::bo Buffer_lu2(*config.device,
                      sizeof(HOST_DATA_TYPE) *
                          (config.programSettings->blockSize) *
                          (config.programSettings->blockSize),
-                     lu_tmp_kernel.group_id(2));
+                     kernel_lu.group_id(2));
 
   std::vector<std::vector<xrt::bo>> Buffer_left_list(2);
   std::vector<std::vector<xrt::bo>> Buffer_top_list(2);
@@ -111,7 +111,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
           *config.device,
           sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
               (config.programSettings->blockSize),
-          lu_tmp_kernel.group_id(0));
+          kernel_lu.group_id(0));
     }
 
     for (int i = 0; i < blocks_per_col; i++) {
@@ -119,7 +119,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
           *config.device,
           sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
               (config.programSettings->blockSize),
-          lu_tmp_kernel.group_id(2));
+          kernel_lu.group_id(2));
     }
   }
 
@@ -216,8 +216,6 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
         {
 
           if (is_calulating_lu_block) {
-            // create the LU kernel
-            auto lu_kernel = xrt::kernel(*config.device, *config.program, "lu");
 
 #ifndef NDEBUG
             std::cout << "Torus " << config.programSettings->torus_row << ","
@@ -225,7 +223,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                       << local_block_row << "," << local_block_col << std::endl;
 #endif
             auto lu_run =
-                lu_kernel(Buffer_a, Buffer_lu1, Buffer_lu2, local_block_col,
+                kernel_lu(Buffer_a, Buffer_lu1, Buffer_lu2, local_block_col,
                           local_block_row, blocks_per_row);
             ert_cmd_state state = lu_run.wait();
             if (state != ERT_CMD_STATE_COMPLETED) {
@@ -254,7 +252,6 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 // Create top kernels
 #pragma omp for
           for (int tops = start_col_index; tops < blocks_per_row; tops++) {
-            xrt::kernel k(*config.device, *config.program, "top_update");
 #ifndef NDEBUG
             std::cout << "Torus " << config.programSettings->torus_row << ","
                       << config.programSettings->torus_col << " Top    "
@@ -262,7 +259,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 #endif
 
             comm_kernel_runs.push_back(
-                k(Buffer_a,
+                kernel_top(Buffer_a,
                   Buffer_top_list[block_row % 2][tops - start_col_index],
                   Buffer_lu1, (tops == start_col_index), tops, local_block_row,
                   blocks_per_row));
@@ -275,14 +272,13 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 // Create left kernels
 #pragma omp for
           for (int tops = start_row_index; tops < blocks_per_col; tops++) {
-            xrt::kernel k(*config.device, *config.program, "left_update");
 #ifndef NDEBUG
             std::cout << "Torus " << config.programSettings->torus_row << ","
                       << config.programSettings->torus_col << " Left   " << tops
                       << "," << local_block_col << std::endl;
 #endif
             comm_kernel_runs.push_back(
-                k(Buffer_a,
+                kernel_left(Buffer_a,
                   Buffer_left_list[block_row % 2][tops - start_row_index],
                   Buffer_lu2, (tops == start_row_index), local_block_col, tops,
                   blocks_per_row));
@@ -339,10 +335,6 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 #pragma omp for
         for (int lbi = 1; lbi < num_inner_block_rows; lbi++) {
 
-          // select the matrix multiplication kernel that should be used for
-          // this block updated
-          xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
-
           int current_block_col = static_cast<cl_uint>(
               (data.matrix_width / config.programSettings->blockSize) -
               num_inner_block_cols);
@@ -357,7 +349,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                     << std::endl;
 #endif
 
-          outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][lbi],
+          outer_mms.push_back(kernel_mm(Buffer_a, Buffer_left_list[block_row % 2][lbi],
                                 Buffer_top_list[block_row % 2][0],
                                 current_block_col, current_block_row,
                                 blocks_per_row));
@@ -366,10 +358,6 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 #pragma omp for
         for (int tbi = 0; tbi < num_inner_block_cols; tbi++) {
 
-          // select the matrix multiplication kernel that should be used for
-          // this block updated
-          xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
-
           int current_block_col = static_cast<cl_uint>(
               (data.matrix_width / config.programSettings->blockSize) -
               num_inner_block_cols + tbi);
@@ -384,7 +372,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                     << std::endl;
 #endif
 
-          outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][0],
+          outer_mms.push_back(kernel_mm(Buffer_a, Buffer_left_list[block_row % 2][0],
                                 Buffer_top_list[block_row % 2][tbi],
                                 current_block_col, current_block_row,
                                 blocks_per_row));
@@ -397,10 +385,6 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 #pragma omp for collapse(2) schedule(static)
         for (int lbi = 1; lbi < num_inner_block_rows; lbi++) {
           for (int tbi = 1; tbi < num_inner_block_cols; tbi++) {
-            // select the matrix multiplication kernel that should be used for
-            // this block updated
-
-            xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
 
             int current_block_col = static_cast<cl_uint>(
                 (data.matrix_width / config.programSettings->blockSize) -
@@ -417,7 +401,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 #endif
 
             inner_mms.push_back(
-                k(Buffer_a, Buffer_left_list[block_row % 2][lbi],
+                kernel_mm(Buffer_a, Buffer_left_list[block_row % 2][lbi],
                   Buffer_top_list[block_row % 2][tbi], current_block_col,
                   current_block_row, blocks_per_row));
           }
@@ -428,6 +412,9 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
         for (auto &run : outer_mms) {
           run.wait();
         }
+        for (auto &run : inner_mms) {
+          run.wait();
+        }
 
 #ifndef NDEBUG
         MPI_Barrier(MPI_COMM_WORLD);

From 7aaa838bbb6d6b5a6dbdc61aea9ce854e224b2dc Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 19 May 2022 14:21:28 +0100
Subject: [PATCH 071/318] Print diff for failing non-dist validation

---
 LINPACK/src/host/linpack_benchmark.hpp | 27 +++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp
index c6656ffd..dd33c0f0 100644
--- a/LINPACK/src/host/linpack_benchmark.hpp
+++ b/LINPACK/src/host/linpack_benchmark.hpp
@@ -341,13 +341,15 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
     double resid = 0.0;
     double normx = 0.0;
 #ifndef DISTRIBUTED_VALIDATION
-    if (mpi_comm_rank > 0) {
+    auto base_data = this->generateInputData();
+    if (this->mpi_comm_rank > 0) {
         for (int j = 0; j < matrix_height; j++) {
             for (int i = 0; i < matrix_width; i+= this->executionSettings->programSettings->blockSize) {
                 MPI_Send(&data.A[matrix_width * j + i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
+                MPI_Send(&base_data->A[matrix_width * j + i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 1, MPI_COMM_WORLD);
             }
         }
-        if (executionSettings->programSettings->torus_row == 0) {
+        if (this->executionSettings->programSettings->torus_row == 0) {
             for (int i = 0; i < matrix_width; i+= this->executionSettings->programSettings->blockSize) {
                 MPI_Send(&data.b[i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
             }
@@ -360,17 +362,20 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
         std::vector<HOST_DATA_TYPE> total_b_original(n);
         std::vector<HOST_DATA_TYPE> total_b(n);
         std::vector<HOST_DATA_TYPE> total_a(n*n);
+        std::vector<HOST_DATA_TYPE> total_a_old(n*n);
         for (int j = 0; j < n; j++) {
             for (int i = 0; i < n; i+= this->executionSettings->programSettings->blockSize) {
                 int recvcol= (i / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_width;
                 int recvrow= (j / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_height;
                 int recvrank = this->executionSettings->programSettings->torus_width * recvrow + recvcol;
                 if (recvrank > 0) {
-                    MPI_Recv(&total_a[j * n + i],executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 0, MPI_COMM_WORLD,  &status);
+                    MPI_Recv(&total_a[j * n + i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 0, MPI_COMM_WORLD,  &status);
+                    MPI_Recv(&total_a_old[j * n + i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 1, MPI_COMM_WORLD,  &status);
                 }
                 else {
                     for (int k=0; k < this->executionSettings->programSettings->blockSize; k++) {
                         total_a[j * n + i + k] = data.A[current_offset + k];
+                        total_a_old[j * n + i + k] = base_data->A[current_offset + k];
                     }
                     current_offset += this->executionSettings->programSettings->blockSize;
                 }
@@ -397,6 +402,22 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
             resid = (resid > std::abs(total_b[i] - 1)) ? resid : std::abs(total_b[i] - 1);
             normx = (normx > std::abs(total_b_original[i])) ? normx : std::abs(total_b_original[i]);
         }
+
+#ifndef NDEBUG 
+        double residn = resid / (static_cast<double>(n)*normx*eps);
+        if (residn > 1.0) {
+            gefa_ref_nopvt(total_a_old.data(), n, n);
+
+            for (int i=0; i < n; i++) {
+                for (int j=0; j < n; j++) {
+                    double error = std::abs(total_a[i * n + j] - total_a_old[i * n + j]);
+                    std::cout << ((error > 1.0e-6) ? error : 0.0) << ",";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+        }
+#endif
     }
 #else
     double local_resid = 0;

From cf8113a020a443108aca9902468b94db42c5d279 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 20 May 2022 14:51:04 +0100
Subject: [PATCH 072/318] Add firt C++ kernel version

---
 LINPACK/src/device/hpl_torus_PCIE.cpp | 799 ++++++++++++++++++++++++++
 1 file changed, 799 insertions(+)
 create mode 100644 LINPACK/src/device/hpl_torus_PCIE.cpp

diff --git a/LINPACK/src/device/hpl_torus_PCIE.cpp b/LINPACK/src/device/hpl_torus_PCIE.cpp
new file mode 100644
index 00000000..391ee48d
--- /dev/null
+++ b/LINPACK/src/device/hpl_torus_PCIE.cpp
@@ -0,0 +1,799 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#include "parameters.h"
+
+const unsigned block_size = (1 << LOCAL_MEM_BLOCK_LOG);
+const unsigned gemm_block = (1 << REGISTER_BLOCK_LOG);
+const unsigned gemm_block_mm = (1 << REGISTER_BLOCK_MM_LOG);
+
+#ifdef KERNEL_lu
+/**
+Executes a single step of the LU factorization.
+
+This method takes a partially solved 8x8 matrix and calculates the next step of
+the LU factorization The method needs 7 (gemm_block-1) calls to perform a single
+LU factorization. This is done to reduce resource usage, since all upcomng calls
+are anyway depending on the results of the previous call and there is no way to
+pipeline multiple executions.
+
+A is the input block that might be partially computed
+step is the current step and must be a value between 0 to gemm_block-2. After
+step gemm_block-2, the block is factorized
+ */
+void lu_block(const DEVICE_DATA_TYPE A[gemm_block][gemm_block], const int step,
+              DEVICE_DATA_TYPE A_out[gemm_block][gemm_block]) {
+
+  // Read current line from input
+  DEVICE_DATA_TYPE line[gemm_block];
+  for (int i = 0; i < gemm_block; i++) {
+    line[i] = A[step][i];
+  }
+
+  // calculate the inverse of the diagonal element for the scaling
+  DEVICE_DATA_TYPE inv_scale_a = -1.0 / line[step];
+
+  // Scale the current row
+  for (int i = 0; i < gemm_block; i++) {
+    if (i > step) {
+      line[i] = line[i] * inv_scale_a;
+    }
+  }
+  line[step] = inv_scale_a;
+
+  // Update all rows fully unrolled
+  // The multiply adds are fully independent
+  //__attribute__((opencl_unroll_hint(gemm_block)))
+  // Unrolling disabled for this loop to save resources
+  for (int j = 0; j < gemm_block; j++) {
+#pragma HLS PIPELINE II=1
+    DEVICE_DATA_TYPE curr_scale = A[j][step];
+    // Update a single row. If it is already updated, just write back the value,
+    // if it is the current row write back the value in "line", else update the
+    // value
+    if (j != step) {
+      for (int i = 0; i < gemm_block; i++) {
+        A_out[j][i] =
+            (i > step && j > step) ? A[j][i] + line[i] * curr_scale : A[j][i];
+      }
+    } else {
+      for (int i = 0; i < gemm_block; i++) {
+        A_out[j][i] = line[i];
+      }
+    }
+  }
+}
+
+/**
+This function can be used to update blocks using with three different
+operations. It will execute the update for a single row in the block. The update
+is completed after gemm_block calls of this update function
+
+operation_type: 0 for top = the top row of blocks will need a triangular MM
+                                1 for left = the left column of blocks will need
+a triangular MM, matrices have to be transposed 2 for inner block == all inner
+blocks will be updated with a MM
+ */
+void update_block(const DEVICE_DATA_TYPE a[gemm_block][gemm_block],
+                  const DEVICE_DATA_TYPE top[gemm_block],
+                  const DEVICE_DATA_TYPE left_or_lu[gemm_block],
+                  DEVICE_DATA_TYPE out[gemm_block][gemm_block],
+                  const int current_row, const int operation_type) {
+
+  // Define different operation types of function
+  const int op_top = 0;
+  const int op_left = 1;
+  const int op_inner = 2;
+
+  // Transpose the input matrices if the target is a left block
+  DEVICE_DATA_TYPE current_block[gemm_block][gemm_block];
+  if (operation_type == op_left) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int jj = 0; jj < gemm_block; jj++) {
+        current_block[ii][jj] = a[jj][ii];
+      }
+    }
+  } else {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int jj = 0; jj < gemm_block; jj++) {
+        current_block[ii][jj] = a[ii][jj];
+      }
+    }
+  }
+
+  // Generate the first scalling array depending on the operation type
+  DEVICE_DATA_TYPE scale_row[gemm_block];
+  if (operation_type == op_inner) {
+    for (int jj = 0; jj < gemm_block; jj++) {
+      scale_row[jj] = top[jj];
+    }
+  } else {
+    for (int jj = 0; jj < gemm_block; jj++) {
+      scale_row[jj] = current_block[current_row][jj];
+    }
+  }
+  if (operation_type == op_top) {
+    for (int jj = 0; jj < gemm_block; jj++) {
+      scale_row[jj] *= left_or_lu[current_row];
+    }
+  }
+
+  DEVICE_DATA_TYPE tmp[gemm_block][gemm_block];
+  // scale all values with the pre calculated scaling array and the second input
+  for (int ii = 0; ii < gemm_block; ii++) {
+    for (int jj = 0; jj < gemm_block; jj++) {
+      // left_or_lu_block are stored transposed to simplify the data access here
+      tmp[ii][jj] = current_block[ii][jj] + scale_row[jj] * left_or_lu[ii];
+    }
+  }
+
+  // overwrite results that were calculated altough they are not needed for the
+  // triangular operations left and top
+  if (operation_type != op_inner) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      if (ii == current_row) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          tmp[ii][jj] = scale_row[jj];
+        }
+      } else if (ii < current_row) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          tmp[ii][jj] = current_block[ii][jj];
+        }
+      }
+    }
+  }
+
+  // write result back and transpose if necessary
+  if (operation_type == op_left) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int jj = 0; jj < gemm_block; jj++) {
+        out[ii][jj] = tmp[jj][ii];
+      }
+    }
+  } else {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int jj = 0; jj < gemm_block; jj++) {
+        out[ii][jj] = tmp[ii][jj];
+      }
+    }
+  }
+}
+
+#endif
+
+extern "C" {
+
+#ifdef KERNEL_lu
+void lu(DEVICE_DATA_TYPE *a, DEVICE_DATA_TYPE *a_block_trans,
+        DEVICE_DATA_TYPE *a_block, const unsigned int block_col, const unsigned int block_row,
+        const unsigned int blocks_per_row) {
+
+  DEVICE_DATA_TYPE a_buffer[block_size / gemm_block][block_size / gemm_block]
+                           [gemm_block][gemm_block];
+
+  // Store current row and column in separate buffers for
+  // easier access in the deep pipeline
+  // need to be declared as local to prevent the compiler from
+  DEVICE_DATA_TYPE top_buffer[block_size / gemm_block][gemm_block];
+  DEVICE_DATA_TYPE left_buffer[block_size / gemm_block][gemm_block];
+
+  // Load block to local memory
+load_a_block:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+#pragma HLS PIPELINE
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a_buffer[i][j][ii][jj] =
+              a[block_col * block_size +
+                (block_row * block_size + i * gemm_block + ii) * block_size *
+                    blocks_per_row +
+                j * gemm_block + jj];
+        }
+      }
+    }
+  }
+
+  // For each row in the matrix update whole matrix.
+  // The iterations depend on each other, so loop pipelining is disabled here
+loop_diag:
+  for (int gk = 0; gk < block_size; gk++) {
+
+    int k = gk / gemm_block;
+    int kk = gk & (gemm_block - 1);
+
+    // Read in current LU block
+    DEVICE_DATA_TYPE lu_a_buffer_in[gemm_block][gemm_block];
+load_a_sb:
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int jj = 0; jj < gemm_block; jj++) {
+        lu_a_buffer_in[ii][jj] = a_buffer[k][k][ii][jj];
+      }
+    }
+
+    DEVICE_DATA_TYPE lu_a_buffer_out[gemm_block][gemm_block];
+    DEVICE_DATA_TYPE lu_a_buffer_out_row[gemm_block];
+    DEVICE_DATA_TYPE lu_a_buffer_out_col[gemm_block];
+    // Calculate next row and column of LU factorization and store in local
+    // memory buffer
+    lu_block(lu_a_buffer_in, kk, lu_a_buffer_out);
+write_lu_sb:
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int jj = 0; jj < gemm_block; jj++) {
+        a_buffer[k][k][ii][jj] = lu_a_buffer_out[ii][jj];
+      }
+    }
+write_lu_row:
+    for (int jj = 0; jj < gemm_block; jj++) {
+      lu_a_buffer_out_row[jj] = lu_a_buffer_out[kk][jj];
+    }
+write_lu_col:
+    for (int jj = 0; jj < gemm_block; jj++) {
+      lu_a_buffer_out_col[jj] = lu_a_buffer_out[jj][kk];
+    }
+
+    // The update pipeline does not need to be executed for the last
+    // row of blocks
+    if (gk < block_size - gemm_block) {
+
+update_inner:
+      // update all left blocks
+      for (int tj = 1; tj < block_size / gemm_block; tj++) {
+#pragma HLS PIPELINE II=1
+
+        int j = k;
+        int i = tj;
+
+        if (i > k) {
+          // copy the correct block in the second input buffer
+          // this depends on the operations that has to be executed
+          DEVICE_DATA_TYPE second_input[gemm_block];
+
+          // left matrix block will be calculated
+          for (int jj = 0; jj < gemm_block; jj++) {
+            second_input[jj] = lu_a_buffer_out_row[jj];
+          }
+          DEVICE_DATA_TYPE a_input[gemm_block][gemm_block];
+          for (int ii = 0; ii < gemm_block; ii++) {
+            for (int jj = 0; jj < gemm_block; jj++) {
+              a_input[ii][jj] = a_buffer[i][j][ii][jj];
+            }
+          }
+          DEVICE_DATA_TYPE top_input[gemm_block];
+          DEVICE_DATA_TYPE out[gemm_block][gemm_block];
+          update_block(a_input, top_input, second_input, out, kk, 1);
+
+          for (int ii = 0; ii < gemm_block; ii++) {
+            left_buffer[i][ii] = out[ii][kk];
+          }
+          for (int ii = 0; ii < gemm_block; ii++) {
+            for (int jj = 0; jj < gemm_block; jj++) {
+              a_buffer[i][j][ii][jj] = out[ii][jj];
+            }
+          }
+        }
+      }
+
+      // Update all other blocks with the new calculated row and column
+      // First update top blocks, then update left blocks, then all inner blocks
+      // ti == 0: top blocks
+      // ti == 1: left blocks
+      // ti > 1: inner blocks
+update_inner_2:
+      for (int ti = 0; ti < block_size / gemm_block - k; ti++) {
+        for (int tj = 1; tj < block_size / gemm_block; tj++) {
+#pragma HLS PIPELINE II=1
+
+          int j = tj;
+          int i = ti + k;
+          // always execute the pipeline for whole rows of matrix blocks.
+          // Only execute update for blocks that are required.
+          // This helps to keep constant latencies between data dependencies of
+          // the pipeline stages
+          if ((i > k || ti == 0) && j > k) {
+
+            // copy the correct block in the second input buffer
+            // this depends on the operations that has to be executed
+            DEVICE_DATA_TYPE second_input[gemm_block];
+            if (ti == 0) {
+              // top matrix block will be calculated
+              for (int jj = 0; jj < gemm_block; jj++) {
+                second_input[jj] = lu_a_buffer_out_col[jj];
+              }
+            } else {
+              // inner block will be calculated
+              for (int jj = 0; jj < gemm_block; jj++) {
+                second_input[jj] = left_buffer[i][jj];
+              }
+            }
+            DEVICE_DATA_TYPE a_input[gemm_block][gemm_block];
+            for (int ii = 0; ii < gemm_block; ii++) {
+              for (int jj = 0; jj < gemm_block; jj++) {
+                a_input[ii][jj] = a_buffer[i][j][ii][jj];
+              }
+            }
+            DEVICE_DATA_TYPE top_input[gemm_block];
+            for (int jj = 0; jj < gemm_block; jj++) {
+              top_input[jj] = top_buffer[j][jj];
+            }
+            DEVICE_DATA_TYPE out[gemm_block][gemm_block];
+            update_block(a_input, top_input, second_input, out, kk,
+                         (ti == 0) ? 0 : 2);
+            if (ti == 0) {
+              // only update in the first row
+              for (int jj = 0; jj < gemm_block; jj++) {
+                top_buffer[j][jj] = out[kk][jj];
+              }
+            }
+            for (int ii = 0; ii < gemm_block; ii++) {
+              for (int jj = 0; jj < gemm_block; jj++) {
+                a_buffer[i][j][ii][jj] = out[ii][jj];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Store block to global memory
+store_a:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a[block_col * block_size +
+            (block_row * block_size + i * gemm_block + ii) * block_size *
+                blocks_per_row +
+            j * gemm_block + jj] = a_buffer[i][j][ii][jj];
+        }
+      }
+    }
+  }
+  // Store current block in global memory also transposed to allow easier access
+  // from the top kernel
+  store_a_bt:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a_block_trans[(i * gemm_block + ii) * block_size + j * gemm_block +
+                        jj] = a_buffer[j][i][jj][ii];
+        }
+      }
+    }
+  }
+
+store_a_b:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a_block[(i * gemm_block + ii) * block_size + j * gemm_block + jj] =
+              a_buffer[i][j][ii][jj];
+        }
+      }
+    }
+  }
+}
+#endif
+
+#ifdef KERNEL_top_update
+/**
+Update the blocks to the right of the current LU block
+
+ */
+void top_update(DEVICE_DATA_TYPE *a, DEVICE_DATA_TYPE *top_block,
+                const DEVICE_DATA_TYPE *lu_global_buffer_transposed,
+                const unsigned int is_first_block, const unsigned int block_col,
+                const unsigned int block_row, const unsigned int blocks_per_row) {
+
+  // Store current block in local memory
+  DEVICE_DATA_TYPE
+      a_buffer[block_size / gemm_block][block_size / gemm_block][gemm_block]
+              [gemm_block];
+
+  // Load block to local memory
+load_a:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a_buffer[i][j][ii][jj] =
+              a[block_col * block_size +
+                (block_row * block_size + i * gemm_block + ii) * block_size *
+                    blocks_per_row +
+                j * gemm_block + jj];
+        }
+      }
+    }
+  }
+
+// For each row in the matrix update whole matrix.
+// The iterations depend on each other, so loop pipelining is disabled here
+diag_exe:
+  for (int gk = 0; gk < block_size; gk++) {
+
+    int k = gk / gemm_block;
+    int kk = gk & (gemm_block - 1);
+
+    DEVICE_DATA_TYPE current_lu_col[block_size / gemm_block][gemm_block];
+    DEVICE_DATA_TYPE current_row[block_size / gemm_block][gemm_block];
+    DEVICE_DATA_TYPE current_scale;
+
+scale_row:
+    for (int col = 0; col < block_size / gemm_block; col++) {
+#pragma HLS PIPELINE II=1
+	    DEVICE_DATA_TYPE col_in[gemm_block];
+#pragma HLS array_partition variable=col_in type=complete dim=0
+      DEVICE_DATA_TYPE scale_chunk[gemm_block];
+#pragma HLS array_partition variable=col_in type=complete dim=0
+
+      // get current row chunk
+      for (int i = 0; i < gemm_block; i++) {
+        scale_chunk[i] = a_buffer[k][col][kk][i];
+      }
+
+      // if current column data is still available read it in and store it in
+      // buffer
+      if (col < block_size / gemm_block - k) {
+        // Load LU data from global memory instead of receiving it from the
+        // channel
+        for (int i = 0; i < gemm_block; i++) {
+          col_in[i] =
+              lu_global_buffer_transposed[gk * block_size +
+                                          (col + k) * gemm_block + i];
+        }
+        if (col == 0) {
+          current_scale = col_in[kk];
+        }
+        for (int i = 0; i < gemm_block; i++) {
+          current_lu_col[col][i] = (col > 0 || i > kk) ? col_in[i] : 0.f;
+        }
+      }
+
+      // scale current row chunk with the rows scale factor received over the
+      // external channel
+      for (int i = 0; i < gemm_block; i++) {
+        scale_chunk[i] = scale_chunk[i] * current_scale;
+      }
+
+      for (int i = 0; i < gemm_block; i++) {
+        current_row[col][i] = scale_chunk[i];
+      }
+
+      // Update local memory buffer with chunk
+      for (int i = 0; i < gemm_block; i++) {
+        a_buffer[k][col][kk][i] = scale_chunk[i];
+      }
+    }
+
+// Update all remaining rows
+update_rows:
+    for (int row = k; row < block_size / gemm_block; row++) {
+#pragma HLS loop_tripcount min=0 max=block_size/gemm_block avg=block_size/gemm_block/2
+      // Update whole rows!
+      for (int curr_col = 0; curr_col < block_size / gemm_block; curr_col++) {
+#pragma HLS PIPELINE II=1
+        DEVICE_DATA_TYPE colbuf[gemm_block];
+        for (int j = 0; j < gemm_block; j++) {
+          colbuf[j] = current_lu_col[row - k][j];
+        }
+        for (int i = 0; i < gemm_block; i++) {
+          for (int j = 0; j < gemm_block; j++) {
+            a_buffer[row][curr_col][i][j] +=
+                colbuf[i] * current_row[curr_col][j];
+          }
+        }
+      }
+    }
+  }
+
+// Store block to global memory
+store_a:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a[block_col * block_size +
+            (block_row * block_size + i * gemm_block + ii) * block_size *
+                blocks_per_row +
+            j * gemm_block + jj] = a_buffer[i][j][ii][jj];
+        }
+      }
+    }
+  }
+// Store current block separately for easier transmission over host
+store_top:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          top_block[(i * gemm_block + ii) * block_size + j * gemm_block + jj] =
+              a_buffer[i][j][ii][jj];
+        }
+      }
+    }
+  }
+}
+#endif
+
+#ifdef KERNEL_left_update
+/**
+Update the blocks below the current LU block
+
+ */
+void left_update(DEVICE_DATA_TYPE * a,
+                 DEVICE_DATA_TYPE * left_block,
+                 const DEVICE_DATA_TYPE * lu_global_buffer,
+                 const unsigned int is_first_block, const unsigned int block_col,
+                 const unsigned int block_row, const unsigned int blocks_per_row) {
+
+  // Store current block in local memory
+  DEVICE_DATA_TYPE
+      a_buffer[block_size / gemm_block][block_size / gemm_block][gemm_block]
+              [gemm_block];
+
+  // Load block to local memory
+load_a:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a_buffer[i][j][ii][jj] =
+              a[block_col * block_size +
+                (block_row * block_size + i * gemm_block + ii) * block_size *
+                    blocks_per_row +
+                j * gemm_block + jj];
+        }
+      }
+    }
+  }
+
+  // For each row in the matrix update whole matrix.
+  // The iterations depend on each other, so loop pipelining is disabled here
+diag:
+  for (int gk = 0; gk < block_size; gk++) {
+
+    int k = gk / gemm_block;
+    int kk = gk & (gemm_block - 1);
+
+    DEVICE_DATA_TYPE current_lu_row[block_size / gemm_block][gemm_block];
+    DEVICE_DATA_TYPE current_col[block_size / gemm_block][gemm_block];
+
+first_col:
+    for (int col = 0; col < block_size / gemm_block; col++) {
+#pragma HLS PIPELINE II=1
+      DEVICE_DATA_TYPE chunk[gemm_block];
+      // get current row chunk
+      for (int i = 0; i < gemm_block; i++) {
+        chunk[i] = a_buffer[col][k][i][kk];
+      }
+
+      // Store chunk for later update
+      for (int i = 0; i < gemm_block; i++) {
+        current_col[col][i] = chunk[i];
+      }
+
+      DEVICE_DATA_TYPE row_in[gemm_block];
+
+      // if current column data is still available read it in and store it in
+      // buffer
+      if (col < block_size / gemm_block - k) {
+        // Load LU data from global memory
+        for (int i = 0; i < gemm_block; i++) {
+          row_in[i] =
+              lu_global_buffer[gk * block_size + (col + k) * gemm_block + i];
+        }
+        for (int i = 0; i < gemm_block; i++) {
+          current_lu_row[col][i] = (col > 0 || i > kk) ? row_in[i] : 0.f;
+        }
+      }
+    }
+
+    // Update all rows
+    // Update only remaining row chunks
+update:
+    for (int curr_col = 0; curr_col < block_size / gemm_block - k; curr_col++) {
+#pragma HLS loop_tripcount min=0 max=block_size/gemm_block avg=block_size/gemm_block/2
+      for (int row = 0; row < block_size / gemm_block; row++) {
+#pragma HLS PIPELINE II=1
+        DEVICE_DATA_TYPE colbuf[gemm_block];
+        for (int j = 0; j < gemm_block; j++) {
+          colbuf[j] = current_col[row][j];
+        }
+        for (int i = 0; i < gemm_block; i++) {
+          for (int j = 0; j < gemm_block; j++) {
+            a_buffer[row][curr_col + k][i][j] +=
+                current_lu_row[curr_col][j] * colbuf[i];
+          }
+        }
+      }
+    }
+  }
+
+  // Store block to global memory
+store_a:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a[block_col * block_size +
+            (block_row * block_size + i * gemm_block + ii) * block_size *
+                blocks_per_row +
+            j * gemm_block + jj] = a_buffer[i][j][ii][jj];
+        }
+      }
+    }
+  }
+
+  // Store current block separately for easier transmission over host
+store_left:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          left_block[(i * gemm_block + ii) * block_size + j * gemm_block + jj] =
+              a_buffer[j][i][jj][ii];
+        }
+      }
+    }
+  }
+}
+#endif
+
+#ifdef KERNEL_inner_update_mm0
+/**
+Update the inner blocks using the left and right column and rows
+
+ */
+void inner_update_mm0(
+    DEVICE_DATA_TYPE *a, const DEVICE_DATA_TYPE *left_global_buffer,
+    const DEVICE_DATA_TYPE *top_global_buffer, const unsigned int block_col,
+    const unsigned int block_row, const unsigned int blocks_per_row) {
+
+  // Store current block in local memory
+  DEVICE_DATA_TYPE a_buffer[block_size / gemm_block_mm]
+                           [block_size / gemm_block_mm][gemm_block_mm]
+                           [gemm_block_mm];
+  DEVICE_DATA_TYPE top_buffer[block_size / gemm_block_mm]
+                             [block_size / gemm_block_mm][gemm_block_mm]
+                             [gemm_block_mm];
+  DEVICE_DATA_TYPE left_buffer[block_size / gemm_block_mm]
+                              [block_size / gemm_block_mm][gemm_block_mm]
+                              [gemm_block_mm];
+
+  // If Xilinx FPGA, load blocks in separate pipelines to achieve memory bursts!
+  // Load blocks to local memory
+load_a_block:
+  for (int i = 0; i < block_size / gemm_block_mm; i++) {
+    for (int ii = 0; ii < gemm_block_mm; ii++) {
+      for (int j = 0; j < block_size / gemm_block_mm; j++) {
+#pragma HLS PIPELINE II=1
+        for (int jj = 0; jj < gemm_block_mm; jj++) {
+          a_buffer[i][j][ii][jj] =
+              a[block_col * block_size +
+                (block_row * block_size + i * gemm_block_mm + ii) * block_size *
+                    blocks_per_row +
+                j * gemm_block_mm + jj];
+        }
+      }
+    }
+  }
+
+load_top_block:
+  for (int i = 0; i < block_size / gemm_block_mm; i++) {
+    for (int ii = 0; ii < gemm_block_mm; ii++) {
+      for (int j = 0; j < block_size / gemm_block_mm; j++) {
+#pragma HLS PIPELINE II=1
+        for (int jj = 0; jj < gemm_block_mm; jj++) {
+          top_buffer[i][j][ii][jj] =
+              top_global_buffer[(i * gemm_block_mm + ii) * block_size +
+                                j * gemm_block_mm + jj];
+        }
+      }
+    }
+  }
+
+load_left_block:
+  for (int i = 0; i < block_size / gemm_block_mm; i++) {
+    for (int ii = 0; ii < gemm_block_mm; ii++) {
+      for (int j = 0; j < block_size / gemm_block_mm; j++) {
+#pragma HLS PIPELINE II=1
+        for (int jj = 0; jj < gemm_block_mm; jj++) {
+          left_buffer[i][j][ii][jj] =
+              left_global_buffer[(i * gemm_block_mm + ii) * block_size +
+                                 j * gemm_block_mm + jj];
+        }
+      }
+    }
+  }
+
+  // Update whole block
+calc_subblocks:
+  for (int c = 0;
+       c < (block_size / gemm_block_mm) * (block_size / gemm_block_mm) *
+               (block_size / gemm_block_mm);
+       c++) {
+#pragma HLS PIPELINE II=1
+
+    int mcol =
+        c / ((block_size / gemm_block_mm) * (block_size / gemm_block_mm));
+    int row =
+        (c / (block_size / gemm_block_mm)) % (block_size / gemm_block_mm);
+    int curr_col = c & ((block_size / gemm_block_mm) - 1);
+
+    DEVICE_DATA_TYPE top_sub[gemm_block_mm][gemm_block_mm];
+    DEVICE_DATA_TYPE left_sub[gemm_block_mm][gemm_block_mm];
+
+load_top_sb:
+    for (int i = 0; i < gemm_block_mm; i++) {
+      for (int j = 0; j < gemm_block_mm; j++) {
+        top_sub[i][j] = top_buffer[mcol][curr_col][i][j];
+      }
+    }
+
+load_left_sb:
+    for (int i = 0; i < gemm_block_mm; i++) {
+      for (int j = 0; j < gemm_block_mm; j++) {
+        left_sub[i][j] = left_buffer[mcol][row][i][j];
+      }
+    }
+
+    DEVICE_DATA_TYPE result_sub[gemm_block_mm][gemm_block_mm];
+mmul:
+    for (int i = 0; i < gemm_block_mm; i++) {
+      for (int j = 0; j < gemm_block_mm; j++) {
+        // Calculate sum of whole column and only write it back once
+        DEVICE_DATA_TYPE sum = 0.0;
+        for (int k = 0; k < gemm_block_mm; k++) {
+          sum += left_sub[k][i] * top_sub[k][j];
+        }
+        result_sub[i][j] = sum;
+      }
+    }
+
+add_sb:
+    for (int i = 0; i < gemm_block_mm; i++) {
+      for (int j = 0; j < gemm_block_mm; j++) {
+        a_buffer[row][curr_col][i][j] += result_sub[i][j];
+      }
+    }
+  }
+
+  // Store block to global memory
+store_result:
+  for (int i = 0; i < block_size / gemm_block_mm; i++) {
+    for (int ii = 0; ii < gemm_block_mm; ii++) {
+      for (int j = 0; j < block_size / gemm_block_mm; j++) {
+        for (int jj = 0; jj < gemm_block_mm; jj++) {
+          a[block_col * block_size +
+            (block_row * block_size + i * gemm_block_mm + ii) * block_size *
+                blocks_per_row +
+            j * gemm_block_mm + jj] = a_buffer[i][j][ii][jj];
+        }
+      }
+    }
+  }
+}
+
+#endif
+}

From ce8f0404c7311c678265b233719af0bdb9eb561d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 20 May 2022 14:52:23 +0100
Subject: [PATCH 073/318] Extend cmake to allow compiling multiple kernels for
 one link step

---
 cmake/kernelTargets.cmake | 71 +++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 25 deletions(-)

diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index b7a237a3..86aeeb1c 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -21,7 +21,7 @@ set(file_endings "cl" "cpp" )
 function(generate_kernel_targets_xilinx)
     foreach (kernel_file_name ${ARGN})
         string(REGEX MATCH "^custom_.*" is_custom_kernel ${kernel_file_name})
-        if (is_custom_kernel) 
+        if (is_custom_kernel)
                 string(REPLACE "custom_" "" base_file_name ${kernel_file_name})
                 set(base_file_part "src/device/custom/${base_file_name}")
         else()
@@ -48,8 +48,17 @@ function(generate_kernel_targets_xilinx)
         else()
             set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_copied_xilinx.${selected_file_ending}")
         endif()
-        set(bitstream_compile xilinx_tmp_compile/${kernel_file_name}.xo)
-        set(bitstream_compile_emulate xilinx_tmp_compile/${kernel_file_name}_emulate.xo)
+        if (DEFINED XILINX_KERNEL_NAMES)
+            set(bitstream_compile "")
+            set(bitstream_compile_emulate "")
+            foreach (kernel ${XILINX_KERNEL_NAMES})
+                list(APPEND bitstream_compile xilinx_tmp_compile/${kernel_file_name}/${kernel}.xo)
+                list(APPEND bitstream_compile_emulate xilinx_tmp_compile/${kernel_file_name}/${kernel}_emulate.xo)
+            endforeach()
+        else()
+            set(bitstream_compile xilinx_tmp_compile/${kernel_file_name}.xo)
+            set(bitstream_compile_emulate xilinx_tmp_compile/${kernel_file_name}_emulate.xo)
+        endif()
         set(bitstream_emulate_f
             ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_emulate.xclbin)
         set(bitstream_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}.xclbin)
@@ -62,7 +71,7 @@ function(generate_kernel_targets_xilinx)
             set(xilinx_link_settings ${CMAKE_BINARY_DIR}/settings/settings.link.xilinx.${kernel_file_name}.ini)
         endif()
         if (USE_ACCL AND is_accl_kernel)
-            list(APPEND additional_xos ${ACCL_XOS}) 
+            list(APPEND additional_xos ${ACCL_XOS})
         endif()
         set(xilinx_report_folder "${EXECUTABLE_OUTPUT_PATH}/xilinx_reports")
         set(local_CLFLAGS -DXILINX_FPGA)
@@ -106,35 +115,47 @@ function(generate_kernel_targets_xilinx)
                 )
         endif()
 
-        add_custom_command(OUTPUT ${bitstream_compile_emulate}
-                COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -DEMULATE -t sw_emu ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS} -f ${FPGA_BOARD_NAME} -g -c ${XILINX_COMPILE_FLAGS} -o ${bitstream_compile_emulate} ${source_f}
-                MAIN_DEPENDENCY ${source_f}
-                DEPENDS ${XILINX_COMPILE_SETTINGS_FILE}
-                )
+        foreach (kernel ${bitstream_compile_emulate})
+            if (DEFINED XILINX_KERNEL_NAMES)
+                string(REGEX MATCH ".+/(.+)_emulate\.xo" kernel_name ${kernel})
+                set(kernel_name_flag -k ${CMAKE_MATCH_1})
+            endif()
+            add_custom_command(OUTPUT ${kernel}
+                    COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -DKERNEL_${CMAKE_MATCH_1} -DEMULATE -t sw_emu ${kernel_name_flag} ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS} -f ${FPGA_BOARD_NAME} -g -c ${XILINX_COMPILE_FLAGS} -o ${kernel} ${source_f}
+                    MAIN_DEPENDENCY ${source_f}
+                    DEPENDS ${XILINX_COMPILE_SETTINGS_FILE}
+                    )
+        endforeach()
         add_custom_command(OUTPUT ${bitstream_emulate_f}
             COMMAND ${Vitis_COMPILER} ${local_CL_FLAGS} ${VPP_FLAGS} -DEMULATE -t sw_emu ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} -f ${FPGA_BOARD_NAME} -g -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_emulate_f} ${bitstream_compile_emulate}
-                MAIN_DEPENDENCY ${bitstream_compile_emulate}
+                DEPENDS ${bitstream_compile_emulate}
                 DEPENDS ${xilinx_link_settings}
                 )
-        add_custom_command(OUTPUT ${bitstream_compile}
-                COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS}  --platform ${FPGA_BOARD_NAME} -R2 -c ${XILINX_COMPILE_FLAGS} -o ${bitstream_compile} ${source_f}
-                MAIN_DEPENDENCY ${source_f}
-                DEPENDS ${XILINX_COMPILE_SETTINGS_FILE}
-                )
+        foreach (kernel ${bitstream_compile})
+            if (DEFINED XILINX_KERNEL_NAMES)
+                string(REGEX MATCH ".+/(.+)\.xo" kernel_name ${kernel})
+                set(kernel_name_flag -k ${CMAKE_MATCH_1})
+            endif()
+            add_custom_command(OUTPUT ${kernel}
+                    COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -t hw -DKERNEL_${CMAKE_MATCH_1} ${kernel_name_flag} ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS}  --platform ${FPGA_BOARD_NAME} -R2 -c ${XILINX_COMPILE_FLAGS} -o ${kernel} ${source_f}
+                    MAIN_DEPENDENCY ${source_f}
+                    DEPENDS ${XILINX_COMPILE_SETTINGS_FILE}
+                    )
+        endforeach()
         add_custom_command(OUTPUT ${bitstream_f}
                 COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} ${local_harware_only_flags} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} -o ${bitstream_f} ${additional_xos} ${bitstream_compile}
-                MAIN_DEPENDENCY ${bitstream_compile}
+                DEPENDS ${bitstream_compile}
                 DEPENDS ${xilinx_link_settings}
                 )
-        add_custom_target(${kernel_file_name}_emulate_xilinx 
-		DEPENDS ${bitstream_emulate_f} 
+        add_custom_target(${kernel_file_name}_emulate_xilinx
+		DEPENDS ${bitstream_emulate_f}
                 DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h ${EXECUTABLE_OUTPUT_PATH}/emconfig.json)
         add_custom_target(${kernel_file_name}_xilinx
-		DEPENDS ${bitstream_f} 
+		DEPENDS ${bitstream_f}
                 DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h
                 )
         add_custom_target(${kernel_file_name}_report_xilinx
-		DEPENDS ${bitstream_compile} 
+		DEPENDS ${bitstream_compile}
                 DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h
                 )
         if(USE_ACCL AND is_accl_kernel)
@@ -153,7 +174,7 @@ endfunction()
 function(generate_kernel_targets_intel)
     foreach (kernel_file_name ${ARGN})
         string(REGEX MATCH "^custom_.*" is_custom_kernel ${kernel_file_name})
-        if (is_custom_kernel) 
+        if (is_custom_kernel)
                 string(REPLACE "custom_" "" base_file_name ${kernel_file_name})
                 set(base_file_part "src/device/custom/${base_file_name}")
         else()
@@ -192,7 +213,7 @@ function(generate_kernel_targets_intel)
                 DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${report_f}
         )
         add_custom_command(OUTPUT ${EXECUTABLE_OUTPUT_PATH}/${bitstream_f}
-                COMMAND ${CMAKE_COMMAND} -E copy  ${CMAKE_CURRENT_BINARY_DIR}/${bitstream_f} ${EXECUTABLE_OUTPUT_PATH}/${bitstream_f} 
+                COMMAND ${CMAKE_COMMAND} -E copy  ${CMAKE_CURRENT_BINARY_DIR}/${bitstream_f} ${EXECUTABLE_OUTPUT_PATH}/${bitstream_f}
                 COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/${kernel_file_name}/reports ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_synth_reports
                 COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${kernel_file_name}/acl_quartus_report.txt ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_synth_reports/acl_quartus_report.txt
                 COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${kernel_file_name}/quartus_sh_compile.log ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_synth_reports/quartus_sh_compile.log
@@ -217,11 +238,11 @@ function(generate_kernel_targets_intel)
                 MAIN_DEPENDENCY ${source_f}
                 DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h
                 )
-        add_custom_target(${kernel_file_name}_report_intel 
+        add_custom_target(${kernel_file_name}_report_intel
                 DEPENDS ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_reports/report.html)
-        add_custom_target(${kernel_file_name}_intel 
+        add_custom_target(${kernel_file_name}_intel
                 DEPENDS ${EXECUTABLE_OUTPUT_PATH}/${bitstream_f})
-        add_custom_target(${kernel_file_name}_emulate_intel 
+        add_custom_target(${kernel_file_name}_emulate_intel
                 DEPENDS ${EXECUTABLE_OUTPUT_PATH}/${bitstream_emulate_f})
         list(APPEND kernel_emulation_targets_intel ${kernel_file_name}_emulate_intel)
         set(kernel_emulation_targets_intel ${kernel_emulation_targets_intel} CACHE INTERNAL "Kernel emulation targets used to define dependencies for the tests for intel devices")

From d379d0cb50188ff1b86998b11302afc96cbcf55b Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 20 May 2022 14:52:37 +0100
Subject: [PATCH 074/318] Add ACCL kernel link

---
 LINPACK/src/device/CMakeLists.txt             | 2 +-
 LINPACK/src/device/hpl_torus_ACCL_buffers.cpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 120000 LINPACK/src/device/hpl_torus_ACCL_buffers.cpp

diff --git a/LINPACK/src/device/CMakeLists.txt b/LINPACK/src/device/CMakeLists.txt
index 7a28cc56..2e9431a5 100644
--- a/LINPACK/src/device/CMakeLists.txt
+++ b/LINPACK/src/device/CMakeLists.txt
@@ -10,7 +10,7 @@ if (INTELFPGAOPENCL_FOUND)
 endif()
 
 if (VITIS_FOUND)
-    generate_kernel_targets_xilinx(hpl_torus_PCIE)
+    generate_kernel_targets_xilinx(hpl_torus_PCIE hpl_torus_ACCL_buffers)
     add_test(NAME test_emulation_xilinx COMMAND Linpack_xilinx -f hpl_torus_PCIE_emulate.xclbin -m 2 -n 1 ${TEST_HOST_FLAGS}
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
     add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Linpack_xilinx ${TEST_HOST_FLAGS} -f hpl_torus_PCIE_emulate.xclbin -m 2 -n 1 
diff --git a/LINPACK/src/device/hpl_torus_ACCL_buffers.cpp b/LINPACK/src/device/hpl_torus_ACCL_buffers.cpp
new file mode 120000
index 00000000..a11753b1
--- /dev/null
+++ b/LINPACK/src/device/hpl_torus_ACCL_buffers.cpp
@@ -0,0 +1 @@
+hpl_torus_PCIE.cpp
\ No newline at end of file

From 019aa15f9246a2b66728c6a85a022a947c3d8942 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 20 May 2022 14:56:56 +0100
Subject: [PATCH 075/318] Update PCIE config for U280

---
 ...CCL_buffers.cmake => Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake} | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
 rename LINPACK/configs/{Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake => Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake} (86%)

diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake
similarity index 86%
rename from LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake
rename to LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake
index 941a1d78..5ddc6b30 100644
--- a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake
@@ -14,11 +14,13 @@ set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
 set(USE_OCL_HOST No CACHE BOOL "" FORCE)
 set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
 set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
-
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE)
+set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
 # LINPACK specific options
 set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
 set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
 set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
 set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
 
 set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)

From 0eab2932bff646600d7daa623a64607219d08d3a Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 20 May 2022 18:20:24 +0100
Subject: [PATCH 076/318] Fix build for XRT host without ACCL

---
 LINPACK/src/host/execution_types/execution_types.hpp | 2 +-
 LINPACK/src/host/linpack_benchmark.hpp               | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/LINPACK/src/host/execution_types/execution_types.hpp b/LINPACK/src/host/execution_types/execution_types.hpp
index 294115ea..457f4e85 100644
--- a/LINPACK/src/host/execution_types/execution_types.hpp
+++ b/LINPACK/src/host/execution_types/execution_types.hpp
@@ -27,9 +27,9 @@ SOFTWARE.
 #include "execution_types/execution_iec.hpp"
 #endif
 #ifdef USE_XRT_HOST
+#include "execution_types/execution_xrt_pcie.hpp"
 #ifdef USE_ACCL
 #include "execution_types/execution_accl_buffers.hpp"
-#include "execution_types/execution_xrt_pcie.hpp"
 #endif
 #endif
 #endif
\ No newline at end of file
diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp
index dd33c0f0..d1d3093c 100644
--- a/LINPACK/src/host/linpack_benchmark.hpp
+++ b/LINPACK/src/host/linpack_benchmark.hpp
@@ -314,7 +314,9 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
 #endif
 #ifdef USE_XRT_HOST
         case hpcc_base::CommunicationType::pcie_mpi : timings = execution::xrt_pcie::calculate(*this->executionSettings, data); break;
+#ifdef USE_ACCL
         case hpcc_base::CommunicationType::accl : timings = execution::accl_buffers::calculate(*this->executionSettings, data); break;
+#endif
 #endif
         default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(this->executionSettings->programSettings->communicationType));
     }

From 6d90f70b21aba131d8b0cc3a54391cb2cf0e5a3c Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 20 May 2022 18:20:56 +0100
Subject: [PATCH 077/318] Adjust to changed ACCL interface

---
 LINPACK/src/host/execution_types/execution_accl_buffers.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
index 29645a51..5e26e267 100644
--- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
+++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
@@ -86,9 +86,9 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
   }
 
   // Create communicators from sub-groups
-  ACCL::CommunicatorId row_comm = config.accl->create_communicator(
+  ACCL::communicatorId row_comm = config.accl->create_communicator(
       row_ranks, config.programSettings->torus_col);
-  ACCL::CommunicatorId col_comm = config.accl->create_communicator(
+  ACCL::communicatorId col_comm = config.accl->create_communicator(
       col_ranks, config.programSettings->torus_row);
 
   // Create global memory buffers

From 41c26033fac43196877d6da317e3fc313db437f8 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 20 May 2022 18:21:43 +0100
Subject: [PATCH 078/318] Use CMake parameter for ACCL buffer size

---
 LINPACK/CMakeLists.txt             | 5 +++++
 LINPACK/src/common/parameters.h.in | 1 +
 2 files changed, 6 insertions(+)

diff --git a/LINPACK/CMakeLists.txt b/LINPACK/CMakeLists.txt
index fb17db96..a33cc82f 100755
--- a/LINPACK/CMakeLists.txt
+++ b/LINPACK/CMakeLists.txt
@@ -19,6 +19,11 @@ if (TEST_UNIFORM)
     set(TEST_HOST_FLAGS "--uniform")
 endif()
 
+if (USE_ACCL)
+    math(EXPR calculate_accl_buffer_size "(2^${LOCAL_MEM_BLOCK_LOG})^2 * 8")
+    set(ACCL_BUFFER_SIZE ${calculate_accl_buffer_size} CACHE STRING "Size of ACCL buffers in bytes")
+endif()
+
 if (TEST_EMULATION)
     set(TEST_HOST_FLAGS "--emulation")
 endif()
diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in
index 4c036fb9..a5bac5e0 100644
--- a/LINPACK/src/common/parameters.h.in
+++ b/LINPACK/src/common/parameters.h.in
@@ -30,6 +30,7 @@
 
 #cmakedefine USE_SVM
 #cmakedefine DISTRIBUTED_VALIDATION
+#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@
 
 /*
 Short description of the program

From 26a4dea22424b0a62b6ac67f17a7650c6c0e8562 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 24 May 2022 14:11:45 +0100
Subject: [PATCH 079/318] Add initial ACCL stream implementation

---
 .../configs/Xilinx_U280_DDR_ACCL_stream.cmake |  28 +++
 ...nk.xilinx.transpose_pq_accl_stream.ddr.ini |  76 +++++++
 PTRANS/src/device/CMakeLists.txt              |   2 +-
 .../src/device/transpose_PQ_ACCL_stream.cpp   | 202 ++++++++++++++++++
 4 files changed, 307 insertions(+), 1 deletion(-)
 create mode 100644 PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake
 create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini
 create mode 100644 PTRANS/src/device/transpose_PQ_ACCL_stream.cpp

diff --git a/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake
new file mode 100644
index 00000000..89114c4d
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake
@@ -0,0 +1,28 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(ACCL_STACK_TYPE "TCP" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini
new file mode 100644
index 00000000..3c7fcf31
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini
@@ -0,0 +1,76 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=transpose_read0:1
+nk=transpose_write0:1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR2
+slr=transpose_read0_1:SLR0
+slr=transpose_write0_1:SLR0
+
+sp=ccl_offload_0.m_axi_0:DDR[0:1]
+sp=ccl_offload_0.m_axi_1:DDR[0:1]
+sp=transpose_read0_1.m_axi_gmem0:DDR[0:1]
+sp=transpose_write0_1.m_axi_gmem0:DDR[0]
+sp=transpose_write0_1.m_axi_gmem1:DDR[1]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl
+stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl
+
diff --git a/PTRANS/src/device/CMakeLists.txt b/PTRANS/src/device/CMakeLists.txt
index 21176719..34c47551 100644
--- a/PTRANS/src/device/CMakeLists.txt
+++ b/PTRANS/src/device/CMakeLists.txt
@@ -11,7 +11,7 @@ if (INTELFPGAOPENCL_FOUND)
 endif()
 
 if (VITIS_FOUND)
-    generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE transpose_PQ_ACCL_buffers)
+    generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE transpose_PQ_ACCL_buffers transpose_PQ_ACCL_stream)
     add_test(NAME test_emulation_PQ_PCIE_xilinx COMMAND Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
     add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
new file mode 100644
index 00000000..223dfd53
--- /dev/null
+++ b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
@@ -0,0 +1,202 @@
+/******************************************************************************
+ *  Author: Arjun Ramaswami
+ *
+ *  Edited by Marius Meyer:
+ *  - Adapt to used kernel signature
+ *  - Change to row-column loop structure
+ *****************************************************************************/
+#include "parameters.h"
+#include "hls_stream.h"
+#include "ap_int.h"
+#include "ap_utils.h"
+#include "ap_axi_sdata.h"
+
+const unsigned int block_size = BLOCK_SIZE;
+const unsigned int channel_width = CHANNEL_WIDTH;
+
+extern "C" {
+
+// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+
+/**
+ * Read blocks of matrix A and transpose them in memory.
+ * Write the block into an external channel.
+ *
+ * Will do the following:
+ *
+ * A -> trans(A) -> ext. ch
+ *
+ * @param A Buffer for matrix A
+ * @param B Buffer for matrix B
+ * @param A_out Buffer for result matrix
+ * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise
+                on the block level, the whole matrix A might be written to global memory and the relevant columns
+                need to be picked using this offset.
+ * @param number_of_blocks The number of blocks that will be processed starting from the block offset
+ * @param width_in_blocks The with of matrix A in blocks
+ * @param height_in_blocks The height of matix A in blocks
+ */
+void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
+            const unsigned int offset_a,
+            const unsigned int number_of_blocks,
+            const unsigned int width_in_blocks,
+            const unsigned int height_in_blocks,
+            hls::stream<ap_axiu<512, 0, 0, 8> > &krnl2cclo) {
+#pragma HLS INTERFACE axis register both port=krnl2cclo
+#pragma HLS INTERFACE ap_ctrl_none port=return
+
+
+    // local memory double buffer for a matrix block
+    DEVICE_DATA_TYPE a_block[2][block_size * block_size / channel_width][channel_width];
+#pragma HLS ARRAY_PARTITION variable = a_block complete dim = 3
+
+    // transpose the matrix block-wise from global memory
+block_loop:
+    for (unsigned int block = 0; block < number_of_blocks + 1; block++) {
+
+read_A:
+        for (unsigned int row = 0; row < block_size; row++) {
+read_A_line:
+            for (unsigned int col = 0; col < block_size / channel_width; col++) {
+#pragma HLS PIPELINE
+                unsigned long block_row_a = (block + offset_a) / width_in_blocks;
+                unsigned long block_col_a = (block + offset_a) % width_in_blocks;
+                unsigned long ls_address_trans = block_col_a * block_size * block_size * height_in_blocks +
+                            block_row_a * block_size + 
+                            row * block_size * height_in_blocks;
+
+#ifdef EMULATE
+                // This condition is actually required to not read out of bounds
+                // but prevents memory bursts, so for hardware this should be removed
+                // In emulation it prevents segfaults
+                if (block < number_of_blocks) {
+#endif
+                    // read in block of A from global memory and store it in a memory efficient manner for transpose
+                    DEVICE_DATA_TYPE rotate_in[channel_width];
+#pragma HLS ARRAY_PARTITION variable = rotate_in complete dim = 0
+
+                    // Blocks of a will be stored columnwise in global memory
+                    for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                        rotate_in[unroll_count] = A[ls_address_trans + col * channel_width + unroll_count];
+                    }
+
+                    unsigned int chunk = row * (block_size / channel_width) + col;
+
+                    unsigned rot = (row) % (channel_width);
+
+                    // rotate temporary buffer to store data into local buffer
+                    for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                        // every block of (N / channel_width), rotates the index by 1
+                        // store in double buffer
+                        a_block[block & 1][chunk][unroll_count] = rotate_in[(unroll_count + channel_width - rot)
+                                                                                                    % (channel_width)];
+                    }
+#ifdef EMULATE
+                }
+#endif
+                if (block > 0) {
+                    DEVICE_DATA_TYPE data_chunk[channel_width];
+#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0
+                    DEVICE_DATA_TYPE rotate_out[channel_width];
+#pragma HLS ARRAY_PARTITION variable = rotate_out complete dim = 0
+
+                    unsigned int base = col * block_size;
+                    unsigned int offset = row / channel_width;
+
+                    for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                        unsigned rot = ((channel_width + unroll_count - row) * (block_size / channel_width)) %
+                                                                                                    (block_size);
+                        unsigned row_rotate = base + offset + rot;
+                        rotate_out[unroll_count] = a_block[(block - 1) & 1][row_rotate][unroll_count];
+                    }
+
+                    unsigned rot_out = row % (channel_width);
+
+                    // rotate temporary buffer to store data into local buffer
+                    for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                        data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) % (channel_width)];
+                    }
+
+                    ap_uint<512> data = 0;
+
+                    // load tranposed A from global memory
+                    for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                        data |= ((ap_uint<8*sizeof(DEVICE_DATA_TYPE)>)data_chunk[unroll_count]) << (unroll_count * sizeof(DEVICE_DATA_TYPE));
+                    }
+
+                    ap_axiu<512, 0, 0, 8> tmp;
+                    tmp.data = data;
+                    tmp.dest = 0;
+                    tmp.keep = -1;
+                    krnl2cclo.write(tmp);               
+                }
+            }
+        }
+    }
+}
+
+/**
+ *
+ * ext. channel -> trans(A) + B -> A_out
+ *
+ * @param B Buffer for matrix B
+ * @param A_out Buffer for result matrix
+ * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise
+                on the block level, the whole matrix A might be written to global memory and the relevant columns
+                need to be picked using this offset.
+ * @param number_of_blocks The number of blocks that will be processed starting from the block offset
+ * @param width_in_blocks The with of matrix A in blocks
+ * @param height_in_blocks The height of matix A in blocks
+ */
+void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B,
+                                 DEVICE_DATA_TYPE *A_out,
+            const unsigned int offset_b,
+            const unsigned int number_of_blocks,
+            const unsigned int width_in_blocks,
+            const unsigned int height_in_blocks,
+            hls::stream<ap_axiu<512, 0, 0, 8> > &cclo2krnl) {
+#pragma HLS INTERFACE axis register both port=cclo2krnl
+#pragma HLS INTERFACE ap_ctrl_none port=return
+
+    // transpose the matrix block-wise from global memory
+block_loop:
+    for (unsigned int block = 0; block < number_of_blocks; block++) {
+
+        // Read transposed A from local memory and add B 
+read_B:
+        for (unsigned int row = 0; row < block_size; row++) {
+read_B_line:
+            for (unsigned int col = 0; col < block_size / channel_width; col++) {
+                unsigned long block_row = (block + offset_b) / width_in_blocks;
+                unsigned long block_col = (block + offset_b) % width_in_blocks;
+                unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks +
+                        block_col * block_size + 
+                        row * block_size * width_in_blocks;
+                unsigned int chunk = row * (block_size / channel_width) + col;
+
+                DEVICE_DATA_TYPE data_chunk[channel_width];
+#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0
+
+                ap_axiu<512, 0, 0, 8> tmp = cclo2krnl.read();
+
+                // rotate temporary buffer to store data into local buffer
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    data_chunk[unroll_count] = (DEVICE_DATA_TYPE)((tmp.data >> (unroll_count * sizeof(DEVICE_DATA_TYPE))) & ((1 << 32) - 1));
+                }
+
+                // load tranposed A from global memory
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    data_chunk[unroll_count] += B[ls_address_row + col * channel_width + unroll_count];
+                }
+
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    A_out[ls_address_row + col * channel_width + unroll_count] = data_chunk[unroll_count];
+                }
+            }
+        }
+    }
+}
+
+// PY_CODE_GEN block_end
+
+}

From 6c12ead036eb8926dcf2ef3d76208b58469d2744 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 24 May 2022 15:08:48 +0100
Subject: [PATCH 080/318] Adding tripcount for better reports

---
 PTRANS/src/device/transpose_PQ_ACCL_stream.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
index 223dfd53..2cda216f 100644
--- a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
+++ b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
@@ -53,6 +53,7 @@ void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
     // transpose the matrix block-wise from global memory
 block_loop:
     for (unsigned int block = 0; block < number_of_blocks + 1; block++) {
+#pragma HLS loop_tripcount min=1 max=1024 avg=1
 
 read_A:
         for (unsigned int row = 0; row < block_size; row++) {
@@ -161,7 +162,7 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B,
     // transpose the matrix block-wise from global memory
 block_loop:
     for (unsigned int block = 0; block < number_of_blocks; block++) {
-
+#pragma HLS loop_tripcount min=1 max=1024 avg=1
         // Read transposed A from local memory and add B 
 read_B:
         for (unsigned int row = 0; row < block_size; row++) {

From e1bf6e86341a6006bb4b3c818e2e572bc0d7ea0e Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 3 Jun 2022 14:59:18 +0100
Subject: [PATCH 081/318] Add support for multiple CMAC kernels for UDP

---
 cmake/accl.cmake | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index 2875657d..cdf23b0b 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -1,7 +1,7 @@
 
 # General definitions
 set(ACCL_STACK_TYPE "UDP" CACHE STRING "Network layer used in ACCL")
-set(ACCL_UDP_ETH_IF 0 CACHE STRING "Ethernet interface used. On ETHZ: 0 = switch, 1 = direct")
+set(ACCL_UDP_ETH_IFS 1 CACHE STRING "Number of Ethernet interfaces to synthesize for UDP stack")
 set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform")
 set(ACCL_BUFFER_SIZE 8192 CACHE STRING "Size of ACCL buffers in bytes")
 set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware)
@@ -10,7 +10,6 @@ set(ACCL_CCLO_BUILD_ARGS ${ACCL_CCLO_ADDITIONAL_BUILD_ARGS})
 # UDP related definitions
 set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/)
 set(ACCL_NETLAYER_HLS ${ACCL_VNX_DIR}/NetLayers/100G-fpga-network-stack-core)
-set(ACCL_UDP_MAC_XO ${ACCL_VNX_DIR}/Ethernet/_x.${FPGA_BOARD_NAME}/cmac_${ACCL_UDP_ETH_IF}.xo)
 set(ACCL_UDP_NET_XO ${ACCL_VNX_DIR}/NetLayers/_x.${FPGA_BOARD_NAME}/networklayer.xo)
 set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HMB)
 if (ACCL_STACK_TYPE STREQUAL "UDP")
@@ -19,10 +18,17 @@ if (ACCL_STACK_TYPE STREQUAL "UDP")
     list(APPEND ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE})
 endif()
 
-add_custom_command(
-    OUTPUT ${ACCL_UDP_MAC_XO}
-    COMMAND make -C ${ACCL_VNX_DIR}/Ethernet DEVICE=${FPGA_BOARD_NAME} INTERFACE=${ACCL_UDP_ETH_IF} all
-    WORKING_DIRECTORY ${ACCL_HARDWARE_DIR}) 
+set(ACCL_UDP_MAC_XOS "")
+
+math(EXPR loopend "${ACCL_UDP_ETH_IFS} - 1")
+foreach(i RANGE ${loopend})
+    set(CURRENT_MAC_XO ${ACCL_VNX_DIR}/Ethernet/_x.${FPGA_BOARD_NAME}/cmac_${i}.xo)
+    add_custom_command(
+        OUTPUT ${CURRENT_MAC_XO}
+        COMMAND make -C ${ACCL_VNX_DIR}/Ethernet DEVICE=${FPGA_BOARD_NAME} INTERFACE=${i} all
+        WORKING_DIRECTORY ${ACCL_HARDWARE_DIR}) 
+    list(APPEND ACCL_UDP_MAC_XOS ${CURRENT_MAC_XO})
+endforeach()
 
 add_custom_command(
     OUTPUT ${ACCL_UDP_NET_XO}
@@ -31,7 +37,7 @@ add_custom_command(
 
 add_custom_target(
     accl_udp_stack
-    DEPENDS ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO})
+    DEPENDS ${ACCL_UDP_MAC_XOS} ${ACCL_UDP_NET_XO})
 
 # TCP related definitions
 set(ACCL_TCP_BASE_DIR ${ACCL_HARDWARE_DIR}/Vitis_with_100Gbps_TCP-IP)
@@ -114,7 +120,7 @@ add_custom_target(
     ${ACCL_PLUGINS_COMPRESSION})
 
 set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL}
-    ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO} CACHE INTERNAL "Object files required for ACCL with UDP")
+    ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XOS} ${ACCL_UDP_NET_XO} CACHE INTERNAL "Object files required for ACCL with UDP")
 
 set(ACCL_TCP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL}
     ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_TCP_CMAC_XO} ${ACCL_TCP_XO} CACHE INTERNAL "Object files required for ACCL with TCP")

From 75ecdb73744149e6e05d69cafa813f324c10706a Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 5 Jul 2022 11:00:25 +0200
Subject: [PATCH 082/318] Add ACCL buffer size to b_eff

---
 b_eff/CMakeLists.txt             | 5 +++++
 b_eff/src/common/parameters.h.in | 1 +
 b_eff/src/host/CMakeLists.txt    | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt
index 13d93b1b..f150bcc9 100755
--- a/b_eff/CMakeLists.txt
+++ b/b_eff/CMakeLists.txt
@@ -19,6 +19,11 @@ set(USE_DEPRECATED_HPP_HEADER No)
 
 set(COMMUNICATION_TYPE_SUPPORT_ENABLED Yes)
 
+if (USE_ACCL)
+    math(EXPR calculate_accl_buffer_size "2 ^ ${DEFAULT_MAX_MESSAGE_SIZE} * 4")
+    set(ACCL_BUFFER_SIZE ${calculate_accl_buffer_size} CACHE STRING "Size of ACCL buffers in bytes")
+endif()
+
 set(DATA_TYPE char)
 include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake)
 unset(DATA_TYPE CACHE)
diff --git a/b_eff/src/common/parameters.h.in b/b_eff/src/common/parameters.h.in
index d404bfd7..5c823610 100644
--- a/b_eff/src/common/parameters.h.in
+++ b/b_eff/src/common/parameters.h.in
@@ -23,6 +23,7 @@
 #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
 
 #cmakedefine HOST_EMULATION_REORDER
+#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@
 
 /*
 Short description of the program.
diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt
index d0be57ba..5e22b54a 100755
--- a/b_eff/src/host/CMakeLists.txt
+++ b/b_eff/src/host/CMakeLists.txt
@@ -27,7 +27,7 @@ if (Vitis_FOUND)
     target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
     target_link_libraries(${LIB_NAME}_xilinx accl)
     target_include_directories(${LIB_NAME}_xilinx PRIVATE ${ACCL_INCLUDE_PATH})
-    target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx)
+    target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx zmqpp)
     target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")

From 1ac78f8a31788e1cd8dec0cf5ef9473ce9080b85 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 5 Jul 2022 11:00:41 +0200
Subject: [PATCH 083/318] Fix b_eff accl calls

---
 b_eff/src/host/execution_types/execution_accl.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
index 81673835..c4686b29 100644
--- a/b_eff/src/host/execution_types/execution_accl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -79,8 +79,8 @@ namespace network::execution_types::accl {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 for (int l = 0; l < looplength; l++) {
-			config.accl->send(0, *acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
-			config.accl->recv(0, *acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+			config.accl->send(*acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+			config.accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();

From 3ad2069ba9404646ebd45d8072c447c8572ce75f Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 5 Jul 2022 11:01:02 +0200
Subject: [PATCH 084/318] Switch default accl branch to main

---
 extern/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 341f73cd..5587a0e1 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -62,7 +62,7 @@ FetchContent_Declare(
 	extern_accl
 
     GIT_REPOSITORY	https://github.com/Xilinx/ACCL.git
-	GIT_TAG		dev)
+	GIT_TAG		main)
 
 FetchContent_GetProperties(extern_accl)
 if(NOT extern_accl_POPULATED)

From 67f6967d1a7828a644b0fb9cec7f3a84a967c605 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 5 Jul 2022 11:01:24 +0200
Subject: [PATCH 085/318] Link PTRANS host with zmqpp

---
 PTRANS/src/host/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt
index 2404394f..5bb10e54 100755
--- a/PTRANS/src/host/CMakeLists.txt
+++ b/PTRANS/src/host/CMakeLists.txt
@@ -39,7 +39,7 @@ if (Vitis_FOUND)
     add_executable(${HOST_EXE_NAME}_xilinx main.cpp)
     target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
     target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
-    target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx)
+    target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx zmqpp)
     target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")

From 5aa44d045b36d57f65dc11dc91681da6366de3a8 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 5 Jul 2022 11:03:14 +0200
Subject: [PATCH 086/318] Add copy-a option to PTRANS

---
 PTRANS/src/host/transpose_benchmark.hpp | 40 ++++++++++++++++++++++++-
 PTRANS/src/host/transpose_data.cpp      |  2 +-
 PTRANS/src/host/transpose_data.hpp      |  6 ++++
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index 392789c8..585e60be 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -81,7 +81,8 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
                 cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_P_VALUE)))
             ("distribute-buffers", "Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs.")
             ("handler", "Specify the used data handler that distributes the data over devices and memory banks",
-                cxxopts::value<std::string>()->default_value(DEFAULT_DIST_TYPE));
+                cxxopts::value<std::string>()->default_value(DEFAULT_DIST_TYPE))
+            ("copy-a", "Create a copy of matrix A for each kernel replication");
     }
 
     std::unique_ptr<transpose::data_handler::TransposeDataHandler<TDevice, TContext, TProgram>> dataHandler;
@@ -164,6 +165,11 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
         // exchange the data using MPI depending on the chosen distribution scheme
         this->dataHandler->exchangeData(data);
 
+#ifndef NDEBUG
+        std::vector<HOST_DATA_TYPE> oldA(this->executionSettings->programSettings->blockSize * this->executionSettings->programSettings->blockSize * data.numBlocks);
+        std::copy(data.A, data.A + oldA.size(), oldA.data());
+#endif
+
         this->dataHandler->reference_transpose(data);
 
         double max_error = 0.0;
@@ -175,6 +181,38 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
             }
         }
 
+#ifndef NDEBUG
+        long height_per_rank = reinterpret_cast<data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>*>(this->dataHandler.get())->getHeightforRank();
+        long width_per_rank = reinterpret_cast<data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>*>(this->dataHandler.get())->getWidthforRank();
+        if (error_count > 0) {
+            std::cout << "A:" << std::endl;
+            for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
+                for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
+                    std::cout << oldA[j * width_per_rank * data.blockSize + i] << ", ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+            std::cout << "B:" << std::endl;
+            for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
+                for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
+                    std::cout << data.B[j * width_per_rank * data.blockSize + i] << ", ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+            std::cout << "Transposed A:" << std::endl;
+            for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
+                for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
+                    std::cout << data.A[j * width_per_rank * data.blockSize + i] << ", ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+        }
+
+#endif
+
         double global_max_error = 0;
         int global_error_count = 0;
         MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
diff --git a/PTRANS/src/host/transpose_data.cpp b/PTRANS/src/host/transpose_data.cpp
index 20d6560f..e8a7c8f0 100644
--- a/PTRANS/src/host/transpose_data.cpp
+++ b/PTRANS/src/host/transpose_data.cpp
@@ -7,7 +7,7 @@
 transpose::TransposeProgramSettings::TransposeProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
     matrixSize(results["m"].as<uint>() * results["b"].as<uint>()),
     blockSize(results["b"].as<uint>()), dataHandlerIdentifier(transpose::data_handler::stringToHandler(results["handler"].as<std::string>())),
-    distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as<uint>()) {
+    distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as<uint>()), copyA(results["copy-a"].count() > 0) {
 
         // auto detect data distribution type if required
         if (dataHandlerIdentifier == transpose::data_handler::DataHandlerType::automatic) {
diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp
index c73a9959..fed4eff6 100644
--- a/PTRANS/src/host/transpose_data.hpp
+++ b/PTRANS/src/host/transpose_data.hpp
@@ -74,6 +74,12 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings {
      */
     bool distributeBuffers;
 
+    /**
+    * @brief If true, create a copy of matrix A for each kernel replication
+    *
+    */
+    bool copyA;
+
     /**
      * @brief Construct a new Transpose Program Settings object
      * 

From c373e0249523a8813fce693fdfbe17d39976bd30 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 5 Jul 2022 11:03:48 +0200
Subject: [PATCH 087/318] Implement copy-a for accl

---
 .../execution_types/execution_xrt_accl_pq.hpp | 54 ++++++++-----------
 1 file changed, 21 insertions(+), 33 deletions(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
index 8d3edac5..3fdaeb1f 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
@@ -62,7 +62,6 @@ void accl_exchangeData(
     acclBuffersA.push_back(accl.create_buffer<HOST_DATA_TYPE>(
         bo, data.blockSize * data.blockSize * data.numBlocks,
         ACCL::dataType::float32));
-    acclBuffersA.back()->sync_from_device();
   }
 
   if (pq_width == pq_height) {
@@ -82,33 +81,24 @@ void accl_exchangeData(
       auto acclBufferA_recv = accl.create_buffer<HOST_DATA_TYPE>(
           data.blockSize * data.blockSize * data.numBlocks,
           ACCL::dataType::float32);
-      acclBufferA_recv->sync_to_device();
       // Send and receive matrix A using ACCL directly on FPGA
-      if (mpi_comm_rank < pair_rank) {
-        for (int block_num = 0; block_num < data.numBlocks; block_num++) {
-          accl.send(0,
-                    *acclBuffersA[0]->slice(
+      for (int block_chunk = 0; block_chunk < data.numBlocks; block_chunk+= 16) {
+        for (int block_num = block_chunk; block_num < std::min<size_t>(data.numBlocks, block_chunk + 16); block_num++) {
+          accl.send(*acclBuffersA[0]->slice(
                         data.blockSize * data.blockSize * block_num,
                         data.blockSize * data.blockSize * (block_num + 1)),
-                    data.blockSize * data.blockSize, pair_rank, 0, true,
+                    data.blockSize * data.blockSize, pair_rank, 0, ACCL::GLOBAL_COMM, true,
                     ACCL::streamFlags::NO_STREAM);
         }
-        accl.recv(0, *acclBufferA_recv,
-                  data.blockSize * data.blockSize * data.numBlocks, pair_rank,
-                  1, true, ACCL::streamFlags::NO_STREAM);
-      } else {
-        accl.recv(0, *acclBufferA_recv,
-                  data.blockSize * data.blockSize * data.numBlocks, pair_rank,
-                  0, true, ACCL::streamFlags::NO_STREAM);
-        for (int block_num = 0; block_num < data.numBlocks; block_num++) {
-          accl.send(0,
-                    *acclBuffersA[0]->slice(
+        for (int block_num = block_chunk; block_num < std::min<size_t>(data.numBlocks, block_chunk + 16); block_num++) {
+          accl.recv(*acclBufferA_recv->slice(
                         data.blockSize * data.blockSize * block_num,
                         data.blockSize * data.blockSize * (block_num + 1)),
-                    data.blockSize * data.blockSize, pair_rank, 1, true,
-                    ACCL::streamFlags::NO_STREAM);
+                    data.blockSize * data.blockSize, pair_rank,
+                    1, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM);
         }
       }
+
       accl.copy(*acclBufferA_recv, *acclBuffersA[0],
                 data.blockSize * data.blockSize * data.numBlocks, true, true);
     }
@@ -275,12 +265,12 @@ void accl_exchangeData(
                   << std::flush;
 #endif
         accl_requests[current_parallel_execution] = (accl.send(
-            0, *send_buffers[current_parallel_execution], sending_size,
-            send_rank, 0, true, ACCL::streamFlags::NO_STREAM,
+            *send_buffers[current_parallel_execution], sending_size,
+            send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM,
             ACCL::dataType::none, true));
         accl_requests[current_parallel_execution + gcd] = (accl.recv(
-            0, *recv_buffers[current_parallel_execution], sending_size,
-            send_rank, 0, true, ACCL::streamFlags::NO_STREAM,
+            *recv_buffers[current_parallel_execution], sending_size,
+            send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM,
             ACCL::dataType::none, true));
         // Increase the counter for parallel executions
         current_parallel_execution = (current_parallel_execution + 1) % gcd;
@@ -458,10 +448,13 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
         *config.device, *config.program,
         ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str());
 
-    xrt::bo bufferA(*config.device, data.A,
+    if (r == 0 || config.programSettings->copyA) {
+      xrt::bo bufferA(*config.device, data.A,
                     data.numBlocks * data.blockSize * data.blockSize *
                         sizeof(HOST_DATA_TYPE),
                     transposeKernel.group_id(0));
+      bufferListA.push_back(bufferA);
+    }
     xrt::bo bufferB(
         *config.device,
         &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
@@ -469,7 +462,6 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
                         transposeKernel.group_id(2));
 
-    bufferListA.push_back(bufferA);
     bufferListB.push_back(bufferB);
     bufferListA_out.push_back(bufferA_out);
     transposeKernelList.push_back(transposeKernel);
@@ -487,7 +479,9 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     auto startTransfer = std::chrono::high_resolution_clock::now();
 
     for (int r = 0; r < transposeKernelList.size(); r++) {
-      bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      if (r == 0 || config.programSettings->copyA) {
+        bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      }
       bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
     }
     auto endTransfer = std::chrono::high_resolution_clock::now();
@@ -501,12 +495,6 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     auto startCalculation = std::chrono::high_resolution_clock::now();
 
     // Exchange A data via ACCL
-    if (bufferListA.size() > 1) {
-      std::cerr << "WARNING: Only the matrix A of the first kernel replication "
-                   "will be exchanged "
-                   "via ACCL!"
-                << std::endl;
-    }
 #ifndef NDEBUG
     std::cout << "Start data exchange with ACCL" << std::endl;
 #endif
@@ -519,7 +507,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     auto startKernelCalculation = std::chrono::high_resolution_clock::now();
     for (int r = 0; r < transposeKernelList.size(); r++) {
       runs.push_back(transposeKernelList[r](
-          bufferListA[r], bufferListB[r], bufferListA_out[r],
+          (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]), bufferListB[r], bufferListA_out[r],
           static_cast<cl_uint>(bufferOffsetList[r]),
           static_cast<cl_uint>(bufferOffsetList[r]),
           static_cast<cl_uint>(blocksPerReplication[r]),

From e87731de58a225e8f2dbae59c5e3db12af480ff4 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 5 Jul 2022 11:04:00 +0200
Subject: [PATCH 088/318] Implement copy-a for xrt

---
 .../execution_types/execution_xrt_pcie_pq.hpp | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
index b5788fed..f0d4eeed 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
@@ -122,10 +122,13 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     xrt::kernel transposeKernel(*config.device, *config.program,
                                 ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str());
 
-    xrt::bo bufferA(*config.device, data.A,
+    if ( r == 0 || config.programSettings->copyA) {
+      xrt::bo bufferA(*config.device, data.A,
                     data.numBlocks * data.blockSize * data.blockSize *
                         sizeof(HOST_DATA_TYPE),
                     transposeKernel.group_id(0));
+      bufferListA.push_back(bufferA);
+    }
     xrt::bo bufferB(
         *config.device,
         &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
@@ -138,7 +141,6 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
                         transposeKernel.group_id(2));
 
-    bufferListA.push_back(bufferA);
     bufferListB.push_back(bufferB);
     bufferListA_out.push_back(bufferA_out);
     transposeKernelList.push_back(transposeKernel);
@@ -152,7 +154,9 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     auto startTransfer = std::chrono::high_resolution_clock::now();
 
     for (int r = 0; r < transposeKernelList.size(); r++) {
-      bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      if ( r == 0 || config.programSettings->copyA) {
+        bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      }
       bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
     }
     auto endTransfer = std::chrono::high_resolution_clock::now();
@@ -168,7 +172,9 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
 
     if (mpi_size > 1) {
     for (int r = 0; r < transposeKernelList.size(); r++) {
-      bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+      if ( r == 0 || config.programSettings->copyA) {
+        bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+      }
     }
 
     // Exchange A data via PCIe and MPI
@@ -177,7 +183,9 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     std::copy(data.A, data.A + data.numBlocks * data.blockSize * data.blockSize,
               data.exchange);
     for (int r = 0; r < transposeKernelList.size(); r++) {
-      bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      if ( r == 0 || config.programSettings->copyA) {
+        bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      }
     }
     }
 
@@ -185,7 +193,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     auto startKernelCalculation = std::chrono::high_resolution_clock::now();
     for (int r = 0; r < transposeKernelList.size(); r++) {
       runs.push_back(transposeKernelList[r](
-          bufferListA[r], bufferListB[r], bufferListA_out[r],
+          (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]), bufferListB[r], bufferListA_out[r],
           static_cast<cl_uint>(bufferStartList[r] + bufferOffsetList[r]),
           static_cast<cl_uint>(bufferOffsetList[r]), static_cast<cl_uint>(blocksPerReplication[r]),
           static_cast<cl_uint>(handler.getWidthforRank()),

From 50be2cf53a85387f387525a784615262bc5f3f83 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 8 Aug 2022 17:45:20 +0200
Subject: [PATCH 089/318] Add proper ACCL UDP support on hardware

---
 .../execution_types/execution_xrt_accl_pq.hpp | 54 +++++++----------
 .../execution_types/execution_xrt_pcie_pq.hpp | 20 +++++--
 PTRANS/src/host/transpose_benchmark.hpp       | 40 ++++++++++++-
 PTRANS/src/host/transpose_data.cpp            |  2 +-
 PTRANS/src/host/transpose_data.hpp            |  6 ++
 shared/CMakeLists.txt                         |  6 +-
 shared/setup/fpga_setup_accl.cpp              | 58 ++++++++++++++++++-
 shared/setup/fpga_setup_xrt.cpp               |  5 +-
 8 files changed, 145 insertions(+), 46 deletions(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
index 8d3edac5..3fdaeb1f 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
@@ -62,7 +62,6 @@ void accl_exchangeData(
     acclBuffersA.push_back(accl.create_buffer<HOST_DATA_TYPE>(
         bo, data.blockSize * data.blockSize * data.numBlocks,
         ACCL::dataType::float32));
-    acclBuffersA.back()->sync_from_device();
   }
 
   if (pq_width == pq_height) {
@@ -82,33 +81,24 @@ void accl_exchangeData(
       auto acclBufferA_recv = accl.create_buffer<HOST_DATA_TYPE>(
           data.blockSize * data.blockSize * data.numBlocks,
           ACCL::dataType::float32);
-      acclBufferA_recv->sync_to_device();
       // Send and receive matrix A using ACCL directly on FPGA
-      if (mpi_comm_rank < pair_rank) {
-        for (int block_num = 0; block_num < data.numBlocks; block_num++) {
-          accl.send(0,
-                    *acclBuffersA[0]->slice(
+      for (int block_chunk = 0; block_chunk < data.numBlocks; block_chunk+= 16) {
+        for (int block_num = block_chunk; block_num < std::min<size_t>(data.numBlocks, block_chunk + 16); block_num++) {
+          accl.send(*acclBuffersA[0]->slice(
                         data.blockSize * data.blockSize * block_num,
                         data.blockSize * data.blockSize * (block_num + 1)),
-                    data.blockSize * data.blockSize, pair_rank, 0, true,
+                    data.blockSize * data.blockSize, pair_rank, 0, ACCL::GLOBAL_COMM, true,
                     ACCL::streamFlags::NO_STREAM);
         }
-        accl.recv(0, *acclBufferA_recv,
-                  data.blockSize * data.blockSize * data.numBlocks, pair_rank,
-                  1, true, ACCL::streamFlags::NO_STREAM);
-      } else {
-        accl.recv(0, *acclBufferA_recv,
-                  data.blockSize * data.blockSize * data.numBlocks, pair_rank,
-                  0, true, ACCL::streamFlags::NO_STREAM);
-        for (int block_num = 0; block_num < data.numBlocks; block_num++) {
-          accl.send(0,
-                    *acclBuffersA[0]->slice(
+        for (int block_num = block_chunk; block_num < std::min<size_t>(data.numBlocks, block_chunk + 16); block_num++) {
+          accl.recv(*acclBufferA_recv->slice(
                         data.blockSize * data.blockSize * block_num,
                         data.blockSize * data.blockSize * (block_num + 1)),
-                    data.blockSize * data.blockSize, pair_rank, 1, true,
-                    ACCL::streamFlags::NO_STREAM);
+                    data.blockSize * data.blockSize, pair_rank,
+                    1, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM);
         }
       }
+
       accl.copy(*acclBufferA_recv, *acclBuffersA[0],
                 data.blockSize * data.blockSize * data.numBlocks, true, true);
     }
@@ -275,12 +265,12 @@ void accl_exchangeData(
                   << std::flush;
 #endif
         accl_requests[current_parallel_execution] = (accl.send(
-            0, *send_buffers[current_parallel_execution], sending_size,
-            send_rank, 0, true, ACCL::streamFlags::NO_STREAM,
+            *send_buffers[current_parallel_execution], sending_size,
+            send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM,
             ACCL::dataType::none, true));
         accl_requests[current_parallel_execution + gcd] = (accl.recv(
-            0, *recv_buffers[current_parallel_execution], sending_size,
-            send_rank, 0, true, ACCL::streamFlags::NO_STREAM,
+            *recv_buffers[current_parallel_execution], sending_size,
+            send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM,
             ACCL::dataType::none, true));
         // Increase the counter for parallel executions
         current_parallel_execution = (current_parallel_execution + 1) % gcd;
@@ -458,10 +448,13 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
         *config.device, *config.program,
         ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str());
 
-    xrt::bo bufferA(*config.device, data.A,
+    if (r == 0 || config.programSettings->copyA) {
+      xrt::bo bufferA(*config.device, data.A,
                     data.numBlocks * data.blockSize * data.blockSize *
                         sizeof(HOST_DATA_TYPE),
                     transposeKernel.group_id(0));
+      bufferListA.push_back(bufferA);
+    }
     xrt::bo bufferB(
         *config.device,
         &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
@@ -469,7 +462,6 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
                         transposeKernel.group_id(2));
 
-    bufferListA.push_back(bufferA);
     bufferListB.push_back(bufferB);
     bufferListA_out.push_back(bufferA_out);
     transposeKernelList.push_back(transposeKernel);
@@ -487,7 +479,9 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     auto startTransfer = std::chrono::high_resolution_clock::now();
 
     for (int r = 0; r < transposeKernelList.size(); r++) {
-      bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      if (r == 0 || config.programSettings->copyA) {
+        bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      }
       bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
     }
     auto endTransfer = std::chrono::high_resolution_clock::now();
@@ -501,12 +495,6 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     auto startCalculation = std::chrono::high_resolution_clock::now();
 
     // Exchange A data via ACCL
-    if (bufferListA.size() > 1) {
-      std::cerr << "WARNING: Only the matrix A of the first kernel replication "
-                   "will be exchanged "
-                   "via ACCL!"
-                << std::endl;
-    }
 #ifndef NDEBUG
     std::cout << "Start data exchange with ACCL" << std::endl;
 #endif
@@ -519,7 +507,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     auto startKernelCalculation = std::chrono::high_resolution_clock::now();
     for (int r = 0; r < transposeKernelList.size(); r++) {
       runs.push_back(transposeKernelList[r](
-          bufferListA[r], bufferListB[r], bufferListA_out[r],
+          (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]), bufferListB[r], bufferListA_out[r],
           static_cast<cl_uint>(bufferOffsetList[r]),
           static_cast<cl_uint>(bufferOffsetList[r]),
           static_cast<cl_uint>(blocksPerReplication[r]),
diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
index b5788fed..f0d4eeed 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
@@ -122,10 +122,13 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     xrt::kernel transposeKernel(*config.device, *config.program,
                                 ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str());
 
-    xrt::bo bufferA(*config.device, data.A,
+    if ( r == 0 || config.programSettings->copyA) {
+      xrt::bo bufferA(*config.device, data.A,
                     data.numBlocks * data.blockSize * data.blockSize *
                         sizeof(HOST_DATA_TYPE),
                     transposeKernel.group_id(0));
+      bufferListA.push_back(bufferA);
+    }
     xrt::bo bufferB(
         *config.device,
         &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
@@ -138,7 +141,6 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
                         transposeKernel.group_id(2));
 
-    bufferListA.push_back(bufferA);
     bufferListB.push_back(bufferB);
     bufferListA_out.push_back(bufferA_out);
     transposeKernelList.push_back(transposeKernel);
@@ -152,7 +154,9 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     auto startTransfer = std::chrono::high_resolution_clock::now();
 
     for (int r = 0; r < transposeKernelList.size(); r++) {
-      bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      if ( r == 0 || config.programSettings->copyA) {
+        bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      }
       bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
     }
     auto endTransfer = std::chrono::high_resolution_clock::now();
@@ -168,7 +172,9 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
 
     if (mpi_size > 1) {
     for (int r = 0; r < transposeKernelList.size(); r++) {
-      bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+      if ( r == 0 || config.programSettings->copyA) {
+        bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+      }
     }
 
     // Exchange A data via PCIe and MPI
@@ -177,7 +183,9 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     std::copy(data.A, data.A + data.numBlocks * data.blockSize * data.blockSize,
               data.exchange);
     for (int r = 0; r < transposeKernelList.size(); r++) {
-      bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      if ( r == 0 || config.programSettings->copyA) {
+        bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      }
     }
     }
 
@@ -185,7 +193,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     auto startKernelCalculation = std::chrono::high_resolution_clock::now();
     for (int r = 0; r < transposeKernelList.size(); r++) {
       runs.push_back(transposeKernelList[r](
-          bufferListA[r], bufferListB[r], bufferListA_out[r],
+          (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]), bufferListB[r], bufferListA_out[r],
           static_cast<cl_uint>(bufferStartList[r] + bufferOffsetList[r]),
           static_cast<cl_uint>(bufferOffsetList[r]), static_cast<cl_uint>(blocksPerReplication[r]),
           static_cast<cl_uint>(handler.getWidthforRank()),
diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index 392789c8..585e60be 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -81,7 +81,8 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
                 cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_P_VALUE)))
             ("distribute-buffers", "Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs.")
             ("handler", "Specify the used data handler that distributes the data over devices and memory banks",
-                cxxopts::value<std::string>()->default_value(DEFAULT_DIST_TYPE));
+                cxxopts::value<std::string>()->default_value(DEFAULT_DIST_TYPE))
+            ("copy-a", "Create a copy of matrix A for each kernel replication");
     }
 
     std::unique_ptr<transpose::data_handler::TransposeDataHandler<TDevice, TContext, TProgram>> dataHandler;
@@ -164,6 +165,11 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
         // exchange the data using MPI depending on the chosen distribution scheme
         this->dataHandler->exchangeData(data);
 
+#ifndef NDEBUG
+        std::vector<HOST_DATA_TYPE> oldA(this->executionSettings->programSettings->blockSize * this->executionSettings->programSettings->blockSize * data.numBlocks);
+        std::copy(data.A, data.A + oldA.size(), oldA.data());
+#endif
+
         this->dataHandler->reference_transpose(data);
 
         double max_error = 0.0;
@@ -175,6 +181,38 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
             }
         }
 
+#ifndef NDEBUG
+        long height_per_rank = reinterpret_cast<data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>*>(this->dataHandler.get())->getHeightforRank();
+        long width_per_rank = reinterpret_cast<data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>*>(this->dataHandler.get())->getWidthforRank();
+        if (error_count > 0) {
+            std::cout << "A:" << std::endl;
+            for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
+                for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
+                    std::cout << oldA[j * width_per_rank * data.blockSize + i] << ", ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+            std::cout << "B:" << std::endl;
+            for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
+                for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
+                    std::cout << data.B[j * width_per_rank * data.blockSize + i] << ", ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+            std::cout << "Transposed A:" << std::endl;
+            for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
+                for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
+                    std::cout << data.A[j * width_per_rank * data.blockSize + i] << ", ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+        }
+
+#endif
+
         double global_max_error = 0;
         int global_error_count = 0;
         MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
diff --git a/PTRANS/src/host/transpose_data.cpp b/PTRANS/src/host/transpose_data.cpp
index 20d6560f..e8a7c8f0 100644
--- a/PTRANS/src/host/transpose_data.cpp
+++ b/PTRANS/src/host/transpose_data.cpp
@@ -7,7 +7,7 @@
 transpose::TransposeProgramSettings::TransposeProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
     matrixSize(results["m"].as<uint>() * results["b"].as<uint>()),
     blockSize(results["b"].as<uint>()), dataHandlerIdentifier(transpose::data_handler::stringToHandler(results["handler"].as<std::string>())),
-    distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as<uint>()) {
+    distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as<uint>()), copyA(results["copy-a"].count() > 0) {
 
         // auto detect data distribution type if required
         if (dataHandlerIdentifier == transpose::data_handler::DataHandlerType::automatic) {
diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp
index c73a9959..fed4eff6 100644
--- a/PTRANS/src/host/transpose_data.hpp
+++ b/PTRANS/src/host/transpose_data.hpp
@@ -74,6 +74,12 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings {
      */
     bool distributeBuffers;
 
+    /**
+    * @brief If true, create a copy of matrix A for each kernel replication
+    *
+    */
+    bool copyA;
+
     /**
      * @brief Construct a new Transpose Program Settings object
      * 
diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt
index 43749c0a..43731ce8 100644
--- a/shared/CMakeLists.txt
+++ b/shared/CMakeLists.txt
@@ -12,8 +12,10 @@ endif()
 list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp)
 add_library(hpcc_fpga_base STATIC ${HPCC_BASE_SOURCES})
 if (USE_ACCL)
-    target_include_directories(hpcc_fpga_base PRIVATE ${ACCL_INCLUDE_PATH})
-    target_link_libraries(hpcc_fpga_base accl)
+    add_subdirectory(${extern_accl_SOURCE_DIR}/test/hardware/xup_vitis_network_example/xrt_host_api
+       ${CMAKE_BINARY_DIR}/libs/xrt_host_api)
+    target_include_directories(hpcc_fpga_base PRIVATE ${VNX_INCLUDE_PATH} ${ACCL_INCLUDE_PATH})
+    target_link_libraries(hpcc_fpga_base accl vnx)
 endif()
 if (USE_XRT_HOST)
     target_link_directories(hpcc_fpga_base PUBLIC ${XRT_SEARCH_PATH})
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 5ce08a41..1e41b3d6 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -14,13 +14,58 @@
 /* External libraries */
 #include "experimental/xrt_ip.h"
 #include "parameters.h"
+#include <vnx/cmac.hpp>
+#include <vnx/networklayer.hpp>
 #include "xrt/xrt_kernel.h"
 #ifdef _USE_MPI_
 #include "mpi.h"
 #endif
 
+using namespace vnx;
+
 namespace fpga_setup {
 
+void configure_vnx(CMAC &cmac, Networklayer &network_layer,
+                   std::vector<ACCL::rank_t> &ranks, int rank) {
+  if (ranks.size() > max_sockets_size) {
+    throw std::runtime_error("Too many ranks. VNX supports up to " +
+                             std::to_string(max_sockets_size) + " sockets.");
+  }
+
+  const auto link_status = cmac.link_status();
+
+  if (link_status.at("rx_status")) {
+    std::cout << "Link successful!" << std::endl;
+  } else {
+    std::cout << "No link found." << std::endl;
+  }
+
+  if (!link_status.at("rx_status")) {
+    // Give time for other ranks to setup link.
+    std::this_thread::sleep_for(std::chrono::seconds(3));
+    exit(1);
+  }
+  
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  network_layer.update_ip_address(ranks[rank].ip);
+  for (size_t i = 0; i < ranks.size(); ++i) {
+    if (i == static_cast<size_t>(rank)) {
+      continue;
+    }
+
+    network_layer.configure_socket(i, ranks[i].ip, ranks[i].port,
+                                   ranks[rank].port, true);
+  }
+
+  network_layer.populate_socket_table();
+
+  std::this_thread::sleep_for(std::chrono::seconds(4));
+  network_layer.arp_discovery();
+  std::this_thread::sleep_for(std::chrono::seconds(2));
+  network_layer.arp_discovery();
+}
+
 std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
                                           bool useAcclEmulation) {
   int current_rank;
@@ -32,16 +77,25 @@ std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &progra
   std::vector<ACCL::rank_t> ranks = {};
   for (int i = 0; i < current_size; ++i) {
     // TODO: Replace the ip addresses and ports here for execution of real hardware?
-    ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, ACCL_BUFFER_SIZE};
+    ACCL::rank_t new_rank = {"10.10.10." + current_rank, 5500 + i, i, ACCL_BUFFER_SIZE};
     ranks.emplace_back(new_rank);
   }
   if (!useAcclEmulation) {
+    std::cout << "Create cclo ip" << std::endl;
     auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}");
+    std::cout << "Create hostctrl" << std::endl;
     auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}",
                                    xrt::kernel::cu_access_mode::exclusive);
+ 
+    auto cmac = CMAC(xrt::ip(device, program, "cmac_0:{cmac_0}"));
+     auto network_layer = Networklayer(
+          xrt::ip(device, program, "networklayer:{networklayer_0}"));
+     configure_vnx(cmac, network_layer, ranks, current_rank);
+
     std::vector<int> mem(1, 0);
+    std::cout << "Create ACCL" << std::endl;
     return std::unique_ptr<ACCL::ACCL>(
-        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, 0));
+        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, 0, ACCL::networkProtocol::UDP));
   } else {
     // TODO: Add start port here. Currenty hardcoded!
     return std::unique_ptr<ACCL::ACCL>(
diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp
index 0410fd1b..103eda17 100644
--- a/shared/setup/fpga_setup_xrt.cpp
+++ b/shared/setup/fpga_setup_xrt.cpp
@@ -35,6 +35,9 @@ namespace fpga_setup {
 
     std::unique_ptr<xrt::device>
     selectFPGADevice(int defaultDevice) {
-        return std::unique_ptr<xrt::device>(new xrt::device(defaultDevice));
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        return std::unique_ptr<xrt::device>(new xrt::device(current_rank));
     } 
 }  // namespace fpga_setup

From 4dbc06eaed24b08727679d39a526869691e47896 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 8 Aug 2022 17:47:53 +0200
Subject: [PATCH 090/318] Add ACCL support to b_eff host code

---
 b_eff/CMakeLists.txt                              | 5 +++++
 b_eff/src/common/parameters.h.in                  | 1 +
 b_eff/src/host/execution_types/execution_accl.hpp | 4 ++--
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt
index 13d93b1b..f150bcc9 100755
--- a/b_eff/CMakeLists.txt
+++ b/b_eff/CMakeLists.txt
@@ -19,6 +19,11 @@ set(USE_DEPRECATED_HPP_HEADER No)
 
 set(COMMUNICATION_TYPE_SUPPORT_ENABLED Yes)
 
+if (USE_ACCL)
+    math(EXPR calculate_accl_buffer_size "2 ^ ${DEFAULT_MAX_MESSAGE_SIZE} * 4")
+    set(ACCL_BUFFER_SIZE ${calculate_accl_buffer_size} CACHE STRING "Size of ACCL buffers in bytes")
+endif()
+
 set(DATA_TYPE char)
 include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake)
 unset(DATA_TYPE CACHE)
diff --git a/b_eff/src/common/parameters.h.in b/b_eff/src/common/parameters.h.in
index d404bfd7..5c823610 100644
--- a/b_eff/src/common/parameters.h.in
+++ b/b_eff/src/common/parameters.h.in
@@ -23,6 +23,7 @@
 #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
 
 #cmakedefine HOST_EMULATION_REORDER
+#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@
 
 /*
 Short description of the program.
diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
index 81673835..c4686b29 100644
--- a/b_eff/src/host/execution_types/execution_accl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -79,8 +79,8 @@ namespace network::execution_types::accl {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 for (int l = 0; l < looplength; l++) {
-			config.accl->send(0, *acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
-			config.accl->recv(0, *acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+			config.accl->send(*acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+			config.accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();

From 8e825ae5a4cfe950aa7e25e7a5a8cc4952afed1f Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 19 Sep 2022 17:54:37 +0100
Subject: [PATCH 091/318] Use UDP for ACCL communication

---
 shared/setup/fpga_setup_accl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 1e41b3d6..2f78366d 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -99,7 +99,7 @@ std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &progra
   } else {
     // TODO: Add start port here. Currenty hardcoded!
     return std::unique_ptr<ACCL::ACCL>(
-        new ACCL::ACCL(ranks, current_rank, 5500, device, ACCL::networkProtocol::TCP, 16, ACCL_BUFFER_SIZE));
+        new ACCL::ACCL(ranks, current_rank, 5500, device, ACCL::networkProtocol::UDP, 16, ACCL_BUFFER_SIZE));
   }
 }
 

From b8b24d0fc40121c58e2346409ae6309b338a4654 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 19 Sep 2022 17:56:50 +0100
Subject: [PATCH 092/318] Change repo name for VNx

---
 cmake/accl.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index cdf23b0b..6b064d95 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -11,7 +11,7 @@ set(ACCL_CCLO_BUILD_ARGS ${ACCL_CCLO_ADDITIONAL_BUILD_ARGS})
 set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/)
 set(ACCL_NETLAYER_HLS ${ACCL_VNX_DIR}/NetLayers/100G-fpga-network-stack-core)
 set(ACCL_UDP_NET_XO ${ACCL_VNX_DIR}/NetLayers/_x.${FPGA_BOARD_NAME}/networklayer.xo)
-set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HMB)
+set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HBM)
 if (ACCL_STACK_TYPE STREQUAL "UDP")
     list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_VNX_DIR}/Ethernet/post_sys_link.tcl)
     list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_HLS_IP_FOLDER})

From 32423b3e0b8b428424af7857b61e93b543047cd7 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 19 Sep 2022 17:57:26 +0100
Subject: [PATCH 093/318] Add config and settings for U55c

---
 .../Xilinx_U55C_DDR_ACCL_buffers_hbm.cmake    | 25 +++++++
 ...ings.link.xilinx.accl_buffers.u55c.hbm.ini | 71 +++++++++++++++++++
 2 files changed, 96 insertions(+)
 create mode 100644 b_eff/configs/Xilinx_U55C_DDR_ACCL_buffers_hbm.cmake
 create mode 100644 b_eff/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini

diff --git a/b_eff/configs/Xilinx_U55C_DDR_ACCL_buffers_hbm.cmake b/b_eff/configs/Xilinx_U55C_DDR_ACCL_buffers_hbm.cmake
new file mode 100644
index 00000000..ed6ec1f9
--- /dev/null
+++ b/b_eff/configs/Xilinx_U55C_DDR_ACCL_buffers_hbm.cmake
@@ -0,0 +1,25 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/b_eff/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini b/b_eff/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini
new file mode 100644
index 00000000..61850b2a
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini
@@ -0,0 +1,71 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=lb_user_krnl:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+
+sp=ccl_offload_0.m_axi_0:HBM[0:5]
+sp=ccl_offload_0.m_axi_1:HBM[0:5]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+

From 3ab470b5e2771a7bc803b9ac35e5a419a0451f6d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 19 Sep 2022 18:00:27 +0100
Subject: [PATCH 094/318] Add U55c configs for LINPACK

---
 .../configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake  | 29 ++++++
 ...nk.xilinx.hpl_torus_accl.hbm.generator.ini | 89 +++++++++++++++++++
 2 files changed, 118 insertions(+)
 create mode 100644 LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake
 create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini

diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake
new file mode 100644
index 00000000..15e2edeb
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake
@@ -0,0 +1,29 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini" CACHE STRING "Link settings file" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
+
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini
new file mode 100644
index 00000000..1e75a2eb
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini
@@ -0,0 +1,89 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR0
+slr=top_update_1:SLR0
+slr=inner_update_mm0_1:SLR0
+slr=inner_update_mm0_2:SLR2
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:4]
+sp=lu_1.m_axi_gmem1:HBM[5:6]
+sp=lu_1.m_axi_gmem2:HBM[5:6]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:4]
+sp=top_update_1.m_axi_gmem1:HBM[5:6]
+sp=top_update_1.m_axi_gmem2:HBM[5:6]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:4]
+sp=left_update_1.m_axi_gmem1:HBM[5:6]
+sp=left_update_1.m_axi_gmem2:HBM[5:6]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:4]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[5:6]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[5:6]
+# PY_CODE_GEN block_end
+
+#ACCL
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+
+sp=ccl_offload_0.m_axi_0:HBM[5:6]
+sp=ccl_offload_0.m_axi_1:HBM[5:6]
+
+
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl

From 0321648a59653d3922923aa4716d966c096d4a2f Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 20 Sep 2022 15:30:08 +0100
Subject: [PATCH 095/318] Add b_eff PL scheduler

---
 b_eff/src/device/CMakeLists.txt            |  2 +-
 b_eff/src/device/communication_ACCL_pl.cpp | 35 ++++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 b_eff/src/device/communication_ACCL_pl.cpp

diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt
index e5939572..c5af8b66 100644
--- a/b_eff/src/device/CMakeLists.txt
+++ b/b_eff/src/device/CMakeLists.txt
@@ -16,5 +16,5 @@ if (INTELFPGAOPENCL_FOUND)
 endif()
 
 if (VITIS_FOUND)
-        generate_kernel_targets_xilinx(communication_ACCL)
+        generate_kernel_targets_xilinx(communication_ACCL communication_ACCL_pl)
 endif()
diff --git a/b_eff/src/device/communication_ACCL_pl.cpp b/b_eff/src/device/communication_ACCL_pl.cpp
new file mode 100644
index 00000000..528c69cb
--- /dev/null
+++ b/b_eff/src/device/communication_ACCL_pl.cpp
@@ -0,0 +1,35 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#include "accl_hls.h"
+
+extern "C" {
+
+void send_recv(char *read_buffer,char *write_buffer,  unsigned int size_in_bytes, unsigned int num_iterations, 
+                unsigned int neighbor_rank, addr_t communicator_addr,
+                hls::stream<ap_uint<32> > &cmd, hls::stream<ap_uint<32> > &sts) {
+    for (int i = 0; i < num_iterations; i++) {
+        ACCLCommand accl_cmd(cmd, sts, communicator_addr, 0,0,0);
+        accl_cmd.send(size_in_bytes, 0, neighbor_rank, read_buffer);
+        accl_cmd.recv(size_in_bytes, 0, neighbor_rank, write_buffer);
+    }
+}
+}
\ No newline at end of file

From a533fc53d0ab0da7124773aaa3f56be42b7e96ce Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 20 Sep 2022 15:31:38 +0100
Subject: [PATCH 096/318] Add host support for PL scheduler

---
 b_eff/src/host/CMakeLists.txt                 |   5 +-
 b_eff/src/host/execution_types/execution.hpp  |   1 +
 .../execution_types/execution_accl_pl.hpp     | 116 ++++++++++++++++++
 b_eff/src/host/network_benchmark.cpp          |  17 ++-
 b_eff/src/host/network_benchmark.hpp          |   6 +
 5 files changed, 140 insertions(+), 5 deletions(-)
 create mode 100644 b_eff/src/host/execution_types/execution_accl_pl.hpp

diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt
index 5e22b54a..adaa8348 100755
--- a/b_eff/src/host/CMakeLists.txt
+++ b/b_eff/src/host/CMakeLists.txt
@@ -25,9 +25,12 @@ if (Vitis_FOUND)
     add_executable(${HOST_EXE_NAME}_xilinx main.cpp)
     target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
     target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
+if (USE_ACCL)
     target_link_libraries(${LIB_NAME}_xilinx accl)
     target_include_directories(${LIB_NAME}_xilinx PRIVATE ${ACCL_INCLUDE_PATH})
-    target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx zmqpp)
+    target_link_libraries(${HOST_EXE_NAME}_xilinx zmqpp)
+endif()
+    target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx)
     target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp
index 118f0ebc..e9bea3be 100644
--- a/b_eff/src/host/execution_types/execution.hpp
+++ b/b_eff/src/host/execution_types/execution.hpp
@@ -28,4 +28,5 @@ SOFTWARE.
 #endif
 #else
 #include "execution_types/execution_accl.hpp"
+#include "execution_types/execution_accl_pl.hpp"
 #endif
diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
new file mode 100644
index 00000000..e82fa29a
--- /dev/null
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -0,0 +1,116 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_PL_HPP
+#define SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_PL_HPP
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* External library headers */
+#include "mpi.h"
+#include "accl.hpp"
+
+/* Project's headers */
+
+namespace network::execution_types::accl_pl {
+
+    /*
+    Implementation for the single kernel.
+     @copydoc bm_execution::calculate()
+    */
+	template<class TDevice, class TContext, class TProgram>
+    std::shared_ptr<network::ExecutionTimings>
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
+                cl::vector<HOST_DATA_TYPE> &validationData) {
+
+        int err;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+        std::vector<cl::vector<HOST_DATA_TYPE>> recvBufferContents;
+	std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclSendBuffers;
+	std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclRecvBuffers;
+        cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
+
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+        std::vector<double> calculationTimings;
+        for (uint r =0; r < config.programSettings->numRepetitions; r++) {
+            dummyBufferContents.clear();
+	    recvBufferContents.clear();
+	    acclSendBuffers.clear();
+	    acclRecvBuffers.clear();
+	    int size_in_values = (size_in_bytes + 3) / 4;
+            // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
+            for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+                recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
+                acclSendBuffers.push_back(config.accl->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
+                acclRecvBuffers.push_back(config.accl->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
+                acclSendBuffers.back()->sync_to_device();
+                acclRecvBuffers.back()->sync_to_device();
+            }
+
+            xrt::kernel sendrecvKernel(*config.device, *config.program, "sendrecv");
+
+            double calculationTime = 0.0;
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto startCalculation = std::chrono::high_resolution_clock::now();
+                auto run = sendrecvKernel(acclSendBuffers[i]->bo(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.accl->get_communicator_adr());
+                run.wait();
+                auto endCalculation = std::chrono::high_resolution_clock::now();
+                calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
+                #endif
+            }
+            calculationTimings.push_back(calculationTime);
+#ifndef NDEBUG
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+        std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
+#endif
+        }
+        // Read validation data from FPGA will be placed sequentially in buffer for all replications
+        // The data order should not matter, because every byte should have the same value!
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+            acclRecvBuffers.back()->sync_from_device();
+		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
+        }
+        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
+                looplength,
+                messageSize,
+                calculationTimings
+        });
+        return result;
+    }
+
+}  // namespace bm_execution
+
+#endif
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index 09872106..2eef9621 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -36,8 +36,11 @@ SOFTWARE.
 
 network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
     maxLoopLength(results["u"].as<uint>()), minLoopLength(results["l"].as<uint>()), maxMessageSize(results["m"].as<uint>()), 
-    minMessageSize(results["min-size"].as<uint>()), llOffset(results["o"].as<uint>()), llDecrease(results["d"].as<uint>()) {
-
+    minMessageSize(results["min-size"].as<uint>()), llOffset(results["o"].as<uint>()), llDecrease(results["d"].as<uint>())
+#ifdef USE_ACCL
+    , accl_from_programable_logic(results["accl-pl"].count()) 
+#endif
+{
 }
 
 std::map<std::string, std::string>
@@ -86,7 +89,11 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options)
         ("o", "Offset used before reducing repetitions",
             cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_OFFSET)))
         ("d", "Number os steps the repetitions are decreased to its minimum",
-            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE)));
+            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE)))
+#ifdef USE_ACCL
+        ("accl-pl", "Use second ACCL command kernel to schedule sends and recevs from PL")
+#endif
+;
 }
 
 std::unique_ptr<network::NetworkExecutionTimings>
@@ -113,8 +120,10 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
 #ifdef INTEL_FPGA
 	    case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
 #endif
+#else
+	    case hpcc_base::CommunicationType::accl: if (!executionSettings->programSettings->accl_from_programable_logic) { timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
+                                                } else { timing = execution_types::accl_pl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);} break;
 #endif
-	    case hpcc_base::CommunicationType::accl: timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
 	    default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType));
         }
         timing_results.push_back(timing);
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index 8e9e2fc1..964ec5ca 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -140,6 +140,12 @@ class NetworkProgramSettings : public hpcc_base::BaseSettings {
      */
     uint llDecrease;
 
+    /**
+    * @brief Use the second command kernel to schedule sends and receives directly from PL
+    *
+    */
+    bool accl_from_programable_logic;
+
     /**
      * @brief Construct a new Network Program Settings object
      * 

From e2f20829b2be3982d3fd7c3d2a09500f030fda40 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 20 Sep 2022 15:32:30 +0100
Subject: [PATCH 097/318] Add config for PL scheduler

---
 .../configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake

diff --git a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake
new file mode 100644
index 00000000..472957c1
--- /dev/null
+++ b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake
@@ -0,0 +1,25 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES "sendrecv" CACHE STRING "" FORCE)
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)

From 8f87e16a883d77c18b201fe08ad0a7ce691857e2 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 20 Sep 2022 15:35:06 +0100
Subject: [PATCH 098/318] Set temporary branch for ACCL with recent features
 (Pl schedule and UDP fix)

---
 extern/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 5587a0e1..141899f7 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -61,8 +61,8 @@ if (DEFINED USE_ACCL)
 FetchContent_Declare(
 	extern_accl
 
-    GIT_REPOSITORY	https://github.com/Xilinx/ACCL.git
-	GIT_TAG		main)
+    GIT_REPOSITORY	https://github.com/Mellich/ACCL.git
+	GIT_TAG		udp_address_fix_and_new_tcp)
 
 FetchContent_GetProperties(extern_accl)
 if(NOT extern_accl_POPULATED)

From d202beb42b05d34ddb1c7b6e25029f8e7d94b214 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 20 Sep 2022 16:52:52 +0100
Subject: [PATCH 099/318] Add support for client arbiter

---
 .../configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake |  3 +-
 .../settings.link.xilinx.accl_pl.u55c.hbm.ini | 81 +++++++++++++++++++
 cmake/accl.cmake                              | 16 +++-
 3 files changed, 96 insertions(+), 4 deletions(-)
 create mode 100644 b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini

diff --git a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake
index 472957c1..65516d5b 100644
--- a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake
+++ b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake
@@ -13,10 +13,11 @@ set(USE_ACCL Yes CACHE BOOL "" FORCE)
 set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
 set(USE_OCL_HOST No CACHE BOOL "" FORCE)
 set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
-set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini CACHE FILEPATH "" FORCE)
 set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
 set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
 set(XILINX_KERNEL_NAMES "sendrecv" CACHE STRING "" FORCE)
+set(USE_ACCL_CLIENT_ARBITER Yes CACHE BOOL "Use the client arbiter kernel to connect multiple kernels to the CCLO cmd stream" FORCE)
 # STREAM specific options
 # Defaults to a total of ~12GB data
 set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini
new file mode 100644
index 00000000..f344c4bb
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini
@@ -0,0 +1,81 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+nk=client_arbiter:1:client_arbiter
+nk=sendrecv:1:sendrecv
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=lb_user_krnl:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+slr=client_arbiter:SLR0
+slr=sendrecv:SLR0
+
+sp=ccl_offload_0.m_axi_0:HBM[0:5]
+sp=ccl_offload_0.m_axi_1:HBM[0:5]
+sp=sendrecv.m_axi_gmem:HBM[0:5]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:client_arbiter.cmd_clients_0
+stream_connect=client_arbiter.ack_clients_0:hostctrl_0.sts
+stream_connect=sendrecv.cmd:client_arbiter.cmd_clients_1
+stream_connect=client_arbiter.ack_clients_1:sendrecv.sts
+stream_connect=client_arbiter.cmd_cclo:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:client_arbiter.ack_cclo
+
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index 6b064d95..097d4094 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -93,9 +93,10 @@ add_custom_target(
 # Build the ACCL Plugins
 set(ACCL_PLUGINS_DIR ${extern_accl_SOURCE_DIR}/kernels/plugins)
 set(ACCL_PLUGINS_HOSTCTRL ${ACCL_PLUGINS_DIR}/hostctrl/hostctrl.xo)
-set(ACCL_PLUGINS_SUM ${ACCL_PLUGINS_DIR}/reduce_sum/reduce_sum.xo)
+set(ACCL_PLUGINS_SUM ${ACCL_PLUGINS_DIR}/reduce_ops/reduce_ops.xo)
 set(ACCL_PLUGINS_COMPRESSION ${ACCL_PLUGINS_DIR}/hp_compression/hp_compression.xo)
 set(ACCL_PLUGINS_LOOPBACK ${ACCL_PLUGINS_DIR}/loopback/loopback.xo)
+set(ACCL_PLUGINS_ARBITER ${ACCL_PLUGINS_DIR}/client_arbiter/client_arbiter.xo)
 
 add_custom_command(
     OUTPUT ${ACCL_PLUGINS_HOSTCTRL}
@@ -104,7 +105,7 @@ add_custom_command(
 add_custom_command(
     OUTPUT ${ACCL_PLUGINS_SUM}
     COMMAND vitis_hls build.tcl -tclargs ip ${ACCL_DEVICE_NAME}
-    WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/reduce_sum ) 
+    WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/reduce_ops ) 
 add_custom_command(
     OUTPUT ${ACCL_PLUGINS_COMPRESSION}
     COMMAND vitis_hls build.tcl -tclargs ip ${ACCL_DEVICE_NAME}
@@ -113,11 +114,16 @@ add_custom_command(
     OUTPUT ${ACCL_PLUGINS_LOOPBACK}
     COMMAND vitis_hls build_loopback.tcl -tclargs ip ${ACCL_DEVICE_NAME}
     WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/loopback ) 
+add_custom_command(
+    OUTPUT ${ACCL_PLUGINS_ARBITER}
+    COMMAND vitis_hls build_client_arbiter.tcl -tclargs ip ${ACCL_DEVICE_NAME}
+    WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/client_arbiter ) 
+
 
 add_custom_target(
     accl_plugins
     DEPENDS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} 
-    ${ACCL_PLUGINS_COMPRESSION})
+    ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_ARBITER})
 
 set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL}
     ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XOS} ${ACCL_UDP_NET_XO} CACHE INTERNAL "Object files required for ACCL with UDP")
@@ -125,6 +131,10 @@ set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLU
 set(ACCL_TCP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL}
     ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_TCP_CMAC_XO} ${ACCL_TCP_XO} CACHE INTERNAL "Object files required for ACCL with TCP")
 
+if (DEFINED USE_ACCL_CLIENT_ARBITER)
+    list(APPEND ${ACCL_UDP_XOS} ${ACCL_PLUGINS_ARBITER})
+    list(APPEND ${ACCL_TCP_XOS} ${ACCL_PLUGINS_ARBITER})
+endif()
 if (ACCL_STACK_TYPE STREQUAL "UDP")
     set(ACCL_XOS ${ACCL_UDP_XOS} CACHE INTERNAL "Object files required for ACCL")
 else()

From 150fac323fb952eab7de119bbf92bf9766dbaf8c Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 21 Sep 2022 13:34:18 +0100
Subject: [PATCH 100/318] Fix build for b_eff with PL

---
 b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake    |  2 +-
 .../settings.link.xilinx.accl_pl.u55c.hbm.ini      |  4 ++--
 b_eff/src/device/communication_ACCL_pl.cpp         | 14 ++++++--------
 .../src/host/execution_types/execution_accl_pl.hpp |  3 ++-
 cmake/accl.cmake                                   |  6 ++++--
 5 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake
index 65516d5b..45e2b5d7 100644
--- a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake
+++ b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake
@@ -16,7 +16,7 @@ set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
 set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini CACHE FILEPATH "" FORCE)
 set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
 set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
-set(XILINX_KERNEL_NAMES "sendrecv" CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES "send_recv" CACHE STRING "" FORCE)
 set(USE_ACCL_CLIENT_ARBITER Yes CACHE BOOL "Use the client arbiter kernel to connect multiple kernels to the CCLO cmd stream" FORCE)
 # STREAM specific options
 # Defaults to a total of ~12GB data
diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini
index f344c4bb..a59018d2 100644
--- a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini
+++ b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini
@@ -20,11 +20,11 @@ nk=networklayer:1:networklayer_0
 nk=ccl_offload:1:ccl_offload_0
 nk=hostctrl:1:hostctrl_0
 nk=cmac_0:1:cmac_0
-nk=reduce_sum:1:arith_0
+nk=reduce_ops:1:arith_0
 nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
 nk=loopback:1:lb_user_krnl
 nk=client_arbiter:1:client_arbiter
-nk=sendrecv:1:sendrecv
+nk=send_recv:1:sendrecv
 
 # Kernels Foorplaning
 slr=compression_0_0:SLR0
diff --git a/b_eff/src/device/communication_ACCL_pl.cpp b/b_eff/src/device/communication_ACCL_pl.cpp
index 528c69cb..58fc7fed 100644
--- a/b_eff/src/device/communication_ACCL_pl.cpp
+++ b/b_eff/src/device/communication_ACCL_pl.cpp
@@ -21,15 +21,13 @@ SOFTWARE.
 */
 #include "accl_hls.h"
 
-extern "C" {
 
-void send_recv(char *read_buffer,char *write_buffer,  unsigned int size_in_bytes, unsigned int num_iterations, 
-                unsigned int neighbor_rank, addr_t communicator_addr,
-                hls::stream<ap_uint<32> > &cmd, hls::stream<ap_uint<32> > &sts) {
+void send_recv(float *read_buffer,float *write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations, 
+                ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
+                hls::stream<command_word> &cmd, hls::stream<command_word > &sts) {
+    accl_hls::ACCLCommand accl_cmd(cmd, sts, communicator_addr, datapath_cfg,0,0);
     for (int i = 0; i < num_iterations; i++) {
-        ACCLCommand accl_cmd(cmd, sts, communicator_addr, 0,0,0);
-        accl_cmd.send(size_in_bytes, 0, neighbor_rank, read_buffer);
-        accl_cmd.recv(size_in_bytes, 0, neighbor_rank, write_buffer);
+        accl_cmd.send(size, 0, neighbor_rank, (ap_uint<64>)read_buffer);
+        accl_cmd.recv(size, 0, neighbor_rank, (ap_uint<64>)write_buffer);
     }
 }
-}
\ No newline at end of file
diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
index e82fa29a..3f39cfab 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -80,7 +80,8 @@ namespace network::execution_types::accl_pl {
             for (int i = 0; i < config.programSettings->kernelReplications; i++) {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
-                auto run = sendrecvKernel(acclSendBuffers[i]->bo(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.accl->get_communicator_adr());
+                auto run = sendrecvKernel(acclSendBuffers[i]->bo(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                            config.accl->get_communicator_adr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}));
                 run.wait();
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index 097d4094..7c3d1f08 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -18,6 +18,8 @@ if (ACCL_STACK_TYPE STREQUAL "UDP")
     list(APPEND ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE})
 endif()
 
+list(APPEND XILINX_ADDITIONAL_COMPILE_FLAGS "-I${extern_accl_SOURCE_DIR}/driver/hls" "-DACCL_SYNTHESIS")
+
 set(ACCL_UDP_MAC_XOS "")
 
 math(EXPR loopend "${ACCL_UDP_ETH_IFS} - 1")
@@ -132,8 +134,8 @@ set(ACCL_TCP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLU
     ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_TCP_CMAC_XO} ${ACCL_TCP_XO} CACHE INTERNAL "Object files required for ACCL with TCP")
 
 if (DEFINED USE_ACCL_CLIENT_ARBITER)
-    list(APPEND ${ACCL_UDP_XOS} ${ACCL_PLUGINS_ARBITER})
-    list(APPEND ${ACCL_TCP_XOS} ${ACCL_PLUGINS_ARBITER})
+    list(APPEND ACCL_UDP_XOS ${ACCL_PLUGINS_ARBITER})
+    list(APPEND ACCL_TCP_XOS ${ACCL_PLUGINS_ARBITER})
 endif()
 if (ACCL_STACK_TYPE STREQUAL "UDP")
     set(ACCL_XOS ${ACCL_UDP_XOS} CACHE INTERNAL "Object files required for ACCL")

From 5b2748b09fa39b1e9639d736399723c35586cc5d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 21 Sep 2022 16:54:14 +0100
Subject: [PATCH 101/318] Update settings to use reduce_ops

---
 .../settings.link.xilinx.hpl_torus_accl.hbm.generator.ini       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini
index 1e75a2eb..ec8cbfa6 100644
--- a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini
@@ -37,7 +37,7 @@ nk=networklayer:1:networklayer_0
 nk=ccl_offload:1:ccl_offload_0
 nk=hostctrl:1:hostctrl_0
 nk=cmac_0:1:cmac_0
-nk=reduce_sum:1:arith_0
+nk=reduce_ops:1:arith_0
 nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
 nk=loopback:1:lb_user_krnl
 

From 695912dcfc8616c1bbf14d15d71550d32f7d93bf Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 22 Sep 2022 16:15:37 +0100
Subject: [PATCH 102/318] Update PTRANS stream approach

---
 .../configs/Xilinx_U55C_HBM_ACCL_stream.cmake | 28 +++++++
 ...nk.xilinx.transpose_pq_accl_stream.hbm.ini | 76 +++++++++++++++++++
 .../src/device/transpose_PQ_ACCL_stream.cpp   |  7 +-
 3 files changed, 108 insertions(+), 3 deletions(-)
 create mode 100644 PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake
 create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini

diff --git a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake
new file mode 100644
index 00000000..e75d0ff7
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake
@@ -0,0 +1,28 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini
new file mode 100644
index 00000000..d4c5567e
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini
@@ -0,0 +1,76 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=transpose_read0:1
+nk=transpose_write0:1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR2
+slr=transpose_read0_1:SLR0
+slr=transpose_write0_1:SLR0
+
+sp=ccl_offload_0.m_axi_0:HBM[31]
+sp=ccl_offload_0.m_axi_1:HBM[31]
+sp=transpose_read0_1.m_axi_gmem0:HBM[0:7]
+sp=transpose_write0_1.m_axi_gmem0:HBM[8:15]
+sp=transpose_write0_1.m_axi_gmem1:HBM[16:23]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl
+stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl
+
diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
index 2cda216f..1c136dc8 100644
--- a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
+++ b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
@@ -118,11 +118,12 @@ void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
                         data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) % (channel_width)];
                     }
 
-                    ap_uint<512> data = 0;
+                    ap_uint<512> data;
 
                     // load tranposed A from global memory
                     for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
-                        data |= ((ap_uint<8*sizeof(DEVICE_DATA_TYPE)>)data_chunk[unroll_count]) << (unroll_count * sizeof(DEVICE_DATA_TYPE));
+                        data(unroll_count * sizeof(DEVICE_DATA_TYPE)*8, unroll_count * sizeof(DEVICE_DATA_TYPE)*8 + sizeof(DEVICE_DATA_TYPE) * 8 - 1) 
+                                = data_chunk[unroll_count];
                     }
 
                     ap_axiu<512, 0, 0, 8> tmp;
@@ -182,7 +183,7 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B,
 
                 // rotate temporary buffer to store data into local buffer
                 for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
-                    data_chunk[unroll_count] = (DEVICE_DATA_TYPE)((tmp.data >> (unroll_count * sizeof(DEVICE_DATA_TYPE))) & ((1 << 32) - 1));
+                    data_chunk[unroll_count] = tmp.data(unroll_count * sizeof(DEVICE_DATA_TYPE)*8, unroll_count * sizeof(DEVICE_DATA_TYPE)*8 + sizeof(DEVICE_DATA_TYPE) * 8 - 1);
                 }
 
                 // load tranposed A from global memory

From 8f4d24cf74ca4b8c9af59107c9ae49774226d734 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 22 Sep 2022 16:23:00 +0100
Subject: [PATCH 103/318] Add configs for HPL U55c

---
 ...cmake => Xilinx_U55C_B8_SB3_R1_ACCL.cmake} |  2 +-
 .../Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake      | 30 ++++++++++++++++
 ...nk.xilinx.hpl_torus_pcie.hbm.generator.ini | 34 +++++++++++++++++++
 3 files changed, 65 insertions(+), 1 deletion(-)
 rename LINPACK/configs/{Xilinx_U55C_B8_SB3_R2_ACCL.cmake => Xilinx_U55C_B8_SB3_R1_ACCL.cmake} (96%)
 create mode 100644 LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake
 create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini

diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake
similarity index 96%
rename from LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake
rename to LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake
index 15e2edeb..ec9d153b 100644
--- a/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake
+++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake
@@ -18,7 +18,7 @@ set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
 set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
 set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
 set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
-set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
 
 set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE)
 set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake
new file mode 100644
index 00000000..bbf80c86
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake
@@ -0,0 +1,30 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE)
+set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini" CACHE STRING "Link settings file" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini
new file mode 100644
index 00000000..df381966
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini
@@ -0,0 +1,34 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR0
+slr=top_update_1:SLR0
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN (i +1) % 3$
+# PY_CODE_GEN block_end
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:5]
+sp=lu_1.m_axi_gmem1:HBM[6]
+sp=lu_1.m_axi_gmem2:HBM[7]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:5]
+sp=top_update_1.m_axi_gmem1:HBM[6]
+sp=top_update_1.m_axi_gmem2:HBM[8]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:5]
+sp=left_update_1.m_axi_gmem1:HBM[7]
+sp=left_update_1.m_axi_gmem2:HBM[9]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[9]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8]
+# PY_CODE_GEN block_end
+

From 2acae30e61b5aee71354a1b930733408ad1fc414 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 22 Sep 2022 16:32:13 +0100
Subject: [PATCH 104/318] Update the SLR mapping and ACCL kernel names

---
 ...ink.xilinx.transpose_pq_accl_stream.hbm.ini | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini
index d4c5567e..2dadc525 100644
--- a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini
@@ -20,21 +20,21 @@ nk=networklayer:1:networklayer_0
 nk=ccl_offload:1:ccl_offload_0
 nk=hostctrl:1:hostctrl_0
 nk=cmac_0:1:cmac_0
-nk=reduce_sum:1:arith_0
+nk=reduce_ops:1:arith_0
 nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
 nk=transpose_read0:1
 nk=transpose_write0:1
 
 # Kernels Foorplaning
-slr=compression_0_0:SLR0
-slr=compression_0_1:SLR0
-slr=compression_0_2:SLR0
-slr=arith_0:SLR0
-slr=ccl_offload_0:SLR0
-slr=hostctrl_0:SLR0
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
 slr=networklayer_0:SLR1
-slr=cmac_0:SLR2
-slr=transpose_read0_1:SLR0
+slr=cmac_0:SLR1
+slr=transpose_read0_1:SLR2
 slr=transpose_write0_1:SLR0
 
 sp=ccl_offload_0.m_axi_0:HBM[31]

From 9c7c5ee5d9875574355467a34ef430df05d279df Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 22 Sep 2022 16:44:28 +0100
Subject: [PATCH 105/318] Add PTRANS baseline configs for U55c

---
 PTRANS/configs/Xilinx_U55C_HBM_PCIE.cmake     | 23 +++++++++++++++++++
 ...s.compile.xilinx.transpose_pq_pcie.hbm.ini |  3 ---
 ...xilinx.transpose_pq_pcie.hbm.generator.ini | 17 ++++++++++++++
 3 files changed, 40 insertions(+), 3 deletions(-)
 create mode 100644 PTRANS/configs/Xilinx_U55C_HBM_PCIE.cmake
 create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini

diff --git a/PTRANS/configs/Xilinx_U55C_HBM_PCIE.cmake b/PTRANS/configs/Xilinx_U55C_HBM_PCIE.cmake
new file mode 100644
index 00000000..c2f3cb4d
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U55C_HBM_PCIE.cmake
@@ -0,0 +1,23 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini
index 7e52533c..8b137891 100644
--- a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini
+++ b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini
@@ -1,4 +1 @@
-kernel_frequency=450
 
-[hls]
-max_memory_ports=all
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini
new file mode 100644
index 00000000..e6f72be5
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini
@@ -0,0 +1,17 @@
+
+# Set number of available SLRs
+# PY_CODE_GEN num_slrs = 3
+# PY_CODE_GEN num_ddrs = 2
+
+[connectivity]
+nk=transpose0:$PY_CODE_GEN num_replications$
+
+# Assign kernels to the SLRs
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+slr=transpose0_$PY_CODE_GEN i + 1$:SLR$PY_CODE_GEN i % num_slrs$
+# PY_CODE_GEN block_end
+
+# Assign the kernels to the memory ports
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem:HBM[$PY_CODE_GEN i*8$:$PY_CODE_GEN (i+1)*8$]
+# PY_CODE_GEN block_end

From ef2da9ee51ffbc9fdc2f5301bb7186a1497e9c26 Mon Sep 17 00:00:00 2001
From: Marius Meyer <marius.meyer@uni-paderborn.de>
Date: Wed, 28 Sep 2022 09:44:45 +0200
Subject: [PATCH 106/318] Update CI script for Noctua2

---
 .gitlab-ci.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index a8b41ac6..40ca7a1f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -3,13 +3,14 @@ stages:
     - test
 
 variables:
-  SCHEDULER_PARAMETERS: "-A pc2-mitarbeiter -p short"
+  SCHEDULER_PARAMETERS: "-A pc2-mitarbeiter -p normal -q cont -t 00:30:00 -n 2 -N 1"
 
 default:
   tags:
     - jacamar
   before_script:
-    - module load intelFPGA_pro/21.2.0 bittware_520n/20.4.0_max toolchain/foss/2021a devel/CMake/3.20.1-GCCcore-10.3.0
+    - module load fpga/intel/opencl_sdk/21.2.0 fpga/bittware/520n/20.4.0_max toolchain/foss/2021a devel/CMake/3.20.1-GCCcore-10.3.0 lang/Python/3.9.5-GCCcore-10.3.0
+    - python -m pip install pandas
 
 ###
 #

From 4bb0862cd9126ef3b5a01150a5b8559f3dba72ec Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 29 Sep 2022 13:56:28 +0100
Subject: [PATCH 107/318] Add ACCL to RPATH for convenience (PTRANS)

---
 PTRANS/src/host/CMakeLists.txt | 5 +++++
 shared/CMakeLists.txt          | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt
index 5bb10e54..1ad17d14 100755
--- a/PTRANS/src/host/CMakeLists.txt
+++ b/PTRANS/src/host/CMakeLists.txt
@@ -33,6 +33,11 @@ if (INTELFPGAOPENCL_FOUND)
 endif()
 
 if (Vitis_FOUND)
+    if (USE_ACCL)
+        set(CMAKE_SKIP_BUILD_RPATH No)
+        set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes)
+        list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib)
+    endif()
     add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE})
     target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS})
     target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host)
diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt
index 43731ce8..22b2e1f4 100644
--- a/shared/CMakeLists.txt
+++ b/shared/CMakeLists.txt
@@ -3,7 +3,7 @@ project(HPCCBaseLibrary VERSION 1.0.1)
 set(HPCC_BASE_SOURCES "")
 
 if (USE_ACCL)
-    include(${extern_accl_SOURCE_DIR}/driver/xrt/CMakeLists.txt)
+    add_subdirectory(${extern_accl_SOURCE_DIR}/driver/xrt ${CMAKE_BINARY_DIR}/lib/accl)
     list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp)
 endif()
 if (USE_XRT_HOST)

From a786b0919154118ff908cf428232da2c82b1c756 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Sun, 4 Sep 2022 21:16:18 +0200
Subject: [PATCH 108/318] Add platform_str parameter

add option for passing platform string insted of platform index

overwrites all index options when used

necessary as order of platforms is not deterministic anymore
---
 shared/include/hpcc_benchmark.hpp         | 11 ++++++++++-
 shared/include/setup/fpga_setup.hpp       |  2 +-
 shared/setup/fpga_setup.cpp               | 16 ++++++++++++++--
 shared/tests/hpcc_base_benchmark_test.cpp | 13 ++++++++++---
 4 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 17e17bb9..aed3f901 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -89,6 +89,12 @@ class BaseSettings {
      */
     int defaultPlatform;
 
+    /**
+     * @brief The platform string of the platform that should be used
+     *
+     */
+    std::string platformString;
+
     /**
      * @brief The default device that should be used for execution. 
      *          A number representing the index in the list of available devices
@@ -134,6 +140,7 @@ class BaseSettings {
             skipValidation(static_cast<bool>(results.count("skip-validation"))), 
             defaultPlatform(results["platform"].as<int>()),
             defaultDevice(results["device"].as<int>()),
+            platformString(results["platform_str"].as<std::string>()),
             kernelFileName(results["f"].as<std::string>()),
 #ifdef NUM_REPLICATIONS
             kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : NUM_REPLICATIONS),
@@ -380,6 +387,7 @@ class HpccFpgaBenchmark {
             "you will be asked which platform to use if there are multiple "\
             "platforms available.",
                 cxxopts::value<int>()->default_value(std::to_string(DEFAULT_PLATFORM)))
+                ("platform_str", "Name of the platform that has to be used", cxxopts::value<std::string>()->default_value(std::string()))
 #ifdef NUM_REPLICATIONS
                 ("r", "Number of used kernel replications",
                 cxxopts::value<cl_uint>()->default_value(std::to_string(NUM_REPLICATIONS)))
@@ -478,7 +486,8 @@ class HpccFpgaBenchmark {
 
             if (!programSettings->testOnly) {
                 usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
-                                                                    programSettings->defaultDevice);
+                                                                    programSettings->defaultDevice,
+                                                                    programSettings->platformString);
 
                 context = std::unique_ptr<cl::Context>(new cl::Context(*usedDevice));
                 program = fpga_setup::fpgaSetup(context.get(), {*usedDevice},
diff --git a/shared/include/setup/fpga_setup.hpp b/shared/include/setup/fpga_setup.hpp
index 0799900c..7f88f8b1 100644
--- a/shared/include/setup/fpga_setup.hpp
+++ b/shared/include/setup/fpga_setup.hpp
@@ -157,7 +157,7 @@ choose a device.
 @return A list containing a single selected device
 */
     std::unique_ptr<cl::Device>
-    selectFPGADevice(int defaultPlatform, int defaultDevice);
+    selectFPGADevice(int defaultPlatform, int defaultDevice, std::string platformString);
 
 }  // namespace fpga_setup
 #endif  // SRC_HOST_FPGA_SETUP_H_
diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp
index dd1ddd28..70125df0 100644
--- a/shared/setup/fpga_setup.cpp
+++ b/shared/setup/fpga_setup.cpp
@@ -224,7 +224,7 @@ choose a device.
 @return A list containing a single selected device
 */
     std::unique_ptr<cl::Device>
-    selectFPGADevice(int defaultPlatform, int defaultDevice) {
+    selectFPGADevice(int defaultPlatform, int defaultDevice, std::string platformString) {
         // Integer used to store return codes of OpenCL library calls
         int err;
 
@@ -243,7 +243,19 @@ choose a device.
         // Choose the target platform
         long unsigned int chosenPlatformId = 0;
         if (defaultPlatform >= 0) {
-            if (defaultPlatform < static_cast<int>(platformList.size())) {
+            if (platformString.size() > 0) {
+                bool found = false;
+                for (int i = 0; i < platformList.size(); i++) {
+                    if (platformList[i].getInfo<CL_PLATFORM_NAME>() == platformString) {
+                        chosenPlatformId = i;
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    throw FpgaSetupException("Invalid platform string specified: " + platformString);
+                }
+            } else if (defaultPlatform < static_cast<int>(platformList.size())) {
                 chosenPlatformId = defaultPlatform;
             } else {
                 std::cerr << "Default platform " << defaultPlatform
diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp
index a93a2a69..1c491b49 100644
--- a/shared/tests/hpcc_base_benchmark_test.cpp
+++ b/shared/tests/hpcc_base_benchmark_test.cpp
@@ -170,21 +170,28 @@ TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenTestOnlyAndSetupSuccess) {
  * Checks if using default platform and device is successful
  */
 TEST_F(BaseHpccBenchmarkTest, SuccessUseDefaultPlatform) {
-    EXPECT_NE(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, bm->getExecutionSettings().programSettings->defaultDevice).get(), nullptr);
+    EXPECT_NE(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, bm->getExecutionSettings().programSettings->defaultDevice, bm->getExecutionSettings().programSettings->platformString).get(), nullptr);
 }
 
 /**
  * Checks if non existing platform leads to an error
  */
 TEST_F(BaseHpccBenchmarkTest, FindNonExistingPlatform) {
-    ASSERT_THROW(fpga_setup::selectFPGADevice(100, bm->getExecutionSettings().programSettings->defaultDevice).get(), fpga_setup::FpgaSetupException);
+    ASSERT_THROW(fpga_setup::selectFPGADevice(100, bm->getExecutionSettings().programSettings->defaultDevice, bm->getExecutionSettings().programSettings->platformString).get(), fpga_setup::FpgaSetupException);
 }
 
 /**
  * Checks if non existing device leads to an error
  */
 TEST_F(BaseHpccBenchmarkTest, FindNonExistingDevice) {
-    ASSERT_THROW(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, 100).get(), fpga_setup::FpgaSetupException);
+    ASSERT_THROW(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, 100, bm->getExecutionSettings().programSettings->platformString).get(), fpga_setup::FpgaSetupException);
+}
+
+/*
+ * Check if wrong platform string leads to an error
+ */
+TEST_F(BaseHpccBenchmarkTest, FindNonExistingPlatformString) {
+    ASSERT_THROW(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, bm->getExecutionSettings().programSettings->defaultDevice, "This is not a platform").get(), fpga_setup::FpgaSetupException);
 }
 
 /**

From a39430c8de9f60163a4e66bfd45bebb88f5f693c Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 4 Oct 2022 08:47:34 +0100
Subject: [PATCH 109/318] Reduce replications for PTRANS via streams

---
 PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake
index e75d0ff7..e7a5a22e 100644
--- a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake
+++ b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake
@@ -23,6 +23,6 @@ set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
 set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
 set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
 set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
-set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
 
 set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)

From 1b58508270a4df0d744b1871e14aea7c24828b6a Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 4 Oct 2022 08:50:16 +0100
Subject: [PATCH 110/318] Update PTRANS for stream_put transpose

---
 .../src/device/transpose_PQ_ACCL_stream.cpp   |  25 +-
 PTRANS/src/host/CMakeLists.txt                |   2 +
 .../execution_xrt_accl_stream_pq.hpp          | 366 ++++++++++++++++++
 PTRANS/src/host/transpose_benchmark.hpp       |   5 +-
 shared/CMakeLists.txt                         |   9 +-
 shared/include/hpcc_benchmark.hpp             |   4 +-
 shared/setup/fpga_setup_accl.cpp              |   2 +-
 7 files changed, 390 insertions(+), 23 deletions(-)
 create mode 100644 PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp

diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
index 1c136dc8..72d3d0cb 100644
--- a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
+++ b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
@@ -6,16 +6,14 @@
  *  - Change to row-column loop structure
  *****************************************************************************/
 #include "parameters.h"
-#include "hls_stream.h"
 #include "ap_int.h"
 #include "ap_utils.h"
 #include "ap_axi_sdata.h"
+#include "accl_hls.h"
 
 const unsigned int block_size = BLOCK_SIZE;
 const unsigned int channel_width = CHANNEL_WIDTH;
 
-extern "C" {
-
 // PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
 
 /**
@@ -41,10 +39,8 @@ void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
             const unsigned int number_of_blocks,
             const unsigned int width_in_blocks,
             const unsigned int height_in_blocks,
-            hls::stream<ap_axiu<512, 0, 0, 8> > &krnl2cclo) {
+            STREAM<stream_word> &krnl2cclo) {
 #pragma HLS INTERFACE axis register both port=krnl2cclo
-#pragma HLS INTERFACE ap_ctrl_none port=return
-
 
     // local memory double buffer for a matrix block
     DEVICE_DATA_TYPE a_block[2][block_size * block_size / channel_width][channel_width];
@@ -118,19 +114,16 @@ void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
                         data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) % (channel_width)];
                     }
 
-                    ap_uint<512> data;
+                    stream_word tmp;
 
                     // load tranposed A from global memory
                     for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
-                        data(unroll_count * sizeof(DEVICE_DATA_TYPE)*8, unroll_count * sizeof(DEVICE_DATA_TYPE)*8 + sizeof(DEVICE_DATA_TYPE) * 8 - 1) 
+                        tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8) 
                                 = data_chunk[unroll_count];
                     }
-
-                    ap_axiu<512, 0, 0, 8> tmp;
-                    tmp.data = data;
                     tmp.dest = 0;
                     tmp.keep = -1;
-                    krnl2cclo.write(tmp);               
+                    STREAM_WRITE(krnl2cclo,tmp);               
                 }
             }
         }
@@ -156,9 +149,8 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B,
             const unsigned int number_of_blocks,
             const unsigned int width_in_blocks,
             const unsigned int height_in_blocks,
-            hls::stream<ap_axiu<512, 0, 0, 8> > &cclo2krnl) {
+            STREAM<stream_word> &cclo2krnl) {
 #pragma HLS INTERFACE axis register both port=cclo2krnl
-#pragma HLS INTERFACE ap_ctrl_none port=return
 
     // transpose the matrix block-wise from global memory
 block_loop:
@@ -179,11 +171,11 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B,
                 DEVICE_DATA_TYPE data_chunk[channel_width];
 #pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0
 
-                ap_axiu<512, 0, 0, 8> tmp = cclo2krnl.read();
+                stream_word tmp = STREAM_READ(cclo2krnl);
 
                 // rotate temporary buffer to store data into local buffer
                 for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
-                    data_chunk[unroll_count] = tmp.data(unroll_count * sizeof(DEVICE_DATA_TYPE)*8, unroll_count * sizeof(DEVICE_DATA_TYPE)*8 + sizeof(DEVICE_DATA_TYPE) * 8 - 1);
+                    data_chunk[unroll_count] = tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8);
                 }
 
                 // load tranposed A from global memory
@@ -201,4 +193,3 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B,
 
 // PY_CODE_GEN block_end
 
-}
diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt
index 1ad17d14..fe7214c4 100755
--- a/PTRANS/src/host/CMakeLists.txt
+++ b/PTRANS/src/host/CMakeLists.txt
@@ -37,10 +37,12 @@ if (Vitis_FOUND)
         set(CMAKE_SKIP_BUILD_RPATH No)
         set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes)
         list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib)
+        list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/transpose_PQ_ACCL_stream.cpp)
     endif()
     add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE})
     target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS})
     target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host)
+    target_include_directories(${LIB_NAME}_xilinx PRIVATE ${extern_accl_SOURCE_DIR}/hlslib/include/xilinx)
     add_executable(${HOST_EXE_NAME}_xilinx main.cpp)
     target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
     target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
new file mode 100644
index 00000000..4bc406e1
--- /dev/null
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
@@ -0,0 +1,366 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_ACCL_STREAM_PQ_EXECUTION_H_
+#define SRC_HOST_ACCL_STREAM_PQ_EXECUTION_H_
+
+/* C++ standard library headers */
+#include <chrono>
+#include <memory>
+#include <vector>
+
+/* Project's headers */
+#include "buffer.hpp"
+#include "cclo.hpp"
+#include "constants.hpp"
+#include "data_handlers/data_handler_types.h"
+#include "data_handlers/pq.hpp"
+#include "fpgabuffer.hpp"
+#include "transpose_data.hpp"
+#include "cclo_bfm.h"
+#include "Simulation.h"
+#include "dummybuffer.hpp"
+
+extern void transpose_write(const DEVICE_DATA_TYPE *B,
+                                 DEVICE_DATA_TYPE *A_out,
+            const unsigned int offset_b,
+            const unsigned int number_of_blocks,
+            const unsigned int width_in_blocks,
+            const unsigned int height_in_blocks,
+            hlslib::Stream<stream_word> &cclo2krnl);
+  
+extern void transpose_read( const DEVICE_DATA_TYPE *A,
+            const unsigned int offset_a,
+            const unsigned int number_of_blocks,
+            const unsigned int width_in_blocks,
+            const unsigned int height_in_blocks,
+            hlslib::Stream<stream_word> &krnl2cclo);
+
+namespace transpose {
+namespace fpga_execution {
+namespace accl_stream_pq {
+
+/**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ
+ * distribution and PCIe+MPI over the host for communication
+ *
+ * @param config The progrma configuration
+ * @param data data object that contains all required data for the execution on
+ * the FPGA
+ * @param handler data handler instance that should be used to exchange data
+ * between hosts
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured
+ * execution times
+ */
+static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
+    const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
+                                       xrt::device, bool, xrt::uuid> &config,
+    transpose::TransposeData<bool> &data,
+    transpose::data_handler::DistributedPQTransposeDataHandler<
+        xrt::device, bool, xrt::uuid> &handler) {
+  int err;
+
+  if (config.programSettings->dataHandlerIdentifier !=
+      transpose::data_handler::DataHandlerType::pq) {
+    throw std::runtime_error(
+        "Used data handler not supported by execution handler!");
+  }
+#ifdef USE_SVM
+  throw new std::runtime_error("SVM not supported in the host implementation "
+                               "of this communication method");
+#endif
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+  throw new std::runtime_error(
+      "Using the Write Rect method is not supported in this host "
+      "implementation of this communication method");
+#endif
+
+  std::vector<size_t> bufferSizeList;
+  std::vector<size_t> bufferStartList;
+  std::vector<size_t> bufferOffsetList;
+  std::vector<xrt::bo> bufferListA;
+  std::vector<xrt::bo> bufferListB;
+  std::vector<xrt::bo> bufferListA_out;
+  std::vector<xrt::kernel> transposeReadKernelList;
+  std::vector<xrt::kernel> transposeWriteKernelList;
+  std::vector<size_t> blocksPerReplication;
+
+  size_t local_matrix_width = handler.getWidthforRank();
+  size_t local_matrix_height = handler.getHeightforRank();
+  size_t local_matrix_width_bytes =
+      local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+
+  size_t total_offset = 0;
+  size_t row_offset = 0;
+#ifndef NDEBUG
+  std::cout << "Start kernel creation" << std::endl;
+#endif
+  // Setup the kernels depending on the number of kernel replications
+  for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+    // Calculate how many blocks the current kernel replication will need to
+    // process.
+    size_t blocks_per_replication =
+        (local_matrix_height * local_matrix_width /
+         config.programSettings->kernelReplications);
+    size_t blocks_remainder = (local_matrix_height * local_matrix_width) %
+                              config.programSettings->kernelReplications;
+    if (blocks_remainder > r) {
+      // Catch the case, that the number of blocks is not divisible by the
+      // number of kernel replications
+      blocks_per_replication += 1;
+    }
+    if (blocks_per_replication < 1) {
+      continue;
+    }
+    blocksPerReplication.push_back(blocks_per_replication);
+    size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) /
+                         local_matrix_width * local_matrix_width *
+                         data.blockSize * data.blockSize;
+    bufferSizeList.push_back(buffer_size);
+    bufferStartList.push_back(total_offset);
+    bufferOffsetList.push_back(row_offset);
+
+    row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
+
+    total_offset += (bufferOffsetList.back() + blocks_per_replication) /
+                    local_matrix_width * local_matrix_width;
+
+    if (!config.programSettings->useAcclEmulation) {
+      // create the kernels
+      xrt::kernel transposeReadKernel(
+          *config.device, *config.program,
+          ("transpose_read0:{transpose_read0_" + std::to_string(r + 1) + "}").c_str());
+      xrt::kernel transposeWriteKernel(
+          *config.device, *config.program,
+          ("transpose_write0:{transpose_write0_" + std::to_string(r + 1) + "}").c_str());
+
+      if (r == 0 || config.programSettings->copyA) {
+        xrt::bo bufferA(*config.device, data.A,
+                      data.numBlocks * data.blockSize * data.blockSize *
+                          sizeof(HOST_DATA_TYPE),
+                      transposeReadKernel.group_id(0));
+        bufferListA.push_back(bufferA);
+      }
+      xrt::bo bufferB(
+          *config.device,
+          &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
+          buffer_size * sizeof(HOST_DATA_TYPE), transposeWriteKernel.group_id(0));
+      xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
+                          transposeWriteKernel.group_id(1));
+
+      bufferListB.push_back(bufferB);
+      bufferListA_out.push_back(bufferA_out);
+      transposeReadKernelList.push_back(transposeReadKernel);
+      transposeWriteKernelList.push_back(transposeWriteKernel);
+    }
+  }
+
+  std::vector<double> transferTimings;
+  std::vector<double> calculationTimings;
+
+  for (int repetition = 0; repetition < config.programSettings->numRepetitions;
+       repetition++) {
+
+#ifndef NDEBUG
+    std::cout << "Start data transfer" << std::endl;
+#endif
+    auto startTransfer = std::chrono::high_resolution_clock::now();
+
+    if (!config.programSettings->useAcclEmulation) {
+      for (int r = 0; r < transposeReadKernelList.size(); r++) {
+        if (r == 0 || config.programSettings->copyA) {
+          bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+        }
+        bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      }
+    }
+    auto endTransfer = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double> transferTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endTransfer - startTransfer);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+#ifndef NDEBUG
+    std::cout << "Start BFM" << std::endl;
+#endif
+
+    HLSLIB_DATAFLOW_INIT();
+    hlslib::Stream<stream_word> cclo2krnl, krnl2cclo;
+    hlslib::Stream<command_word> cmd, sts;
+
+    int pq_width = handler.getP();
+
+    int mpi_comm_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank);
+    int mpi_comm_size;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size);
+    int pq_row = mpi_comm_rank / pq_width;
+    int pq_col = mpi_comm_rank % pq_width;
+
+    int pair_rank = pq_width * pq_col + pq_row;
+    std::vector<unsigned int> dest = {0,9, 18};
+    CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+    cclo.run();
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    auto startCalculation = std::chrono::high_resolution_clock::now();
+
+#ifndef NDEBUG
+    std::cout << "Start kernel execution" << std::endl;
+#endif
+    std::vector<xrt::run> runs;
+    auto startKernelCalculation = std::chrono::high_resolution_clock::now();
+    for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+      if (!config.programSettings->useAcclEmulation) {
+        runs.push_back(transposeReadKernelList[r](
+            (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]),
+            static_cast<cl_uint>(bufferOffsetList[r]),
+            static_cast<cl_uint>(blocksPerReplication[r]),
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize))));
+        runs.push_back(transposeWriteKernelList[r](
+            bufferListB[r], bufferListA_out[r],
+            static_cast<cl_uint>(bufferOffsetList[r]),
+            static_cast<cl_uint>(blocksPerReplication[r]),
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize))));
+      } else {
+        HLSLIB_DATAFLOW_FUNCTION(transpose_read,
+            (config.programSettings->copyA ? data.A : data.A),
+            static_cast<cl_uint>(bufferOffsetList[r]),
+            static_cast<cl_uint>(blocksPerReplication[r]),
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize)),
+                krnl2cclo);
+        HLSLIB_DATAFLOW_FUNCTION(transpose_write,
+            data.B, data.result,
+            static_cast<cl_uint>(bufferOffsetList[r]),
+            static_cast<cl_uint>(blocksPerReplication[r]),
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize)),
+                cclo2krnl);
+      }
+    }
+    auto dbuffer = config.accl->create_buffer<DEVICE_DATA_TYPE>(1,ACCL::dataType::float32);
+    // Exchange A data via ACCL
+    config.accl->stream_put(*dbuffer, data.blockSize * data.blockSize * data.numBlocks,
+                   pair_rank, 9, ACCL::GLOBAL_COMM,
+                   false, ACCL::streamFlags::OP0_STREAM);
+    // config.accl->send(*dbuffer, data.blockSize * data.blockSize * data.numBlocks,
+    //                pair_rank, 9, ACCL::GLOBAL_COMM,
+    //                false, ACCL::streamFlags::OP0_STREAM | ACCL::streamFlags::RES_STREAM );
+#ifndef NDEBUG
+    std::cout << "Wait for kernels to complete" << std::endl;
+#endif
+    for (int r = 0; r < runs.size(); r++) {
+      runs[r].wait();
+    }
+    cclo.stop();
+    HLSLIB_DATAFLOW_FINALIZE();
+    auto endCalculation = std::chrono::high_resolution_clock::now();
+#ifndef NDEBUG
+    int mpi_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    std::cout << "Rank " << mpi_rank << ": "
+              << "Done i=" << repetition << std::endl;
+    std::cout << "Kernel execution time: "
+              << std::chrono::duration_cast<std::chrono::duration<double>>(
+                     endCalculation - startKernelCalculation)
+                     .count()
+              << "s ("
+              << ((config.programSettings->matrixSize *
+                   config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) *
+                   3) /
+                  std::chrono::duration_cast<std::chrono::duration<double>>(
+                      endCalculation - startKernelCalculation)
+                      .count() *
+                  1.0e-9)
+              << " GB/s)" << std::endl;
+#endif
+
+    std::chrono::duration<double> calculationTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endCalculation - startCalculation);
+    calculationTimings.push_back(calculationTime.count());
+
+    std::vector<HOST_DATA_TYPE> tmp_write_buffer(
+        local_matrix_height * local_matrix_width * data.blockSize *
+        data.blockSize);
+
+    startTransfer = std::chrono::high_resolution_clock::now();
+    if (!config.programSettings->useAcclEmulation) {
+      for (int r = 0; r < transposeReadKernelList.size(); r++) {
+        // Copy possibly incomplete first block row
+        if (bufferOffsetList[r] != 0) {
+          bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+          bufferListA_out[r].read(tmp_write_buffer.data());
+          for (int row = 0; row < data.blockSize; row++) {
+            for (int col = bufferOffsetList[r] * data.blockSize;
+                col < local_matrix_width * data.blockSize; col++) {
+              data.result[bufferStartList[r] * data.blockSize * data.blockSize +
+                          row * local_matrix_width * data.blockSize + col] =
+                  tmp_write_buffer[row * local_matrix_width * data.blockSize +
+                                  col];
+            }
+          }
+          // Copy remaining buffer
+          std::copy(tmp_write_buffer.begin() +
+                        local_matrix_width * data.blockSize * data.blockSize,
+                    tmp_write_buffer.begin() + bufferSizeList[r],
+                    &data.result[(bufferStartList[r] + local_matrix_width) *
+                                data.blockSize * data.blockSize]);
+        } else {
+          bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+          bufferListA_out[r].read(
+              &data.result[bufferStartList[r] * data.blockSize * data.blockSize]);
+        }
+      }
+    }
+    endTransfer = std::chrono::high_resolution_clock::now();
+
+    transferTime += std::chrono::duration_cast<std::chrono::duration<double>>(
+        endTransfer - startTransfer);
+    transferTimings.push_back(transferTime.count());
+  }
+
+  std::unique_ptr<transpose::TransposeExecutionTimings> result(
+      new transpose::TransposeExecutionTimings{transferTimings,
+                                               calculationTimings});
+
+  return result;
+}
+
+} // namespace accl_pq
+} // namespace fpga_execution
+} // namespace transpose
+
+#endif
diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index 585e60be..f2b06965 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -41,6 +41,7 @@ SOFTWARE.
 #include "execution_types/execution_xrt_pcie_pq.hpp"
 #ifdef USE_ACCL
 #include "execution_types/execution_xrt_accl_pq.hpp"
+#include "execution_types/execution_xrt_accl_stream_pq.hpp"
 #endif
 #endif
 #include "execution_types/execution_cpu.hpp"
@@ -141,8 +142,10 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
             case hpcc_base::CommunicationType::pcie_mpi:
                                     return transpose::fpga_execution::pcie_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler)); break;
 #ifdef USE_ACCL
+            // case hpcc_base::CommunicationType::accl:
+            //                         return transpose::fpga_execution::accl_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler)); break;
             case hpcc_base::CommunicationType::accl:
-                                    return transpose::fpga_execution::accl_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler)); break;
+                                    return transpose::fpga_execution::accl_stream_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler)); break;
 #endif
 #endif
 #ifdef MKL_FOUND
diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt
index 22b2e1f4..19ab7ff2 100644
--- a/shared/CMakeLists.txt
+++ b/shared/CMakeLists.txt
@@ -5,16 +5,19 @@ set(HPCC_BASE_SOURCES "")
 if (USE_ACCL)
     add_subdirectory(${extern_accl_SOURCE_DIR}/driver/xrt ${CMAKE_BINARY_DIR}/lib/accl)
     list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp)
+    if (CMAKE_BUILD_TYPE EQUAL "Debug")
+        set(ACCL_DEBUG Yes)
+    endif()
 endif()
 if (USE_XRT_HOST)
-    list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp)
+    list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp ${extern_accl_SOURCE_DIR}/test/model/bfm/cclo_bfm.cpp)
 endif()
 list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp)
 add_library(hpcc_fpga_base STATIC ${HPCC_BASE_SOURCES})
 if (USE_ACCL)
     add_subdirectory(${extern_accl_SOURCE_DIR}/test/hardware/xup_vitis_network_example/xrt_host_api
-       ${CMAKE_BINARY_DIR}/libs/xrt_host_api)
-    target_include_directories(hpcc_fpga_base PRIVATE ${VNX_INCLUDE_PATH} ${ACCL_INCLUDE_PATH})
+       ${CMAKE_BINARY_DIR}/lib/xrt_host_api)
+    target_include_directories(hpcc_fpga_base PUBLIC ${VNX_INCLUDE_PATH} ${ACCL_INCLUDE_PATH} ${extern_accl_SOURCE_DIR}/test/model/bfm ${extern_accl_SOURCE_DIR}/driver/hls ${extern_hlslib_SOURCE_DIR}/include/hlslib/xilinx)
     target_link_libraries(hpcc_fpga_base accl vnx)
 endif()
 if (USE_XRT_HOST)
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index bece837c..7ff91bae 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -517,7 +517,9 @@ class HpccFpgaBenchmark {
 #ifdef USE_XRT_HOST
                 usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultDevice);
                 context = std::unique_ptr<bool>(new bool(false));
-                program = fpga_setup::fpgaSetup(*usedDevice, programSettings->kernelFileName);
+                if (!programSettings->useAcclEmulation) {
+                    program = fpga_setup::fpgaSetup(*usedDevice, programSettings->kernelFileName);
+                }
 #endif                                                             
 #ifdef USE_OCL_HOST
                 usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 2f78366d..51c7f87e 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -99,7 +99,7 @@ std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &progra
   } else {
     // TODO: Add start port here. Currenty hardcoded!
     return std::unique_ptr<ACCL::ACCL>(
-        new ACCL::ACCL(ranks, current_rank, 5500, device, ACCL::networkProtocol::UDP, 16, ACCL_BUFFER_SIZE));
+        new ACCL::ACCL(ranks, current_rank, 6000, device, ACCL::networkProtocol::UDP, 16, ACCL_BUFFER_SIZE));
   }
 }
 

From 610071e65de239d4e44de42ffcd58999355cff6f Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 4 Oct 2022 08:50:37 +0100
Subject: [PATCH 111/318] Update ACCL source to dev branch

---
 extern/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 141899f7..341f73cd 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -61,8 +61,8 @@ if (DEFINED USE_ACCL)
 FetchContent_Declare(
 	extern_accl
 
-    GIT_REPOSITORY	https://github.com/Mellich/ACCL.git
-	GIT_TAG		udp_address_fix_and_new_tcp)
+    GIT_REPOSITORY	https://github.com/Xilinx/ACCL.git
+	GIT_TAG		dev)
 
 FetchContent_GetProperties(extern_accl)
 if(NOT extern_accl_POPULATED)

From ff89927672206f974982707d00fbffe15b171bab Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 4 Oct 2022 10:44:06 +0100
Subject: [PATCH 112/318] Fix settings for PTRANS ACCL stream

---
 PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake               | 2 +-
 .../settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini | 3 ++-
 .../settings.link.xilinx.transpose_pq_accl_stream.hbm.ini      | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake
index e7a5a22e..f84f73a2 100644
--- a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake
+++ b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake
@@ -15,7 +15,7 @@ set(USE_OCL_HOST No CACHE BOOL "" FORCE)
 set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE)
 set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
 set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini CACHE FILEPATH "" FORCE)
-set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE)
 set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
 
 # STREAM specific options
diff --git a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini
index 8b137891..d259f88f 100644
--- a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini
+++ b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini
@@ -1 +1,2 @@
-
+[hls]
+max_memory_ports=all
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini
index 2dadc525..f9be4bec 100644
--- a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini
@@ -22,6 +22,7 @@ nk=hostctrl:1:hostctrl_0
 nk=cmac_0:1:cmac_0
 nk=reduce_ops:1:arith_0
 nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:0
 nk=transpose_read0:1
 nk=transpose_write0:1
 

From f475d4cf1a96ef23b04abf4b95e3f8d1b06e5175 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 5 Oct 2022 13:44:20 +0100
Subject: [PATCH 113/318] Fix PTRANS emulation results

---
 PTRANS/src/device/transpose_PQ_ACCL_stream.cpp        | 11 +++++++----
 .../execution_types/execution_xrt_accl_stream_pq.hpp  | 10 ++++------
 shared/setup/fpga_setup_accl.cpp                      |  2 +-
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
index 72d3d0cb..739792e0 100644
--- a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
+++ b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
@@ -118,12 +118,14 @@ void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
 
                     // load tranposed A from global memory
                     for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                        DEVICE_DATA_TYPE v = data_chunk[unroll_count];
                         tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8) 
-                                = data_chunk[unroll_count];
+                                = *reinterpret_cast<ap_uint<sizeof(DEVICE_DATA_TYPE)*8>*>(&v);
                     }
-                    tmp.dest = 0;
+                    tmp.dest = 9;
+                    tmp.last = 1;
                     tmp.keep = -1;
-                    STREAM_WRITE(krnl2cclo,tmp);               
+                    STREAM_WRITE(krnl2cclo,tmp);              
                 }
             }
         }
@@ -175,7 +177,8 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B,
 
                 // rotate temporary buffer to store data into local buffer
                 for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
-                    data_chunk[unroll_count] = tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8);
+                    ap_uint<sizeof(DEVICE_DATA_TYPE)*8> v = tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8);
+                    data_chunk[unroll_count] = *reinterpret_cast<DEVICE_DATA_TYPE*>(&v);
                 }
 
                 // load tranposed A from global memory
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
index 4bc406e1..8fac597c 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
@@ -206,7 +206,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
 #endif
 
     HLSLIB_DATAFLOW_INIT();
-    hlslib::Stream<stream_word> cclo2krnl, krnl2cclo;
+    hlslib::Stream<stream_word> cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo");
     hlslib::Stream<command_word> cmd, sts;
 
     int pq_width = handler.getP();
@@ -219,7 +219,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     int pq_col = mpi_comm_rank % pq_width;
 
     int pair_rank = pq_width * pq_col + pq_row;
-    std::vector<unsigned int> dest = {0,9, 18};
+    std::vector<unsigned int> dest = {0};
     CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
     cclo.run();
     MPI_Barrier(MPI_COMM_WORLD);
@@ -275,17 +275,15 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     config.accl->stream_put(*dbuffer, data.blockSize * data.blockSize * data.numBlocks,
                    pair_rank, 9, ACCL::GLOBAL_COMM,
                    false, ACCL::streamFlags::OP0_STREAM);
-    // config.accl->send(*dbuffer, data.blockSize * data.blockSize * data.numBlocks,
-    //                pair_rank, 9, ACCL::GLOBAL_COMM,
-    //                false, ACCL::streamFlags::OP0_STREAM | ACCL::streamFlags::RES_STREAM );
 #ifndef NDEBUG
     std::cout << "Wait for kernels to complete" << std::endl;
 #endif
     for (int r = 0; r < runs.size(); r++) {
       runs[r].wait();
     }
-    cclo.stop();
+    MPI_Barrier(MPI_COMM_WORLD);
     HLSLIB_DATAFLOW_FINALIZE();
+    cclo.stop();
     auto endCalculation = std::chrono::high_resolution_clock::now();
 #ifndef NDEBUG
     int mpi_rank;
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 51c7f87e..58fc3f67 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -99,7 +99,7 @@ std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &progra
   } else {
     // TODO: Add start port here. Currenty hardcoded!
     return std::unique_ptr<ACCL::ACCL>(
-        new ACCL::ACCL(ranks, current_rank, 6000, device, ACCL::networkProtocol::UDP, 16, ACCL_BUFFER_SIZE));
+        new ACCL::ACCL(ranks, current_rank, 6000, device, ACCL::networkProtocol::TCP, 16, ACCL_BUFFER_SIZE));
   }
 }
 

From 9828b49fd9651538ea59d06c107c80dd7d42cb37 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 5 Oct 2022 13:48:40 +0100
Subject: [PATCH 114/318] Make start/stop BFM optional

---
 .../host/execution_types/execution_xrt_accl_stream_pq.hpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
index 8fac597c..9a43fef9 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
@@ -221,7 +221,9 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     int pair_rank = pq_width * pq_col + pq_row;
     std::vector<unsigned int> dest = {0};
     CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
-    cclo.run();
+    if (config.programSettings->useAcclEmulation) {
+      cclo.run();
+    }
     MPI_Barrier(MPI_COMM_WORLD);
 
     auto startCalculation = std::chrono::high_resolution_clock::now();
@@ -283,7 +285,9 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     }
     MPI_Barrier(MPI_COMM_WORLD);
     HLSLIB_DATAFLOW_FINALIZE();
-    cclo.stop();
+    if (config.programSettings->useAcclEmulation) {
+      cclo.stop();
+    }
     auto endCalculation = std::chrono::high_resolution_clock::now();
 #ifndef NDEBUG
     int mpi_rank;

From f046192909eced188d141c5264683f6c68301ab2 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 6 Oct 2022 10:54:42 +0100
Subject: [PATCH 115/318] Add send/recv implementation for PTRANS with ACCL

---
 .../transpose_PQ_ACCL_stream_sendrecv.cpp     | 252 ++++++++++
 .../execution_xrt_accl_stream_pq_sendrecv.hpp | 456 ++++++++++++++++++
 2 files changed, 708 insertions(+)
 create mode 100644 PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp
 create mode 100644 PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp

diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp
new file mode 100644
index 00000000..4c9452b2
--- /dev/null
+++ b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp
@@ -0,0 +1,252 @@
+/******************************************************************************
+ *  Author: Arjun Ramaswami
+ *
+ *  Edited by Marius Meyer:
+ *  - Adapt to used kernel signature
+ *  - Change to row-column loop structure
+ *****************************************************************************/
+#include "parameters.h"
+#include "ap_int.h"
+#include "ap_utils.h"
+#include "ap_axi_sdata.h"
+#include "accl_hls.h"
+
+
+const int block_size = BLOCK_SIZE;
+const int channel_width = CHANNEL_WIDTH;
+
+/**
+ * @brief Modulo operation that always produces positive values in range [0,op-1]. This is required for the PQ transpose algorithm and is different from the usual remainder calculation done with %!
+ * 
+ * @tparam T Data type used for the modulo operation.
+ * @param number Number the modulo is calculated from
+ * @param op Modulo operator
+ * @return T number mod op
+ */
+template<typename T> 
+T mod(T number, T op) {
+    T result = number % op;
+    // result >= op required for unsinged data types
+    return (result < 0 || result >= op) ? op + result : result;
+}
+
+
+void transpose_block_transpose(const DEVICE_DATA_TYPE *A,
+            DEVICE_DATA_TYPE a_block[][channel_width],
+            const unsigned int offset_a,
+            const unsigned int width_in_blocks,
+            const unsigned int height_in_blocks) {
+
+#pragma HLS INTERFACE axis register both port=krnl2cclo
+
+    // transpose the matrix block-wise from global memory
+read_A:
+    for (unsigned int row = 0; row < block_size; row++) {
+read_A_line:
+        for (unsigned int col = 0; col < block_size / channel_width; col++) {
+#pragma HLS PIPELINE
+            unsigned long block_row_a = (offset_a) / width_in_blocks;
+            unsigned long block_col_a = (offset_a) % width_in_blocks;
+            unsigned long ls_address_trans = block_col_a * block_size * block_size * height_in_blocks +
+                        block_row_a * block_size + 
+                        row * block_size * height_in_blocks;
+
+
+            // read in block of A from global memory and store it in a memory efficient manner for transpose
+            DEVICE_DATA_TYPE rotate_in[channel_width];
+#pragma HLS ARRAY_PARTITION variable = rotate_in complete dim = 0
+
+            // Blocks of a will be stored columnwise in global memory
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                rotate_in[unroll_count] = A[ls_address_trans + col * channel_width + unroll_count];
+            }
+
+            unsigned int chunk = row * (block_size / channel_width) + col;
+
+            unsigned rot = (row) % (channel_width);
+
+            // rotate temporary buffer to store data into local buffer
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                // every block of (N / channel_width), rotates the index by 1
+                // store in double buffer
+                a_block[chunk][unroll_count] = rotate_in[(unroll_count + channel_width - rot)
+                                                                                            % (channel_width)];
+            }
+        }
+    }
+}
+
+void transpose_block_forward(DEVICE_DATA_TYPE a_block[][channel_width],
+            STREAM<stream_word> &krnl2cclo) {
+
+read_A:
+    for (unsigned int row = 0; row < block_size; row++) {
+read_A_line:
+        for (unsigned int col = 0; col < block_size / channel_width; col++) {
+            DEVICE_DATA_TYPE data_chunk[channel_width];
+#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0
+            DEVICE_DATA_TYPE rotate_out[channel_width];
+#pragma HLS ARRAY_PARTITION variable = rotate_out complete dim = 0
+
+            unsigned int base = col * block_size;
+            unsigned int offset = row / channel_width;
+
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                unsigned rot = ((channel_width + unroll_count - row) * (block_size / channel_width)) %
+                                                                                            (block_size);
+                unsigned row_rotate = base + offset + rot;
+                rotate_out[unroll_count] = a_block[row_rotate][unroll_count];
+            }
+
+            unsigned rot_out = row % (channel_width);
+
+            // rotate temporary buffer to store data into local buffer
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) % (channel_width)];
+            }
+
+            stream_word tmp;
+
+            // load tranposed A from global memory
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                DEVICE_DATA_TYPE v = data_chunk[unroll_count];
+                tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8) 
+                        = *reinterpret_cast<ap_uint<sizeof(DEVICE_DATA_TYPE)*8>*>(&v);
+            }
+            tmp.dest = 9;
+            tmp.last = 1;
+            tmp.keep = -1;
+            STREAM_WRITE(krnl2cclo,tmp);              
+        }
+    }
+}
+
+/**
+ *
+ * ext. channel -> trans(A) + B -> A_out
+ *
+ * @param B Buffer for matrix B
+ * @param A_out Buffer for result matrix
+ * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise
+                on the block level, the whole matrix A might be written to global memory and the relevant columns
+                need to be picked using this offset.
+ * @param number_of_blocks The number of blocks that will be processed starting from the block offset
+ * @param width_in_blocks The with of matrix A in blocks
+ * @param height_in_blocks The height of matix A in blocks
+ */
+void transpose_block_receive(const DEVICE_DATA_TYPE *B,
+                                 DEVICE_DATA_TYPE *A_out,
+            const unsigned int offset_b,
+            const unsigned int width_in_blocks,
+            STREAM<stream_word> &cclo2krnl) {
+#pragma HLS INTERFACE axis register both port=cclo2krnl
+
+    // transpose the matrix block-wise from global memory
+#pragma HLS loop_tripcount min=1 max=1024 avg=1
+        // Read transposed A from local memory and add B 
+read_B:
+    for (unsigned int row = 0; row < block_size; row++) {
+read_B_line:
+        for (unsigned int col = 0; col < block_size / channel_width; col++) {
+            unsigned long block_row = (offset_b) / width_in_blocks;
+            unsigned long block_col = (offset_b) % width_in_blocks;
+            unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks +
+                    block_col * block_size + 
+                    row * block_size * width_in_blocks;
+            unsigned int chunk = row * (block_size / channel_width) + col;
+
+            DEVICE_DATA_TYPE data_chunk[channel_width];
+#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0
+
+            stream_word tmp = STREAM_READ(cclo2krnl);
+
+            // rotate temporary buffer to store data into local buffer
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                ap_uint<sizeof(DEVICE_DATA_TYPE)*8> v = tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8);
+                data_chunk[unroll_count] = *reinterpret_cast<DEVICE_DATA_TYPE*>(&v);
+            }
+
+            // load tranposed A from global memory
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                data_chunk[unroll_count] += B[ls_address_row + col * channel_width + unroll_count];
+            }
+
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                A_out[ls_address_row + col * channel_width + unroll_count] = data_chunk[unroll_count];
+            }
+        }
+    }
+}
+
+void transpose_read(const DEVICE_DATA_TYPE* A,
+                const int* target_list,
+                int pq_row, int pq_col, 
+                int pq_width, int pq_height,
+                int gcd, int least_common_multiple,
+                int height_per_rank,
+                int width_per_rank,
+                STREAM<stream_word> &krnl2cclo) {
+
+    // Begin algorithm from Figure 14 for general case
+    int g = mod(pq_row - pq_col, gcd);
+    int p = mod(pq_col + g, pq_width);
+    int q = mod(pq_row - g, pq_height);
+
+    for (int j = 0; j < least_common_multiple/pq_width; j++) {
+        for (int i = 0; i < least_common_multiple/pq_height; i++) {
+            // Determine sender and receiver rank of current rank for current communication step
+            int send_rank = mod(p + i * gcd, pq_width) + mod(q - j * gcd, pq_height) * pq_width;
+
+            for (int col = 0; col < least_common_multiple/pq_width; col++) {
+                for (int row = 0; row < least_common_multiple/pq_height; row++) {
+                    if (target_list[row * least_common_multiple/pq_width + col] == send_rank) {
+                        for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) {
+                            for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) {
+                                unsigned int matrix_buffer_offset = (col + lcm_col * least_common_multiple/pq_width) + (row + lcm_row * least_common_multiple/pq_height) * width_per_rank;
+                                DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width];
+                                transpose_block_transpose(A, a_block, matrix_buffer_offset, width_per_rank, height_per_rank);
+                                transpose_block_forward(a_block, krnl2cclo);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void transpose_write(const DEVICE_DATA_TYPE* B,
+                    DEVICE_DATA_TYPE* C,
+                const int* target_list,
+                int pq_row, int pq_col, 
+                int pq_width, int pq_height,
+                int gcd, int least_common_multiple,
+                int height_per_rank,
+                int width_per_rank,
+                STREAM<stream_word> &cclo2krnl) {
+
+    // Begin algorithm from Figure 14 for general case
+    int g = mod(pq_row - pq_col, gcd);
+    int p = mod(pq_col + g, pq_width);
+    int q = mod(pq_row - g, pq_height);
+    for (int j = 0; j < least_common_multiple/pq_width; j++) {
+        for (int i = 0; i < least_common_multiple/pq_height; i++) {
+
+            int recv_rank = mod(p - i * gcd, pq_width) + mod(q + j * gcd, pq_height) * pq_width;
+
+            for (int col = 0; col < least_common_multiple/pq_width; col++) {
+                for (int row = 0; row < least_common_multiple/pq_height; row++) {
+                    if (target_list[row * least_common_multiple/pq_width + col] == recv_rank) {
+                        for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) {
+                            for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) {
+                                unsigned int matrix_buffer_offset = (col + lcm_col * least_common_multiple/pq_width) + (row + lcm_row * least_common_multiple/pq_height) * width_per_rank;
+                                transpose_block_receive(B,C,matrix_buffer_offset,width_per_rank, cclo2krnl);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } 
+}
+
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp
new file mode 100644
index 00000000..b51d8120
--- /dev/null
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp
@@ -0,0 +1,456 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_ACCL_STREAM_PQ_SENDRECV_EXECUTION_H_
+#define SRC_HOST_ACCL_STREAM_PQ_SENDRECV_EXECUTION_H_
+
+/* C++ standard library headers */
+#include <chrono>
+#include <memory>
+#include <vector>
+
+/* Project's headers */
+#include "buffer.hpp"
+#include "cclo.hpp"
+#include "constants.hpp"
+#include "data_handlers/data_handler_types.h"
+#include "data_handlers/pq.hpp"
+#include "fpgabuffer.hpp"
+#include "transpose_data.hpp"
+#include "cclo_bfm.h"
+#include "Simulation.h"
+#include "dummybuffer.hpp"
+
+extern void transpose_write(const DEVICE_DATA_TYPE* B,
+                    DEVICE_DATA_TYPE* C,
+                const int* target_list,
+                int pq_row, int pq_col, 
+                int pq_width, int pq_height,
+                int gcd, int least_common_multiple,
+                int height_per_rank,
+                int width_per_rank,
+                STREAM<stream_word> &cclo2krnl);
+  
+extern void transpose_read(const DEVICE_DATA_TYPE* A,
+                const int* target_list,
+                int pq_row, int pq_col, 
+                int pq_width, int pq_height,
+                int gcd, int least_common_multiple,
+                int height_per_rank,
+                int width_per_rank,
+                STREAM<stream_word> &krnl2cclo);
+
+namespace transpose {
+namespace fpga_execution {
+namespace accl_stream_pq {
+
+/**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ
+ * distribution and PCIe+MPI over the host for communication
+ *
+ * @param config The progrma configuration
+ * @param data data object that contains all required data for the execution on
+ * the FPGA
+ * @param handler data handler instance that should be used to exchange data
+ * between hosts
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured
+ * execution times
+ */
+static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
+    const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
+                                       xrt::device, bool, xrt::uuid> &config,
+    transpose::TransposeData<bool> &data,
+    transpose::data_handler::DistributedPQTransposeDataHandler<
+        xrt::device, bool, xrt::uuid> &handler) {
+  int err;
+
+  if (config.programSettings->dataHandlerIdentifier !=
+      transpose::data_handler::DataHandlerType::pq) {
+    throw std::runtime_error(
+        "Used data handler not supported by execution handler!");
+  }
+#ifdef USE_SVM
+  throw new std::runtime_error("SVM not supported in the host implementation "
+                               "of this communication method");
+#endif
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+  throw new std::runtime_error(
+      "Using the Write Rect method is not supported in this host "
+      "implementation of this communication method");
+#endif
+
+  std::vector<size_t> bufferSizeList;
+  std::vector<size_t> bufferStartList;
+  std::vector<size_t> bufferOffsetList;
+  std::vector<xrt::bo> bufferListA;
+  std::vector<xrt::bo> bufferListB;
+  std::vector<xrt::bo> bufferListA_out;
+  std::vector<std::unique_ptr<ACCL::Buffer<int>>> bufferListTargets;
+  std::vector<xrt::kernel> transposeReadKernelList;
+  std::vector<xrt::kernel> transposeWriteKernelList;
+  std::vector<size_t> blocksPerReplication;
+
+  size_t local_matrix_width = handler.getWidthforRank();
+  size_t local_matrix_height = handler.getHeightforRank();
+  size_t local_matrix_width_bytes =
+      local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+
+  size_t total_offset = 0;
+  size_t row_offset = 0;
+
+  // Algorithm defines
+  int pq_width = handler.getP();
+  int pq_height = handler.getQ();
+
+  int mpi_comm_rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank);
+  int mpi_comm_size;
+  MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size);
+  int pq_row = mpi_comm_rank / pq_width;
+  int pq_col = mpi_comm_rank % pq_width;
+
+  int gcd = std::__gcd(pq_height, pq_width);
+  int least_common_multiple = pq_height * pq_width / gcd;
+
+#ifndef NDEBUG
+  std::cout << "Start kernel creation" << std::endl;
+#endif
+  // Setup the kernels depending on the number of kernel replications
+  for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+    // Calculate how many blocks the current kernel replication will need to
+    // process.
+    size_t blocks_per_replication =
+        (local_matrix_height * local_matrix_width /
+         config.programSettings->kernelReplications);
+    size_t blocks_remainder = (local_matrix_height * local_matrix_width) %
+                              config.programSettings->kernelReplications;
+    if (blocks_remainder > r) {
+      // Catch the case, that the number of blocks is not divisible by the
+      // number of kernel replications
+      blocks_per_replication += 1;
+    }
+    if (blocks_per_replication < 1) {
+      continue;
+    }
+    blocksPerReplication.push_back(blocks_per_replication);
+    size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) /
+                         local_matrix_width * local_matrix_width *
+                         data.blockSize * data.blockSize;
+    bufferSizeList.push_back(buffer_size);
+    bufferStartList.push_back(total_offset);
+    bufferOffsetList.push_back(row_offset);
+
+    row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
+
+    total_offset += (bufferOffsetList.back() + blocks_per_replication) /
+                    local_matrix_width * local_matrix_width;
+
+    // Pre-calculate target ranks in LCM block
+    // The vector list variable can be interpreted as 2D matrix. Every entry
+    // represents the target rank of the sub-block Since the LCM block will
+    // repeat, we only need to store this small amount of data!
+    auto target_list = config.accl->create_buffer<int>(least_common_multiple / pq_height *
+                                least_common_multiple / pq_width, ACCL::dataType::int32);
+    for (int row = 0; row < least_common_multiple / pq_height; row++) {
+      for (int col = 0; col < least_common_multiple / pq_width; col++) {
+        int global_block_col = pq_col + col * pq_width;
+        int global_block_row = pq_row + row * pq_height;
+        int destination_rank = (global_block_col % pq_height) * pq_width +
+                              (global_block_row % pq_width);
+        target_list->buffer()[row * least_common_multiple / pq_width + col] =
+            destination_rank;
+      }
+    }
+    target_list->sync_to_device();
+    bufferListTargets.push_back(std::move(target_list));
+
+    if (!config.programSettings->useAcclEmulation) {
+      // create the kernels
+      xrt::kernel transposeReadKernel(
+          *config.device, *config.program,
+          ("transpose_read0:{transpose_read0_" + std::to_string(r + 1) + "}").c_str());
+      xrt::kernel transposeWriteKernel(
+          *config.device, *config.program,
+          ("transpose_write0:{transpose_write0_" + std::to_string(r + 1) + "}").c_str());
+
+      if (r == 0 || config.programSettings->copyA) {
+        xrt::bo bufferA(*config.device, data.A,
+                      data.numBlocks * data.blockSize * data.blockSize *
+                          sizeof(HOST_DATA_TYPE),
+                      transposeReadKernel.group_id(0));
+        bufferListA.push_back(bufferA);
+      }
+
+      xrt::bo bufferB(
+          *config.device,
+          &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
+          buffer_size * sizeof(HOST_DATA_TYPE), transposeWriteKernel.group_id(0));
+      xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
+                          transposeWriteKernel.group_id(1));
+
+      bufferListB.push_back(bufferB);
+      bufferListA_out.push_back(bufferA_out);
+      transposeReadKernelList.push_back(transposeReadKernel);
+      transposeWriteKernelList.push_back(transposeWriteKernel);
+    }
+  }
+
+  std::vector<double> transferTimings;
+  std::vector<double> calculationTimings;
+
+  for (int repetition = 0; repetition < config.programSettings->numRepetitions;
+       repetition++) {
+
+#ifndef NDEBUG
+    std::cout << "Start data transfer" << std::endl;
+#endif
+    auto startTransfer = std::chrono::high_resolution_clock::now();
+
+    if (!config.programSettings->useAcclEmulation) {
+      for (int r = 0; r < transposeReadKernelList.size(); r++) {
+        if (r == 0 || config.programSettings->copyA) {
+          bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+        }
+        bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      }
+    }
+    auto endTransfer = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double> transferTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endTransfer - startTransfer);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+#ifndef NDEBUG
+    std::cout << "Start BFM" << std::endl;
+#endif
+
+    HLSLIB_DATAFLOW_INIT();
+    hlslib::Stream<stream_word> cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo");
+    hlslib::Stream<command_word> cmd, sts;
+
+    std::vector<unsigned int> dest = {0};
+    CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+    if (config.programSettings->useAcclEmulation) {
+      cclo.run();
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    auto startCalculation = std::chrono::high_resolution_clock::now();
+
+#ifndef NDEBUG
+    std::cout << "Start kernel execution" << std::endl;
+    std::cout << bufferListTargets[0]->buffer()[0] << std::endl;
+#endif
+    std::vector<xrt::run> runs;
+    auto startKernelCalculation = std::chrono::high_resolution_clock::now();
+    for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+      if (!config.programSettings->useAcclEmulation) {
+        runs.push_back(transposeReadKernelList[r](
+            (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]),
+            static_cast<cl_uint>(bufferOffsetList[r]),
+            static_cast<cl_uint>(blocksPerReplication[r]),
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize))));
+        runs.push_back(transposeWriteKernelList[r](
+            bufferListB[r], bufferListA_out[r],
+            static_cast<cl_uint>(bufferOffsetList[r]),
+            static_cast<cl_uint>(blocksPerReplication[r]),
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize))));
+      } else {
+        HLSLIB_DATAFLOW_FUNCTION(transpose_read,
+            (config.programSettings->copyA ? data.A : data.A),
+            bufferListTargets[r]->buffer(),
+            pq_row, pq_col, pq_width, pq_height,
+            gcd, least_common_multiple,
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize)),
+                krnl2cclo);
+        HLSLIB_DATAFLOW_FUNCTION(transpose_write,
+            data.B, data.result,
+            bufferListTargets[r]->buffer(),
+            pq_row, pq_col, pq_width, pq_height,
+            gcd, least_common_multiple,
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize)),
+                cclo2krnl);
+      }
+    }
+#ifndef NDEBUG
+    std::cout << "Start ACCL send/recv" << std::endl;
+#endif
+    auto dbuffer = config.accl->create_buffer<DEVICE_DATA_TYPE>(1,ACCL::dataType::float32);
+    int g = transpose::data_handler::mod(pq_row - pq_col, gcd);
+    int p = transpose::data_handler::mod(pq_col + g, pq_width);
+    int q = transpose::data_handler::mod(pq_row - g, pq_height);
+    // Exchange A data via ACCL
+    for (int k=0; k < 2; k++) {
+      for (int j = 0; j < least_common_multiple/pq_width; j++) {
+          for (int i = 0; i < least_common_multiple/pq_height; i++) {
+              // Determine sender and receiver rank of current rank for current communication step
+              int send_rank = transpose::data_handler::mod(p + i * gcd, pq_width) + transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width;
+              int recv_rank = transpose::data_handler::mod(p - i * gcd, pq_width) + transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width;
+
+              // Also count receiving buffer size because sending and receiving buffer size may differ in certain scenarios!
+              int receiving_size = 0;
+              int sending_size = 0;
+
+              std::vector<int> send_rows;
+              std::vector<int> send_cols;
+              // Look up which blocks are affected by the current rank
+              for (int row = 0; row  < least_common_multiple/pq_height; row++) {
+                  for (int col = 0; col  < least_common_multiple/pq_width; col++) {
+#ifndef NDEBUG
+    std::cout << "Check" << row * least_common_multiple/pq_width + col << std::endl;
+#endif
+                      if (bufferListTargets[0]->buffer()[row * least_common_multiple/pq_width + col] == send_rank) {
+                          send_rows.push_back(row);
+                          send_cols.push_back(col);
+                          sending_size += data.blockSize * data.blockSize;
+                      }
+                      if (bufferListTargets[0]->buffer()[row * least_common_multiple/pq_width + col] == recv_rank) {
+                          receiving_size += data.blockSize * data.blockSize;
+                      }
+                  }
+              }
+              receiving_size *= (local_matrix_height)/(least_common_multiple/pq_height) * ((local_matrix_width)/(least_common_multiple/pq_width));
+              sending_size *= (local_matrix_height)/(least_common_multiple/pq_height) * ((local_matrix_width)/(least_common_multiple/pq_width));
+
+              // Do actual MPI communication
+              if (k==0) {
+                // First schedule all sends, then all receives. This works if communication rounds <= ACCL buffers.
+                // Non-blocking communication would not offer many benefits, because the CCLO can only execute send OR recv
+#ifndef NDEBUG
+                  std::cout << "Send blocks " << sending_size / (data.blockSize * data.blockSize) << " to " << send_rank << std::endl << std::flush;
+#endif
+                  config.accl->send(*dbuffer, sending_size, send_rank, ACCL::TAG_ANY, ACCL::GLOBAL_COMM, false, ACCL::streamFlags::OP0_STREAM);
+                  // TODO Use stream_put to simulate this implementation approach on single FPGA since send/recv to same rank is not working!
+                  // config.accl->stream_put(*dbuffer, sending_size, send_rank, 9, ACCL::GLOBAL_COMM, false, ACCL::streamFlags::OP0_STREAM);
+              } else {
+  #ifndef NDEBUG
+                  std::cout << "Recv blocks " <<   receiving_size / (data.blockSize * data.blockSize) << " from " << recv_rank << std::endl << std::flush;
+  #endif
+                  config.accl->recv(*dbuffer, receiving_size, recv_rank, ACCL::TAG_ANY, ACCL::GLOBAL_COMM, false, ACCL::streamFlags::RES_STREAM);
+              }
+          }
+      }
+    }
+
+#ifndef NDEBUG
+    std::cout << "Wait for kernels to complete" << std::endl;
+#endif
+    for (int r = 0; r < runs.size(); r++) {
+      runs[r].wait();
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+    HLSLIB_DATAFLOW_FINALIZE();
+    if (config.programSettings->useAcclEmulation) {
+      cclo.stop();
+    }
+    auto endCalculation = std::chrono::high_resolution_clock::now();
+#ifndef NDEBUG
+    int mpi_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    std::cout << "Rank " << mpi_rank << ": "
+              << "Done i=" << repetition << std::endl;
+    std::cout << "Kernel execution time: "
+              << std::chrono::duration_cast<std::chrono::duration<double>>(
+                     endCalculation - startKernelCalculation)
+                     .count()
+              << "s ("
+              << ((config.programSettings->matrixSize *
+                   config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) *
+                   3) /
+                  std::chrono::duration_cast<std::chrono::duration<double>>(
+                      endCalculation - startKernelCalculation)
+                      .count() *
+                  1.0e-9)
+              << " GB/s)" << std::endl;
+#endif
+
+    std::chrono::duration<double> calculationTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endCalculation - startCalculation);
+    calculationTimings.push_back(calculationTime.count());
+
+    std::vector<HOST_DATA_TYPE> tmp_write_buffer(
+        local_matrix_height * local_matrix_width * data.blockSize *
+        data.blockSize);
+
+    startTransfer = std::chrono::high_resolution_clock::now();
+    if (!config.programSettings->useAcclEmulation) {
+      for (int r = 0; r < transposeReadKernelList.size(); r++) {
+        // Copy possibly incomplete first block row
+        if (bufferOffsetList[r] != 0) {
+          bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+          bufferListA_out[r].read(tmp_write_buffer.data());
+          for (int row = 0; row < data.blockSize; row++) {
+            for (int col = bufferOffsetList[r] * data.blockSize;
+                col < local_matrix_width * data.blockSize; col++) {
+              data.result[bufferStartList[r] * data.blockSize * data.blockSize +
+                          row * local_matrix_width * data.blockSize + col] =
+                  tmp_write_buffer[row * local_matrix_width * data.blockSize +
+                                  col];
+            }
+          }
+          // Copy remaining buffer
+          std::copy(tmp_write_buffer.begin() +
+                        local_matrix_width * data.blockSize * data.blockSize,
+                    tmp_write_buffer.begin() + bufferSizeList[r],
+                    &data.result[(bufferStartList[r] + local_matrix_width) *
+                                data.blockSize * data.blockSize]);
+        } else {
+          bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+          bufferListA_out[r].read(
+              &data.result[bufferStartList[r] * data.blockSize * data.blockSize]);
+        }
+      }
+    }
+    endTransfer = std::chrono::high_resolution_clock::now();
+
+    transferTime += std::chrono::duration_cast<std::chrono::duration<double>>(
+        endTransfer - startTransfer);
+    transferTimings.push_back(transferTime.count());
+  }
+
+  std::unique_ptr<transpose::TransposeExecutionTimings> result(
+      new transpose::TransposeExecutionTimings{transferTimings,
+                                               calculationTimings});
+
+  return result;
+}
+
+} // namespace accl_pq
+} // namespace fpga_execution
+} // namespace transpose
+
+#endif

From b782ebad4cca5d4c410c7576ecc295ac7888bbfd Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 6 Oct 2022 12:07:57 +0100
Subject: [PATCH 116/318] ACCL emulation for b_eff

---
 b_eff/src/device/communication_ACCL_pl.cpp    |  4 +-
 b_eff/src/host/CMakeLists.txt                 |  6 +++
 .../execution_types/execution_accl_pl.hpp     | 43 ++++++++++++++++---
 shared/setup/fpga_setup_accl.cpp              |  2 +-
 4 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/b_eff/src/device/communication_ACCL_pl.cpp b/b_eff/src/device/communication_ACCL_pl.cpp
index 58fc7fed..97a21907 100644
--- a/b_eff/src/device/communication_ACCL_pl.cpp
+++ b/b_eff/src/device/communication_ACCL_pl.cpp
@@ -22,9 +22,9 @@ SOFTWARE.
 #include "accl_hls.h"
 
 
-void send_recv(float *read_buffer,float *write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations, 
+void send_recv(const float *read_buffer,float *write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations, 
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
-                hls::stream<command_word> &cmd, hls::stream<command_word > &sts) {
+                STREAM<command_word> &cmd, STREAM<command_word> &sts) {
     accl_hls::ACCLCommand accl_cmd(cmd, sts, communicator_addr, datapath_cfg,0,0);
     for (int i = 0; i < num_iterations; i++) {
         accl_cmd.send(size, 0, neighbor_rank, (ap_uint<64>)read_buffer);
diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt
index adaa8348..e5e09aed 100755
--- a/b_eff/src/host/CMakeLists.txt
+++ b/b_eff/src/host/CMakeLists.txt
@@ -19,6 +19,12 @@ if (INTELFPGAOPENCL_FOUND)
     add_test(NAME test_intel_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_intel> -h)
 endif()
 if (Vitis_FOUND)
+if (USE_ACCL)
+    set(CMAKE_SKIP_BUILD_RPATH No)
+    set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes)
+    list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib)
+    list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl.cpp)
+endif()
     add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE})
     target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS})
     target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host)
diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
index 3f39cfab..5bbda303 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -30,11 +30,18 @@ SOFTWARE.
 /* External library headers */
 #include "mpi.h"
 #include "accl.hpp"
+#include "cclo_bfm.h"
+#include "accl_hls.h"
 
 /* Project's headers */
 
+extern void send_recv(const float *read_buffer,float *write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations, 
+                ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
+                STREAM<command_word> &cmd, STREAM<command_word > &sts);
+
 namespace network::execution_types::accl_pl {
 
+
     /*
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
@@ -57,13 +64,23 @@ namespace network::execution_types::accl_pl {
         int current_size;
         MPI_Comm_size(MPI_COMM_WORLD, & current_size);
 
+        hlslib::Stream<stream_word> cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo");
+        hlslib::Stream<command_word> cmd, sts;
+
+        std::vector<unsigned int> dest = {0};
+        CCLO_BFM cclo(6000, current_rank, current_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+        if (config.programSettings->useAcclEmulation) {
+            cclo.run();
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+
         std::vector<double> calculationTimings;
         for (uint r =0; r < config.programSettings->numRepetitions; r++) {
             dummyBufferContents.clear();
-	    recvBufferContents.clear();
-	    acclSendBuffers.clear();
-	    acclRecvBuffers.clear();
-	    int size_in_values = (size_in_bytes + 3) / 4;
+            recvBufferContents.clear();
+            acclSendBuffers.clear();
+            acclRecvBuffers.clear();
+            int size_in_values = (size_in_bytes + 3) / 4;
             // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
@@ -74,15 +91,21 @@ namespace network::execution_types::accl_pl {
                 acclRecvBuffers.back()->sync_to_device();
             }
 
-            xrt::kernel sendrecvKernel(*config.device, *config.program, "sendrecv");
+            xrt::kernel sendrecvKernel(*config.device, *config.program, "send_recv");
 
             double calculationTime = 0.0;
             for (int i = 0; i < config.programSettings->kernelReplications; i++) {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
+                if (!config.programSettings->useAcclEmulation) {
                 auto run = sendrecvKernel(acclSendBuffers[i]->bo(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
-                                            config.accl->get_communicator_adr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}));
+                                            config.accl->get_communicator_addr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}));
                 run.wait();
+                } else {
+                    send_recv(reinterpret_cast<float*>(acclSendBuffers[i]->buffer()), reinterpret_cast<float*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                            config.accl->get_communicator_addr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}),
+                                            cmd, sts);
+                }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
                 #ifndef NDEBUG
@@ -98,10 +121,16 @@ namespace network::execution_types::accl_pl {
         std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
 #endif
         }
+
+        if (config.programSettings->useAcclEmulation) {
+            cclo.stop();
+        }
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-            acclRecvBuffers.back()->sync_from_device();
+            if (!config.programSettings->useAcclEmulation) {
+                acclRecvBuffers.back()->sync_from_device();
+            }
 		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
         }
         std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 58fc3f67..51c7f87e 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -99,7 +99,7 @@ std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &progra
   } else {
     // TODO: Add start port here. Currenty hardcoded!
     return std::unique_ptr<ACCL::ACCL>(
-        new ACCL::ACCL(ranks, current_rank, 6000, device, ACCL::networkProtocol::TCP, 16, ACCL_BUFFER_SIZE));
+        new ACCL::ACCL(ranks, current_rank, 6000, device, ACCL::networkProtocol::UDP, 16, ACCL_BUFFER_SIZE));
   }
 }
 

From e22128ab3d10d6ba70d9623f4fd6ac247a744cac Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 7 Oct 2022 16:08:00 +0100
Subject: [PATCH 117/318] Update LINPACK

---
 .../settings.link.xilinx.hpl_torus_accl.hbm.generator.ini    | 3 +--
 LINPACK/src/host/CMakeLists.txt                              | 5 +++++
 .../host/execution_types/execution_xrt_accl_stream_pq.hpp    | 4 ++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini
index ec8cbfa6..4783d320 100644
--- a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini
@@ -9,8 +9,7 @@ nk=inner_update_mm0:$PY_CODE_GEN num_replications$
 slr=lu_1:SLR0
 slr=left_update_1:SLR0
 slr=top_update_1:SLR0
-slr=inner_update_mm0_1:SLR0
-slr=inner_update_mm0_2:SLR2
+slr=inner_update_mm0_1:SLR2
 
 # matrix ports
 sp=lu_1.m_axi_gmem0:HBM[0:4]
diff --git a/LINPACK/src/host/CMakeLists.txt b/LINPACK/src/host/CMakeLists.txt
index 5422f31f..72abdf1c 100755
--- a/LINPACK/src/host/CMakeLists.txt
+++ b/LINPACK/src/host/CMakeLists.txt
@@ -23,6 +23,11 @@ if (INTELFPGAOPENCL_FOUND)
 endif()
 
 if (Vitis_FOUND)
+    if (USE_ACCL)
+        set(CMAKE_SKIP_BUILD_RPATH No)
+        set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes)
+        list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib)
+    endif()
     add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE})
     target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS})
     target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host)
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
index 9a43fef9..9bf1aaf6 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
@@ -219,7 +219,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     int pq_col = mpi_comm_rank % pq_width;
 
     int pair_rank = pq_width * pq_col + pq_row;
-    std::vector<unsigned int> dest = {0};
+    std::vector<unsigned int> dest = {0, 9};
     CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
     if (config.programSettings->useAcclEmulation) {
       cclo.run();
@@ -283,8 +283,8 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     for (int r = 0; r < runs.size(); r++) {
       runs[r].wait();
     }
-    MPI_Barrier(MPI_COMM_WORLD);
     HLSLIB_DATAFLOW_FINALIZE();
+    MPI_Barrier(MPI_COMM_WORLD);
     if (config.programSettings->useAcclEmulation) {
       cclo.stop();
     }

From d8be6afbb852ab12c4d5cdf763343735ad8c0ca0 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 17 Oct 2022 13:54:46 +0100
Subject: [PATCH 118/318] Update PTRANS stream settings

---
 PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake   |  4 ++--
 ...gs.link.xilinx.transpose_pq_accl_stream.hbm.ini | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake
index 89114c4d..6b196634 100644
--- a/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake
+++ b/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake
@@ -14,7 +14,7 @@ set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
 set(USE_OCL_HOST No CACHE BOOL "" FORCE)
 set(ACCL_STACK_TYPE "TCP" CACHE STRING "" FORCE)
 set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
-set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini CACHE FILEPATH "" FORCE)
 set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
 set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
 
@@ -23,6 +23,6 @@ set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
 set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
 set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
 set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
-set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
 
 set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini
index f9be4bec..559ff34f 100644
--- a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini
@@ -27,12 +27,12 @@ nk=transpose_read0:1
 nk=transpose_write0:1
 
 # Kernels Foorplaning
-slr=compression_0_0:SLR1
-slr=compression_0_1:SLR1
-slr=compression_0_2:SLR1
-slr=arith_0:SLR1
-slr=ccl_offload_0:SLR1
-slr=hostctrl_0:SLR1
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
 slr=networklayer_0:SLR1
 slr=cmac_0:SLR1
 slr=transpose_read0_1:SLR2
@@ -73,5 +73,5 @@ stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
 
 # Tie off user kernel interface
 stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl
-stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl
+stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:32
 

From 1d70455c52019775b244512594ec97dfb54ceca6 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 21 Oct 2022 11:34:40 +0100
Subject: [PATCH 119/318] Fix ACCL and XRT setup

---
 shared/setup/fpga_setup_accl.cpp |  2 +-
 shared/setup/fpga_setup_xrt.cpp  | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 51c7f87e..54b78c5c 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -77,7 +77,7 @@ std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &progra
   std::vector<ACCL::rank_t> ranks = {};
   for (int i = 0; i < current_size; ++i) {
     // TODO: Replace the ip addresses and ports here for execution of real hardware?
-    ACCL::rank_t new_rank = {"10.10.10." + current_rank, 5500 + i, i, ACCL_BUFFER_SIZE};
+    ACCL::rank_t new_rank = {"10.10.10." + std::to_string(current_rank), 5500 + i, i, ACCL_BUFFER_SIZE};
     ranks.emplace_back(new_rank);
   }
   if (!useAcclEmulation) {
diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp
index 103eda17..eae39fe8 100644
--- a/shared/setup/fpga_setup_xrt.cpp
+++ b/shared/setup/fpga_setup_xrt.cpp
@@ -35,9 +35,11 @@ namespace fpga_setup {
 
     std::unique_ptr<xrt::device>
     selectFPGADevice(int defaultDevice) {
-        int current_rank;
-        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
-
-        return std::unique_ptr<xrt::device>(new xrt::device(current_rank));
+        int current_device;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_device);
+        if (defaultDevice >= 0) {
+            current_device = defaultDevice;
+        }
+        return std::unique_ptr<xrt::device>(new xrt::device(current_device));
     } 
 }  // namespace fpga_setup

From fb6895655415d30e3daa96e1ef1c5649d61eab51 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 21 Oct 2022 17:29:02 +0100
Subject: [PATCH 120/318] Add ACCL stream flag to PTRANS

---
 PTRANS/src/host/transpose_benchmark.hpp | 54 ++++++++++++++++---------
 PTRANS/src/host/transpose_data.cpp      |  3 +-
 PTRANS/src/host/transpose_data.hpp      |  5 +++
 3 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index f2b06965..553ce002 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -41,6 +41,7 @@ SOFTWARE.
 #include "execution_types/execution_xrt_pcie_pq.hpp"
 #ifdef USE_ACCL
 #include "execution_types/execution_xrt_accl_pq.hpp"
+#include "execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp"
 #include "execution_types/execution_xrt_accl_stream_pq.hpp"
 #endif
 #endif
@@ -83,7 +84,8 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
             ("distribute-buffers", "Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs.")
             ("handler", "Specify the used data handler that distributes the data over devices and memory banks",
                 cxxopts::value<std::string>()->default_value(DEFAULT_DIST_TYPE))
-            ("copy-a", "Create a copy of matrix A for each kernel replication");
+            ("copy-a", "Create a copy of matrix A for each kernel replication")
+            ("accl-stream", "Use design with user kernels directly connected to CCLO");
     }
 
     std::unique_ptr<transpose::data_handler::TransposeDataHandler<TDevice, TContext, TProgram>> dataHandler;
@@ -144,8 +146,18 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
 #ifdef USE_ACCL
             // case hpcc_base::CommunicationType::accl:
             //                         return transpose::fpga_execution::accl_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler)); break;
-            case hpcc_base::CommunicationType::accl:
-                                    return transpose::fpga_execution::accl_stream_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler)); break;
+            case hpcc_base::CommunicationType::accl: 
+                                    if (this->executionSettings->programSettings->useAcclStreams) {
+                                        auto h = reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler);
+                                        if (!h.getP() == h.getQ()) {
+                                            return transpose::fpga_execution::accl_stream_sendrecv_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler));
+                                        }
+                                        else {
+                                            return transpose::fpga_execution::accl_stream_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler));
+                                        }
+                                    } else {
+                                        return transpose::fpga_execution::accl_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler));
+                                    } break;
 #endif
 #endif
 #ifdef MKL_FOUND
@@ -188,30 +200,32 @@ public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext,
         long height_per_rank = reinterpret_cast<data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>*>(this->dataHandler.get())->getHeightforRank();
         long width_per_rank = reinterpret_cast<data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>*>(this->dataHandler.get())->getWidthforRank();
         if (error_count > 0) {
-            std::cout << "A:" << std::endl;
-            for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
-                for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
-                    std::cout << oldA[j * width_per_rank * data.blockSize + i] << ", ";
+            if ( this->mpi_comm_rank == 0) {
+                std::cout << "A:" << std::endl;
+                for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
+                    for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
+                        std::cout << oldA[j * width_per_rank * data.blockSize + i] << ", ";
+                    }
+                    std::cout << std::endl;
                 }
                 std::cout << std::endl;
-            }
-            std::cout << std::endl;
-            std::cout << "B:" << std::endl;
-            for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
-                for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
-                    std::cout << data.B[j * width_per_rank * data.blockSize + i] << ", ";
+                std::cout << "B:" << std::endl;
+                for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
+                    for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
+                        std::cout << data.B[j * width_per_rank * data.blockSize + i] << ", ";
+                    }
+                    std::cout << std::endl;
                 }
                 std::cout << std::endl;
-            }
-            std::cout << std::endl;
-            std::cout << "Transposed A:" << std::endl;
-            for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
-                for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
-                    std::cout << data.A[j * width_per_rank * data.blockSize + i] << ", ";
+                std::cout << "Transposed A:" << std::endl;
+                for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
+                    for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
+                        std::cout << data.A[j * width_per_rank * data.blockSize + i] << ", ";
+                    }
+                    std::cout << std::endl;
                 }
                 std::cout << std::endl;
             }
-            std::cout << std::endl;
         }
 
 #endif
diff --git a/PTRANS/src/host/transpose_data.cpp b/PTRANS/src/host/transpose_data.cpp
index e8a7c8f0..36979413 100644
--- a/PTRANS/src/host/transpose_data.cpp
+++ b/PTRANS/src/host/transpose_data.cpp
@@ -7,7 +7,8 @@
 transpose::TransposeProgramSettings::TransposeProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
     matrixSize(results["m"].as<uint>() * results["b"].as<uint>()),
     blockSize(results["b"].as<uint>()), dataHandlerIdentifier(transpose::data_handler::stringToHandler(results["handler"].as<std::string>())),
-    distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as<uint>()), copyA(results["copy-a"].count() > 0) {
+    distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as<uint>()), copyA(results["copy-a"].count() > 0),
+    useAcclStreams(results["accl-stream"].count() > 0) {
 
         // auto detect data distribution type if required
         if (dataHandlerIdentifier == transpose::data_handler::DataHandlerType::automatic) {
diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp
index fed4eff6..cd9020e4 100644
--- a/PTRANS/src/host/transpose_data.hpp
+++ b/PTRANS/src/host/transpose_data.hpp
@@ -80,6 +80,11 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings {
     */
     bool copyA;
 
+    /**
+     * @brief Indicate, if a design is used where the user kernels are directly connected to the ACCL CCLO
+    */
+    bool useAcclStreams;
+
     /**
      * @brief Construct a new Transpose Program Settings object
      * 

From 3a1cc5f32ed107974489f3fd2992bafd58ba1556 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 21 Oct 2022 17:31:27 +0100
Subject: [PATCH 121/318] Apply ACCL API changes to PTRANS

---
 .../execution_types/execution_xrt_accl_pq.hpp | 14 ++---
 .../execution_xrt_accl_stream_pq.hpp          | 26 ++++-----
 .../execution_xrt_accl_stream_pq_sendrecv.hpp | 55 ++++++++++++-------
 3 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
index 3fdaeb1f..8e6c0f5b 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
@@ -28,12 +28,9 @@ SOFTWARE.
 #include <vector>
 
 /* Project's headers */
-#include "buffer.hpp"
-#include "cclo.hpp"
-#include "constants.hpp"
+#include "accl.hpp"
 #include "data_handlers/data_handler_types.h"
 #include "data_handlers/pq.hpp"
-#include "fpgabuffer.hpp"
 #include "transpose_data.hpp"
 
 namespace transpose {
@@ -87,15 +84,14 @@ void accl_exchangeData(
           accl.send(*acclBuffersA[0]->slice(
                         data.blockSize * data.blockSize * block_num,
                         data.blockSize * data.blockSize * (block_num + 1)),
-                    data.blockSize * data.blockSize, pair_rank, 0, ACCL::GLOBAL_COMM, true,
-                    ACCL::streamFlags::NO_STREAM);
+                    data.blockSize * data.blockSize, pair_rank, 0, ACCL::GLOBAL_COMM, true);
         }
         for (int block_num = block_chunk; block_num < std::min<size_t>(data.numBlocks, block_chunk + 16); block_num++) {
           accl.recv(*acclBufferA_recv->slice(
                         data.blockSize * data.blockSize * block_num,
                         data.blockSize * data.blockSize * (block_num + 1)),
                     data.blockSize * data.blockSize, pair_rank,
-                    1, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM);
+                    1, ACCL::GLOBAL_COMM, true);
         }
       }
 
@@ -266,11 +262,11 @@ void accl_exchangeData(
 #endif
         accl_requests[current_parallel_execution] = (accl.send(
             *send_buffers[current_parallel_execution], sending_size,
-            send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM,
+            send_rank, 0, ACCL::GLOBAL_COMM, true,
             ACCL::dataType::none, true));
         accl_requests[current_parallel_execution + gcd] = (accl.recv(
             *recv_buffers[current_parallel_execution], sending_size,
-            send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM,
+            send_rank, 0, ACCL::GLOBAL_COMM, true,
             ACCL::dataType::none, true));
         // Increase the counter for parallel executions
         current_parallel_execution = (current_parallel_execution + 1) % gcd;
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
index 9bf1aaf6..50a07998 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
@@ -28,16 +28,12 @@ SOFTWARE.
 #include <vector>
 
 /* Project's headers */
-#include "buffer.hpp"
-#include "cclo.hpp"
-#include "constants.hpp"
 #include "data_handlers/data_handler_types.h"
 #include "data_handlers/pq.hpp"
-#include "fpgabuffer.hpp"
 #include "transpose_data.hpp"
 #include "cclo_bfm.h"
 #include "Simulation.h"
-#include "dummybuffer.hpp"
+#include "accl.hpp"
 
 extern void transpose_write(const DEVICE_DATA_TYPE *B,
                                  DEVICE_DATA_TYPE *A_out,
@@ -201,10 +197,6 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
 
     MPI_Barrier(MPI_COMM_WORLD);
 
-#ifndef NDEBUG
-    std::cout << "Start BFM" << std::endl;
-#endif
-
     HLSLIB_DATAFLOW_INIT();
     hlslib::Stream<stream_word> cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo");
     hlslib::Stream<command_word> cmd, sts;
@@ -220,9 +212,13 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
 
     int pair_rank = pq_width * pq_col + pq_row;
     std::vector<unsigned int> dest = {0, 9};
-    CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+    std::unique_ptr<CCLO_BFM> cclo;
     if (config.programSettings->useAcclEmulation) {
-      cclo.run();
+#ifndef NDEBUG
+      std::cout << "Start BFM" << std::endl;
+#endif
+      cclo = std::make_unique<CCLO_BFM>(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+      cclo->run();
     }
     MPI_Barrier(MPI_COMM_WORLD);
 
@@ -272,11 +268,9 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
                 cclo2krnl);
       }
     }
-    auto dbuffer = config.accl->create_buffer<DEVICE_DATA_TYPE>(1,ACCL::dataType::float32);
     // Exchange A data via ACCL
-    config.accl->stream_put(*dbuffer, data.blockSize * data.blockSize * data.numBlocks,
-                   pair_rank, 9, ACCL::GLOBAL_COMM,
-                   false, ACCL::streamFlags::OP0_STREAM);
+    config.accl->stream_put(ACCL::dataType::float32, data.blockSize * data.blockSize * data.numBlocks,
+                   pair_rank, 0);
 #ifndef NDEBUG
     std::cout << "Wait for kernels to complete" << std::endl;
 #endif
@@ -286,7 +280,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     HLSLIB_DATAFLOW_FINALIZE();
     MPI_Barrier(MPI_COMM_WORLD);
     if (config.programSettings->useAcclEmulation) {
-      cclo.stop();
+      cclo->stop();
     }
     auto endCalculation = std::chrono::high_resolution_clock::now();
 #ifndef NDEBUG
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp
index b51d8120..c01bab4c 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp
@@ -28,18 +28,14 @@ SOFTWARE.
 #include <vector>
 
 /* Project's headers */
-#include "buffer.hpp"
-#include "cclo.hpp"
-#include "constants.hpp"
 #include "data_handlers/data_handler_types.h"
 #include "data_handlers/pq.hpp"
-#include "fpgabuffer.hpp"
 #include "transpose_data.hpp"
 #include "cclo_bfm.h"
 #include "Simulation.h"
-#include "dummybuffer.hpp"
+#include "accl.hpp"
 
-extern void transpose_write(const DEVICE_DATA_TYPE* B,
+void transpose_write_sendrecv(const DEVICE_DATA_TYPE* B,
                     DEVICE_DATA_TYPE* C,
                 const int* target_list,
                 int pq_row, int pq_col, 
@@ -49,7 +45,7 @@ extern void transpose_write(const DEVICE_DATA_TYPE* B,
                 int width_per_rank,
                 STREAM<stream_word> &cclo2krnl);
   
-extern void transpose_read(const DEVICE_DATA_TYPE* A,
+void transpose_read_sendrecv(const DEVICE_DATA_TYPE* A,
                 const int* target_list,
                 int pq_row, int pq_col, 
                 int pq_width, int pq_height,
@@ -60,7 +56,7 @@ extern void transpose_read(const DEVICE_DATA_TYPE* A,
 
 namespace transpose {
 namespace fpga_execution {
-namespace accl_stream_pq {
+namespace accl_stream_sendrecv_pq {
 
 /**
  * @brief Transpose and add the matrices using the OpenCL kernel using a PQ
@@ -104,6 +100,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
   std::vector<xrt::bo> bufferListB;
   std::vector<xrt::bo> bufferListA_out;
   std::vector<std::unique_ptr<ACCL::Buffer<int>>> bufferListTargets;
+  std::vector<std::unique_ptr<ACCL::Buffer<DEVICE_DATA_TYPE>>> bufferListCopy;
   std::vector<xrt::kernel> transposeReadKernelList;
   std::vector<xrt::kernel> transposeWriteKernelList;
   std::vector<size_t> blocksPerReplication;
@@ -159,6 +156,10 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     bufferStartList.push_back(total_offset);
     bufferOffsetList.push_back(row_offset);
 
+#ifndef NDEBUG
+    std::cout << "Blocks per replication: " << blocks_per_replication << std::endl;
+#endif
+
     row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
 
     total_offset += (bufferOffsetList.back() + blocks_per_replication) /
@@ -170,6 +171,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     // repeat, we only need to store this small amount of data!
     auto target_list = config.accl->create_buffer<int>(least_common_multiple / pq_height *
                                 least_common_multiple / pq_width, ACCL::dataType::int32);
+    bufferListCopy.push_back(config.accl->create_buffer<DEVICE_DATA_TYPE>(buffer_size, ACCL::dataType::float32));
     for (int row = 0; row < least_common_multiple / pq_height; row++) {
       for (int col = 0; col < least_common_multiple / pq_width; col++) {
         int global_block_col = pq_col + col * pq_width;
@@ -187,10 +189,10 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
       // create the kernels
       xrt::kernel transposeReadKernel(
           *config.device, *config.program,
-          ("transpose_read0:{transpose_read0_" + std::to_string(r + 1) + "}").c_str());
+          ("transpose_read_sendrecv0:{transpose_read_sendrecv0_" + std::to_string(r + 1) + "}").c_str());
       xrt::kernel transposeWriteKernel(
           *config.device, *config.program,
-          ("transpose_write0:{transpose_write0_" + std::to_string(r + 1) + "}").c_str());
+          ("transpose_write_sendrecv0:{transpose_write_sendrecv0_" + std::to_string(r + 1) + "}").c_str());
 
       if (r == 0 || config.programSettings->copyA) {
         xrt::bo bufferA(*config.device, data.A,
@@ -249,10 +251,14 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     hlslib::Stream<stream_word> cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo");
     hlslib::Stream<command_word> cmd, sts;
 
-    std::vector<unsigned int> dest = {0};
-    CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+    std::vector<unsigned int> dest = {0, 9};
+    std::unique_ptr<CCLO_BFM> cclo;
     if (config.programSettings->useAcclEmulation) {
-      cclo.run();
+#ifndef NDEBUG
+      std::cout << "Start BFM" << std::endl;
+#endif
+      cclo = std::make_unique<CCLO_BFM>(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+      cclo->run();
     }
     MPI_Barrier(MPI_COMM_WORLD);
 
@@ -260,7 +266,6 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
 
 #ifndef NDEBUG
     std::cout << "Start kernel execution" << std::endl;
-    std::cout << bufferListTargets[0]->buffer()[0] << std::endl;
 #endif
     std::vector<xrt::run> runs;
     auto startKernelCalculation = std::chrono::high_resolution_clock::now();
@@ -283,7 +288,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
                 (bufferSizeList[r]) /
                 (local_matrix_width * data.blockSize * data.blockSize))));
       } else {
-        HLSLIB_DATAFLOW_FUNCTION(transpose_read,
+        HLSLIB_DATAFLOW_FUNCTION(transpose_read_sendrecv,
             (config.programSettings->copyA ? data.A : data.A),
             bufferListTargets[r]->buffer(),
             pq_row, pq_col, pq_width, pq_height,
@@ -293,7 +298,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
                 (bufferSizeList[r]) /
                 (local_matrix_width * data.blockSize * data.blockSize)),
                 krnl2cclo);
-        HLSLIB_DATAFLOW_FUNCTION(transpose_write,
+        HLSLIB_DATAFLOW_FUNCTION(transpose_write_sendrecv,
             data.B, data.result,
             bufferListTargets[r]->buffer(),
             pq_row, pq_col, pq_width, pq_height,
@@ -352,14 +357,22 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
 #ifndef NDEBUG
                   std::cout << "Send blocks " << sending_size / (data.blockSize * data.blockSize) << " to " << send_rank << std::endl << std::flush;
 #endif
-                  config.accl->send(*dbuffer, sending_size, send_rank, ACCL::TAG_ANY, ACCL::GLOBAL_COMM, false, ACCL::streamFlags::OP0_STREAM);
-                  // TODO Use stream_put to simulate this implementation approach on single FPGA since send/recv to same rank is not working!
-                  // config.accl->stream_put(*dbuffer, sending_size, send_rank, 9, ACCL::GLOBAL_COMM, false, ACCL::streamFlags::OP0_STREAM);
+                  if (send_rank == mpi_comm_rank) {
+                    //TODO copy from and to string not implemented in driver yet
+                    // config.accl->copy_from_stream(*bufferListCopy[0], sending_size);
+                  } else {
+                    config.accl->send(ACCL::dataType::float32, sending_size, send_rank, 0);
+                  }
               } else {
   #ifndef NDEBUG
                   std::cout << "Recv blocks " <<   receiving_size / (data.blockSize * data.blockSize) << " from " << recv_rank << std::endl << std::flush;
   #endif
-                  config.accl->recv(*dbuffer, receiving_size, recv_rank, ACCL::TAG_ANY, ACCL::GLOBAL_COMM, false, ACCL::streamFlags::RES_STREAM);
+                if (recv_rank == mpi_comm_rank) {
+                  //TODO copy from and to string not implemented in driver yet
+                  // config.accl->copy_to_stream(*bufferListCopy[0], receiving_size);
+                } else {
+                  config.accl->recv(ACCL::dataType::float32, receiving_size, recv_rank, 0);
+                }
               }
           }
       }
@@ -374,7 +387,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     MPI_Barrier(MPI_COMM_WORLD);
     HLSLIB_DATAFLOW_FINALIZE();
     if (config.programSettings->useAcclEmulation) {
-      cclo.stop();
+      cclo->stop();
     }
     auto endCalculation = std::chrono::high_resolution_clock::now();
 #ifndef NDEBUG

From 539047ca17044c9b6cd0b568c2025ab7c36153cd Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 21 Oct 2022 17:31:52 +0100
Subject: [PATCH 122/318] Add more ACCL debug output

---
 shared/setup/fpga_setup_accl.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 54b78c5c..8a5f685c 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -86,10 +86,12 @@ std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &progra
     std::cout << "Create hostctrl" << std::endl;
     auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}",
                                    xrt::kernel::cu_access_mode::exclusive);
- 
+    std::cout << "Create CMAC" << std::endl;
     auto cmac = CMAC(xrt::ip(device, program, "cmac_0:{cmac_0}"));
+    std::cout << "Create Network Layer" << std::endl;
      auto network_layer = Networklayer(
           xrt::ip(device, program, "networklayer:{networklayer_0}"));
+    std::cout << "Configure VNX" << std::endl;
      configure_vnx(cmac, network_layer, ranks, current_rank);
 
     std::vector<int> mem(1, 0);

From 599228b700a84a7277118bf9fe70591a9c20f087 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 21 Oct 2022 17:32:46 +0100
Subject: [PATCH 123/318] Add sendrecv PTRANS kernel to build

---
 PTRANS/src/device/CMakeLists.txt | 2 +-
 PTRANS/src/host/CMakeLists.txt   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/PTRANS/src/device/CMakeLists.txt b/PTRANS/src/device/CMakeLists.txt
index 34c47551..bbee1bff 100644
--- a/PTRANS/src/device/CMakeLists.txt
+++ b/PTRANS/src/device/CMakeLists.txt
@@ -11,7 +11,7 @@ if (INTELFPGAOPENCL_FOUND)
 endif()
 
 if (VITIS_FOUND)
-    generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE transpose_PQ_ACCL_buffers transpose_PQ_ACCL_stream)
+    generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE transpose_PQ_ACCL_buffers transpose_PQ_ACCL_stream transpose_PQ_ACCL_stream_sendrecv)
     add_test(NAME test_emulation_PQ_PCIE_xilinx COMMAND Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
     add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt
index fe7214c4..e162b809 100755
--- a/PTRANS/src/host/CMakeLists.txt
+++ b/PTRANS/src/host/CMakeLists.txt
@@ -37,7 +37,7 @@ if (Vitis_FOUND)
         set(CMAKE_SKIP_BUILD_RPATH No)
         set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes)
         list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib)
-        list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/transpose_PQ_ACCL_stream.cpp)
+        list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp ${CMAKE_SOURCE_DIR}/src/device/transpose_PQ_ACCL_stream.cpp)
     endif()
     add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE})
     target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS})

From 1a4a69f55a4ba888c61b88d584a484498f6d9183 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 21 Oct 2022 17:33:31 +0100
Subject: [PATCH 124/318] Rename sendrecv PTRANS kernels

---
 PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp
index 4c9452b2..13e1c300 100644
--- a/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp
+++ b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp
@@ -178,7 +178,7 @@ void transpose_block_receive(const DEVICE_DATA_TYPE *B,
     }
 }
 
-void transpose_read(const DEVICE_DATA_TYPE* A,
+void transpose_read_sendrecv(const DEVICE_DATA_TYPE* A,
                 const int* target_list,
                 int pq_row, int pq_col, 
                 int pq_width, int pq_height,
@@ -215,7 +215,7 @@ void transpose_read(const DEVICE_DATA_TYPE* A,
     }
 }
 
-void transpose_write(const DEVICE_DATA_TYPE* B,
+void transpose_write_sendrecv(const DEVICE_DATA_TYPE* B,
                     DEVICE_DATA_TYPE* C,
                 const int* target_list,
                 int pq_row, int pq_col, 

From 2e1405ed27fb29dbe1d4e3cbbe29d7cca1939266 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 21 Oct 2022 17:46:38 +0100
Subject: [PATCH 125/318] Attempt to fix PTRANS sendrecv kernels

---
 .../src/device/transpose_PQ_ACCL_stream_sendrecv.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp
index 13e1c300..c43736d9 100644
--- a/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp
+++ b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp
@@ -200,9 +200,9 @@ void transpose_read_sendrecv(const DEVICE_DATA_TYPE* A,
             for (int col = 0; col < least_common_multiple/pq_width; col++) {
                 for (int row = 0; row < least_common_multiple/pq_height; row++) {
                     if (target_list[row * least_common_multiple/pq_width + col] == send_rank) {
-                        for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) {
-                            for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) {
-                                unsigned int matrix_buffer_offset = (col + lcm_col * least_common_multiple/pq_width) + (row + lcm_row * least_common_multiple/pq_height) * width_per_rank;
+                        for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_height); lcm_col++) {
+                            for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_width); lcm_row++) {
+                                unsigned int matrix_buffer_offset = (row + lcm_col * least_common_multiple/pq_height) + (col + lcm_row * least_common_multiple/pq_width) * width_per_rank;
                                 DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width];
                                 transpose_block_transpose(A, a_block, matrix_buffer_offset, width_per_rank, height_per_rank);
                                 transpose_block_forward(a_block, krnl2cclo);
@@ -237,9 +237,9 @@ void transpose_write_sendrecv(const DEVICE_DATA_TYPE* B,
             for (int col = 0; col < least_common_multiple/pq_width; col++) {
                 for (int row = 0; row < least_common_multiple/pq_height; row++) {
                     if (target_list[row * least_common_multiple/pq_width + col] == recv_rank) {
-                        for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) {
-                            for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) {
-                                unsigned int matrix_buffer_offset = (col + lcm_col * least_common_multiple/pq_width) + (row + lcm_row * least_common_multiple/pq_height) * width_per_rank;
+                        for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_width); lcm_row++) {
+                            for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_height); lcm_col++) {
+                                unsigned int matrix_buffer_offset = (row + lcm_col * least_common_multiple/pq_height) + (col + lcm_row * least_common_multiple/pq_width) * width_per_rank;
                                 transpose_block_receive(B,C,matrix_buffer_offset,width_per_rank, cclo2krnl);
                             }
                         }

From 9a03cf2c59dfcee7419758d32823e0044f15eda9 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 21 Oct 2022 18:08:55 +0100
Subject: [PATCH 126/318] Extend memory alignment for b_eff

---
 b_eff/src/host/network_benchmark.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index 964ec5ca..472ab15d 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -42,7 +42,7 @@ struct aligned_allocator {
 
 	   pointer allocate(size_t pCount, const_pointer = 0){ 
 	    	T* mem = 0;
-	    	if (posix_memalign(reinterpret_cast<void**>(&mem), 1024 , sizeof(T) * pCount) != 0) {
+	    	if (posix_memalign(reinterpret_cast<void**>(&mem), 4096, sizeof(T) * pCount) != 0) {
 	    		throw std::bad_alloc();
 	        }
 		return mem; 

From 8455a7d7082b0a8bab70e8d644f9df3223dad726 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 21 Oct 2022 18:26:36 +0100
Subject: [PATCH 127/318] Add profiling config for b_eff

---
 .../Xilinx_U55C_HBM_ACCL_pl_profile.cmake     | 27 ++++++
 ...s.link.xilinx.accl_pl.u55c.hbm.profile.ini | 88 +++++++++++++++++++
 2 files changed, 115 insertions(+)
 create mode 100644 b_eff/configs/Xilinx_U55C_HBM_ACCL_pl_profile.cmake
 create mode 100644 b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini

diff --git a/b_eff/configs/Xilinx_U55C_HBM_ACCL_pl_profile.cmake b/b_eff/configs/Xilinx_U55C_HBM_ACCL_pl_profile.cmake
new file mode 100644
index 00000000..5cd3ed0a
--- /dev/null
+++ b/b_eff/configs/Xilinx_U55C_HBM_ACCL_pl_profile.cmake
@@ -0,0 +1,27 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES "send_recv" CACHE STRING "" FORCE)
+set(USE_ACCL_CLIENT_ARBITER Yes CACHE BOOL "Use the client arbiter kernel to connect multiple kernels to the CCLO cmd stream" FORCE)
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini
new file mode 100644
index 00000000..96e13497
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini
@@ -0,0 +1,88 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+nk=client_arbiter:1:client_arbiter
+nk=send_recv:1:sendrecv
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=lb_user_krnl:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+slr=client_arbiter:SLR0
+slr=sendrecv:SLR0
+
+sp=ccl_offload_0.m_axi_0:HBM[0:5]
+sp=ccl_offload_0.m_axi_1:HBM[0:5]
+sp=sendrecv.m_axi_gmem:HBM[0:5]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:client_arbiter.cmd_clients_0
+stream_connect=client_arbiter.ack_clients_0:hostctrl_0.sts
+stream_connect=sendrecv.cmd:client_arbiter.cmd_clients_1
+stream_connect=client_arbiter.ack_clients_1:sendrecv.sts
+stream_connect=client_arbiter.cmd_cclo:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:client_arbiter.ack_cclo
+
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
+[profile]
+data=ccl_offload:all:all            # Monitor data on all instances of kernel k1
+data=send_recv:all:all          # Specific CU master
+memory=all                 # Monitor transfers for all memories
+stall=ccl_offload:all              # Monitor stalls for all CUs of all kernels
+stall=send_recv:all              # Stalls only for cu2
+exec=all:all               # Monitor execution times for all CUs

From 79967790aa4010f539d06a1c44161ef260128db6 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 24 Oct 2022 08:34:53 +0100
Subject: [PATCH 128/318] Fix IP address for hardware execution

---
 shared/setup/fpga_setup_accl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 8a5f685c..c560ee29 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -77,7 +77,7 @@ std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &progra
   std::vector<ACCL::rank_t> ranks = {};
   for (int i = 0; i < current_size; ++i) {
     // TODO: Replace the ip addresses and ports here for execution of real hardware?
-    ACCL::rank_t new_rank = {"10.10.10." + std::to_string(current_rank), 5500 + i, i, ACCL_BUFFER_SIZE};
+    ACCL::rank_t new_rank = {"10.10.10." + std::to_string(i), 5500 + i, i, ACCL_BUFFER_SIZE};
     ranks.emplace_back(new_rank);
   }
   if (!useAcclEmulation) {

From 7237139d14ea8427ffab29350009969a38183b77 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Mon, 24 Oct 2022 08:53:33 +0100
Subject: [PATCH 129/318] Make debug output optional

---
 b_eff/src/host/execution_types/execution_accl.hpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
index c4686b29..8d1638d9 100644
--- a/b_eff/src/host/execution_types/execution_accl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -79,8 +79,19 @@ namespace network::execution_types::accl {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 for (int l = 0; l < looplength; l++) {
+#ifndef NDEBUG
+                    std::cout << "Send " << size_in_values << " bytes to " 
+                                << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl;
+#endif
 			config.accl->send(*acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
-			config.accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+#ifndef NDEBUG
+                    std::cout << "Recv " << size_in_values << " bytes from " 
+                                << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl;
+#endif
+            config.accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+#ifndef NDEBUG
+                    std::cout << "Done" << std::endl;
+#endif
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();

From 2f565a7a031055984cd5497975db0e9bc4f44cec Mon Sep 17 00:00:00 2001
From: Marius Meyer <meyermar@alveo-build-01.inf.ethz.ch>
Date: Mon, 24 Oct 2022 10:32:33 +0200
Subject: [PATCH 130/318] Explicitly set ACCL buffer size in config

---
 b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake
index 45e2b5d7..81c20e1d 100644
--- a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake
+++ b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake
@@ -22,5 +22,6 @@ set(USE_ACCL_CLIENT_ARBITER Yes CACHE BOOL "Use the client arbiter kernel to con
 # Defaults to a total of ~12GB data
 set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
 set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+set(ACCL_BUFFER_SIZE 4194304 CACHE STRING "Size of the ACCL buffers" FORCE)
 
 set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)

From d675b98444b20da78c84b745ce9d45b33242e8db Mon Sep 17 00:00:00 2001
From: Marius Meyer <meyermar@alveo-build-01.inf.ethz.ch>
Date: Mon, 24 Oct 2022 10:56:45 +0200
Subject: [PATCH 131/318] Make BFM optional in ACCL PL

---
 b_eff/src/host/execution_types/execution_accl_pl.hpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
index 5bbda303..0bac2fa8 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -68,9 +68,10 @@ namespace network::execution_types::accl_pl {
         hlslib::Stream<command_word> cmd, sts;
 
         std::vector<unsigned int> dest = {0};
-        CCLO_BFM cclo(6000, current_rank, current_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+        std::unique_ptr<CCLO_BFM> cclo;
         if (config.programSettings->useAcclEmulation) {
-            cclo.run();
+            cclo = std::make_unique<CCLO_BFM>(6000, current_rank, current_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+            cclo->run();
         }
         MPI_Barrier(MPI_COMM_WORLD);
 
@@ -123,7 +124,7 @@ namespace network::execution_types::accl_pl {
         }
 
         if (config.programSettings->useAcclEmulation) {
-            cclo.stop();
+            cclo->stop();
         }
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!

From 456a7825735d9f96f00fd602b28251aa27eac2c4 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 25 Oct 2022 14:56:44 +0100
Subject: [PATCH 132/318] Fix ACCL pl emulation w/o bitstream

---
 b_eff/src/host/execution_types/execution_accl_pl.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
index 0bac2fa8..ed35d552 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -92,7 +92,10 @@ namespace network::execution_types::accl_pl {
                 acclRecvBuffers.back()->sync_to_device();
             }
 
-            xrt::kernel sendrecvKernel(*config.device, *config.program, "send_recv");
+            xrt::kernel sendrecvKernel;
+            if (!config.programSettings->useAcclEmulation) {
+                sendrecvKernel(*config.device, *config.program, "send_recv");
+            }
 
             double calculationTime = 0.0;
             for (int i = 0; i < config.programSettings->kernelReplications; i++) {

From 7ec36450a885c17005673de699d0bed4b4984736 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Tue, 25 Oct 2022 14:57:10 +0100
Subject: [PATCH 133/318] Profile everything

---
 .../settings.link.xilinx.accl_pl.u55c.hbm.profile.ini  | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini
index 96e13497..9a1ce41d 100644
--- a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini
+++ b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini
@@ -80,9 +80,7 @@ stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
 stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
 
 [profile]
-data=ccl_offload:all:all            # Monitor data on all instances of kernel k1
-data=send_recv:all:all          # Specific CU master
-memory=all                 # Monitor transfers for all memories
-stall=ccl_offload:all              # Monitor stalls for all CUs of all kernels
-stall=send_recv:all              # Stalls only for cu2
-exec=all:all               # Monitor execution times for all CUs
+data=all:all:all
+memory=all
+stall=all:all
+exec=all:all

From 70a7ef200654c20d31309d518c8913db82cea3a3 Mon Sep 17 00:00:00 2001
From: Marius Meyer <meyermar@alveo-u55c-03.inf.ethz.ch>
Date: Tue, 25 Oct 2022 18:56:53 +0200
Subject: [PATCH 134/318] Fix memory bank for top kernel

---
 LINPACK/src/host/execution_types/execution_xrt_pcie.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
index 6de18915..1a2610e0 100644
--- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
@@ -111,7 +111,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
           *config.device,
           sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
               (config.programSettings->blockSize),
-          kernel_lu.group_id(0));
+          kernel_lu.group_id(1));
     }
 
     for (int i = 0; i < blocks_per_col; i++) {

From 30fdfc35075451af44373a4ef39ce6f53da402bb Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 26 Oct 2022 15:23:47 +0100
Subject: [PATCH 135/318] Fix memory bank in HPL ACCL host code

---
 LINPACK/src/host/execution_types/execution_accl_buffers.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
index 5e26e267..e7db12b3 100644
--- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
+++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
@@ -149,7 +149,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                            sizeof(HOST_DATA_TYPE) *
                                (config.programSettings->blockSize) *
                                (config.programSettings->blockSize),
-                           lu_tmp_kernel.group_id(0));
+                           lu_tmp_kernel.group_id(1));
       Buffer_top_list.back().push_back(
           config.accl->create_buffer<HOST_DATA_TYPE>(
               tmp_bos.back(),

From 3cbf9e98a5bd49d1c73f48ca8b37dc4422fec0b0 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 26 Oct 2022 15:56:58 +0100
Subject: [PATCH 136/318] Add PTRANS ACCL profile config

---
 .../Xilinx_U55C_HBM_ACCL_stream_profile.cmake | 30 +++++++
 ...x.transpose_pq_accl_stream.hbm.profile.ini | 82 +++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream_profile.cmake
 create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini

diff --git a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream_profile.cmake b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream_profile.cmake
new file mode 100644
index 00000000..a61bd058
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream_profile.cmake
@@ -0,0 +1,30 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini
new file mode 100644
index 00000000..1c3a4861
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini
@@ -0,0 +1,82 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:0
+nk=transpose_read0:1
+nk=transpose_write0:1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+slr=transpose_read0_1:SLR2
+slr=transpose_write0_1:SLR0
+
+sp=ccl_offload_0.m_axi_0:HBM[31]
+sp=ccl_offload_0.m_axi_1:HBM[31]
+sp=transpose_read0_1.m_axi_gmem0:HBM[0:7]
+sp=transpose_write0_1.m_axi_gmem0:HBM[8:15]
+sp=transpose_write0_1.m_axi_gmem1:HBM[16:23]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl
+stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:512
+
+[profile]
+data=all:all:all
+memory=all
+stall=all:all
+exec=all:all
\ No newline at end of file

From b9709f85360662153485a381eaf3c25ff8982cd5 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 26 Oct 2022 16:02:26 +0100
Subject: [PATCH 137/318] LINPACK ACCL update configs

---
 .../configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake  |  2 +-
 .../Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake      |  2 +-
 ...linx.hpl_torus_accl.hbm.u55c.generator.ini | 88 +++++++++++++++++++
 3 files changed, 90 insertions(+), 2 deletions(-)
 create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini

diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake
index ec9d153b..800a33e0 100644
--- a/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake
+++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake
@@ -22,7 +22,7 @@ set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication k
 
 set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE)
 set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
-set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini" CACHE STRING "Link settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini" CACHE STRING "Link settings file" FORCE)
 
 set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
 set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake
index bbf80c86..dfd8611b 100644
--- a/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake
+++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake
@@ -9,7 +9,7 @@
 set(USE_MPI Yes CACHE BOOL "" FORCE)
 set(USE_SVM No CACHE BOOL "" FORCE)
 set(USE_HBM No CACHE BOOL "" FORCE)
-set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_ACCL No CACHE BOOL "" FORCE)
 set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
 set(USE_OCL_HOST No CACHE BOOL "" FORCE)
 set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini
new file mode 100644
index 00000000..4783d320
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini
@@ -0,0 +1,88 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR0
+slr=top_update_1:SLR0
+slr=inner_update_mm0_1:SLR2
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:4]
+sp=lu_1.m_axi_gmem1:HBM[5:6]
+sp=lu_1.m_axi_gmem2:HBM[5:6]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:4]
+sp=top_update_1.m_axi_gmem1:HBM[5:6]
+sp=top_update_1.m_axi_gmem2:HBM[5:6]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:4]
+sp=left_update_1.m_axi_gmem1:HBM[5:6]
+sp=left_update_1.m_axi_gmem2:HBM[5:6]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:4]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[5:6]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[5:6]
+# PY_CODE_GEN block_end
+
+#ACCL
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+
+sp=ccl_offload_0.m_axi_0:HBM[5:6]
+sp=ccl_offload_0.m_axi_1:HBM[5:6]
+
+
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl

From 41bd60c4d61abf6c306e6b65bff8e3e1a95098d9 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 26 Oct 2022 16:07:00 +0100
Subject: [PATCH 138/318] LINPACK add profile config for U55c

---
 .../Xilinx_U55C_B8_SB3_R1_ACCL_profile.cmake  | 31 ++++++
 ..._torus_accl.hbm.u55c.profile.generator.ini | 94 +++++++++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL_profile.cmake
 create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini

diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL_profile.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL_profile.cmake
new file mode 100644
index 00000000..ed8cc15a
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL_profile.cmake
@@ -0,0 +1,31 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini" CACHE STRING "Link settings file" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
+
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini
new file mode 100644
index 00000000..d4d128dc
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini
@@ -0,0 +1,94 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR0
+slr=top_update_1:SLR0
+slr=inner_update_mm0_1:SLR2
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:4]
+sp=lu_1.m_axi_gmem1:HBM[5:6]
+sp=lu_1.m_axi_gmem2:HBM[5:6]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:4]
+sp=top_update_1.m_axi_gmem1:HBM[5:6]
+sp=top_update_1.m_axi_gmem2:HBM[5:6]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:4]
+sp=left_update_1.m_axi_gmem1:HBM[5:6]
+sp=left_update_1.m_axi_gmem2:HBM[5:6]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:4]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[5:6]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[5:6]
+# PY_CODE_GEN block_end
+
+#ACCL
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+
+sp=ccl_offload_0.m_axi_0:HBM[5:6]
+sp=ccl_offload_0.m_axi_1:HBM[5:6]
+
+
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
+[profile]
+data=all:all:all
+memory=all
+stall=all:all
+exec=all:all
\ No newline at end of file

From ecf3714d222dd19be119b6b1fa98b8caca6bca24 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 26 Oct 2022 16:37:00 +0100
Subject: [PATCH 139/318] Add profiling config for PTRANS ACCL on U280

---
 .../Xilinx_U280_HBM_ACCL_stream_profile.cmake | 30 +++++++
 ...nspose_pq_accl_stream.hbm.u280.profile.ini | 82 +++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 PTRANS/configs/Xilinx_U280_HBM_ACCL_stream_profile.cmake
 create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini

diff --git a/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream_profile.cmake b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream_profile.cmake
new file mode 100644
index 00000000..1b1aa691
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream_profile.cmake
@@ -0,0 +1,30 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_gen3x16_xdma_1_202211_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini
new file mode 100644
index 00000000..3860eb41
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini
@@ -0,0 +1,82 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:0
+nk=transpose_read0:1
+nk=transpose_write0:1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
+slr=transpose_read0_1:SLR1
+slr=transpose_write0_1:SLR0
+
+sp=ccl_offload_0.m_axi_0:HBM[31]
+sp=ccl_offload_0.m_axi_1:HBM[31]
+sp=transpose_read0_1.m_axi_gmem0:HBM[0:7]
+sp=transpose_write0_1.m_axi_gmem0:HBM[8:15]
+sp=transpose_write0_1.m_axi_gmem1:HBM[16:23]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl
+stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:512
+
+[profile]
+data=all:all:all
+memory=all
+stall=all:all
+exec=all:all
\ No newline at end of file

From d4739966fdca21f2a30557bc552b8fb8341bb683 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Wed, 26 Oct 2022 16:49:57 +0100
Subject: [PATCH 140/318] Set LINPACk ACCL buffer size suffiently large

---
 LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake
index 800a33e0..20afd309 100644
--- a/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake
+++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake
@@ -23,6 +23,7 @@ set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication k
 set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE)
 set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
 set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini" CACHE STRING "Link settings file" FORCE)
+set(ACCL_BUFFER_SIZE 524288 CACHE STRING "Set ACCL buffer size to fit single matrix block" FORCE)
 
 set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
 set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)

From 350bb3bea8e617d8751c363279f7c9edb664a350 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Thu, 27 Oct 2022 09:08:36 +0100
Subject: [PATCH 141/318] Add profile config for b_eff U280

---
 .../Xilinx_U280_HBM_ACCL_pl_profile.cmake     | 27 ++++++
 ...s.link.xilinx.accl_pl.u280.hbm.profile.ini | 86 +++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake
 create mode 100644 b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini

diff --git a/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake b/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake
new file mode 100644
index 00000000..94489fba
--- /dev/null
+++ b/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake
@@ -0,0 +1,27 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES "send_recv" CACHE STRING "" FORCE)
+set(USE_ACCL_CLIENT_ARBITER Yes CACHE BOOL "Use the client arbiter kernel to connect multiple kernels to the CCLO cmd stream" FORCE)
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini
new file mode 100644
index 00000000..2f284b1c
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini
@@ -0,0 +1,86 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+nk=client_arbiter:1:client_arbiter
+nk=send_recv:1:sendrecv
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
+slr=client_arbiter:SLR1
+slr=sendrecv:SLR1
+
+sp=ccl_offload_0.m_axi_0:HBM[0:5]
+sp=ccl_offload_0.m_axi_1:HBM[0:5]
+sp=sendrecv.m_axi_gmem:HBM[0:5]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:client_arbiter.cmd_clients_0
+stream_connect=client_arbiter.ack_clients_0:hostctrl_0.sts
+stream_connect=sendrecv.cmd:client_arbiter.cmd_clients_1
+stream_connect=client_arbiter.ack_clients_1:sendrecv.sts
+stream_connect=client_arbiter.cmd_cclo:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:client_arbiter.ack_cclo
+
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
+[profile]
+data=all:all:all
+memory=all
+stall=all:all
+exec=all:all

From 64b3602abecd37b3835967773298719ee59fcd77 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 28 Oct 2022 10:50:51 +0100
Subject: [PATCH 142/318] Fixes in b_eff ACCL PL host code

---
 b_eff/src/host/execution_types/execution_accl_pl.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
index ed35d552..4b3ff2ee 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -94,7 +94,7 @@ namespace network::execution_types::accl_pl {
 
             xrt::kernel sendrecvKernel;
             if (!config.programSettings->useAcclEmulation) {
-                sendrecvKernel(*config.device, *config.program, "send_recv");
+                sendrecvKernel = xrt::kernel(*config.device, *config.program, "send_recv");
             }
 
             double calculationTime = 0.0;
@@ -102,7 +102,7 @@ namespace network::execution_types::accl_pl {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 if (!config.programSettings->useAcclEmulation) {
-                auto run = sendrecvKernel(acclSendBuffers[i]->bo(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                auto run = sendrecvKernel(*(acclSendBuffers[i]->bo()), *(acclRecvBuffers[i]->bo()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.accl->get_communicator_addr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}));
                 run.wait();
                 } else {

From 69bf7aaa7e5150e337b8023b62c699c6c97e9563 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 28 Oct 2022 11:10:13 +0100
Subject: [PATCH 143/318] Update b_eff ACCL profile link config

---
 .../settings.link.xilinx.accl_pl.u55c.hbm.profile.ini    | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini
index 9a1ce41d..778054e5 100644
--- a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini
+++ b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini
@@ -80,7 +80,12 @@ stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
 stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
 
 [profile]
-data=all:all:all
+data=send_recv:all:all
+data=client_arbiter:all:all
+data=ccl_offload:all:m_axis_eth_tx_data
+data=networklayer:all:M_AXIS_nl2sk
+data=networklayer:all:M_AXIS_nl2eth
+data=cmac_0:all:M_AXIS
 memory=all
-stall=all:all
+stall=all
 exec=all:all

From ebf3c8a1f1c260b0ceefc3566d01457d6c6892aa Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusm@xilinx.com>
Date: Fri, 28 Oct 2022 11:26:39 +0100
Subject: [PATCH 144/318] Specify the important profiling metrics for PTRANS
 ACCL

---
 ....link.xilinx.transpose_pq_accl_stream.hbm.profile.ini | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini
index 1c3a4861..9dec51d7 100644
--- a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini
@@ -76,7 +76,8 @@ stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl
 stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:512
 
 [profile]
-data=all:all:all
-memory=all
-stall=all:all
-exec=all:all
\ No newline at end of file
+data=transpose_read0:all:all
+data=transpose_write0:all:all
+memory=transpose_read0_1.m_axi_gmem0
+memory=transpose_write0_1.m_axi_gmem0
+memory=transpose_write0_1.m_axi_gmem1

From 0dffc714c7464b053d8bd68f7cf9d92e74eb10a5 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 8 Nov 2022 11:16:14 +0100
Subject: [PATCH 145/318] Add R3 config for Linpack on U280

---
 .../Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake      | 30 ++++++++++++++++
 ..._pcie.distribute_kernels.hbm.generator.ini | 34 +++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 LINPACK/configs/Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake
 create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini

diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake
new file mode 100644
index 00000000..de080ee7
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake
@@ -0,0 +1,30 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE)
+set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 3 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini" CACHE STRING "Link settings file" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini
new file mode 100644
index 00000000..2815cc38
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini
@@ -0,0 +1,34 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR1
+slr=top_update_1:SLR2
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN i % 3$
+# PY_CODE_GEN block_end
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:5]
+sp=lu_1.m_axi_gmem1:HBM[6]
+sp=lu_1.m_axi_gmem2:HBM[7]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:5]
+sp=top_update_1.m_axi_gmem1:HBM[6]
+sp=top_update_1.m_axi_gmem2:HBM[8]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:5]
+sp=left_update_1.m_axi_gmem1:HBM[7]
+sp=left_update_1.m_axi_gmem2:HBM[9]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:5]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[9]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8]
+# PY_CODE_GEN block_end
+

From 1cb75def2fb56dc6d2e3e8b39e4151787ac84f83 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 8 Nov 2022 13:12:44 +0100
Subject: [PATCH 146/318] Profiling U280 b_eff PL

---
 b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake      | 2 +-
 .../settings.link.xilinx.accl_pl.u280.hbm.profile.ini    | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake b/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake
index 94489fba..c40efff7 100644
--- a/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake
+++ b/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake
@@ -12,7 +12,7 @@ set(USE_HBM No CACHE BOOL "" FORCE)
 set(USE_ACCL Yes CACHE BOOL "" FORCE)
 set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
 set(USE_OCL_HOST No CACHE BOOL "" FORCE)
-set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
 set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini CACHE FILEPATH "" FORCE)
 set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
 set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE)
diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini
index 2f284b1c..374a41c9 100644
--- a/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini
+++ b/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini
@@ -80,7 +80,12 @@ stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
 stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
 
 [profile]
-data=all:all:all
+data=send_recv:all:all
+data=client_arbiter:all:all
+data=ccl_offload:all:m_axis_eth_tx_data
+data=networklayer:all:M_AXIS_nl2sk
+data=networklayer:all:M_AXIS_nl2eth
+data=cmac_0:all:M_AXIS
 memory=all
-stall=all:all
+stall=all
 exec=all:all

From 809eb38d0165570cb3b3523235ec89a06cd0c79d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 8 Nov 2022 13:19:10 +0100
Subject: [PATCH 147/318] Fix config naming

---
 ...SB3_R3_DDR_PCIE.cmake => Xilinx_U280_B8_SB3_R3_HBM_PCIE.cmake} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename LINPACK/configs/{Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake => Xilinx_U280_B8_SB3_R3_HBM_PCIE.cmake} (100%)

diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R3_HBM_PCIE.cmake
similarity index 100%
rename from LINPACK/configs/Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake
rename to LINPACK/configs/Xilinx_U280_B8_SB3_R3_HBM_PCIE.cmake

From cf5561b5c5c819705552831ebda8ea37a6acd2ac Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 8 Nov 2022 14:33:16 +0100
Subject: [PATCH 148/318] Add PTRANS U280 ACCL stream config

---
 .../configs/Xilinx_U280_HBM_ACCL_stream.cmake | 29 +++++++
 ...linx.transpose_pq_accl_stream.hbm.u280.ini | 76 +++++++++++++++++++
 2 files changed, 105 insertions(+)
 create mode 100644 PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake
 create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini

diff --git a/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake
new file mode 100644
index 00000000..827da9a9
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake
@@ -0,0 +1,29 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_gen3x16_xdma_1_202211_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini
new file mode 100644
index 00000000..83150287
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini
@@ -0,0 +1,76 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:0
+nk=transpose_read0:1
+nk=transpose_write0:1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
+slr=transpose_read0_1:SLR1
+slr=transpose_write0_1:SLR0
+
+sp=ccl_offload_0.m_axi_0:HBM[31]
+sp=ccl_offload_0.m_axi_1:HBM[31]
+sp=transpose_read0_1.m_axi_gmem0:HBM[0:7]
+sp=transpose_write0_1.m_axi_gmem0:HBM[8:15]
+sp=transpose_write0_1.m_axi_gmem1:HBM[16:23]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl
+stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:512

From c1b697ee40c69e07093521cb4c37ac6fd1ab56be Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 8 Nov 2022 14:59:05 +0100
Subject: [PATCH 149/318] Fix PTRANS PCIE DDR config for U280

---
 .../settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini  | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini
index 882d5af1..3b7b0497 100644
--- a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini
@@ -13,7 +13,5 @@ slr=transpose0_$PY_CODE_GEN i + 1$:SLR$PY_CODE_GEN i % num_slrs$
 
 # Assign the kernels to the memory ports
 # PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
-sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem0:DDR[$PY_CODE_GEN i % num_ddrs$]
-sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem1:DDR[$PY_CODE_GEN i % num_ddrs$]
-sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem2:DDR[$PY_CODE_GEN i % num_ddrs$]
+sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem:DDR[$PY_CODE_GEN i % num_ddrs$]
 # PY_CODE_GEN block_end

From bdc93c880494e23f181c84b3d2f67f790e1703a3 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 8 Nov 2022 17:51:20 +0100
Subject: [PATCH 150/318] Make zmq optional in PTRANS w/o ACCL

---
 PTRANS/src/host/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt
index e162b809..d17422ca 100755
--- a/PTRANS/src/host/CMakeLists.txt
+++ b/PTRANS/src/host/CMakeLists.txt
@@ -46,7 +46,9 @@ if (Vitis_FOUND)
     add_executable(${HOST_EXE_NAME}_xilinx main.cpp)
     target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
     target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
-    target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx zmqpp)
+    if (USE_ACCL)
+        target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx zmqpp)
+    endif()
     target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")

From 58a5b2139ed4c59c219b55a8ec3b7222258f9288 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 8 Nov 2022 18:43:56 +0100
Subject: [PATCH 151/318] Use old platform for synthesis

---
 PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake
index 827da9a9..d5223408 100644
--- a/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake
+++ b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake
@@ -13,7 +13,7 @@ set(USE_ACCL Yes CACHE BOOL "" FORCE)
 set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
 set(USE_OCL_HOST No CACHE BOOL "" FORCE)
 set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE)
-set(FPGA_BOARD_NAME "xilinx_u280_gen3x16_xdma_1_202211_1" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
 set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini CACHE FILEPATH "" FORCE)
 set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE)
 set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)

From 4bb1fd23ae7ee62305e11af7b1f98adbc7200a6f Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 9 Nov 2022 09:21:57 +0100
Subject: [PATCH 152/318] Reduce target clock frequency of design

---
 LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake
index 5ddc6b30..37f843d3 100644
--- a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake
@@ -13,7 +13,7 @@ set(USE_ACCL Yes CACHE BOOL "" FORCE)
 set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
 set(USE_OCL_HOST No CACHE BOOL "" FORCE)
 set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
-set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 200 CACHE STRING "" FORCE)
 set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE)
 set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
 # LINPACK specific options

From f2d148b27245ff240caae632ba7113e0d10f94d2 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 9 Nov 2022 13:54:55 +0100
Subject: [PATCH 153/318] Fix device selection

---
 PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake | 1 +
 shared/setup/fpga_setup_xrt.cpp           | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake
index 46ef245c..a75d3fd4 100644
--- a/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake
+++ b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake
@@ -12,6 +12,7 @@ set(USE_HBM No CACHE BOOL "" FORCE)
 set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
 set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini CACHE FILEPATH "" FORCE)
 set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
+set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
 
 # STREAM specific options
 # Defaults to a total of ~12GB data
diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp
index eae39fe8..f5d7ef32 100644
--- a/shared/setup/fpga_setup_xrt.cpp
+++ b/shared/setup/fpga_setup_xrt.cpp
@@ -39,6 +39,10 @@ namespace fpga_setup {
         MPI_Comm_rank(MPI_COMM_WORLD, & current_device);
         if (defaultDevice >= 0) {
             current_device = defaultDevice;
+        } else {
+            //TODO Use xrt::system::enumerate_devices() in "experimental/xrt_system.h" for future XRT versions
+            // instead of hardcoded number of devices.
+            current_device = current_device % 3;
         }
         return std::unique_ptr<xrt::device>(new xrt::device(current_device));
     } 

From 3fb8810fb2437285b2640d89963c378d75a4c3b4 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 9 Nov 2022 14:25:13 +0100
Subject: [PATCH 154/318] Switch to XRT host code

---
 PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake
index a75d3fd4..eb878f8d 100644
--- a/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake
+++ b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake
@@ -9,10 +9,13 @@
 set(USE_MPI Yes CACHE BOOL "" FORCE)
 set(USE_SVM No CACHE BOOL "" FORCE)
 set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
 set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
 set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini CACHE FILEPATH "" FORCE)
 set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
 set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES transpose0 CACHE STRING "" FORCE)
 
 # STREAM specific options
 # Defaults to a total of ~12GB data

From 5a42ab3c4533a3fcd67e07e68e77824460c39db4 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 9 Nov 2022 16:54:53 +0100
Subject: [PATCH 155/318] Fix build scripts for XRT

---
 PTRANS/src/host/CMakeLists.txt | 3 ++-
 shared/CMakeLists.txt          | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt
index d17422ca..b9e0541b 100755
--- a/PTRANS/src/host/CMakeLists.txt
+++ b/PTRANS/src/host/CMakeLists.txt
@@ -46,8 +46,9 @@ if (Vitis_FOUND)
     add_executable(${HOST_EXE_NAME}_xilinx main.cpp)
     target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
     target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
+    target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx)
     if (USE_ACCL)
-        target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx zmqpp)
+        target_link_libraries(${HOST_EXE_NAME}_xilinx zmqpp)
     endif()
     target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA)
diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt
index 19ab7ff2..64260c94 100644
--- a/shared/CMakeLists.txt
+++ b/shared/CMakeLists.txt
@@ -4,13 +4,13 @@ set(HPCC_BASE_SOURCES "")
 
 if (USE_ACCL)
     add_subdirectory(${extern_accl_SOURCE_DIR}/driver/xrt ${CMAKE_BINARY_DIR}/lib/accl)
-    list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp)
+    list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp ${extern_accl_SOURCE_DIR}/test/model/bfm/cclo_bfm.cpp)
     if (CMAKE_BUILD_TYPE EQUAL "Debug")
         set(ACCL_DEBUG Yes)
     endif()
 endif()
 if (USE_XRT_HOST)
-    list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp ${extern_accl_SOURCE_DIR}/test/model/bfm/cclo_bfm.cpp)
+    list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp)
 endif()
 list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp)
 add_library(hpcc_fpga_base STATIC ${HPCC_BASE_SOURCES})

From 2157327888e0e2034bc94c0b6aae556a146b170e Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 10 Nov 2022 10:06:51 +0100
Subject: [PATCH 156/318] Remove ACCL deps from PCIE config

---
 LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake
index 37f843d3..9d5cc02f 100644
--- a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake
@@ -9,11 +9,11 @@
 set(USE_MPI Yes CACHE BOOL "" FORCE)
 set(USE_SVM No CACHE BOOL "" FORCE)
 set(USE_HBM No CACHE BOOL "" FORCE)
-set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_ACCL No CACHE BOOL "" FORCE)
 set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
 set(USE_OCL_HOST No CACHE BOOL "" FORCE)
 set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
-set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 200 CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
 set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE)
 set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
 # LINPACK specific options

From 242fa19ab0a36fb42d84a2b502dfb14f317e5d14 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 14 Nov 2022 11:41:13 +0100
Subject: [PATCH 157/318] Add config for U280 HPL with ACCL

---
 .../configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake  | 30 +++++++
 ...linx.hpl_torus_accl.hbm.u280.generator.ini | 88 +++++++++++++++++++
 2 files changed, 118 insertions(+)
 create mode 100644 LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake
 create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini

diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake
new file mode 100644
index 00000000..186266ca
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake
@@ -0,0 +1,30 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini" CACHE STRING "Link settings file" FORCE)
+set(ACCL_BUFFER_SIZE 524288 CACHE STRING "Set ACCL buffer size to fit single matrix block" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
+
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini
new file mode 100644
index 00000000..289a6263
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini
@@ -0,0 +1,88 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR0
+slr=top_update_1:SLR0
+slr=inner_update_mm0_1:SLR1
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:4]
+sp=lu_1.m_axi_gmem1:HBM[5:6]
+sp=lu_1.m_axi_gmem2:HBM[5:6]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:4]
+sp=top_update_1.m_axi_gmem1:HBM[5:6]
+sp=top_update_1.m_axi_gmem2:HBM[5:6]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:4]
+sp=left_update_1.m_axi_gmem1:HBM[5:6]
+sp=left_update_1.m_axi_gmem2:HBM[5:6]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:4]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[5:6]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[5:6]
+# PY_CODE_GEN block_end
+
+#ACCL
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR2
+slr=compression_0_1:SLR2
+slr=compression_0_2:SLR2
+slr=lb_user_krnl:SLR2
+slr=arith_0:SLR2
+slr=ccl_offload_0:SLR2
+slr=hostctrl_0:SLR2
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
+
+sp=ccl_offload_0.m_axi_0:HBM[5:6]
+sp=ccl_offload_0.m_axi_1:HBM[5:6]
+
+
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl

From 72e66929f1048fa570ba60cf6749921b3d10a33c Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 14 Nov 2022 13:32:15 +0100
Subject: [PATCH 158/318] Profiling for LINPACK U280

---
 ...linx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake | 32 ++++++++++++++++
 ..._torus_pcie.hbm.u280.profile.generator.ini | 38 +++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake
 create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini

diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake
new file mode 100644
index 00000000..63210758
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake
@@ -0,0 +1,32 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL No CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE)
+set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE
+    "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini" CACHE STRING "Link settings file" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
new file mode 100644
index 00000000..ca99d858
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
@@ -0,0 +1,38 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR0
+slr=top_update_1:SLR0
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN (i +1) % 3$
+# PY_CODE_GEN block_end
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:5]
+sp=lu_1.m_axi_gmem1:HBM[6]
+sp=lu_1.m_axi_gmem2:HBM[7]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:5]
+sp=top_update_1.m_axi_gmem1:HBM[6]
+sp=top_update_1.m_axi_gmem2:HBM[8]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:5]
+sp=left_update_1.m_axi_gmem1:HBM[7]
+sp=left_update_1.m_axi_gmem2:HBM[9]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[9]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8]
+# PY_CODE_GEN block_end
+
+[profile]
+memory=all
+exec=all:all
+

From d89939f5f0186896a97941409e31e897c10aaccb Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 15 Nov 2022 17:01:53 +0100
Subject: [PATCH 159/318] Update config for HPL U280 profile

---
 LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake   | 3 +--
 ...s.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake
index 63210758..654bc3f3 100644
--- a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake
@@ -25,8 +25,7 @@ set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipul
 set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
 
 set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
-set(XILINX_LINK_SETTINGS_FILE
-    "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini" CACHE STRING "Link settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini" CACHE STRING "Link settings file" FORCE)
 
 set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
 set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
index ca99d858..e0bb5aaa 100644
--- a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
@@ -27,7 +27,7 @@ sp=left_update_1.m_axi_gmem1:HBM[7]
 sp=left_update_1.m_axi_gmem2:HBM[9]
 
 # PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
-sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:5]
 sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[9]
 sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8]
 # PY_CODE_GEN block_end

From 719353eb927f68f3a7b4a2f16c67e33edc95b7db Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 16 Nov 2022 11:30:52 +0100
Subject: [PATCH 160/318] Fix HBM link config for LINPACK

---
 ...ttings.link.xilinx.hpl_torus_pcie.hbm.generator.ini | 10 +++++-----
 ...ilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini
index df381966..fe68d728 100644
--- a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini
@@ -19,15 +19,15 @@ sp=lu_1.m_axi_gmem1:HBM[6]
 sp=lu_1.m_axi_gmem2:HBM[7]
 
 sp=top_update_1.m_axi_gmem0:HBM[0:5]
-sp=top_update_1.m_axi_gmem1:HBM[6]
-sp=top_update_1.m_axi_gmem2:HBM[8]
+sp=top_update_1.m_axi_gmem1:HBM[8]
+sp=top_update_1.m_axi_gmem2:HBM[6]
 
 sp=left_update_1.m_axi_gmem0:HBM[0:5]
-sp=left_update_1.m_axi_gmem1:HBM[7]
-sp=left_update_1.m_axi_gmem2:HBM[9]
+sp=left_update_1.m_axi_gmem1:HBM[9]
+sp=left_update_1.m_axi_gmem2:HBM[7]
 
 # PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
-sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:5]
 sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[9]
 sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8]
 # PY_CODE_GEN block_end
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
index e0bb5aaa..5a7bbbf0 100644
--- a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
@@ -19,12 +19,12 @@ sp=lu_1.m_axi_gmem1:HBM[6]
 sp=lu_1.m_axi_gmem2:HBM[7]
 
 sp=top_update_1.m_axi_gmem0:HBM[0:5]
-sp=top_update_1.m_axi_gmem1:HBM[6]
-sp=top_update_1.m_axi_gmem2:HBM[8]
+sp=top_update_1.m_axi_gmem1:HBM[8]
+sp=top_update_1.m_axi_gmem2:HBM[6]
 
 sp=left_update_1.m_axi_gmem0:HBM[0:5]
-sp=left_update_1.m_axi_gmem1:HBM[7]
-sp=left_update_1.m_axi_gmem2:HBM[9]
+sp=left_update_1.m_axi_gmem1:HBM[9]
+sp=left_update_1.m_axi_gmem2:HBM[7]
 
 # PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
 sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:5]

From 3ffe0c4ab6198a1d5aa6390a1da5cfb116133653 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 16 Nov 2022 12:12:43 +0100
Subject: [PATCH 161/318] document new argument of selectFPGADevice

---
 shared/setup/fpga_setup.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp
index 70125df0..e6039973 100644
--- a/shared/setup/fpga_setup.cpp
+++ b/shared/setup/fpga_setup.cpp
@@ -220,6 +220,9 @@ choose a device.
 @param defaultDevice The index of the device that has to be used. If a
                         value < 0 is given, the device can be chosen
                         interactively
+@param platformString The platform string which should be chosen.
+                        If it is empty, it will be ignored. If it is not empty,
+                        but the string is not found an exception is thrown.
 
 @return A list containing a single selected device
 */

From a50dfe47ec49a64b852dada0d27c9754eda42256 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 17 Nov 2022 09:40:57 +0100
Subject: [PATCH 162/318] Update ACCL constructor

---
 shared/setup/fpga_setup_accl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index c560ee29..4d3207af 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -97,7 +97,7 @@ std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &progra
     std::vector<int> mem(1, 0);
     std::cout << "Create ACCL" << std::endl;
     return std::unique_ptr<ACCL::ACCL>(
-        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, 0, ACCL::networkProtocol::UDP));
+        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, ACCL::networkProtocol::UDP));
   } else {
     // TODO: Add start port here. Currenty hardcoded!
     return std::unique_ptr<ACCL::ACCL>(

From af24978072c02e55328f2988676ff3889c846981 Mon Sep 17 00:00:00 2001
From: Marius Meyer <marius.meyer@uni-paderborn.de>
Date: Thu, 17 Nov 2022 11:12:32 +0100
Subject: [PATCH 163/318] Disable LINPACK AllBlockExternResult test

---
 LINPACK/tests/test_kernel_communication.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/LINPACK/tests/test_kernel_communication.cpp b/LINPACK/tests/test_kernel_communication.cpp
index dfcb8867..603bedef 100644
--- a/LINPACK/tests/test_kernel_communication.cpp
+++ b/LINPACK/tests/test_kernel_communication.cpp
@@ -1206,8 +1206,10 @@ class LinpackKernelCommunicationTestAll : public LinpackKernelCommunicationTest
     }
 };
 
-
-TEST_F(LinpackKernelCommunicationTestAll, AllBlockExternalResultisCorrect) {
+// TODO: This test is disabled because it fails non-deterministicly although 
+// calculations with benchmark host are correct.
+// Maybe this is related to a problem with intel external channels in emulation.
+TEST_F(LinpackKernelCommunicationTestAll, DISABLED_AllBlockExternalResultisCorrect) {
     uint matrix_size = bm->getExecutionSettings().programSettings->matrixSize;
 
     auto ref_data = bm->generateInputData();

From 49fde98adbd0b7f9c2b39c6684bac186ad2e846b Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 17 Nov 2022 14:20:30 +0100
Subject: [PATCH 164/318] Fix memory mapping of buffers in XRT HPL base version

---
 LINPACK/src/host/execution_types/execution_xrt_pcie.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
index 1a2610e0..f35df7b9 100644
--- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
@@ -111,7 +111,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
           *config.device,
           sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
               (config.programSettings->blockSize),
-          kernel_lu.group_id(1));
+          kernel_top.group_id(1));
     }
 
     for (int i = 0; i < blocks_per_col; i++) {
@@ -119,7 +119,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
           *config.device,
           sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
               (config.programSettings->blockSize),
-          kernel_lu.group_id(2));
+          kernel_left.group_id(1));
     }
   }
 

From d55e450cd9c9fae41b442d77f70a49d0b2f0b279 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 17 Nov 2022 14:31:30 +0100
Subject: [PATCH 165/318] Extend profile options in config

---
 LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake   | 2 +-
 ...s.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake
index 654bc3f3..9bc20f5c 100644
--- a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake
@@ -13,7 +13,7 @@ set(USE_ACCL No CACHE BOOL "" FORCE)
 set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
 set(USE_OCL_HOST No CACHE BOOL "" FORCE)
 set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
-set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_COMPILE_FLAGS -g --profile.stall all:all CACHE STRING "" FORCE)
 set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE)
 set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE)
 set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
index 5a7bbbf0..4ac80a17 100644
--- a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
@@ -33,6 +33,7 @@ sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8]
 # PY_CODE_GEN block_end
 
 [profile]
-memory=all
+stall=all:all
+data=all:all:all
 exec=all:all
 

From 993f919359a1bfe2369d07c45823c22c4820be3c Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 18 Nov 2022 12:29:24 +0100
Subject: [PATCH 166/318] Add 250MHz target to config

---
 LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake
index 186266ca..94a9c4f6 100644
--- a/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake
@@ -20,6 +20,7 @@ set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of t
 set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
 set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
 
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
 set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE)
 set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
 set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini" CACHE STRING "Link settings file" FORCE)

From f0d446f938aaf9c370fc9cbabcec03ed1a70275d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 18 Nov 2022 12:34:34 +0100
Subject: [PATCH 167/318] Fix clock for cclo in HPL

---
 .../settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini
index 289a6263..d20bdb66 100644
--- a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini
@@ -86,3 +86,6 @@ stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
 # Tie off user kernel interface
 stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
 stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
+[clock]
+freqHz=250000000:ccl_offload_0
\ No newline at end of file

From efdac5daf78faafef8b3f20c549d7c2710fddb0d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 18 Nov 2022 13:42:54 +0100
Subject: [PATCH 168/318] Remove fixed clock since itis not supported with the
 platform

---
 .../settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini
index d20bdb66..289a6263 100644
--- a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini
@@ -86,6 +86,3 @@ stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
 # Tie off user kernel interface
 stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
 stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
-
-[clock]
-freqHz=250000000:ccl_offload_0
\ No newline at end of file

From 83a0867cf1f0beba8ec979dc3f9fbedc1f1e9b57 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 23 Nov 2022 14:27:19 +0100
Subject: [PATCH 169/318] Add trace memory in HBM

---
 ...k.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
index 4ac80a17..aeea6acf 100644
--- a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
@@ -33,7 +33,9 @@ sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8]
 # PY_CODE_GEN block_end
 
 [profile]
-stall=all:all
+stall=all:all:all
 data=all:all:all
-exec=all:all
-
+exec=all:all:all
+trace_memory=HBM[16]:SLR0
+trace_memory=HBM[17]:SLR1
+trace_memory=HBM[18]:SLR2

From d17a0a6356edd3397bf53bc58f22049ef935d512 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.uni-paderborn.de>
Date: Fri, 25 Nov 2022 13:38:59 +0100
Subject: [PATCH 170/318] Extend documentation for communication types

---
 docs/source/index.rst                              | 11 +++++++++++
 .../source/technical_support/Basic Setup/index.rst | 14 ++++++++------
 .../Host Input Parameters/index.rst                |  3 +++
 shared/include/communication_types.hpp             |  7 -------
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 96fd71bc..13f1de5c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -25,6 +25,17 @@ The pages collected under **Benchmark Descriptions** contain information about t
 **Technical Support** tackles selected topics of configuration, build, and execution of the benchmarks.
 **Benchmark Results** for the base implementations of the benchmarks are listed at the bottom of this page. They are reported together with the used CPU and other relevant infrastructure, as well as the configuration and resource utilization of the bitstreams.
 
+The scalability and performance of applications executed over multiple FPGAs is not least dependent on the communication capabilities of these devices. The benchmark suite supports the implementation of different communication strategies to compare their impact on the overall benchmark performance. This is only available to the benchmarks which rely on communication: b_eff, PTRANS and LINPACK.
+
+The first and most obvious strategy is host-to-host communication using PCIe and MPI. This strategy requires, in most cases, no additional hardware or software and only relies on moving data between the host and FPGA.
+The data is then exchanged via the existing CPU network, which makes it broadly appliable in the HPC context.
+As a consequence, this approach is used for the base implementations in this benchmark suite.
+For comparison, the suite can be extended with different communication types. 
+Intel is providing external channels for direct communication between the FPGAs. 
+This approach is based on point-to-point connections between FPGA and requires manual routing of data through the network.
+
+Further optimized implementations that use such device-specific communication approaches will be added in the future to the suite.
+
 
 
 .. toctree::
diff --git a/docs/source/technical_support/Basic Setup/index.rst b/docs/source/technical_support/Basic Setup/index.rst
index ed80740a..7308fc23 100644
--- a/docs/source/technical_support/Basic Setup/index.rst	
+++ b/docs/source/technical_support/Basic Setup/index.rst	
@@ -103,20 +103,22 @@ You can always get an overview of the available targets by executing the followi
     BENCHMARK_VENDOR, "Builds the host application "
     BENCHMARK_test_VENDOR, "Compile the tests and its dependencies "
 
-Moreover, there are additional targets to generate kernel reports and bitstreams.
+Moreover, there are additional targets to generate device reports and bitstreams.
+
 The kernel targets are:
  
 .. csv-table:: Device code build targets
    :header: "Target","Description"  
    :widths: 10, 30  
 
-    BENCHMARK_VENDOR            , Synthesizes the kernel (takes several hours!)  
-    BENCHMARK_report_VENDOR        , Just compile the kernel and create logs and reports 
-    BENCHMARK_emulate_VENDOR       , Create an emulation kernel                    
+    BASENAME_{COMM_}VENDOR            , Synthesizes the device kernels (takes several hours!)
+    BASENAME_{COMM_}report_VENDOR        , Just compile the kernels and create logs and reports
+    BASENAME_{COMM_}emulate_VENDOR       , Creates the emulation kernels
   
 `VENDOR` is either `intel` or `xilinx` depending if the Intel SDK or Xilinx Vitis should be used.
-`BENCHMARK` is the kernel name.
-A benchmark can provide multiple kernels and thus, these targets will be generated for every kernel file.
+`BASENAME` is the name of the file containing the device code.
+A benchmark can provide multiple kernel implementations and thus, these targets will be generated for every file containing kernel code.
+For all benchmarks using communication between FPGAs the different communcation types are encoded into the device code file name and therefore part of target name. These are b_eff, PTRANS and LINPACK.
 
 ------------------------------------------------------
 Configure and Build STREAM for a fictional Xilinx FPGA
diff --git a/docs/source/technical_support/Host Input Parameters/index.rst b/docs/source/technical_support/Host Input Parameters/index.rst
index 46abe6f3..50121964 100644
--- a/docs/source/technical_support/Host Input Parameters/index.rst	
+++ b/docs/source/technical_support/Host Input Parameters/index.rst	
@@ -43,6 +43,9 @@ Input parameters (or options) can be appended to the host execution call like th
     Please note, that the benchmark will always fail with this option since it assumes the validation failed, so it will return a non-zero exit code! For reported measurements, the validation has to be enabled and the host should return
     with an exit code 0.
 
+``--comm-type COMM``:
+    This parameter chooses the communication strategy which will be used. Current Options are "IEC" for using the Intel External Channel, "PCIE" for using the host-to-host communicationa and "CPU" for calculating on the CPU.
+
 ``--test``:
     This option will also skip the execution of the benchmark. It can be used to test different data generation schemes or the benchmark summary before the actual execution. Please note, that the 
     host will exit with a non-zero exit code, because it will not be able to validate the output.
diff --git a/shared/include/communication_types.hpp b/shared/include/communication_types.hpp
index bb46bb8d..005f4c03 100644
--- a/shared/include/communication_types.hpp
+++ b/shared/include/communication_types.hpp
@@ -46,12 +46,6 @@ typedef enum _CommunicationType {
      */
     pcie_mpi,
 
-    /**
-     * @brief Communcation using the Streaming Message Interface
-     * 
-     */
-    smi,
-
     /**
      * @brief Calculate the benchmark on CPU instead of FPGA
      * 
@@ -75,7 +69,6 @@ typedef enum _CommunicationType {
 static const std::map<const std::string, CommunicationType> comm_to_str_map{ 
     {"IEC", CommunicationType::intel_external_channels}, 
     {"PCIE", CommunicationType::pcie_mpi},
-	{"SMI", CommunicationType::smi},
     {"CPU", CommunicationType::cpu_only},
     {"UNSUPPORTED", CommunicationType::unsupported},
     {"AUTO", CommunicationType::automatic}

From d74711be72feef99fbbb9c77bf66cdfe55f21ae4 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 28 Nov 2022 19:33:06 +0100
Subject: [PATCH 171/318] Add TCP host setup for ACCL

---
 .../host/execution_types/execution_accl.hpp   | 22 +++----
 .../execution_types/execution_accl_pl.hpp     |  8 +--
 b_eff/src/host/network_benchmark.cpp          |  4 +-
 b_eff/src/host/network_benchmark.hpp          |  5 +-
 shared/include/hpcc_benchmark.hpp             | 45 ++++++-------
 shared/include/setup/fpga_setup_accl.hpp      | 28 +++++++-
 shared/setup/fpga_setup_accl.cpp              | 64 +++++++++++++++----
 7 files changed, 116 insertions(+), 60 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
index 8d1638d9..2ade570b 100644
--- a/b_eff/src/host/execution_types/execution_accl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -45,11 +45,11 @@ namespace network::execution_types::accl {
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
         int err;
-        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
-        std::vector<cl::vector<HOST_DATA_TYPE>> recvBufferContents;
-	std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclSendBuffers;
-	std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclRecvBuffers;
-        cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
+        std::vector<cl::vector<float>> dummyBufferContents;
+        std::vector<cl::vector<float>> recvBufferContents;
+	std::vector<std::unique_ptr<ACCL::Buffer<float>>> acclSendBuffers;
+	std::vector<std::unique_ptr<ACCL::Buffer<float>>> acclRecvBuffers;
+        size_t size_in_bytes = std::max(static_cast<size_t>(validationData.size()), static_cast<size_t>(1 << messageSize));
 
         int current_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
@@ -66,10 +66,10 @@ namespace network::execution_types::accl {
 	    int size_in_values = (size_in_bytes + 3) / 4;
             // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
-                recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
-		acclSendBuffers.push_back(config.accl->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
-		acclRecvBuffers.push_back(config.accl->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
+                dummyBufferContents.emplace_back(size_in_values, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+                recvBufferContents.emplace_back(size_in_values, static_cast<HOST_DATA_TYPE>(0));
+		acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_values, ACCL::dataType::float32));
+		acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_values, ACCL::dataType::float32));
 		acclSendBuffers.back()->sync_to_device();
 		acclRecvBuffers.back()->sync_to_device();
             }
@@ -83,12 +83,12 @@ namespace network::execution_types::accl {
                     std::cout << "Send " << size_in_values << " bytes to " 
                                 << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl;
 #endif
-			config.accl->send(*acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+			config.context->accl->send(*acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
 #ifndef NDEBUG
                     std::cout << "Recv " << size_in_values << " bytes from " 
                                 << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl;
 #endif
-            config.accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+            config.context->accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
 #ifndef NDEBUG
                     std::cout << "Done" << std::endl;
 #endif
diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
index 4b3ff2ee..eecb552e 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -86,8 +86,8 @@ namespace network::execution_types::accl_pl {
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
                 recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
-                acclSendBuffers.push_back(config.accl->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
-                acclRecvBuffers.push_back(config.accl->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
+                acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
+                acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
                 acclSendBuffers.back()->sync_to_device();
                 acclRecvBuffers.back()->sync_to_device();
             }
@@ -103,11 +103,11 @@ namespace network::execution_types::accl_pl {
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 if (!config.programSettings->useAcclEmulation) {
                 auto run = sendrecvKernel(*(acclSendBuffers[i]->bo()), *(acclRecvBuffers[i]->bo()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
-                                            config.accl->get_communicator_addr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}));
+                                            config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}));
                 run.wait();
                 } else {
                     send_recv(reinterpret_cast<float*>(acclSendBuffers[i]->buffer()), reinterpret_cast<float*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
-                                            config.accl->get_communicator_addr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}),
+                                            config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}),
                                             cmd, sts);
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index 2eef9621..5265ac46 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -110,7 +110,7 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
 
     for (auto& run : data.items) {
         if (world_rank == 0) {
-            std::cout << "Measure for " << (1 << run.messageSize) << " Byte" << std::endl;
+            std::cout << std::dec << "Measure for " << (1 << run.messageSize) << " Byte" << std::endl;
         }
         std::shared_ptr<network::ExecutionTimings> timing;
         switch (executionSettings->programSettings->communicationType) {
@@ -211,7 +211,7 @@ network::NetworkBenchmark::collectAndPrintResults(const network::NetworkExecutio
 
             maxBandwidths.push_back(maxCalcBW);
 
-            std::cout << std::setw(ENTRY_SPACE) << (1 << msgSizeResults.first) << "   "
+            std::cout << std::dec << std::setw(ENTRY_SPACE) << (1 << msgSizeResults.first) << "   "
                     << std::setw(ENTRY_SPACE) << looplength << "   "
                     << std::setw(ENTRY_SPACE) << totalMaxMinCalculationTime[i] << "   "
                     << std::setw(ENTRY_SPACE)  << maxCalcBW
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index 472ab15d..cb0c61ea 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -251,8 +251,11 @@ class NetworkBenchmark :
     public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, cl::Device, cl::Context, cl::Program, NetworkData, NetworkExecutionTimings> 
 #endif
 #ifdef USE_XRT_HOST
+#ifdef USE_ACCL
+    public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, xrt::device, fpga_setup::ACCLContext, xrt::uuid, NetworkData, NetworkExecutionTimings> 
+#else
     public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, xrt::device, bool, xrt::uuid, NetworkData, NetworkExecutionTimings> 
-
+#endif
 #endif
    {
     protected:
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 7ff91bae..494d18c8 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -126,11 +126,19 @@ class BaseSettings {
      */
     CommunicationType communicationType;
 
+#ifdef USE_ACCL
     /**
      * @brief Use ACCL emulation constructor instead of hardware execution
      */
     bool useAcclEmulation;
 
+    /**
+     * @brief Used ACCL network stack
+     * 
+     */
+    ACCL::networkProtocol acclProtocol;
+#endif
+
     /**
      * @brief Construct a new Base Settings object
      * 
@@ -153,8 +161,7 @@ class BaseSettings {
 #endif
 #ifdef USE_ACCL
             useAcclEmulation(static_cast<bool>(results.count("accl-emulation"))),
-#else
-            useAcclEmulation(false),
+            acclProtocol(fpga_setup::acclProtocolStringToEnum(results["accl-protocol"].as<std::string>())),
 #endif
 #ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED
             communicationType(retrieveCommunicationType(results["comm-type"].as<std::string>(), results["f"].as<std::string>())),
@@ -219,14 +226,6 @@ class ExecutionSettings {
      */
     std::unique_ptr<TProgram> program;
 
-#ifdef USE_ACCL
-    /**
-     * @brief Pointer to ACCL instance
-     *
-     */
-    std::unique_ptr<ACCL::ACCL> accl;
-#endif
-
     /**
      * @brief Construct a new Execution Settings object
      * 
@@ -237,16 +236,10 @@ class ExecutionSettings {
      */
     ExecutionSettings(std::unique_ptr<TSettings> programSettings_, std::unique_ptr<TDevice> device_, 
                         std::unique_ptr<TContext> context_, std::unique_ptr<TProgram> program_
-#ifdef USE_ACCL
-                        , std::unique_ptr<ACCL::ACCL> accl_
-#endif
                         
                         ): 
                                     programSettings(std::move(programSettings_)), device(std::move(device_)), 
-                                    context(std::move(context_)), program(std::move(program_))
-#ifdef USE_ACCL
-                                            , accl(std::move(accl_))
-#endif                                      
+                                    context(std::move(context_)), program(std::move(program_))                                    
                                              {}
 
     /**
@@ -406,6 +399,8 @@ class HpccFpgaBenchmark {
 #endif
 #ifdef USE_ACCL
                 ("accl-emulation", "Use the accl emulation instead of hardware execution")
+                ("accl-protocol", "Specify the network protocol that should be used with ACCL.",
+                cxxopts::value<std::string>()->default_value("UDP"))
 #endif
                 ("skip-validation", "Skip the validation of the output data. This will speed up execution and helps when working with special data types.")
                 ("device", "Index of the device that has to be used. If not given you "\
@@ -510,13 +505,13 @@ class HpccFpgaBenchmark {
             std::unique_ptr<TContext> context;
             std::unique_ptr<TProgram> program;
             std::unique_ptr<TDevice> usedDevice;
-#ifdef USE_ACCL
-            std::unique_ptr<ACCL::ACCL> accl;
-#endif
+
             if (!programSettings->testOnly) {
 #ifdef USE_XRT_HOST
                 usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultDevice);
+#ifndef USE_ACCL
                 context = std::unique_ptr<bool>(new bool(false));
+#endif
                 if (!programSettings->useAcclEmulation) {
                     program = fpga_setup::fpgaSetup(*usedDevice, programSettings->kernelFileName);
                 }
@@ -530,19 +525,17 @@ class HpccFpgaBenchmark {
 #endif
 #ifdef USE_ACCL
                 if (programSettings->communicationType == CommunicationType::accl) {
-                    accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program, programSettings->useAcclEmulation);
+                    context = std::unique_ptr<fpga_setup::ACCLContext>(new fpga_setup::ACCLContext(fpga_setup::fpgaSetupACCL(*usedDevice, *program, programSettings->useAcclEmulation,
+                    programSettings->acclProtocol)));
                 }
                 else {
-                    accl = std::unique_ptr<ACCL::ACCL>(nullptr);
+                    context = std::unique_ptr<fpga_setup::ACCLContext>(new fpga_setup::ACCLContext());
                 }
 #endif
             }
 
             executionSettings = std::unique_ptr<ExecutionSettings<TSettings, TDevice, TContext, TProgram>>(new ExecutionSettings<TSettings, TDevice, TContext, TProgram>(std::move(programSettings), std::move(usedDevice), 
-                                                                    std::move(context), std::move(program) 
-#ifdef USE_ACCL
-                                                                    , std::move(accl)
-#endif
+                                                                    std::move(context), std::move(program)
                                                                     ));
             if (mpi_comm_rank == 0) {
                 if (!checkInputParameters()) {
diff --git a/shared/include/setup/fpga_setup_accl.hpp b/shared/include/setup/fpga_setup_accl.hpp
index dcf2a530..ff493ccc 100644
--- a/shared/include/setup/fpga_setup_accl.hpp
+++ b/shared/include/setup/fpga_setup_accl.hpp
@@ -36,6 +36,29 @@ SOFTWARE.
 
 namespace fpga_setup {
 
+
+struct ACCLContext {
+    std::unique_ptr<ACCL::ACCL> accl;
+    std::unique_ptr<ACCL::BaseBuffer> tx_buf_network;
+    std::unique_ptr<ACCL::BaseBuffer> rx_buf_network; 
+};
+
+
+static const std::map<std::string, ACCL::networkProtocol> acclProtocolMap = {
+    {"UDP", ACCL::networkProtocol::UDP}, 
+    {"TCP", ACCL::networkProtocol::TCP} 
+};
+
+static ACCL::networkProtocol acclProtocolStringToEnum(std::string string_representation) {
+    if (acclProtocolMap.count(string_representation)) {
+        return acclProtocolMap.at(string_representation);
+    }
+    else {
+        std::runtime_error("ACCL network protocol could not be parsed from string: " + string_representation);
+    }
+    return ACCL::networkProtocol::UDP;
+}
+
 /**
 Sets up the given FPGA with the kernel in the provided file.
 
@@ -44,8 +67,9 @@ Sets up the given FPGA with the kernel in the provided file.
 @param useAcclEmulation Construct an ACCL emulation instance instead of hardware execution
 @return The ACCL instance used for communication
 */
-std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
-                                          bool useAcclEmulation);
+ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
+                                          bool useAcclEmulation,
+                                          ACCL::networkProtocol protocol);
 
 } // namespace fpga_setup
 #endif // SRC_HOST_FPGA_SETUP_H_
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 4d3207af..ed84ea08 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -66,8 +66,23 @@ void configure_vnx(CMAC &cmac, Networklayer &network_layer,
   network_layer.arp_discovery();
 }
 
-std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
-                                          bool useAcclEmulation) {
+void configure_tcp(ACCL::BaseBuffer &tx_buf_network, ACCL::BaseBuffer &rx_buf_network,
+                   xrt::kernel &network_krnl, std::vector<ACCL::rank_t> &ranks,
+                   int rank) {
+  std::cout << "Configure TCP Network Kernel" << std::endl;
+  tx_buf_network.sync_to_device();
+  rx_buf_network.sync_to_device();
+
+  uint local_fpga_ip = ACCL::ip_encode(ranks[rank].ip);
+  std::cout << "rank: " << rank << " FPGA IP: " << std::hex << local_fpga_ip
+            << std::endl;
+
+  network_krnl(local_fpga_ip, static_cast<uint32_t>(rank), local_fpga_ip,
+               *(tx_buf_network.bo()), *(rx_buf_network.bo()));
+}
+
+ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
+                                          bool useAcclEmulation, ACCL::networkProtocol protocol) {
   int current_rank;
   MPI_Comm_rank(MPI_COMM_WORLD, &current_rank);
 
@@ -80,29 +95,50 @@ std::unique_ptr<ACCL::ACCL> fpgaSetupACCL(xrt::device &device, xrt::uuid &progra
     ACCL::rank_t new_rank = {"10.10.10." + std::to_string(i), 5500 + i, i, ACCL_BUFFER_SIZE};
     ranks.emplace_back(new_rank);
   }
+
+  ACCLContext accl;
+
   if (!useAcclEmulation) {
     std::cout << "Create cclo ip" << std::endl;
     auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}");
     std::cout << "Create hostctrl" << std::endl;
     auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}",
                                    xrt::kernel::cu_access_mode::exclusive);
-    std::cout << "Create CMAC" << std::endl;
-    auto cmac = CMAC(xrt::ip(device, program, "cmac_0:{cmac_0}"));
-    std::cout << "Create Network Layer" << std::endl;
-     auto network_layer = Networklayer(
-          xrt::ip(device, program, "networklayer:{networklayer_0}"));
-    std::cout << "Configure VNX" << std::endl;
-     configure_vnx(cmac, network_layer, ranks, current_rank);
-
+    if (protocol == ACCL::networkProtocol::UDP) {
+      std::cout << "Create CMAC" << std::endl;
+      auto cmac = CMAC(xrt::ip(device, program, "cmac_0:{cmac_0}"));
+      std::cout << "Create Network Layer" << std::endl;
+      auto network_layer = Networklayer(
+            xrt::ip(device, program, "networklayer:{networklayer_0}"));
+      std::cout << "Configure VNX" << std::endl;
+      configure_vnx(cmac, network_layer, ranks, current_rank);
+    }
+    if (protocol == ACCL::networkProtocol::TCP) {
+      auto network_krnl = xrt::kernel(device, program, "network_krnl:{network_krnl_0}",
+                      xrt::kernel::cu_access_mode::exclusive);
+      accl.tx_buf_network = std::unique_ptr<ACCL::BaseBuffer>(new ACCL::FPGABuffer<int8_t>(
+          64 * 1024 * 1024, ACCL::dataType::int8, device, network_krnl.group_id(3)));
+      accl.rx_buf_network = std::unique_ptr<ACCL::BaseBuffer>(new ACCL::FPGABuffer<int8_t>(
+          64 * 1024 * 1024, ACCL::dataType::int8, device, network_krnl.group_id(4)));
+      configure_tcp(*accl.tx_buf_network, *accl.rx_buf_network, network_krnl, ranks, current_rank);
+    }
     std::vector<int> mem(1, 0);
     std::cout << "Create ACCL" << std::endl;
-    return std::unique_ptr<ACCL::ACCL>(
-        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, ACCL::networkProtocol::UDP));
+    accl.accl = std::unique_ptr<ACCL::ACCL>(
+        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, protocol, 16, ACCL_BUFFER_SIZE));
   } else {
     // TODO: Add start port here. Currenty hardcoded!
-    return std::unique_ptr<ACCL::ACCL>(
-        new ACCL::ACCL(ranks, current_rank, 6000, device, ACCL::networkProtocol::UDP, 16, ACCL_BUFFER_SIZE));
+    accl.accl = std::unique_ptr<ACCL::ACCL>(
+        new ACCL::ACCL(ranks, current_rank, 6000, device, protocol, 16, ACCL_BUFFER_SIZE));
+  }
+
+  if (protocol == ACCL::networkProtocol::TCP) {
+    MPI_Barrier(MPI_COMM_WORLD);
+    accl.accl->open_port();
+    MPI_Barrier(MPI_COMM_WORLD);
+    accl.accl->open_con();
   }
+  return accl;
 }
 
 } // namespace fpga_setup

From a2e7b37871f2ff5b86691a39ee9c62bb51df1728 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 29 Nov 2022 10:47:53 +0100
Subject: [PATCH 172/318] Fix ACCL configuration bug

---
 cmake/general_benchmark_build_setup.cmake | 7 +++----
 extern/CMakeLists.txt                     | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake
index 1537b092..cc59db91 100644
--- a/cmake/general_benchmark_build_setup.cmake
+++ b/cmake/general_benchmark_build_setup.cmake
@@ -3,9 +3,6 @@ INCLUDE (CheckTypeSize)
 
 set (CMAKE_CXX_STANDARD 14)
 
-# Download build dependencies
-add_subdirectory(${CMAKE_SOURCE_DIR}/../extern ${CMAKE_BINARY_DIR}/extern)
-
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
     enable_testing()
 endif()
@@ -45,12 +42,14 @@ if (NOT KERNEL_REPLICATION_ENABLED)
  unset(NUM_REPLICATIONS)
 endif()
 
-
 if (HPCC_FPGA_CONFIG)
     message(STATUS "HPCC FPGA configuration defined. Overwrite default values with configuration: ${HPCC_FPGA_CONFIG}")
     include(${HPCC_FPGA_CONFIG})
 endif()
 
+# Download build dependencies
+add_subdirectory(${CMAKE_SOURCE_DIR}/../extern ${CMAKE_BINARY_DIR}/extern)
+
 # Set the used data type
 if (NOT DATA_TYPE)
     set(DATA_TYPE float CACHE STRING "Data type used for calculation")
diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 341f73cd..3bbf1a84 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -55,7 +55,7 @@ if(NOT extern_cxxopts_POPULATED)
     EXCLUDE_FROM_ALL)
 endif()
 
-if (DEFINED USE_ACCL)
+if (USE_ACCL)
 # -------------------------------------------------------------------------------
 # ACCL Library
 FetchContent_Declare(

From 57febee258edeb89a45e4a18b319a5ff0ee6f466 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 29 Nov 2022 11:34:15 +0100
Subject: [PATCH 173/318] Include ACCL earlier in the config process

---
 cmake/general_benchmark_build_setup.cmake | 5 +++++
 cmake/kernelTargets.cmake                 | 4 ----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake
index cc59db91..82ba4ac7 100644
--- a/cmake/general_benchmark_build_setup.cmake
+++ b/cmake/general_benchmark_build_setup.cmake
@@ -50,6 +50,11 @@ endif()
 # Download build dependencies
 add_subdirectory(${CMAKE_SOURCE_DIR}/../extern ${CMAKE_BINARY_DIR}/extern)
 
+# Enable ACCL if required
+if (USE_ACCL)
+   include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake)
+endif()
+
 # Set the used data type
 if (NOT DATA_TYPE)
     set(DATA_TYPE float CACHE STRING "Data type used for calculation")
diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index 86aeeb1c..4b8adee3 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -9,10 +9,6 @@ else()
         set(VPP_FLAGS "-O3")
 endif()
 
-if (USE_ACCL)
-   include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake)
-endif()
-
 set(file_endings "cl" "cpp" )
 
 ##

From 48976ddfa814557926ad40f8c3b739cb4d2d11c0 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 29 Nov 2022 11:35:16 +0100
Subject: [PATCH 174/318] Build PTRANS with ACCl context

---
 .../execution_types/execution_xrt_accl_pq.hpp    | 14 +++++++-------
 .../execution_xrt_accl_stream_pq.hpp             |  8 ++++----
 .../execution_xrt_accl_stream_pq_sendrecv.hpp    | 16 ++++++++--------
 .../execution_types/execution_xrt_pcie_pq.hpp    |  7 ++++---
 PTRANS/src/host/main.cpp                         |  4 ++++
 PTRANS/src/host/transpose_data.hpp               |  7 +++++--
 6 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
index 8e6c0f5b..3a2111f3 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
@@ -40,8 +40,8 @@ namespace accl_pq {
 void accl_exchangeData(
     ACCL::ACCL &accl,
     transpose::data_handler::DistributedPQTransposeDataHandler<
-        xrt::device, bool, xrt::uuid> &handler,
-    transpose::TransposeData<bool> &data, std::vector<xrt::bo> &bufferAXrt,
+        xrt::device, fpga_setup::ACCLContext, xrt::uuid> &handler,
+    transpose::TransposeData<fpga_setup::ACCLContext> &data, std::vector<xrt::bo> &bufferAXrt,
     int global_width) {
 
   int pq_width = handler.getP();
@@ -368,10 +368,10 @@ void accl_exchangeData(
  */
 static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
-                                       xrt::device, bool, xrt::uuid> &config,
-    transpose::TransposeData<bool> &data,
+                                       xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config,
+    transpose::TransposeData<fpga_setup::ACCLContext> &data,
     transpose::data_handler::DistributedPQTransposeDataHandler<
-        xrt::device, bool, xrt::uuid> &handler) {
+        xrt::device, fpga_setup::ACCLContext, xrt::uuid> &handler) {
   int err;
 
   if (config.programSettings->dataHandlerIdentifier !=
@@ -494,7 +494,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
 #ifndef NDEBUG
     std::cout << "Start data exchange with ACCL" << std::endl;
 #endif
-    accl_exchangeData(*config.accl, handler, data, bufferListA,
+    accl_exchangeData(*(config.context->accl), handler, data, bufferListA,
                       config.programSettings->matrixSize / data.blockSize);
 #ifndef NDEBUG
     std::cout << "End data exchange with ACCL" << std::endl;
@@ -578,7 +578,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     }
     endTransfer = std::chrono::high_resolution_clock::now();
 
-    accl_exchangeData(*config.accl, handler, data, bufferListA,
+    accl_exchangeData(*(config.context->accl), handler, data, bufferListA,
                       config.programSettings->matrixSize / data.blockSize);
 
     transferTime += std::chrono::duration_cast<std::chrono::duration<double>>(
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
index 50a07998..27e240e6 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
@@ -68,10 +68,10 @@ namespace accl_stream_pq {
  */
 static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
-                                       xrt::device, bool, xrt::uuid> &config,
-    transpose::TransposeData<bool> &data,
+                                       xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config,
+    transpose::TransposeData<fpga_setup::ACCLContext> &data,
     transpose::data_handler::DistributedPQTransposeDataHandler<
-        xrt::device, bool, xrt::uuid> &handler) {
+        xrt::device, fpga_setup::ACCLContext, xrt::uuid> &handler) {
   int err;
 
   if (config.programSettings->dataHandlerIdentifier !=
@@ -269,7 +269,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
       }
     }
     // Exchange A data via ACCL
-    config.accl->stream_put(ACCL::dataType::float32, data.blockSize * data.blockSize * data.numBlocks,
+    config.context->accl->stream_put(ACCL::dataType::float32, data.blockSize * data.blockSize * data.numBlocks,
                    pair_rank, 0);
 #ifndef NDEBUG
     std::cout << "Wait for kernels to complete" << std::endl;
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp
index c01bab4c..5282b5da 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp
@@ -72,10 +72,10 @@ namespace accl_stream_sendrecv_pq {
  */
 static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
-                                       xrt::device, bool, xrt::uuid> &config,
-    transpose::TransposeData<bool> &data,
+                                       xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config,
+    transpose::TransposeData<fpga_setup::ACCLContext> &data,
     transpose::data_handler::DistributedPQTransposeDataHandler<
-        xrt::device, bool, xrt::uuid> &handler) {
+        xrt::device, fpga_setup::ACCLContext, xrt::uuid> &handler) {
   int err;
 
   if (config.programSettings->dataHandlerIdentifier !=
@@ -169,9 +169,9 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     // The vector list variable can be interpreted as 2D matrix. Every entry
     // represents the target rank of the sub-block Since the LCM block will
     // repeat, we only need to store this small amount of data!
-    auto target_list = config.accl->create_buffer<int>(least_common_multiple / pq_height *
+    auto target_list = config.context->accl->create_buffer<int>(least_common_multiple / pq_height *
                                 least_common_multiple / pq_width, ACCL::dataType::int32);
-    bufferListCopy.push_back(config.accl->create_buffer<DEVICE_DATA_TYPE>(buffer_size, ACCL::dataType::float32));
+    bufferListCopy.push_back(config.context->accl->create_buffer<DEVICE_DATA_TYPE>(buffer_size, ACCL::dataType::float32));
     for (int row = 0; row < least_common_multiple / pq_height; row++) {
       for (int col = 0; col < least_common_multiple / pq_width; col++) {
         int global_block_col = pq_col + col * pq_width;
@@ -313,7 +313,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
 #ifndef NDEBUG
     std::cout << "Start ACCL send/recv" << std::endl;
 #endif
-    auto dbuffer = config.accl->create_buffer<DEVICE_DATA_TYPE>(1,ACCL::dataType::float32);
+    auto dbuffer = config.context->accl->create_buffer<DEVICE_DATA_TYPE>(1,ACCL::dataType::float32);
     int g = transpose::data_handler::mod(pq_row - pq_col, gcd);
     int p = transpose::data_handler::mod(pq_col + g, pq_width);
     int q = transpose::data_handler::mod(pq_row - g, pq_height);
@@ -361,7 +361,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
                     //TODO copy from and to string not implemented in driver yet
                     // config.accl->copy_from_stream(*bufferListCopy[0], sending_size);
                   } else {
-                    config.accl->send(ACCL::dataType::float32, sending_size, send_rank, 0);
+                    config.context->accl->send(ACCL::dataType::float32, sending_size, send_rank, 0);
                   }
               } else {
   #ifndef NDEBUG
@@ -371,7 +371,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
                   //TODO copy from and to string not implemented in driver yet
                   // config.accl->copy_to_stream(*bufferListCopy[0], receiving_size);
                 } else {
-                  config.accl->recv(ACCL::dataType::float32, receiving_size, recv_rank, 0);
+                  config.context->accl->recv(ACCL::dataType::float32, receiving_size, recv_rank, 0);
                 }
               }
           }
diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
index f0d4eeed..0fa0f9c2 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
@@ -48,12 +48,13 @@ namespace pcie_pq {
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured
  * execution times
  */
+template<class TContext>
 static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
-                                       xrt::device, bool, xrt::uuid> &config,
-          transpose::TransposeData<bool> &data,
+                                       xrt::device, TContext, xrt::uuid> &config,
+          transpose::TransposeData<TContext> &data,
     transpose::data_handler::DistributedPQTransposeDataHandler<
-        xrt::device, bool, xrt::uuid> &handler) {
+        xrt::device, TContext, xrt::uuid> &handler) {
   int err;
 
   if (config.programSettings->dataHandlerIdentifier !=
diff --git a/PTRANS/src/host/main.cpp b/PTRANS/src/host/main.cpp
index d4db9803..126f6ff3 100644
--- a/PTRANS/src/host/main.cpp
+++ b/PTRANS/src/host/main.cpp
@@ -11,7 +11,11 @@ main(int argc, char *argv[]) {
 #ifdef USE_OCL_HOST
     TransposeBenchmark<cl::Device, cl::Context, cl::Program> bm(argc, argv);
 #else
+#ifndef USE_ACCL
     TransposeBenchmark<xrt::device, bool, xrt::uuid> bm(argc, argv);
+#else
+    TransposeBenchmark<xrt::device, fpga_setup::ACCLContext, xrt::uuid> bm(argc, argv);
+#endif
 #endif
     bool success = bm.executeBenchmark();
     if (success) {
diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp
index cd9020e4..9949aede 100644
--- a/PTRANS/src/host/transpose_data.hpp
+++ b/PTRANS/src/host/transpose_data.hpp
@@ -159,8 +159,11 @@ class TransposeData {
      * @param block_size size of the quadratic blocks that are stored within this object
      * @param y_size number of blocks that are stored within this object per replication
      */
-    TransposeData(TContext context, uint block_size, uint y_size): context(context),
-                                                                   numBlocks(y_size), blockSize(block_size) {
+    TransposeData(TContext &context, uint block_size, uint y_size): 
+#ifdef USE_SVM
+    context(context),
+#endif                                                                   
+    numBlocks(y_size), blockSize(block_size) {
         if (numBlocks * blockSize > 0) {
 #ifdef USE_SVM
             A = reinterpret_cast<HOST_DATA_TYPE*>(

From ac6b911b5cabfe38835e0380bfa2c432347268f3 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 29 Nov 2022 12:45:25 +0100
Subject: [PATCH 175/318] Compile LINPACK with new ACCL context

---
 .../execution_accl_buffers.hpp                | 26 +++++------
 .../host/execution_types/execution_iec.hpp    |  2 +-
 .../host/execution_types/execution_pcie.hpp   |  2 +-
 .../execution_types/execution_xrt_pcie.hpp    |  5 ++-
 LINPACK/src/host/linpack_benchmark.hpp        | 14 +++---
 LINPACK/src/host/linpack_data.cpp             | 31 -------------
 LINPACK/src/host/linpack_data.hpp             | 43 +++++++++++++++----
 LINPACK/src/host/main.cpp                     |  4 ++
 8 files changed, 64 insertions(+), 63 deletions(-)

diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
index e7db12b3..4266f605 100644
--- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
+++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
@@ -49,8 +49,8 @@ namespace accl_buffers {
 */
 std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
     const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings,
-                                       xrt::device, bool, xrt::uuid> &config,
-    linpack::LinpackData &data) {
+                                       xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config,
+    linpack::LinpackData<fpga_setup::ACCLContext> &data) {
 
   cl_int err;
 
@@ -67,7 +67,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
 
   // Get group of global communicator
   std::vector<ACCL::rank_t> all_accl_ranks =
-      config.accl->get_comm_group(ACCL::GLOBAL_COMM);
+      config.context->accl->get_comm_group(ACCL::GLOBAL_COMM);
 
   std::vector<ACCL::rank_t> row_ranks;
   std::vector<ACCL::rank_t> col_ranks;
@@ -86,9 +86,9 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
   }
 
   // Create communicators from sub-groups
-  ACCL::communicatorId row_comm = config.accl->create_communicator(
+  ACCL::communicatorId row_comm = config.context->accl->create_communicator(
       row_ranks, config.programSettings->torus_col);
-  ACCL::communicatorId col_comm = config.accl->create_communicator(
+  ACCL::communicatorId col_comm = config.context->accl->create_communicator(
       col_ranks, config.programSettings->torus_row);
 
   // Create global memory buffers
@@ -120,7 +120,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                            (config.programSettings->blockSize) *
                            (config.programSettings->blockSize),
                        lu_tmp_kernel.group_id(1));
-  auto Buffer_lu1 = config.accl->create_buffer<HOST_DATA_TYPE>(
+  auto Buffer_lu1 = config.context->accl->create_buffer<HOST_DATA_TYPE>(
       tmp_bos.back(),
       (config.programSettings->blockSize) * (config.programSettings->blockSize),
       ACCL::dataType::float32);
@@ -129,7 +129,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                            (config.programSettings->blockSize) *
                            (config.programSettings->blockSize),
                        lu_tmp_kernel.group_id(2));
-  auto Buffer_lu2 = config.accl->create_buffer<HOST_DATA_TYPE>(
+  auto Buffer_lu2 = config.context->accl->create_buffer<HOST_DATA_TYPE>(
       tmp_bos.back(),
       (config.programSettings->blockSize) * (config.programSettings->blockSize),
       ACCL::dataType::float32);
@@ -151,7 +151,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                                (config.programSettings->blockSize),
                            lu_tmp_kernel.group_id(1));
       Buffer_top_list.back().push_back(
-          config.accl->create_buffer<HOST_DATA_TYPE>(
+          config.context->accl->create_buffer<HOST_DATA_TYPE>(
               tmp_bos.back(),
               (config.programSettings->blockSize) *
                   (config.programSettings->blockSize),
@@ -166,7 +166,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                                (config.programSettings->blockSize),
                            lu_tmp_kernel.group_id(2));
       Buffer_left_list.back().push_back(
-          config.accl->create_buffer<HOST_DATA_TYPE>(
+          config.context->accl->create_buffer<HOST_DATA_TYPE>(
               tmp_bos.back(),
               (config.programSettings->blockSize) *
                   (config.programSettings->blockSize),
@@ -291,12 +291,12 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
           // FPGAs
 
           // Broadcast LU block in column to update all left blocks
-          config.accl->bcast(*Buffer_lu2,
+          config.context->accl->bcast(*Buffer_lu2,
                              config.programSettings->blockSize *
                                  config.programSettings->blockSize,
                              local_block_row_remainder, col_comm, true, true);
           // Broadcast LU block in row to update all top blocks
-          config.accl->bcast(*Buffer_lu1,
+          config.context->accl->bcast(*Buffer_lu1,
                              config.programSettings->blockSize *
                                  config.programSettings->blockSize,
                              local_block_col_remainder, row_comm, true, true);
@@ -352,7 +352,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                lbi <
                std::max(static_cast<int>(blocks_per_col - local_block_col), 0);
                lbi++) {
-            config.accl->bcast(*Buffer_left_list[block_row % 2][lbi],
+            config.context->accl->bcast(*Buffer_left_list[block_row % 2][lbi],
                                config.programSettings->blockSize *
                                    config.programSettings->blockSize,
                                local_block_col_remainder, row_comm, true, true);
@@ -361,7 +361,7 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
                tbi <
                std::max(static_cast<int>(blocks_per_row - local_block_row), 0);
                tbi++) {
-            config.accl->bcast(*Buffer_top_list[block_row % 2][tbi],
+            config.context->accl->bcast(*Buffer_top_list[block_row % 2][tbi],
                                config.programSettings->blockSize *
                                    config.programSettings->blockSize,
                                local_block_row_remainder, col_comm, true, true);
diff --git a/LINPACK/src/host/execution_types/execution_iec.hpp b/LINPACK/src/host/execution_types/execution_iec.hpp
index b07ed6a6..279db54a 100644
--- a/LINPACK/src/host/execution_types/execution_iec.hpp
+++ b/LINPACK/src/host/execution_types/execution_iec.hpp
@@ -46,7 +46,7 @@ namespace iec {
 */
 std::unique_ptr<linpack::LinpackExecutionTimings>
 calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings, cl::Device, cl::Context, cl::Program>&config,
-          linpack::LinpackData& data) {
+          linpack::LinpackData<cl::Context>& data) {
 
     int err;
 
diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp
index 5ef4ad27..b484a822 100644
--- a/LINPACK/src/host/execution_types/execution_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_pcie.hpp
@@ -52,7 +52,7 @@ namespace pcie {
 */
 std::unique_ptr<linpack::LinpackExecutionTimings>
 calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings, cl::Device, cl::Context, cl::Program>&config,
-          linpack::LinpackData& data) {
+          linpack::LinpackData<cl::Context>& data) {
 
     cl_int err;
 
diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
index f35df7b9..aa0484e1 100644
--- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
@@ -47,10 +47,11 @@ namespace xrt_pcie {
 
  @copydoc bm_execution::calculate()
 */
+template<class TContext>
 std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
     const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings,
-                                       xrt::device, bool, xrt::uuid> &config,
-    linpack::LinpackData &data) {
+                                       xrt::device, TContext, xrt::uuid> &config,
+    linpack::LinpackData<TContext> &data) {
 
   cl_int err;
 
diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp
index d1d3093c..48819296 100644
--- a/LINPACK/src/host/linpack_benchmark.hpp
+++ b/LINPACK/src/host/linpack_benchmark.hpp
@@ -48,7 +48,7 @@ namespace linpack {
  * 
  */
 template<class TDevice, class TContext, class TProgram>
-class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSettings, TDevice, TContext, TProgram, LinpackData, LinpackExecutionTimings> {
+class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSettings, TDevice, TContext, TProgram, LinpackData<TContext>, LinpackExecutionTimings> {
 
 protected:
 
@@ -77,7 +77,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @param data The local data. b will contain the solution for the unknows that were handeled by this rank
      */
     void 
-    distributed_gesl_nopvt_ref(linpack::LinpackData& data) {
+    distributed_gesl_nopvt_ref(linpack::LinpackData<TContext>& data) {
     uint global_matrix_size = this->executionSettings->programSettings->matrixSize;
     uint matrix_width = data.matrix_width;
     uint matrix_height = data.matrix_height;
@@ -209,7 +209,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * 
      * @return std::unique_ptr<LinpackData> The input and output data of the benchmark
      */
-    std::unique_ptr<LinpackData>
+    std::unique_ptr<LinpackData<TContext>>
     generateInputData() override {
     int local_matrix_width = this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->torus_width;
     int local_matrix_height = this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->torus_height;
@@ -219,7 +219,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
             throw std::runtime_error("Global matrix size must be multiple of LCM of PQ grid!");
     }
 
-    auto d = std::unique_ptr<linpack::LinpackData>(new linpack::LinpackData(*this->executionSettings->context ,local_matrix_width, local_matrix_height));
+    auto d = std::unique_ptr<linpack::LinpackData<TContext>>(new linpack::LinpackData<TContext>(*this->executionSettings->context ,local_matrix_width, local_matrix_height));
     std::mt19937 gen(this->mpi_comm_rank);
     std::uniform_real_distribution<> dis(0.0, 1.0);
     d->norma = 0.0;
@@ -305,7 +305,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @return std::unique_ptr<LinpackExecutionTimings> Measured runtimes of the kernel execution
      */
     std::unique_ptr<LinpackExecutionTimings>
-    executeKernel(LinpackData &data) override {
+    executeKernel(LinpackData<TContext> &data) override {
     std::unique_ptr<linpack::LinpackExecutionTimings> timings;
     switch (this->executionSettings->programSettings->communicationType) {
 #ifdef USE_OCL_HOST
@@ -335,7 +335,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(LinpackData &data) override {
+    validateOutputAndPrintError(LinpackData<TContext> &data) override {
     uint n= this->executionSettings->programSettings->matrixSize;
     uint matrix_width = data.matrix_width;
     uint matrix_height = data.matrix_height;
@@ -577,7 +577,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @param argc the number of program input parameters
      * @param argv the program input parameters as array of strings
      */
-    LinpackBenchmark(int argc, char* argv[]) : hpcc_base::HpccFpgaBenchmark<linpack::LinpackProgramSettings, TDevice, TContext, TProgram, linpack::LinpackData, linpack::LinpackExecutionTimings>(argc, argv) {
+    LinpackBenchmark(int argc, char* argv[]) : hpcc_base::HpccFpgaBenchmark<linpack::LinpackProgramSettings, TDevice, TContext, TProgram, linpack::LinpackData<TContext>, linpack::LinpackExecutionTimings>(argc, argv) {
         this->setupBenchmark(argc, argv);
     }
 
diff --git a/LINPACK/src/host/linpack_data.cpp b/LINPACK/src/host/linpack_data.cpp
index 2c724796..f2c3cfa4 100644
--- a/LINPACK/src/host/linpack_data.cpp
+++ b/LINPACK/src/host/linpack_data.cpp
@@ -62,37 +62,6 @@ linpack::LinpackProgramSettings::getSettingsMap() {
         return map;
 }
 
-linpack::LinpackData::LinpackData(cl::Context context, size_t width, size_t height) : norma(0.0), context(context),
-    matrix_width(width), matrix_height(height) {
-#ifdef USE_SVM
-    A = reinterpret_cast<HOST_DATA_TYPE*>(
-                        clSVMAlloc(context(), 0 ,
-                        size * size * sizeof(HOST_DATA_TYPE), 1024));
-    b = reinterpret_cast<HOST_DATA_TYPE*>(
-                        clSVMAlloc(context(), 0 ,
-                        size  * sizeof(HOST_DATA_TYPE), 1024));
-    ipvt = reinterpret_cast<cl_int*>(
-                        clSVMAlloc(context(), 0 ,
-                        size * sizeof(cl_int), 1024));
-#else
-    posix_memalign(reinterpret_cast<void**>(&A), 4096, width * height * sizeof(HOST_DATA_TYPE));
-    posix_memalign(reinterpret_cast<void**>(&b), 4096, width * sizeof(HOST_DATA_TYPE));
-    posix_memalign(reinterpret_cast<void**>(&ipvt), 4096, height * sizeof(cl_int));
-#endif
-    }
-
-linpack::LinpackData::~LinpackData() {
-#ifdef USE_SVM
-    clSVMFree(context(), reinterpret_cast<void*>(A));
-    clSVMFree(context(), reinterpret_cast<void*>(b));
-    clSVMFree(context(), reinterpret_cast<void*>(ipvt));
-#else
-    free(A);
-    free(b);
-    free(ipvt);
-#endif
-}
-
 /**
 Standard LU factorization on a block with fixed size
 
diff --git a/LINPACK/src/host/linpack_data.hpp b/LINPACK/src/host/linpack_data.hpp
index 51324a5c..341ce0a2 100644
--- a/LINPACK/src/host/linpack_data.hpp
+++ b/LINPACK/src/host/linpack_data.hpp
@@ -117,6 +117,7 @@ class LinpackProgramSettings : public hpcc_base::BaseSettings {
  * @brief Data class containing the data the kernel is exeucted with
  * 
  */
+template<class TContext>
 class LinpackData {
 
 public:
@@ -155,7 +156,7 @@ class LinpackData {
      * @brief The context that is used to allocate memory in SVM mode
      * 
      */
-    cl::Context context;
+    TContext context;
 
     /**
      * @brief The maximum value of A that will be used for the error calculation
@@ -176,13 +177,39 @@ class LinpackData {
      * @param width width of the local matrix in values
      * @param height height of the local matrix in values
      */
-    LinpackData(cl::Context context, size_t width, size_t height);
-
-    /**
-     * @brief Destroy the Linpack Data object. Free the allocated memory
-     * 
-     */
-    ~LinpackData();
+    LinpackData(TContext &context, size_t width, size_t height) : norma(0.0), 
+#ifdef USE_SVM
+    context(context),
+#endif
+    matrix_width(width), matrix_height(height) {
+#ifdef USE_SVM
+        A = reinterpret_cast<HOST_DATA_TYPE*>(
+                            clSVMAlloc(context(), 0 ,
+                            size * size * sizeof(HOST_DATA_TYPE), 1024));
+        b = reinterpret_cast<HOST_DATA_TYPE*>(
+                            clSVMAlloc(context(), 0 ,
+                            size  * sizeof(HOST_DATA_TYPE), 1024));
+        ipvt = reinterpret_cast<cl_int*>(
+                            clSVMAlloc(context(), 0 ,
+                            size * sizeof(cl_int), 1024));
+#else
+        posix_memalign(reinterpret_cast<void**>(&A), 4096, width * height * sizeof(HOST_DATA_TYPE));
+        posix_memalign(reinterpret_cast<void**>(&b), 4096, width * sizeof(HOST_DATA_TYPE));
+        posix_memalign(reinterpret_cast<void**>(&ipvt), 4096, height * sizeof(cl_int));
+#endif
+    }
+
+    ~LinpackData() {
+#ifdef USE_SVM
+        clSVMFree(context(), reinterpret_cast<void*>(A));
+        clSVMFree(context(), reinterpret_cast<void*>(b));
+        clSVMFree(context(), reinterpret_cast<void*>(ipvt));
+#else
+        free(A);
+        free(b);
+        free(ipvt);
+#endif
+    }
 
 };
 
diff --git a/LINPACK/src/host/main.cpp b/LINPACK/src/host/main.cpp
index cfd89914..51c4d292 100644
--- a/LINPACK/src/host/main.cpp
+++ b/LINPACK/src/host/main.cpp
@@ -16,7 +16,11 @@ main(int argc, char *argv[]) {
     LinpackBenchmark<cl::Device, cl::Context, cl::Program> bm(argc, argv);
 #endif
 #ifdef USE_XRT_HOST
+#ifndef USE_ACCL
     LinpackBenchmark<xrt::device, bool, xrt::uuid> bm(argc, argv);
+#else
+    LinpackBenchmark<xrt::device, fpga_setup::ACCLContext, xrt::uuid> bm(argc, argv);
+#endif
 #endif
     bool success = bm.executeBenchmark();
     if (success) {

From 0a5981efea2fc2e5cb0cdf1181a0dd60b9b7ecc9 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 29 Nov 2022 15:39:47 +0100
Subject: [PATCH 176/318] Split up parameter.h generation to reduce redundancy

---
 LINPACK/src/common/parameters.h.in        | 15 ++-------------
 PTRANS/src/common/parameters.h.in         | 18 +-----------------
 b_eff/src/common/parameters.h.in          | 18 ++----------------
 cmake/general_benchmark_build_setup.cmake |  4 ++++
 shared/include/base_parameters.h.in       | 22 ++++++++++++++++++++++
 5 files changed, 31 insertions(+), 46 deletions(-)
 create mode 100644 shared/include/base_parameters.h.in

diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in
index a5bac5e0..5c7b0331 100644
--- a/LINPACK/src/common/parameters.h.in
+++ b/LINPACK/src/common/parameters.h.in
@@ -1,14 +1,11 @@
 #ifndef SRC_COMMON_PARAMETERS_H_
 #define SRC_COMMON_PARAMETERS_H_
 
+#include "base_parameters.h"
+
 /**
  * Host specific parameters
  */
-#define VERSION "@PROJECT_VERSION@"
-#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@
-#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@
-#define DEFAULT_DEVICE @DEFAULT_DEVICE@
-#define HOST_DATA_TYPE @HOST_DATA_TYPE@
 #define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@
 #define DEFAULT_P_VALUE @DEFAULT_P_VALUE@
 #cmakedefine _DP
@@ -22,15 +19,12 @@
 /**
  * Device specific parameters
  */
-#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
 #define LOCAL_MEM_BLOCK_LOG @LOCAL_MEM_BLOCK_LOG@
 #define REGISTER_BLOCK_LOG @REGISTER_BLOCK_LOG@
 #define REGISTER_BLOCK_MM_LOG @REGISTER_BLOCK_MM_LOG@
-#define NUM_REPLICATIONS @NUM_REPLICATIONS@
 
 #cmakedefine USE_SVM
 #cmakedefine DISTRIBUTED_VALIDATION
-#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@
 
 /*
 Short description of the program
@@ -39,11 +33,6 @@ Short description of the program
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
 
-/**
-Output separator
-*/
-#define HLINE "-------------------------------------------------------------\n"
-
 #define LEFT_BLOCK (1 << 1)
 #define TOP_BLOCK (1 << 2)
 #define LU_BLOCK_OUT (1 << 3)
diff --git a/PTRANS/src/common/parameters.h.in b/PTRANS/src/common/parameters.h.in
index e42792ff..9575d69e 100644
--- a/PTRANS/src/common/parameters.h.in
+++ b/PTRANS/src/common/parameters.h.in
@@ -1,32 +1,21 @@
 #ifndef SRC_COMMON_PARAMETERS_H_
 #define SRC_COMMON_PARAMETERS_H_
 
-#define VERSION "@PROJECT_VERSION@"
+#include "base_parameters.h"
 
 #define READ_KERNEL_NAME "@READ_KERNEL_NAME@"
 #define WRITE_KERNEL_NAME "@WRITE_KERNEL_NAME@"
-#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@
 #define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@
 #define DEFAULT_COMM_TYPE "@DEFAULT_COMM_TYPE@"
 #define DEFAULT_DIST_TYPE "@DEFAULT_DIST_TYPE@"
-#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@
-#define DEFAULT_DEVICE @DEFAULT_DEVICE@
 #define DEFAULT_P_VALUE @DEFAULT_P_VALUE@
 
-#define NUM_REPLICATIONS @NUM_REPLICATIONS@
-#cmakedefine HOST_EMULATION_REORDER
-
-#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@
-
 /**
  * Kernel Parameters
  */
 #define BLOCK_SIZE @BLOCK_SIZE@
 #define CHANNEL_WIDTH @CHANNEL_WIDTH@
 
-#define HOST_DATA_TYPE @HOST_DATA_TYPE@
-#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
-
 #cmakedefine USE_SVM
 #cmakedefine USE_BUFFER_WRITE_RECT_FOR_A
 #cmakedefine XILINX_UNROLL_INNER_LOOPS
@@ -39,9 +28,4 @@ Moreover the version and build time is also compiled into the description.
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
 
-/**
-Output separator
-*/
-#define HLINE "-------------------------------------------------------------\n"
-
 #endif // SRC_COMMON_PARAMETERS_H_
\ No newline at end of file
diff --git a/b_eff/src/common/parameters.h.in b/b_eff/src/common/parameters.h.in
index 5c823610..3dc3e8a0 100644
--- a/b_eff/src/common/parameters.h.in
+++ b/b_eff/src/common/parameters.h.in
@@ -1,12 +1,10 @@
 #ifndef SRC_COMMON_PARAMETERS_H_
 #define SRC_COMMON_PARAMETERS_H_
 
-#define VERSION "@PROJECT_VERSION@"
+#include "base_parameters.h"
+
 #define SEND_KERNEL_NAME "@SEND_KERNEL_NAME@"
 #define RECV_KERNEL_NAME "@RECV_KERNEL_NAME@"
-#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@
-#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@
-#define DEFAULT_DEVICE @DEFAULT_DEVICE@
 #define DEFAULT_MAX_MESSAGE_SIZE @DEFAULT_MAX_MESSAGE_SIZE@
 #define DEFAULT_MAX_LOOP_LENGTH @DEFAULT_MAX_LOOP_LENGTH@
 #define DEFAULT_MIN_LOOP_LENGTH @DEFAULT_MIN_LOOP_LENGTH@
@@ -17,13 +15,6 @@
  * Kernel Parameters
  */
 #define CHANNEL_WIDTH @CHANNEL_WIDTH@
-#define NUM_REPLICATIONS @NUM_REPLICATIONS@
-
-#define HOST_DATA_TYPE @HOST_DATA_TYPE@
-#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
-
-#cmakedefine HOST_EMULATION_REORDER
-#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@
 
 /*
 Short description of the program.
@@ -34,9 +25,4 @@ Moreover the version and build time is also compiled into the description.
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
 
-/**
-Output separator
-*/
-#define HLINE "-------------------------------------------------------------\n"
-
 #endif // SRC_COMMON_PARAMETERS_H_
\ No newline at end of file
diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake
index 82ba4ac7..427aaab4 100644
--- a/cmake/general_benchmark_build_setup.cmake
+++ b/cmake/general_benchmark_build_setup.cmake
@@ -164,6 +164,10 @@ list(APPEND CMAKE_EXTRA_INCLUDE_FILES "CL/opencl.h")
 check_type_size("${HOST_DATA_TYPE}" DATA_TYPE_SIZE)
 
 # Configure the header file with definitions used by the host code
+configure_file(
+        "${CMAKE_SOURCE_DIR}/../shared/include/base_parameters.h.in"
+        "${CMAKE_BINARY_DIR}/src/common/base_parameters.h"
+)
 configure_file(
         "${CMAKE_SOURCE_DIR}/src/common/parameters.h.in"
         "${CMAKE_BINARY_DIR}/src/common/parameters.h"
diff --git a/shared/include/base_parameters.h.in b/shared/include/base_parameters.h.in
new file mode 100644
index 00000000..45a1100b
--- /dev/null
+++ b/shared/include/base_parameters.h.in
@@ -0,0 +1,22 @@
+#ifndef BASE_PARAMETERS_H
+#define BASE_PARAMETERS_H
+
+#define VERSION "@PROJECT_VERSION@"
+#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@
+#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@
+#define DEFAULT_DEVICE @DEFAULT_DEVICE@
+#cmakedefine NUM_REPLICATIONS @NUM_REPLICATIONS@
+#define HOST_DATA_TYPE @HOST_DATA_TYPE@
+#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
+
+#cmakedefine HOST_EMULATION_REORDER
+#cmakedefine DEFAULT_ACCL_BUFFER_SIZE @DEFAULT_ACCL_BUFFER_SIZE@
+#cmakedefine DEFAULT_ACCL_BUFFER_COUNT @DEFAULT_ACCL_BUFFER_COUNT@
+#cmakedefine ACCL_STACK_TYPE "@ACCL_STACK_TYPE@"
+
+/**
+Output separator
+*/
+#define HLINE "-------------------------------------------------------------\n"
+
+#endif
\ No newline at end of file

From c46559f66bfb4dbd02d000caf2d6e5a21418519a Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 29 Nov 2022 15:40:55 +0100
Subject: [PATCH 177/318] Explicitly load ACCL network stacks as dep

---
 cmake/accl.cmake      |  7 ++++---
 extern/CMakeLists.txt | 31 +++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index 7c3d1f08..fd29f4ee 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -3,12 +3,13 @@
 set(ACCL_STACK_TYPE "UDP" CACHE STRING "Network layer used in ACCL")
 set(ACCL_UDP_ETH_IFS 1 CACHE STRING "Number of Ethernet interfaces to synthesize for UDP stack")
 set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform")
-set(ACCL_BUFFER_SIZE 8192 CACHE STRING "Size of ACCL buffers in bytes")
+set(DEFAULT_ACCL_BUFFER_SIZE 8192 CACHE STRING "Size of ACCL buffers in KB")
+set(DEFAULT_ACCL_BUFFER_COUNT 16 CACHE STRING "Number of ACCL buffers")
 set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware)
 set(ACCL_CCLO_ADDITIONAL_BUILD_ARGS "" CACHE STRING "Add additional build arguments that will be passed to the CCLO makefile")
 set(ACCL_CCLO_BUILD_ARGS ${ACCL_CCLO_ADDITIONAL_BUILD_ARGS})
 # UDP related definitions
-set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/)
+set(ACCL_VNX_DIR ${extern_accl_udp_SOURCE_DIR})
 set(ACCL_NETLAYER_HLS ${ACCL_VNX_DIR}/NetLayers/100G-fpga-network-stack-core)
 set(ACCL_UDP_NET_XO ${ACCL_VNX_DIR}/NetLayers/_x.${FPGA_BOARD_NAME}/networklayer.xo)
 set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HBM)
@@ -42,7 +43,7 @@ add_custom_target(
     DEPENDS ${ACCL_UDP_MAC_XOS} ${ACCL_UDP_NET_XO})
 
 # TCP related definitions
-set(ACCL_TCP_BASE_DIR ${ACCL_HARDWARE_DIR}/Vitis_with_100Gbps_TCP-IP)
+set(ACCL_TCP_BASE_DIR ${extern_accl_tcp_SOURCE_DIR})
 set(ACCL_TCP_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/network_krnl.xo)
 set(ACCL_TCP_CMAC_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/cmac_krnl.xo)
 if (ACCL_STACK_TYPE STREQUAL "TCP")
diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 3bbf1a84..eec6a24d 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -56,6 +56,7 @@ if(NOT extern_cxxopts_POPULATED)
 endif()
 
 if (USE_ACCL)
+message(STATUS "ACCL was selected. Fetch ACCL dependencies")
 # -------------------------------------------------------------------------------
 # ACCL Library
 FetchContent_Declare(
@@ -70,4 +71,34 @@ if(NOT extern_accl_POPULATED)
 	FetchContent_Populate(extern_accl)
 	set(extern_accl_SOURCE_DIR ${extern_accl_SOURCE_DIR} PARENT_SCOPE)
 endif()
+
+# -------------------------------------------------------------------------------
+# UDP Library
+FetchContent_Declare(
+	extern_accl_udp
+
+    GIT_REPOSITORY	https://github.com/Xilinx/xup_vitis_network_example.git
+	  GIT_TAG		master)
+
+FetchContent_GetProperties(extern_accl_udp)
+if(NOT extern_accl_udp_POPULATED)
+	message(STATUS "Fetching mandatory build dependency ACCL UDP stack")
+	FetchContent_Populate(extern_accl_udp)
+	set(extern_accl_udp_SOURCE_DIR ${extern_accl_udp_SOURCE_DIR} PARENT_SCOPE)
+endif()
+
+# -------------------------------------------------------------------------------
+# TCP Library
+FetchContent_Declare(
+	extern_accl_tcp
+
+    GIT_REPOSITORY	https://github.com/fpgasystems/Vitis_with_100Gbps_TCP-IP.git
+	  GIT_TAG		vitis_2022_1)
+
+FetchContent_GetProperties(extern_accl_tcp)
+if(NOT extern_accl_tcp_POPULATED)
+	message(STATUS "Fetching mandatory build dependency ACCL TCP stack")
+	FetchContent_Populate(extern_accl_tcp)
+	set(extern_accl_tcp_SOURCE_DIR ${extern_accl_tcp_SOURCE_DIR} PARENT_SCOPE)
+endif()
 endif()

From c7172d76a5c5f92ca4f591d3a29b5d622c599851 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 29 Nov 2022 15:42:32 +0100
Subject: [PATCH 178/318] Refactor hpcc base code

---
 shared/CMakeLists.txt                    |   2 +-
 shared/hpcc_settings.cpp                 |  59 +++++++
 shared/include/hpcc_benchmark.hpp        | 212 ++---------------------
 shared/include/hpcc_settings.hpp         | 200 +++++++++++++++++++++
 shared/include/setup/fpga_setup_accl.hpp |   6 +-
 shared/setup/fpga_setup_accl.cpp         |  16 +-
 shared/setup/fpga_setup_xrt.cpp          |   2 +-
 7 files changed, 282 insertions(+), 215 deletions(-)
 create mode 100644 shared/hpcc_settings.cpp
 create mode 100644 shared/include/hpcc_settings.hpp

diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt
index 64260c94..70d8184d 100644
--- a/shared/CMakeLists.txt
+++ b/shared/CMakeLists.txt
@@ -12,7 +12,7 @@ endif()
 if (USE_XRT_HOST)
     list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp)
 endif()
-list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp)
+list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp ${CMAKE_CURRENT_SOURCE_DIR}/hpcc_settings.cpp)
 add_library(hpcc_fpga_base STATIC ${HPCC_BASE_SOURCES})
 if (USE_ACCL)
     add_subdirectory(${extern_accl_SOURCE_DIR}/test/hardware/xup_vitis_network_example/xrt_host_api
diff --git a/shared/hpcc_settings.cpp b/shared/hpcc_settings.cpp
new file mode 100644
index 00000000..3751d10f
--- /dev/null
+++ b/shared/hpcc_settings.cpp
@@ -0,0 +1,59 @@
+#include "hpcc_settings.hpp"
+
+#ifdef USE_ACCL
+#include "setup/fpga_setup_accl.hpp"
+#endif
+    
+    /**
+     * @brief Construct a new Base Settings object
+     * 
+     * @param results The resulting map from parsing the program input parameters
+     */
+hpcc_base::BaseSettings::BaseSettings(cxxopts::ParseResult &results) : numRepetitions(results["n"].as<uint>()),
+#ifdef INTEL_FPGA
+            useMemoryInterleaving(static_cast<bool>(results.count("i"))), 
+#else
+            useMemoryInterleaving(true),
+#endif
+            skipValidation(static_cast<bool>(results.count("skip-validation"))), 
+            defaultPlatform(results["platform"].as<int>()),
+            defaultDevice(results["device"].as<int>()),
+            kernelFileName(results["f"].as<std::string>()),
+#ifdef NUM_REPLICATIONS
+            kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : NUM_REPLICATIONS),
+#else
+            kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : 1),
+#endif
+#ifdef USE_ACCL
+            useAcclEmulation(static_cast<bool>(results.count("accl-emulation"))),
+            acclProtocol(fpga_setup::acclProtocolStringToEnum(results["accl-protocol"].as<std::string>())),
+            acclBufferSize(results["accl-buffer-size"].as<uint>() * 1024),
+            acclBufferCount(results["accl-buffer-count"].as<uint>()),
+#endif
+#ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED
+            communicationType(retrieveCommunicationType(results["comm-type"].as<std::string>(), results["f"].as<std::string>())),
+#else
+            communicationType(retrieveCommunicationType("UNSUPPORTED", results["f"].as<std::string>())),
+#endif
+            testOnly(static_cast<bool>(results.count("test"))) {}
+
+/**
+ * @brief Get a map of the settings. This map will be used to print the final configuration.
+ *          Derived classes should override it to add additional configuration options
+ * 
+ * @return std::map<std::string,std::string> 
+ */
+std::map<std::string,std::string> 
+hpcc_base::BaseSettings::getSettingsMap() {
+    int mpi_size = 0;
+#ifdef _USE_MPI_
+     MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+#endif
+    std::string str_mpi_ranks = "None";
+    if (mpi_size > 0) {
+        str_mpi_ranks = std::to_string(mpi_size);
+    }
+    return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, 
+            {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"},
+            {"Communication Type", commToString(communicationType)}};
+}
\ No newline at end of file
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 494d18c8..6fad3147 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -45,6 +45,7 @@ SOFTWARE.
 #include "cxxopts.hpp"
 #include "parameters.h"
 #include "communication_types.hpp"
+#include "hpcc_settings.hpp"
 
 #define STR_EXPAND(tok) #tok
 #define STR(tok) STR_EXPAND(tok)
@@ -58,204 +59,6 @@ SOFTWARE.
  */
 namespace hpcc_base {
 
-/**
- * @brief This class should be derived and extended for every benchmark.
- *          It is a pure data object containing the benchmark settings that are
- *          used to execute the benchmark kernel.
- *       
- */
-class BaseSettings {
-
-public:
-
-    /**
-     * @brief Number of times the kernel execution will be repeated
-     * 
-     */
-    uint numRepetitions;
-
-    /**
-     * @brief Boolean showing if memory interleaving is used that is 
-     *          triggered from the host side (Intel specific)
-     * 
-     */
-    bool useMemoryInterleaving;
-
-    /**
-     * @brief Boolean showing if the output data of the benchmark kernel
-     *          should be validated or not
-     * 
-     */
-    bool skipValidation;
-
-    /**
-     * @brief The default platform that should be used for execution. 
-     *          A number representing the index in the list of available platforms
-     * 
-     */
-    int defaultPlatform;
-
-    /**
-     * @brief The default device that should be used for execution. 
-     *          A number representing the index in the list of available devices
-     * 
-     */
-    int defaultDevice;
-
-    /**
-     * @brief Path to the kernel file that is used for execution
-     * 
-     */
-    std::string kernelFileName;
-
-    /**
-     * @brief Number of times the kernel is replicated
-     * 
-     */
-    uint kernelReplications;
-
-    /**
-     * @brief Only test the given configuration. Do not execute the benchmarks
-     * 
-     */
-    bool testOnly;
-
-    /**
-     * @brief Type of inter-FPGA communication used
-     * 
-     */
-    CommunicationType communicationType;
-
-#ifdef USE_ACCL
-    /**
-     * @brief Use ACCL emulation constructor instead of hardware execution
-     */
-    bool useAcclEmulation;
-
-    /**
-     * @brief Used ACCL network stack
-     * 
-     */
-    ACCL::networkProtocol acclProtocol;
-#endif
-
-    /**
-     * @brief Construct a new Base Settings object
-     * 
-     * @param results The resulting map from parsing the program input parameters
-     */
-    BaseSettings(cxxopts::ParseResult &results) : numRepetitions(results["n"].as<uint>()),
-#ifdef INTEL_FPGA
-            useMemoryInterleaving(static_cast<bool>(results.count("i"))), 
-#else
-            useMemoryInterleaving(true),
-#endif
-            skipValidation(static_cast<bool>(results.count("skip-validation"))), 
-            defaultPlatform(results["platform"].as<int>()),
-            defaultDevice(results["device"].as<int>()),
-            kernelFileName(results["f"].as<std::string>()),
-#ifdef NUM_REPLICATIONS
-            kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : NUM_REPLICATIONS),
-#else
-            kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : 1),
-#endif
-#ifdef USE_ACCL
-            useAcclEmulation(static_cast<bool>(results.count("accl-emulation"))),
-            acclProtocol(fpga_setup::acclProtocolStringToEnum(results["accl-protocol"].as<std::string>())),
-#endif
-#ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED
-            communicationType(retrieveCommunicationType(results["comm-type"].as<std::string>(), results["f"].as<std::string>())),
-#else
-            communicationType(retrieveCommunicationType("UNSUPPORTED", results["f"].as<std::string>())),
-#endif
-            testOnly(static_cast<bool>(results.count("test"))) {}
-
-    /**
-     * @brief Get a map of the settings. This map will be used to print the final configuration.
-     *          Derived classes should override it to add additional configuration options
-     * 
-     * @return std::map<std::string,std::string> 
-     */
-    virtual std::map<std::string,std::string> getSettingsMap() {
-    int mpi_size = 0;
-#ifdef _USE_MPI_
-     MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-#endif
-    std::string str_mpi_ranks = "None";
-    if (mpi_size > 0) {
-        str_mpi_ranks = std::to_string(mpi_size);
-    }
-        return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, 
-                {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"},
-                {"Communication Type", commToString(communicationType)}};
-    }
-
-};
-
-/**
- * @brief Settings class that is containing the program settings together with
- *          additional information about the OpenCL runtime
- * 
- * @tparam TSettings The program settings class that should be used (Must derive from BaseSettings)
- */
-template <class TSettings, class TDevice, class TContext, class TProgram>
-class ExecutionSettings {
-public:
-
-    /**
-     * @brief Pointer to the additional program settings
-     * 
-     */
-    std::unique_ptr<TSettings> programSettings;
-
-    /**
-     * @brief The OpenCL device that should be used for execution
-     * 
-     */
-    std::unique_ptr<TDevice> device;
-
-    /**
-     * @brief The OpenCL context that should be used for execution
-     * 
-     */
-    std::unique_ptr<TContext> context;
-
-    /**
-     * @brief The OpenCL program that contains the benchmark kernel
-     * 
-     */
-    std::unique_ptr<TProgram> program;
-
-    /**
-     * @brief Construct a new Execution Settings object
-     * 
-     * @param programSettings_ Pointer to an existing program settings object that is derived from BaseSettings
-     * @param device_ Used OpenCL device
-     * @param context_ Used OpenCL context
-     * @param program_ Used OpenCL program
-     */
-    ExecutionSettings(std::unique_ptr<TSettings> programSettings_, std::unique_ptr<TDevice> device_, 
-                        std::unique_ptr<TContext> context_, std::unique_ptr<TProgram> program_
-                        
-                        ): 
-                                    programSettings(std::move(programSettings_)), device(std::move(device_)), 
-                                    context(std::move(context_)), program(std::move(program_))                                    
-                                             {}
-
-    /**
-     * @brief Destroy the Execution Settings object. Used to specify the order the contained objects are destroyed 
-     *         to prevent segmentation faults during exit.
-     * 
-     */
-    ~ExecutionSettings() {
-        program = nullptr;
-        context = nullptr;
-        device = nullptr;
-        programSettings = nullptr;
-    }
-
-};
-
 /**
  * @brief Base benchmark class. Every benchmark should be derived from this class and implement its abstract methods.
  * 
@@ -263,7 +66,8 @@ class ExecutionSettings {
  * @tparam TData Class used to represent the benchmark input and output data
  * @tparam TOutput Class representing the measurements like timings etc
  */
-template <class TSettings, class TDevice, class TContext, class TProgram, class TData, class TOutput>
+template <typename TSettings, class TDevice, class TContext, class TProgram, class TData, class TOutput, typename =
+typename std::enable_if<std::is_base_of<BaseSettings, TSettings>::value>::type>
 class HpccFpgaBenchmark {
 
 private:
@@ -400,7 +204,11 @@ class HpccFpgaBenchmark {
 #ifdef USE_ACCL
                 ("accl-emulation", "Use the accl emulation instead of hardware execution")
                 ("accl-protocol", "Specify the network protocol that should be used with ACCL.",
-                cxxopts::value<std::string>()->default_value("UDP"))
+                cxxopts::value<std::string>()->default_value(ACCL_STACK_TYPE))
+                ("accl-buffer-size", "Specify the size of the ACCL buffers in KB",
+                cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_SIZE)))
+                ("accl-buffer-count", "Specify the number of ACCL buffers used within the benchmark",
+                cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_COUNT)))
 #endif
                 ("skip-validation", "Skip the validation of the output data. This will speed up execution and helps when working with special data types.")
                 ("device", "Index of the device that has to be used. If not given you "\
@@ -525,8 +333,8 @@ class HpccFpgaBenchmark {
 #endif
 #ifdef USE_ACCL
                 if (programSettings->communicationType == CommunicationType::accl) {
-                    context = std::unique_ptr<fpga_setup::ACCLContext>(new fpga_setup::ACCLContext(fpga_setup::fpgaSetupACCL(*usedDevice, *program, programSettings->useAcclEmulation,
-                    programSettings->acclProtocol)));
+                    context = std::unique_ptr<fpga_setup::ACCLContext>(new fpga_setup::ACCLContext(
+                                    fpga_setup::fpgaSetupACCL(*usedDevice, *program, *programSettings)));
                 }
                 else {
                     context = std::unique_ptr<fpga_setup::ACCLContext>(new fpga_setup::ACCLContext());
diff --git a/shared/include/hpcc_settings.hpp b/shared/include/hpcc_settings.hpp
new file mode 100644
index 00000000..defa2892
--- /dev/null
+++ b/shared/include/hpcc_settings.hpp
@@ -0,0 +1,200 @@
+#ifndef HPCC_BASE_SETTINGS_H_
+#define HPCC_BASE_SETTINGS_H_
+
+#include "cxxopts.hpp"
+#include "parameters.h"
+#include "communication_types.hpp"
+
+#ifdef _USE_MPI_
+#include "mpi.h"
+#endif
+
+#ifdef USE_ACCL
+#include "accl.hpp"
+#endif
+
+/**
+ * @brief Contains all classes and functions that are used as basis
+ *          for all benchmarks.
+ * 
+ */
+namespace hpcc_base {
+
+/**
+ * @brief This class should be derived and extended for every benchmark.
+ *          It is a pure data object containing the benchmark settings that are
+ *          used to execute the benchmark kernel.
+ *       
+ */
+class BaseSettings {
+
+public:
+
+    /**
+     * @brief Number of times the kernel execution will be repeated
+     * 
+     */
+    uint numRepetitions;
+
+    /**
+     * @brief Boolean showing if memory interleaving is used that is 
+     *          triggered from the host side (Intel specific)
+     * 
+     */
+    bool useMemoryInterleaving;
+
+    /**
+     * @brief Boolean showing if the output data of the benchmark kernel
+     *          should be validated or not
+     * 
+     */
+    bool skipValidation;
+
+    /**
+     * @brief The default platform that should be used for execution. 
+     *          A number representing the index in the list of available platforms
+     * 
+     */
+    int defaultPlatform;
+
+    /**
+     * @brief The default device that should be used for execution. 
+     *          A number representing the index in the list of available devices
+     * 
+     */
+    int defaultDevice;
+
+    /**
+     * @brief Path to the kernel file that is used for execution
+     * 
+     */
+    std::string kernelFileName;
+
+    /**
+     * @brief Number of times the kernel is replicated
+     * 
+     */
+    uint kernelReplications;
+
+    /**
+     * @brief Only test the given configuration. Do not execute the benchmarks
+     * 
+     */
+    bool testOnly;
+
+    /**
+     * @brief Type of inter-FPGA communication used
+     * 
+     */
+    CommunicationType communicationType;
+
+#ifdef USE_ACCL
+    /**
+     * @brief Use ACCL emulation constructor instead of hardware execution
+     */
+    bool useAcclEmulation;
+
+    /**
+     * @brief Used ACCL network stack
+     * 
+     */
+    ACCL::networkProtocol acclProtocol;
+
+    /**
+     * @brief Size of the ACCL buffers in bytes
+     * 
+     */
+    uint acclBufferSize;
+
+    /**
+     * @brief Number of ACCL buffers to use
+     * 
+     */
+    uint acclBufferCount;
+#endif
+
+    /**
+     * @brief Construct a new Base Settings object
+     * 
+     * @param results The resulting map from parsing the program input parameters
+     */
+    BaseSettings(cxxopts::ParseResult &results);
+
+    /**
+     * @brief Get a map of the settings. This map will be used to print the final configuration.
+     *          Derived classes should override it to add additional configuration options
+     * 
+     * @return std::map<std::string,std::string> 
+     */
+    virtual std::map<std::string,std::string> getSettingsMap();
+
+};
+
+/**
+ * @brief Settings class that is containing the program settings together with
+ *          additional information about the OpenCL runtime
+ * 
+ * @tparam TSettings The program settings class that should be used (Must derive from BaseSettings)
+ */
+template <class TSettings, class TDevice, class TContext, class TProgram, typename =
+typename std::enable_if<std::is_base_of<hpcc_base::BaseSettings, TSettings>::value>::type>
+class ExecutionSettings {
+public:
+
+    /**
+     * @brief Pointer to the additional program settings
+     * 
+     */
+    std::unique_ptr<TSettings> programSettings;
+
+    /**
+     * @brief The OpenCL device that should be used for execution
+     * 
+     */
+    std::unique_ptr<TDevice> device;
+
+    /**
+     * @brief The OpenCL context that should be used for execution
+     * 
+     */
+    std::unique_ptr<TContext> context;
+
+    /**
+     * @brief The OpenCL program that contains the benchmark kernel
+     * 
+     */
+    std::unique_ptr<TProgram> program;
+
+    /**
+     * @brief Construct a new Execution Settings object
+     * 
+     * @param programSettings_ Pointer to an existing program settings object that is derived from BaseSettings
+     * @param device_ Used OpenCL device
+     * @param context_ Used OpenCL context
+     * @param program_ Used OpenCL program
+     */
+    ExecutionSettings(std::unique_ptr<TSettings> programSettings_, std::unique_ptr<TDevice> device_, 
+                        std::unique_ptr<TContext> context_, std::unique_ptr<TProgram> program_
+                        
+                        ): 
+                                    programSettings(std::move(programSettings_)), device(std::move(device_)), 
+                                    context(std::move(context_)), program(std::move(program_))                                    
+                                             {}
+
+    /**
+     * @brief Destroy the Execution Settings object. Used to specify the order the contained objects are destroyed 
+     *         to prevent segmentation faults during exit.
+     * 
+     */
+    ~ExecutionSettings() {
+        program = nullptr;
+        context = nullptr;
+        device = nullptr;
+        programSettings = nullptr;
+    }
+
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/shared/include/setup/fpga_setup_accl.hpp b/shared/include/setup/fpga_setup_accl.hpp
index ff493ccc..0f451ced 100644
--- a/shared/include/setup/fpga_setup_accl.hpp
+++ b/shared/include/setup/fpga_setup_accl.hpp
@@ -33,6 +33,7 @@ SOFTWARE.
 /* External libraries */
 #include "accl.hpp"
 #include "xrt/xrt_device.h"
+#include "hpcc_settings.hpp"
 
 namespace fpga_setup {
 
@@ -64,12 +65,11 @@ Sets up the given FPGA with the kernel in the provided file.
 
 @param device The device used for the program
 @param program The program used to find the ACCL kernels for hardware execution
-@param useAcclEmulation Construct an ACCL emulation instance instead of hardware execution
+@param programSettings Pass current program settings to configure ACCL according to user specification
 @return The ACCL instance used for communication
 */
 ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
-                                          bool useAcclEmulation,
-                                          ACCL::networkProtocol protocol);
+                                          hpcc_base::BaseSettings &programSettings);
 
 } // namespace fpga_setup
 #endif // SRC_HOST_FPGA_SETUP_H_
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index ed84ea08..36561553 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -82,7 +82,7 @@ void configure_tcp(ACCL::BaseBuffer &tx_buf_network, ACCL::BaseBuffer &rx_buf_ne
 }
 
 ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
-                                          bool useAcclEmulation, ACCL::networkProtocol protocol) {
+                                          hpcc_base::BaseSettings &programSettings) {
   int current_rank;
   MPI_Comm_rank(MPI_COMM_WORLD, &current_rank);
 
@@ -92,19 +92,19 @@ ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
   std::vector<ACCL::rank_t> ranks = {};
   for (int i = 0; i < current_size; ++i) {
     // TODO: Replace the ip addresses and ports here for execution of real hardware?
-    ACCL::rank_t new_rank = {"10.10.10." + std::to_string(i), 5500 + i, i, ACCL_BUFFER_SIZE};
+    ACCL::rank_t new_rank = {"10.10.10." + std::to_string(i), 6000 + i, i, programSettings.acclBufferSize};
     ranks.emplace_back(new_rank);
   }
 
   ACCLContext accl;
 
-  if (!useAcclEmulation) {
+  if (!programSettings.useAcclEmulation) {
     std::cout << "Create cclo ip" << std::endl;
     auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}");
     std::cout << "Create hostctrl" << std::endl;
     auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}",
                                    xrt::kernel::cu_access_mode::exclusive);
-    if (protocol == ACCL::networkProtocol::UDP) {
+    if (programSettings.acclProtocol == ACCL::networkProtocol::UDP) {
       std::cout << "Create CMAC" << std::endl;
       auto cmac = CMAC(xrt::ip(device, program, "cmac_0:{cmac_0}"));
       std::cout << "Create Network Layer" << std::endl;
@@ -113,7 +113,7 @@ ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
       std::cout << "Configure VNX" << std::endl;
       configure_vnx(cmac, network_layer, ranks, current_rank);
     }
-    if (protocol == ACCL::networkProtocol::TCP) {
+    if (programSettings.acclProtocol == ACCL::networkProtocol::TCP) {
       auto network_krnl = xrt::kernel(device, program, "network_krnl:{network_krnl_0}",
                       xrt::kernel::cu_access_mode::exclusive);
       accl.tx_buf_network = std::unique_ptr<ACCL::BaseBuffer>(new ACCL::FPGABuffer<int8_t>(
@@ -125,14 +125,14 @@ ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
     std::vector<int> mem(1, 0);
     std::cout << "Create ACCL" << std::endl;
     accl.accl = std::unique_ptr<ACCL::ACCL>(
-        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, protocol, 16, ACCL_BUFFER_SIZE));
+        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize));
   } else {
     // TODO: Add start port here. Currenty hardcoded!
     accl.accl = std::unique_ptr<ACCL::ACCL>(
-        new ACCL::ACCL(ranks, current_rank, 6000, device, protocol, 16, ACCL_BUFFER_SIZE));
+        new ACCL::ACCL(ranks, current_rank, 6000, device, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize));
   }
 
-  if (protocol == ACCL::networkProtocol::TCP) {
+  if (programSettings.acclProtocol == ACCL::networkProtocol::TCP) {
     MPI_Barrier(MPI_COMM_WORLD);
     accl.accl->open_port();
     MPI_Barrier(MPI_COMM_WORLD);
diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp
index f5d7ef32..1b41f9e0 100644
--- a/shared/setup/fpga_setup_xrt.cpp
+++ b/shared/setup/fpga_setup_xrt.cpp
@@ -42,7 +42,7 @@ namespace fpga_setup {
         } else {
             //TODO Use xrt::system::enumerate_devices() in "experimental/xrt_system.h" for future XRT versions
             // instead of hardcoded number of devices.
-            current_device = current_device % 3;
+            current_device = current_device + 1 % 3;
         }
         return std::unique_ptr<xrt::device>(new xrt::device(current_device));
     } 

From 43c4f78edc76d84a597b8b876f95497cd8b69ec3 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 29 Nov 2022 18:30:35 +0100
Subject: [PATCH 179/318] Fix base tests to compile with xrt

---
 shared/tests/hpcc_base_benchmark_test.cpp | 167 ++++++++++++++--------
 1 file changed, 108 insertions(+), 59 deletions(-)

diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp
index a93a2a69..42a49a70 100644
--- a/shared/tests/hpcc_base_benchmark_test.cpp
+++ b/shared/tests/hpcc_base_benchmark_test.cpp
@@ -16,7 +16,8 @@
 // and enable the included tests
 void use_hpcc_base_lib() {}
 
-class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, int, int> {
+template<class T>
+class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, typename std::tuple_element<0, T>::type, typename std::tuple_element<1, T>::type, typename std::tuple_element<2, T>::type, int, int> {
 
 protected:
 
@@ -47,12 +48,12 @@ class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
     void
     collectAndPrintResults(const int &output) override {}
 
-    MinimalBenchmark() : HpccFpgaBenchmark(0, { nullptr}) {}
+    MinimalBenchmark() : hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, typename std::tuple_element<0, T>::type, typename std::tuple_element<1, T>::type, typename std::tuple_element<2, T>::type, int, int>(0, { nullptr}) {}
 
 };
 
-
-class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, int, int> {
+template<class TDevice, class TContext, class TProgram>
+class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, TDevice, TContext, TProgram, int, int> {
 
 protected:
 
@@ -102,29 +103,66 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
             return false;
         }
         else {
-            return hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, int, int>::checkInputParameters();
+            return hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, TDevice, TContext, TProgram, int, int>::checkInputParameters();
         }
     }
 
-    SuccessBenchmark() : HpccFpgaBenchmark(0, { nullptr}) {}
+    SuccessBenchmark() : hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, TDevice, TContext, TProgram, int, int>(0, { nullptr}) {}
 
 };
 
+template<class T>
 class BaseHpccBenchmarkTest :public  ::testing::Test {
 
+using TDevice = typename std::tuple_element<0,T>::type;
+using TContext = typename std::tuple_element<1,T>::type;
+using TProgram = typename std::tuple_element<2,T>::type;
+
 public:
-    std::unique_ptr<SuccessBenchmark> bm;
+    std::unique_ptr<SuccessBenchmark<TDevice, TContext, TProgram>> bm;
 
     BaseHpccBenchmarkTest() {
-        bm = std::unique_ptr<SuccessBenchmark>(new SuccessBenchmark());
+        bm = std::unique_ptr<SuccessBenchmark<TDevice, TContext, TProgram>>(new SuccessBenchmark<TDevice, TContext, TProgram>());
         bm->setupBenchmark(global_argc, global_argv);
     }
 
 };
 
-
-TEST_F(BaseHpccBenchmarkTest, SetupSucceedsForBenchmarkTest) {
-        bool success = bm->setupBenchmark(global_argc, global_argv);
+template<class T>
+class SetupTest : public ::testing::Test {};
+
+#ifdef USE_OCL_HOST
+typedef ::testing::Types<std::tuple<cl::Device, cl::Context, cl::Program>> cl_types;
+TYPED_TEST_SUITE(
+        BaseHpccBenchmarkTest,
+        cl_types);
+TYPED_TEST_SUITE(
+        SetupTest,
+        cl_types);
+#endif
+#ifdef USE_XRT_HOST
+#ifndef USE_ACCL
+typedef ::testing::Types<std::tuple<xrt::device, bool, xrt::uuid>> xrt_types;
+TYPED_TEST_SUITE(
+        BaseHpccBenchmarkTest,
+        xrt_types);
+TYPED_TEST_SUITE(
+        SetupTest,
+        xrt_types);
+#else
+typedef ::testing::Types<std::tuple<xrt::device, fpga_setup::ACCLContext, xrt::uuid>> accl_types;
+TYPED_TEST_SUITE(
+        BaseHpccBenchmarkTest,
+        accl_types);
+TYPED_TEST_SUITE(
+        SetupTest,
+        accl_types);
+#endif
+#endif
+
+
+TYPED_TEST(BaseHpccBenchmarkTest, SetupSucceedsForBenchmarkTest) {
+        bool success = this->bm->setupBenchmark(global_argc, global_argv);
         EXPECT_TRUE(success);
 }
 
@@ -132,97 +170,108 @@ TEST_F(BaseHpccBenchmarkTest, SetupSucceedsForBenchmarkTest) {
 /**
  * Checks if the testing flag works as expected
  */
-TEST_F(BaseHpccBenchmarkTest, AllExecutedWhenNotTestOnly) {
-    bm->getExecutionSettings().programSettings->testOnly = false;
-    bm->executeBenchmark();
-    EXPECT_EQ(bm->validateOutputcalled, 1);
-    EXPECT_EQ(bm->executeKernelcalled, 1);
-    EXPECT_EQ(bm->generateInputDatacalled, 1);
+TYPED_TEST(BaseHpccBenchmarkTest, AllExecutedWhenNotTestOnly) {
+    this->bm->getExecutionSettings().programSettings->testOnly = false;
+    this->bm->executeBenchmark();
+    EXPECT_EQ(this->bm->validateOutputcalled, 1);
+    EXPECT_EQ(this->bm->executeKernelcalled, 1);
+    EXPECT_EQ(this->bm->generateInputDatacalled, 1);
 }
 
-TEST_F(BaseHpccBenchmarkTest, NothingExecutedWhenTestOnly) {
-    bm->getExecutionSettings().programSettings->testOnly = true;
-    bm->executeBenchmark();
-    EXPECT_EQ(bm->validateOutputcalled, 0);
-    EXPECT_EQ(bm->executeKernelcalled, 0);
-    EXPECT_EQ(bm->generateInputDatacalled, 0);
+TYPED_TEST(BaseHpccBenchmarkTest, NothingExecutedWhenTestOnly) {
+    this->bm->getExecutionSettings().programSettings->testOnly = true;
+    this->bm->executeBenchmark();
+    EXPECT_EQ(this->bm->validateOutputcalled, 0);
+    EXPECT_EQ(this->bm->executeKernelcalled, 0);
+    EXPECT_EQ(this->bm->generateInputDatacalled, 0);
 }
 
-TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenNotTestOnly) {
-    bm->getExecutionSettings().programSettings->testOnly = false;
-    EXPECT_TRUE(bm->executeBenchmark());
+TYPED_TEST(BaseHpccBenchmarkTest, ExecutionSuccessWhenNotTestOnly) {
+    this->bm->getExecutionSettings().programSettings->testOnly = false;
+    EXPECT_TRUE(this->bm->executeBenchmark());
 
 }
 
-TEST_F(BaseHpccBenchmarkTest, ExecutionFailsWhenTestOnlyAndSetupFails) {
-    bm->getExecutionSettings().programSettings->testOnly = true;
-    bm->forceSetupFail = true;
-    bm->setupBenchmark(global_argc, global_argv);
-    EXPECT_FALSE(bm->executeBenchmark());
+TYPED_TEST(BaseHpccBenchmarkTest, ExecutionFailsWhenTestOnlyAndSetupFails) {
+    this->bm->getExecutionSettings().programSettings->testOnly = true;
+    this->bm->forceSetupFail = true;
+    this->bm->setupBenchmark(global_argc, global_argv);
+    EXPECT_FALSE(this->bm->executeBenchmark());
 }
 
-TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenTestOnlyAndSetupSuccess) {
-    bm->getExecutionSettings().programSettings->testOnly = true;
-    EXPECT_TRUE(bm->executeBenchmark());
+TYPED_TEST(BaseHpccBenchmarkTest, ExecutionSuccessWhenTestOnlyAndSetupSuccess) {
+    this->bm->getExecutionSettings().programSettings->testOnly = true;
+    EXPECT_TRUE(this->bm->executeBenchmark());
 }
 
 /**
- * Checks if using default platform and device is successful
+ * Checks if non existing device leads to an error
  */
-TEST_F(BaseHpccBenchmarkTest, SuccessUseDefaultPlatform) {
-    EXPECT_NE(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, bm->getExecutionSettings().programSettings->defaultDevice).get(), nullptr);
+TYPED_TEST(BaseHpccBenchmarkTest, FindNonExistingDevice) {
+#ifdef USE_OCL_HOST
+    ASSERT_THROW(fpga_setup::selectFPGADevice(this->bm->getExecutionSettings().programSettings->defaultPlatform, 100).get(), fpga_setup::FpgaSetupException);
+#else
+    ASSERT_THROW(fpga_setup::selectFPGADevice(100).get(), fpga_setup::FpgaSetupException);
+#endif
 }
 
 /**
- * Checks if non existing platform leads to an error
+ * Checks if using default platform and device is successful
  */
-TEST_F(BaseHpccBenchmarkTest, FindNonExistingPlatform) {
-    ASSERT_THROW(fpga_setup::selectFPGADevice(100, bm->getExecutionSettings().programSettings->defaultDevice).get(), fpga_setup::FpgaSetupException);
+TYPED_TEST(BaseHpccBenchmarkTest, SuccessUseDefaultPlatformandDevice) {
+#ifdef USE_OCL_HOST
+    EXPECT_NE(fpga_setup::selectFPGADevice(this->bm->getExecutionSettings().programSettings->defaultPlatform, this->bm->getExecutionSettings().programSettings->defaultDevice).get(), nullptr);
+#else
+    EXPECT_NE(fpga_setup::selectFPGADevice(this->bm->getExecutionSettings().programSettings->defaultDevice).get(), nullptr);
+#endif
 }
 
+#ifdef USE_OCL_HOST
 /**
- * Checks if non existing device leads to an error
+ * Checks if non existing platform leads to an error
  */
-TEST_F(BaseHpccBenchmarkTest, FindNonExistingDevice) {
-    ASSERT_THROW(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, 100).get(), fpga_setup::FpgaSetupException);
+TYPED_TEST(BaseHpccBenchmarkTest, FindNonExistingPlatform) {
+    ASSERT_THROW(fpga_setup::selectFPGADevice(100, this->bm->getExecutionSettings().programSettings->defaultDevice).get(), fpga_setup::FpgaSetupException);
 }
 
+#endif
+
 /**
  * Execute kernel and validation is success
  */
-TEST_F(BaseHpccBenchmarkTest, SuccessfulExeAndVal) {
-    EXPECT_TRUE(bm->executeBenchmark());
+TYPED_TEST(BaseHpccBenchmarkTest, SuccessfulExeAndVal) {
+    EXPECT_TRUE(this->bm->executeBenchmark());
 }
 
 /**
  * Execute kernel is success, but validation fails
  */
-TEST_F(BaseHpccBenchmarkTest, SuccessfulExeFailedVal) {
-    bm->returnValidate = false;
-    EXPECT_FALSE(bm->executeBenchmark());
+TYPED_TEST(BaseHpccBenchmarkTest, SuccessfulExeFailedVal) {
+    this->bm->returnValidate = false;
+    EXPECT_FALSE(this->bm->executeBenchmark());
 }
 
 /**
  * Execute kernel fails
  */
-TEST_F(BaseHpccBenchmarkTest, FailedExe) {
-    bm->returnExecuteKernel = false;
-    EXPECT_FALSE(bm->executeBenchmark());
+TYPED_TEST(BaseHpccBenchmarkTest, FailedExe) {
+    this->bm->returnExecuteKernel = false;
+    EXPECT_FALSE(this->bm->executeBenchmark());
 }
 
 /**
  * Benchmark Setup is successful with default data
  */
-TEST(SetupTest, BenchmarkSetupIsSuccessful) {
-    std::unique_ptr<MinimalBenchmark> bm = std::unique_ptr<MinimalBenchmark>(new MinimalBenchmark());
+TYPED_TEST(SetupTest, BenchmarkSetupIsSuccessful) {
+    std::unique_ptr<MinimalBenchmark<TypeParam>> bm = std::unique_ptr<MinimalBenchmark<TypeParam>>(new MinimalBenchmark<TypeParam>());
     EXPECT_TRUE(bm->setupBenchmark(global_argc, global_argv));
 }
 
 /**
  * Benchmark Setup fails because of failing configuration check
  */
-TEST(SetupTest, BenchmarkConfigurationFailsSetup) {
-    std::unique_ptr<MinimalBenchmark> bm = std::unique_ptr<MinimalBenchmark>(new MinimalBenchmark());
+TYPED_TEST(SetupTest, BenchmarkConfigurationFailsSetup) {
+    std::unique_ptr<MinimalBenchmark<TypeParam>> bm = std::unique_ptr<MinimalBenchmark<TypeParam>>(new MinimalBenchmark<TypeParam>());
     bm->configurationCheckSucceeds = false;
     EXPECT_FALSE(bm->setupBenchmark(global_argc, global_argv));
 }
@@ -230,8 +279,8 @@ TEST(SetupTest, BenchmarkConfigurationFailsSetup) {
 /**
  * Benchmark Execution fails if configuration check failed
  */
-TEST(SetupTest, BenchmarkConfigurationFailsExecution) {
-    std::unique_ptr<MinimalBenchmark> bm = std::unique_ptr<MinimalBenchmark>(new MinimalBenchmark());
+TYPED_TEST(SetupTest, BenchmarkConfigurationFailsExecution) {
+    std::unique_ptr<MinimalBenchmark<TypeParam>> bm = std::unique_ptr<MinimalBenchmark<TypeParam>>(new MinimalBenchmark<TypeParam>());
     bm->configurationCheckSucceeds = false;
     bm->setupBenchmark(global_argc, global_argv);
     EXPECT_FALSE(bm->executeBenchmark());
@@ -240,8 +289,8 @@ TEST(SetupTest, BenchmarkConfigurationFailsExecution) {
 /**
  * Benchmark Setup fails with empty data
  */
-TEST(SetupTest, BenchmarkSetupFails) {
-    std::unique_ptr<MinimalBenchmark> bm = std::unique_ptr<MinimalBenchmark>(new MinimalBenchmark());
+TYPED_TEST(SetupTest, BenchmarkSetupFails) {
+    std::unique_ptr<MinimalBenchmark<TypeParam>> bm = std::unique_ptr<MinimalBenchmark<TypeParam>>(new MinimalBenchmark<TypeParam>());
     char** tmp_argv = new char*[2];
     char* name_str = new char[5];
     strcpy(name_str, "name");

From b4a1c610d212a89d1646afb5d5d94779d7102258 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 30 Nov 2022 14:10:23 +0100
Subject: [PATCH 180/318] Fix test build for network kernel

---
 cmake/kernelTargets.cmake   |  4 ++--
 cmake/unitTestTargets.cmake | 11 ++++++++---
 shared/tests/CMakeLists.txt | 11 +++++++++++
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index 4b8adee3..50da445f 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -117,13 +117,13 @@ function(generate_kernel_targets_xilinx)
                 set(kernel_name_flag -k ${CMAKE_MATCH_1})
             endif()
             add_custom_command(OUTPUT ${kernel}
-                    COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -DKERNEL_${CMAKE_MATCH_1} -DEMULATE -t sw_emu ${kernel_name_flag} ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS} -f ${FPGA_BOARD_NAME} -g -c ${XILINX_COMPILE_FLAGS} -o ${kernel} ${source_f}
+                    COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -DKERNEL_${CMAKE_MATCH_1} -DEMULATE -t sw_emu ${kernel_name_flag} ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS} -f ${FPGA_BOARD_NAME} -c ${XILINX_COMPILE_FLAGS} -o ${kernel} ${source_f}
                     MAIN_DEPENDENCY ${source_f}
                     DEPENDS ${XILINX_COMPILE_SETTINGS_FILE}
                     )
         endforeach()
         add_custom_command(OUTPUT ${bitstream_emulate_f}
-            COMMAND ${Vitis_COMPILER} ${local_CL_FLAGS} ${VPP_FLAGS} -DEMULATE -t sw_emu ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} -f ${FPGA_BOARD_NAME} -g -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_emulate_f} ${bitstream_compile_emulate}
+            COMMAND ${Vitis_COMPILER} ${local_CL_FLAGS} ${VPP_FLAGS} -DEMULATE -t sw_emu ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} -f ${FPGA_BOARD_NAME} -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_emulate_f} ${bitstream_compile_emulate}
                 DEPENDS ${bitstream_compile_emulate}
                 DEPENDS ${xilinx_link_settings}
                 )
diff --git a/cmake/unitTestTargets.cmake b/cmake/unitTestTargets.cmake
index 776269e7..263d4033 100644
--- a/cmake/unitTestTargets.cmake
+++ b/cmake/unitTestTargets.cmake
@@ -21,14 +21,19 @@ endif()
 
 if (Vitis_FOUND)
     include_directories(SYSTEM ${Vitis_INCLUDE_DIRS})
+    if (USE_ACCL)
+        set(CMAKE_SKIP_BUILD_RPATH No)
+        set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes)
+        list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib)
+    endif()
     add_executable(${HOST_EXE_NAME}_test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES})
     target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
     target_link_libraries(${HOST_EXE_NAME}_test_xilinx hpcc_fpga_base_test)
-    if (NOT "${kernel_emulation_targets_xilinx}" STREQUAL "")
-        add_dependencies(${HOST_EXE_NAME}_test_xilinx ${kernel_emulation_targets_xilinx})
-    endif()
     target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
+    if (USE_ACCL)
+        target_link_libraries(${HOST_EXE_NAME}_xilinx zmqpp)
+    endif()
     foreach (kernel_target ${kernel_emulation_targets_xilinx})
         string(REPLACE "_xilinx" ".xclbin" kernel_name ${kernel_target})
         add_test(NAME test_unit_${kernel_target} COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_test_xilinx> -f ${kernel_name} ${TEST_HOST_FLAGS} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
diff --git a/shared/tests/CMakeLists.txt b/shared/tests/CMakeLists.txt
index a4ea0a4d..5d4c441b 100644
--- a/shared/tests/CMakeLists.txt
+++ b/shared/tests/CMakeLists.txt
@@ -1,6 +1,14 @@
 
 set(HPCC_BASE_TEST_SOURCES  main.cpp hpcc_base_benchmark_test.cpp)
 
+if (USE_ACCL)
+    set(ACCL_EMULATOR_DIR ${CMAKE_BINARY_DIR}/lib/accl-emulator CACHE STRING "Directory of ACCL emulator")
+    add_subdirectory(${extern_accl_SOURCE_DIR}/test/model/emulator ${ACCL_EMULATOR_DIR})
+    if (CMAKE_BUILD_TYPE EQUAL "Debug")
+        set(ACCL_DEBUG Yes)
+    endif()
+endif()
+
 add_library(hpcc_fpga_base_test STATIC ${HPCC_BASE_TEST_SOURCES})
 target_link_libraries(hpcc_fpga_base_test gtest gmock hpcc_fpga_base)
 target_include_directories(hpcc_fpga_base_test PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
@@ -11,6 +19,9 @@ if (INTELFPGAOPENCL_FOUND)
 elseif(Vitis_FOUND)
     target_include_directories(hpcc_fpga_base_test PUBLIC ${Vitis_INCLUDE_DIRS})  
     target_link_libraries(hpcc_fpga_base_test ${Vitis_LIBRARIES})
+if (USE_ACCL)
+    add_dependencies(hpcc_fpga_base_test cclo_emu)
+endif()
 else()
     message(ERROR "No OpenCL header found on system!")
 endif()

From 401465ea7e5361888206a72c8e7f72392fe0f0d9 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 1 Dec 2022 16:56:06 +0100
Subject: [PATCH 181/318] Fix performance issue with XRT PCIe HPL host

---
 LINPACK/src/host/execution_types/execution_xrt_pcie.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
index aa0484e1..0269a8e3 100644
--- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
@@ -413,11 +413,13 @@ std::unique_ptr<linpack::LinpackExecutionTimings> calculate(
         for (auto &run : outer_mms) {
           run.wait();
         }
+
+#ifndef NDEBUG
+        // Wait for iiner MMs in this communication round to keep
+        // sync with prints
         for (auto &run : inner_mms) {
           run.wait();
         }
-
-#ifndef NDEBUG
         MPI_Barrier(MPI_COMM_WORLD);
         if (is_calulating_lu_block)
           std::cout << "---------------" << std::endl;

From bc5f38e958dfc74cbb56535e823933f3371a707a Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Tue, 22 Mar 2022 11:08:49 +0100
Subject: [PATCH 182/318] add generator2.py v0.1

---
 STREAM/src/device/stream_kernels_single.cl | 17 +++----
 cmake/general_benchmark_build_setup.cmake  |  2 +-
 scripts/code_generator/generator2.py       | 55 ++++++++++++++++++++++
 3 files changed, 62 insertions(+), 12 deletions(-)
 create mode 100644 scripts/code_generator/generator2.py

diff --git a/STREAM/src/device/stream_kernels_single.cl b/STREAM/src/device/stream_kernels_single.cl
index 678d4fc1..6d421b94 100644
--- a/STREAM/src/device/stream_kernels_single.cl
+++ b/STREAM/src/device/stream_kernels_single.cl
@@ -15,19 +15,14 @@ KERNEL_NUMBER will be replaced by the build script with the ID of the current re
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 
-/* PY_CODE_GEN 
-try:
-    kernel_param_attributes = generate_attributes(num_replications)
-except:
-    kernel_param_attributes = ["" for i in range(num_replications)]
-*/
+{% set kernel_param_attributes = generate_attributes(num_replications) %}
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 __kernel
 __attribute__((uses_global_work_offset(0)))
-void calc_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ const DEVICE_ARRAY_DATA_TYPE *restrict in1,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]*/ const DEVICE_ARRAY_DATA_TYPE *restrict in2,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_ARRAY_DATA_TYPE *restrict out,
+void calc_{{ i }}(__global {{ kernel_param_attributes[i] }} const DEVICE_ARRAY_DATA_TYPE *restrict in1,
+          __global {{ kernel_param_attributes[i] }} const DEVICE_ARRAY_DATA_TYPE *restrict in2,
+          __global {{ kernel_param_attributes[i] }} DEVICE_ARRAY_DATA_TYPE *restrict out,
           const DEVICE_SCALAR_DATA_TYPE scalar,
           const uint array_size,
           const uint operation_type) {
@@ -126,4 +121,4 @@ void calc_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake
index 64aa8d0a..0a3f8d66 100644
--- a/cmake/general_benchmark_build_setup.cmake
+++ b/cmake/general_benchmark_build_setup.cmake
@@ -214,7 +214,7 @@ if (INTELFPGAOPENCL_FOUND)
     separate_arguments(AOC_FLAGS)
 endif()
 
-set(CODE_GENERATOR "${CMAKE_SOURCE_DIR}/../scripts/code_generator/generator.py" CACHE FILEPATH "Path to the code generator executable")
+set(CODE_GENERATOR "${CMAKE_SOURCE_DIR}/../scripts/code_generator/generator2.py" CACHE FILEPATH "Path to the code generator executable")
 set(CUSTOM_KERNEL_FOLDER ${CMAKE_SOURCE_DIR}/src/device/custom/)
 
 
diff --git a/scripts/code_generator/generator2.py b/scripts/code_generator/generator2.py
new file mode 100644
index 00000000..09a0c142
--- /dev/null
+++ b/scripts/code_generator/generator2.py
@@ -0,0 +1,55 @@
+import argparse
+import sys
+from jinja2 import Environment, PackageLoader, BaseLoader, TemplateNotFound, select_autoescape
+from os.path import join, exists, getmtime
+
+parser = argparse.ArgumentParser(description='Preprocessor for code replication and advanced code modification.')
+parser.add_argument('file', metavar='CODE_FILE', type=str,
+                   help='Path to the file that is used as input')
+parser.add_argument("-o", dest="output_file", default=None, help="Path to the output file. If not given, output will printed to stdout.")
+parser.add_argument("-p", dest="params", default=[], action="append", help="Python statement that is parsed before modifying the files. Can be used to define global variables.")
+
+# create a simple loader to load templates from the file system
+class SimpleLoader(BaseLoader):
+    def __init__(self, path):
+        self.path = path
+
+    def get_source(self, environment, template):
+        path = join(self.path, template)
+        if not exists(path):
+            raise TemplateNotFound(template)
+        mtime = getmtime(path)
+        with open(path) as f:
+            source = f.read()
+        return source, path, lambda: mtime == getmtime(path)
+
+env = Environment(
+    loader=SimpleLoader("./"),
+    autoescape=select_autoescape()
+)
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    if not args.file:
+        print('no input file given')
+        exit(1)
+    if not args.output_file: 
+        print('no output file given')
+        exit(1)
+    for p in args.params:
+        print("Parse statement: %s" % p)
+        exec(p, globals())
+
+    template = env.get_template(args.file)
+
+    try:
+        template.globals.update({"generate_attributes": generate_attributes})
+    except:
+        generate_attributes = lambda r : ["" for i in range(r)]
+        template.globals.update({"generate_attributes": generate_attributes})
+
+    if num_replications is None:
+        num_replications = 1 
+
+    with open(args.output_file, 'w') as f:
+        f.write(template.render(num_replications=num_replications))
\ No newline at end of file

From 9b703894f809b1321ef9b06b974bd6a0f7618ba5 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 23 Mar 2022 13:30:47 +0100
Subject: [PATCH 183/318] use new generator for all benchmarks

---
 .../settings.gen.intel.fft1d_float_8.hbm.py   |   2 +-
 .../settings.gen.intel.fft1d_float_8.svm.py   |   2 +-
 FFT/src/device/fft1d_float_8.cl               |  83 ++++---
 .../settings.gen.intel.gemm_base.520n_mx.py   |   2 +-
 .../settings.gen.intel.gemm_base.hbm.py       |   2 +-
 GEMM/src/device/gemm_base.cl                  |  29 ++-
 LINPACK/src/device/hpl_torus_IEC.cl           |   6 +-
 LINPACK/src/device/hpl_torus_PCIE.cl          |   6 +-
 PTRANS/src/device/transpose_DIAG_IEC.cl       |  24 +--
 PTRANS/src/device/transpose_DIAG_PCIE.cl      |   6 +-
 PTRANS/src/device/transpose_PQ_IEC.cl         |  24 +--
 PTRANS/src/device/transpose_PQ_PCIE.cl        |  20 +-
 PTRANS/src/device/transpose_c2_DIAG_IEC.cl    |  32 +--
 .../device/random_access_kernels_single.cl    |  15 +-
 STREAM/src/device/stream_kernels.cl           |  12 +-
 b_eff/src/device/communication_bw520n_IEC.cl  |  40 ++--
 cmake/general_benchmark_build_setup.cmake     |   2 +-
 scripts/code_generator/README.md              |   4 +-
 scripts/code_generator/generator.py           | 202 +++++-------------
 scripts/code_generator/generator2.py          |  55 -----
 20 files changed, 201 insertions(+), 367 deletions(-)
 mode change 100755 => 100644 scripts/code_generator/generator.py
 delete mode 100644 scripts/code_generator/generator2.py

diff --git a/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py b/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py
index b4775387..c72f4081 100644
--- a/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py
+++ b/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py
@@ -1,7 +1,7 @@
 
 global_memory_name = "HBM"
 
-def generate_attributes(num_replications, num_global_memory_banks=32):
+def generate_bi_map_attributes(num_replications, num_global_memory_banks=32):
     """
     Generates the kernel attributes for the global memory. They specify in which 
     global memory the buffer is located. The buffers will be placed using a 
diff --git a/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py b/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py
index 86e3cc3a..2cb14bde 100644
--- a/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py
+++ b/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py
@@ -1,5 +1,5 @@
 
-def generate_attributes(num_replications, num_global_memory_banks=32):
+def generate_bi_map_attributes(num_replications, num_global_memory_banks=32):
     """
     Generates the kernel attributes for the global memory. They specify in which 
     global memory the buffer is located. The buffers will be placed using a 
diff --git a/FFT/src/device/fft1d_float_8.cl b/FFT/src/device/fft1d_float_8.cl
index 69da1432..9cfe70c7 100644
--- a/FFT/src/device/fft1d_float_8.cl
+++ b/FFT/src/device/fft1d_float_8.cl
@@ -51,12 +51,7 @@
 // code generation expects an array of maps of size num_replications with the keys "in" and "out".
 // The value of the keys have to be strings containing the attributes that
 // have to be assigned to input and output buffers in global memory
-/* PY_CODE_GEN 
-try:
-    kernel_param_attributes = generate_attributes(num_replications)
-except:
-    kernel_param_attributes = [{"in": "", "out": ""} for i in range(num_replications)]
-*/
+{% set kernel_param_attributes = generate_bi_map_attributes(num_replications) %}
 
 
 #define min(a,b) (a<b?a:b)
@@ -69,19 +64,19 @@ except:
 // Need some depth to our channels to accommodate their bursty filling.
 #ifdef INTEL_FPGA
 #pragma OPENCL EXTENSION cl_intel_channels : enable
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
-channel float2 chanin/*PY_CODE_GEN i*/[POINTS] __attribute__((depth(POINTS)));
-// PY_CODE_GEN block_end
+{% for i in range(num_total_replications) %}
+channel float2 chanin{{ i }}[POINTS] __attribute__((depth(POINTS)));
+{% endfor %}
 #endif
 #ifdef XILINX_FPGA
 #define XILINX_PIPE_DEPTH 16
 //#define XILINX_PIPE_DEPTH ((1 << (LOGN - LOGPOINTS) < 16) ? 16 : (1 << (LOGN - LOGPOINTS)))
 
 // Compiler states, that the pipe depth needs at least to be 16
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
-pipe float2x8 chanin/*PY_CODE_GEN i*/ __attribute__((xcl_reqd_pipe_depth(XILINX_PIPE_DEPTH)));
-pipe float2x8 chanout/*PY_CODE_GEN i*/ __attribute__((xcl_reqd_pipe_depth(XILINX_PIPE_DEPTH)));
-// PY_CODE_GEN block_end
+{% for i in range(num_total_replications) %}
+pipe float2x8 chanin{{ i }} __attribute__((xcl_reqd_pipe_depth(XILINX_PIPE_DEPTH)));
+pipe float2x8 chanout{{ i }} __attribute__((xcl_reqd_pipe_depth(XILINX_PIPE_DEPTH)));
+{% endfor %}
 #endif
 
 uint bit_reversed(uint x, uint bits) {
@@ -96,11 +91,11 @@ __attribute__((opencl_unroll_hint()))
   return y;
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_total_replications) %}
 
 __kernel
 __attribute__ ((max_global_work_dim(0), reqd_work_group_size(1,1,1)))
-void fetch/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["in"]*/ float2 * restrict src, int iter) {
+void fetch{{ i }}(__global {{ kernel_param_attributes[i]["in"] }} float2 * restrict src, int iter) {
 
   const int N = (1 << LOGN);
 
@@ -154,27 +149,27 @@ void fetch/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["i
         write_chunk[bit_reversed(j, LOGPOINTS)] = buf[offset + (current_index >> LOGPOINTS)][(current_index + shift) & (POINTS - 1)];
       }
 #ifdef XILINX_FPGA
-      buf2x8.i0 = write_chunk[0];          
-      buf2x8.i1 = write_chunk[1];  
-      buf2x8.i2 = write_chunk[2];  
-      buf2x8.i3 = write_chunk[3]; 
-      buf2x8.i4 = write_chunk[4]; 
+      buf2x8.i0 = write_chunk[0];
+      buf2x8.i1 = write_chunk[1];
+      buf2x8.i2 = write_chunk[2];
+      buf2x8.i3 = write_chunk[3];
+      buf2x8.i4 = write_chunk[4];
       buf2x8.i5 = write_chunk[5];
       buf2x8.i6 = write_chunk[6];
       buf2x8.i7 = write_chunk[7];
 
       // Start in the second iteration to forward the buffered data over the pipe
-      write_pipe_block(chanin/*PY_CODE_GEN i*/, &buf2x8);
+      write_pipe_block(chanin{{ i }}, &buf2x8);
 #endif
 #ifdef INTEL_FPGA
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[0], write_chunk[0]); 
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[1], write_chunk[1]);  
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[2], write_chunk[2]);  
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[3], write_chunk[3]);  
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[4], write_chunk[4]);  
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[5], write_chunk[5]); 
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[6], write_chunk[6]);  
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[7], write_chunk[7]);  
+        write_channel_intel(chanin{{ i }}[0], write_chunk[0]);
+        write_channel_intel(chanin{{ i }}[1], write_chunk[1]);
+        write_channel_intel(chanin{{ i }}[2], write_chunk[2]);
+        write_channel_intel(chanin{{ i }}[3], write_chunk[3]);
+        write_channel_intel(chanin{{ i }}[4], write_chunk[4]);
+        write_channel_intel(chanin{{ i }}[5], write_chunk[5]);
+        write_channel_intel(chanin{{ i }}[6], write_chunk[6]);
+        write_channel_intel(chanin{{ i }}[7], write_chunk[7]);
 #endif
     }
   }
@@ -193,10 +188,10 @@ void fetch/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["i
 
 __attribute__ ((max_global_work_dim(0)))
 __attribute__((reqd_work_group_size(1,1,1)))
-kernel void fft1d/*PY_CODE_GEN i*/(
+kernel void fft1d{{ i }}(
 #ifdef INTEL_FPGA
                 // Intel does not need a store kernel and directly writes back the result to global memory
-                __global /*PY_CODE_GEN kernel_param_attributes[i]["out"]*/ float2 * restrict dest,
+                __global {{ kernel_param_attributes[i]["out"] }} float2 * restrict dest,
 #endif
                 int count, int inverse) {
 
@@ -235,17 +230,17 @@ kernel void fft1d/*PY_CODE_GEN i*/(
     // Perform memory transfers only when reading data in range
     if (i < count * (N / POINTS)) {
 #ifdef INTEL_FPGA
-      data.i0 = read_channel_intel(chanin/*PY_CODE_GEN i*/[0]);
-      data.i1 = read_channel_intel(chanin/*PY_CODE_GEN i*/[1]);
-      data.i2 = read_channel_intel(chanin/*PY_CODE_GEN i*/[2]);
-      data.i3 = read_channel_intel(chanin/*PY_CODE_GEN i*/[3]);
-      data.i4 = read_channel_intel(chanin/*PY_CODE_GEN i*/[4]);
-      data.i5 = read_channel_intel(chanin/*PY_CODE_GEN i*/[5]);
-      data.i6 = read_channel_intel(chanin/*PY_CODE_GEN i*/[6]);
-      data.i7 = read_channel_intel(chanin/*PY_CODE_GEN i*/[7]);
+      data.i0 = read_channel_intel(chanin{{ i }}[0]);
+      data.i1 = read_channel_intel(chanin{{ i }}[1]);
+      data.i2 = read_channel_intel(chanin{{ i }}[2]);
+      data.i3 = read_channel_intel(chanin{{ i }}[3]);
+      data.i4 = read_channel_intel(chanin{{ i }}[4]);
+      data.i5 = read_channel_intel(chanin{{ i }}[5]);
+      data.i6 = read_channel_intel(chanin{{ i }}[6]);
+      data.i7 = read_channel_intel(chanin{{ i }}[7]);
 #endif
 #ifdef XILINX_FPGA
-      read_pipe_block(chanin/*PY_CODE_GEN i*/, &data);
+      read_pipe_block(chanin{{ i }}, &data);
 #endif
     } else {
       data.i0 = data.i1 = data.i2 = data.i3 = 
@@ -274,7 +269,7 @@ kernel void fft1d/*PY_CODE_GEN i*/(
 #endif
 #ifdef XILINX_FPGA
     // For Xilinx send the data to the store kernel to enable memory bursts
-      write_pipe_block(chanout/*PY_CODE_GEN i*/, &data);
+      write_pipe_block(chanout{{ i }}, &data);
 #endif
     }
   }
@@ -287,14 +282,14 @@ This kernel works without conditional branches which enables memory bursts.
  */
 __kernel
 __attribute__ ((max_global_work_dim(0), reqd_work_group_size(1,1,1)))
-void store/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["out"]*/ float2 * restrict dest, int iter) {
+void store{{ i }}(__global {{ kernel_param_attributes[i]["out"] }} float2 * restrict dest, int iter) {
 
   const int N = (1 << LOGN);
 
   // write the data back to global memory using memory bursts
   for(unsigned k = 0; k < iter * (N / POINTS); k++){ 
       float2x8 buf2x8;
-      read_pipe_block(chanout/*PY_CODE_GEN i*/, &buf2x8);
+      read_pipe_block(chanout{{ i }}, &buf2x8);
 
       dest[(k << LOGPOINTS)]     = buf2x8.i0;    
       dest[(k << LOGPOINTS) + 1] = buf2x8.i1; 
@@ -308,4 +303,4 @@ void store/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["o
 }
 #endif
 
-//PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/GEMM/settings/settings.gen.intel.gemm_base.520n_mx.py b/GEMM/settings/settings.gen.intel.gemm_base.520n_mx.py
index ba180d5c..023500c0 100644
--- a/GEMM/settings/settings.gen.intel.gemm_base.520n_mx.py
+++ b/GEMM/settings/settings.gen.intel.gemm_base.520n_mx.py
@@ -1,7 +1,7 @@
 
 global_memory_name = "HBM"
 
-def generate_attributes(num_replications, num_global_memory_banks=32):
+def generate_map_attributes(num_replications, num_global_memory_banks=32):
     """
     Generates the kernel attributes for the global memory. They specify in which 
     global memory the buffer is located. The buffers will be placed using a 
diff --git a/GEMM/settings/settings.gen.intel.gemm_base.hbm.py b/GEMM/settings/settings.gen.intel.gemm_base.hbm.py
index ab88f63a..4b3f1813 100644
--- a/GEMM/settings/settings.gen.intel.gemm_base.hbm.py
+++ b/GEMM/settings/settings.gen.intel.gemm_base.hbm.py
@@ -1,7 +1,7 @@
 
 global_memory_name = "HBM"
 
-def generate_attributes(num_replications, num_global_memory_banks=32):
+def generate_map_attributes(num_replications, num_global_memory_banks=32):
     """
     Generates the kernel attributes for the global memory. They specify in which 
     global memory the buffer is located. The buffers will be placed using a 
diff --git a/GEMM/src/device/gemm_base.cl b/GEMM/src/device/gemm_base.cl
index 3599e6cd..6511a221 100644
--- a/GEMM/src/device/gemm_base.cl
+++ b/GEMM/src/device/gemm_base.cl
@@ -33,12 +33,7 @@ SOFTWARE.
 // code generation expects an array of maps of size num_replications with the keys a,b,c,out.
 // The value of the keys have to be strings containing the attributes that
 // have to be assigned to input and output buffers in global memory
-/* PY_CODE_GEN 
-try:
-    kernel_param_attributes = generate_attributes(num_replications)
-except:
-    kernel_param_attributes = [{"a": "", "b": "", "c": "", "out": ""} for i in range(num_replications)]
-*/
+{% set kernel_param_attributes = generate_map_attributes(num_replications) %}
 
 /**
 Calculate for the Level 2 block:
@@ -260,7 +255,7 @@ to BRAM.
 
 // Here we use the total replications. This will also create three kernels for the Xilinx compiler because they all
 // use different hard-coded ranges in the outer loop
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 /**
 Two level blocked GEMM kernel
@@ -277,21 +272,21 @@ calculates C_OUT = alpha * A.dot(B) + beta * C
 */
 __attribute__((uses_global_work_offset(0)))
 __kernel
-void gemm/*PY_CODE_GEN i*/(
+void gemm{{ i }}(
 #ifdef ENABLE_MIXED_PRECISION
         // In mixed precision convert the values accordingly 
         // from single precision to the target precision on the FPGA
-            __global /*PY_CODE_GEN kernel_param_attributes[i]["a"]*/ const float* restrict a,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]["b"]*/ const float* restrict b,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]["c"]*/ const float* restrict c,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]["out"]*/ float* restrict c_out,
+            __global {{ kernel_param_attributes[i]["a"] }} const float* restrict a,
+          __global {{ kernel_param_attributes[i]["b"] }} const float* restrict b,
+          __global {{ kernel_param_attributes[i]["c"] }} const float* restrict c,
+          __global {{ kernel_param_attributes[i]["out"] }} float* restrict c_out,
           const float alpha,
           const float beta,
 #else
-            __global /*PY_CODE_GEN kernel_param_attributes[i]["a"]*/ const DEVICE_DATA_TYPE* restrict a,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]["b"]*/ const DEVICE_DATA_TYPE* restrict b,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]["c"]*/ const DEVICE_DATA_TYPE* restrict c,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]["out"]*/ DEVICE_DATA_TYPE* restrict c_out,
+            __global {{ kernel_param_attributes[i]["a"] }} const DEVICE_DATA_TYPE* restrict a,
+          __global {{ kernel_param_attributes[i]["b"] }} const DEVICE_DATA_TYPE* restrict b,
+          __global {{ kernel_param_attributes[i]["c"] }} const DEVICE_DATA_TYPE* restrict c,
+          __global {{ kernel_param_attributes[i]["out"] }} DEVICE_DATA_TYPE* restrict c_out,
           const DEVICE_DATA_TYPE alpha,
           const DEVICE_DATA_TYPE beta,
 #endif
@@ -445,4 +440,4 @@ __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
diff --git a/LINPACK/src/device/hpl_torus_IEC.cl b/LINPACK/src/device/hpl_torus_IEC.cl
index fc3d0257..7e8f57ea 100644
--- a/LINPACK/src/device/hpl_torus_IEC.cl
+++ b/LINPACK/src/device/hpl_torus_IEC.cl
@@ -839,7 +839,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	}
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 /**
 Update the inner blocks using the left and right column and rows
@@ -847,7 +847,7 @@ Update the inner blocks using the left and right column and rows
  */
  __attribute__((uses_global_work_offset(0)))
 __kernel
-void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a, 
+void inner_update_mm{{ i }}(__global DEVICE_DATA_TYPE* restrict a, 
 				__global DEVICE_DATA_TYPE* restrict left_global_buffer,
 				__global DEVICE_DATA_TYPE* restrict top_global_buffer,
 				const uint block_col,
@@ -945,4 +945,4 @@ void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a,
 	}
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
diff --git a/LINPACK/src/device/hpl_torus_PCIE.cl b/LINPACK/src/device/hpl_torus_PCIE.cl
index 2b3d312d..2b86657d 100644
--- a/LINPACK/src/device/hpl_torus_PCIE.cl
+++ b/LINPACK/src/device/hpl_torus_PCIE.cl
@@ -708,7 +708,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	}
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 /**
 Update the inner blocks using the left and right column and rows
@@ -716,7 +716,7 @@ Update the inner blocks using the left and right column and rows
  */
  __attribute__((uses_global_work_offset(0)))
 __kernel
-void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a, 
+void inner_update_mm{{ i }}(__global DEVICE_DATA_TYPE* restrict a, 
 				__global DEVICE_DATA_TYPE* restrict left_global_buffer,
 				__global DEVICE_DATA_TYPE* restrict top_global_buffer,
 				const uint block_col,
@@ -862,4 +862,4 @@ void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a,
 	}
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/PTRANS/src/device/transpose_DIAG_IEC.cl b/PTRANS/src/device/transpose_DIAG_IEC.cl
index 513b39e8..94077736 100644
--- a/PTRANS/src/device/transpose_DIAG_IEC.cl
+++ b/PTRANS/src/device/transpose_DIAG_IEC.cl
@@ -16,11 +16,11 @@ typedef struct {
     DEVICE_DATA_TYPE data[CHANNEL_WIDTH];
 } ch_data;
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_total_replications) %}
 // Channel used to send the transposed blocks of A
-channel ch_data chan_a_out/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(i) + "\""*/), depth(1)));
-channel ch_data chan_a_in/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2 * (i // 2) + ((i + 1) % 2)) + "\""*/), depth(1)));
-// PY_CODE_GEN block_end
+channel ch_data chan_a_out{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(i) }}), depth(1)));
+channel ch_data chan_a_in{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2 * (i // 2) + ((i + 1) % 2)) }}), depth(1)));
+{% endfor %}
 #endif
 
 /**
@@ -64,7 +64,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
         }
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_total_replications) %}
 
 /**
 * send a chunk of A into local memory in a reordered fashion
@@ -77,7 +77,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
 *
 */
 void
-send_chunk_of_a/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
+send_chunk_of_a{{ i }}(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
         const ulong row,
         const ulong col) {
 
@@ -104,7 +104,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
             data.data[unroll_count] = rotate_out[(unroll_count + rot_out) & (CHANNEL_WIDTH - 1)];
         }
 
-        write_channel_intel(chan_a_out/*PY_CODE_GEN i*/, data); 
+        write_channel_intel(chan_a_out{{ i }}, data); 
 }
 
 /**
@@ -121,7 +121,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
+void transpose_read{{ i }}(__global DEVICE_DATA_TYPE *restrict A,
             const ulong block_offset,
             const ulong number_of_blocks) {
 
@@ -139,7 +139,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
                     load_chunk_of_a(A, a_block[block & 1], block, row, col);
                 }
                 if (block > 0) {
-                    send_chunk_of_a/*PY_CODE_GEN i*/(a_block[(block - 1) & 1], row, col);
+                    send_chunk_of_a{{ i }}(a_block[(block - 1) & 1], row, col);
                 }
             }
         }
@@ -162,7 +162,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B,
+void transpose_write{{ i }}(__global DEVICE_DATA_TYPE *restrict B,
             __global DEVICE_DATA_TYPE *restrict A_out,
             const ulong block_offset,
             const ulong number_of_blocks) {
@@ -173,7 +173,7 @@ void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B,
         for (ulong row = 0; row < BLOCK_SIZE; row++) {
             for (ulong col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) {
 
-                ch_data data = read_channel_intel(chan_a_in/*PY_CODE_GEN i*/); 
+                ch_data data = read_channel_intel(chan_a_in{{ i }}); 
 
                 unsigned rot_out = row & (CHANNEL_WIDTH - 1);
                 // rotate temporary buffer to store data into local buffer
@@ -188,4 +188,4 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
diff --git a/PTRANS/src/device/transpose_DIAG_PCIE.cl b/PTRANS/src/device/transpose_DIAG_PCIE.cl
index 614800f3..b443803d 100644
--- a/PTRANS/src/device/transpose_DIAG_PCIE.cl
+++ b/PTRANS/src/device/transpose_DIAG_PCIE.cl
@@ -127,7 +127,7 @@ store_a(__global DEVICE_DATA_TYPE *restrict A_out,
     }
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 /**
  * Read blocks of matrix A and transpose them in memory.
@@ -144,7 +144,7 @@ store_a(__global DEVICE_DATA_TYPE *restrict A_out,
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
+void transpose{{ i }}(__global DEVICE_DATA_TYPE *restrict A,
                                 __global DEVICE_DATA_TYPE *restrict B,
                                 __global DEVICE_DATA_TYPE *restrict A_out,
             const uint number_of_blocks) {
@@ -172,4 +172,4 @@ void transpose/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
diff --git a/PTRANS/src/device/transpose_PQ_IEC.cl b/PTRANS/src/device/transpose_PQ_IEC.cl
index e219ae1c..5bce8ab7 100644
--- a/PTRANS/src/device/transpose_PQ_IEC.cl
+++ b/PTRANS/src/device/transpose_PQ_IEC.cl
@@ -16,11 +16,11 @@ typedef struct {
     DEVICE_DATA_TYPE data[CHANNEL_WIDTH];
 } ch_data;
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_total_replications) %}
 // Channel used to send the transposed blocks of A
-channel ch_data chan_a_out/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(i) + "\""*/), depth(1)));
-channel ch_data chan_a_in/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2 * (i // 2) + ((i + 1) % 2)) + "\""*/), depth(1)));
-// PY_CODE_GEN block_end
+channel ch_data chan_a_out{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(i) }}), depth(1)));
+channel ch_data chan_a_in{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2 * (i // 2) + ((i + 1) % 2)) }}), depth(1)));
+{% endfor %}
 #endif
 
 /**
@@ -69,7 +69,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
         }
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_total_replications) %}
 
 /**
 * send a chunk of A into local memory in a reordered fashion
@@ -82,7 +82,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
 *
 */
 void
-send_chunk_of_a/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
+send_chunk_of_a{{ i }}(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
         const ulong row,
         const ulong col) {
 
@@ -109,7 +109,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
             data.data[unroll_count] = rotate_out[(unroll_count + rot_out) & (CHANNEL_WIDTH - 1)];
         }
 
-        write_channel_intel(chan_a_out/*PY_CODE_GEN i*/, data); 
+        write_channel_intel(chan_a_out{{ i }}, data); 
 }
 
 /**
@@ -126,7 +126,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
+void transpose_read{{ i }}(__global DEVICE_DATA_TYPE *restrict A,
             const ulong offset,
             const ulong width_in_blocks,
             const ulong height_in_blocks,
@@ -148,7 +148,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
                     load_chunk_of_a(A, a_block[block & 1], block_row, block_col, width_in_blocks, row, col);
                 }
                 if (block > offset) {
-                    send_chunk_of_a/*PY_CODE_GEN i*/(a_block[(block - 1) & 1], row, col);
+                    send_chunk_of_a{{ i }}(a_block[(block - 1) & 1], row, col);
                 }
             }
         }
@@ -171,7 +171,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B,
+void transpose_write{{ i }}(__global DEVICE_DATA_TYPE *restrict B,
             __global DEVICE_DATA_TYPE *restrict A_out,
             const ulong offset,
             const ulong width_in_blocks,
@@ -183,7 +183,7 @@ void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B,
         for (ulong row = 0; row < BLOCK_SIZE; row++) {
             for (ulong col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) {
 
-                ch_data data = read_channel_intel(chan_a_in/*PY_CODE_GEN i*/); 
+                ch_data data = read_channel_intel(chan_a_in{{ i }}); 
 
                 ulong block_col = block % width_in_blocks;
                 ulong block_row = block / width_in_blocks;
@@ -202,4 +202,4 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cl b/PTRANS/src/device/transpose_PQ_PCIE.cl
index 161fcb88..caa20143 100644
--- a/PTRANS/src/device/transpose_PQ_PCIE.cl
+++ b/PTRANS/src/device/transpose_PQ_PCIE.cl
@@ -8,14 +8,9 @@
 
 #include "parameters.h"
 
-/* PY_CODE_GEN 
-try:
-    kernel_param_attributes = generate_attributes(num_replications)
-except:
-    kernel_param_attributes = ["" for i in range(num_replications)]
-*/
+{% set kernel_param_attributes = generate_attributes(num_replications) %}
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 /**
  * Read blocks of matrix A and transpose them in memory.
@@ -37,11 +32,10 @@ except:
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE *restrict A,
-                                __global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE *restrict B,
-                                __global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE *restrict A_out,
-            const uint offset_a,
-            const uint offset_b,
+void transpose{{ i }}(__global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict A,
+                                __global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict B,
+                                __global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict A_out,
+            const uint offset,
             const uint number_of_blocks,
             const uint width_in_blocks,
             const uint height_in_blocks) {
@@ -190,4 +184,4 @@ void transpose/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/PTRANS/src/device/transpose_c2_DIAG_IEC.cl b/PTRANS/src/device/transpose_c2_DIAG_IEC.cl
index dfad9f87..cf2455e7 100644
--- a/PTRANS/src/device/transpose_c2_DIAG_IEC.cl
+++ b/PTRANS/src/device/transpose_c2_DIAG_IEC.cl
@@ -16,13 +16,13 @@ typedef struct {
     DEVICE_DATA_TYPE data[CHANNEL_WIDTH/2];
 } ch_data;
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_replications) %}
 // Channel used to send the transposed blocks of A
-channel ch_data chan_a_out1/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(2*i) + "\""*/), depth(1)));
-channel ch_data chan_a_out2/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(2*i + 1) + "\""*/), depth(1)));
-channel ch_data chan_a_in1/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2*i + 1) + "\""*/), depth(1)));
-channel ch_data chan_a_in2/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2*i) + "\""*/), depth(1)));
-// PY_CODE_GEN block_end
+channel ch_data chan_a_out1{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(2*i) }}), depth(1)));
+channel ch_data chan_a_out2{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(2*i + 1) }}), depth(1)));
+channel ch_data chan_a_in1{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2*i + 1) }}), depth(1)));
+channel ch_data chan_a_in2{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2*i) }}), depth(1)));
+{% endfor %}
 #endif
 
 /**
@@ -65,7 +65,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
         }
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_total_replications) %}
 
 /**
 * send a chunk of A into local memory in a reordered fashion
@@ -78,7 +78,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
 *
 */
 void
-send_chunk_of_a/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
+send_chunk_of_a{{ i }}(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
         const ulong row,
         const ulong col) {
 
@@ -111,7 +111,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2)))
         for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH/2; unroll_count++) {
             data1.data[unroll_count] = channel_data[unroll_count];
         }
-        write_channel_intel(chan_a_out1/*PY_CODE_GEN i*/, data1); 
+        write_channel_intel(chan_a_out1{{ i }}, data1); 
 
         ch_data data2;
         // rotate temporary buffer to store data into local buffer
@@ -119,7 +119,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2)))
         for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH/2; unroll_count++) {
             data2.data[unroll_count] = channel_data[CHANNEL_WIDTH/2 + unroll_count];
         }
-        write_channel_intel(chan_a_out2/*PY_CODE_GEN i*/, data2); 
+        write_channel_intel(chan_a_out2{{ i }}, data2); 
 }
 
 /**
@@ -136,7 +136,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2)))
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
+void transpose_read{{ i }}(__global DEVICE_DATA_TYPE *restrict A,
             const ulong block_offset,
             const ulong number_of_blocks) {
 
@@ -154,7 +154,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
                     load_chunk_of_a(A, a_block[block & 1], block, row, col);
                 }
                 if (block > 0) {
-                    send_chunk_of_a/*PY_CODE_GEN i*/(a_block[(block - 1) & 1], row, col);
+                    send_chunk_of_a{{ i }}(a_block[(block - 1) & 1], row, col);
                 }
             }
         }
@@ -177,7 +177,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B,
+void transpose_write{{ i }}(__global DEVICE_DATA_TYPE *restrict B,
             __global DEVICE_DATA_TYPE *restrict A_out,
             const ulong block_offset,
             const ulong number_of_blocks) {
@@ -190,13 +190,13 @@ void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B,
 
                 DEVICE_DATA_TYPE channel_data[CHANNEL_WIDTH];
 
-                ch_data data1 = read_channel_intel(chan_a_in1/*PY_CODE_GEN i*/); 
+                ch_data data1 = read_channel_intel(chan_a_in1{{ i }}); 
                 __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2)))
                 for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH/2; unroll_count++) {
                     channel_data[unroll_count] = data1.data[unroll_count];
                 }
 
-                ch_data data2 = read_channel_intel(chan_a_in2/*PY_CODE_GEN i*/); 
+                ch_data data2 = read_channel_intel(chan_a_in2{{ i }}); 
                 __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2)))
                 for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH/2; unroll_count++) {
                     channel_data[CHANNEL_WIDTH/2 + unroll_count] = data2.data[unroll_count];
@@ -217,4 +217,4 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/RandomAccess/src/device/random_access_kernels_single.cl b/RandomAccess/src/device/random_access_kernels_single.cl
index f7c59260..16637065 100644
--- a/RandomAccess/src/device/random_access_kernels_single.cl
+++ b/RandomAccess/src/device/random_access_kernels_single.cl
@@ -34,14 +34,9 @@ Constant used to update the pseudo random number
 #define BLOCK_SIZE_LOG GLOBAL_MEM_UNROLL_LOG
 #define BLOCK_SIZE (1 << BLOCK_SIZE_LOG)
 
-/* PY_CODE_GEN 
-try:
-    kernel_param_attributes = generate_attributes(num_replications)
-except:
-    kernel_param_attributes = ["" for i in range(num_replications)]
-*/
+{% set kernel_param_attributes = generate_attributes(num_replications) %}
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 /*
 Kernel, that will update the given data array accoring to a predefined pseudo-
@@ -56,8 +51,8 @@ to the kernel.
 */
 __attribute__((max_global_work_dim(0),uses_global_work_offset(0)))
 __kernel
-void accessMemory_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE_UNSIGNED  volatile * restrict data,
-                        __constant /*PY_CODE_GEN kernel_param_attributes[i]*/ const DEVICE_DATA_TYPE_UNSIGNED * restrict random_init,
+void accessMemory_{{ i }}(__global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE_UNSIGNED  volatile * restrict data,
+                        __constant {{ kernel_param_attributes[i] }} const DEVICE_DATA_TYPE_UNSIGNED * restrict random_init,
                         const DEVICE_DATA_TYPE_UNSIGNED m,
                         const DEVICE_DATA_TYPE_UNSIGNED data_chunk,
                         const uint num_cache_operations,
@@ -190,4 +185,4 @@ void accessMemory_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attribut
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/STREAM/src/device/stream_kernels.cl b/STREAM/src/device/stream_kernels.cl
index cd569727..c8a99e2b 100644
--- a/STREAM/src/device/stream_kernels.cl
+++ b/STREAM/src/device/stream_kernels.cl
@@ -6,11 +6,11 @@ KERNEL_NUMBER will be replaced by the build script with the ID of the current re
 */
 #include "parameters.h"
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 __kernel
 __attribute__((uses_global_work_offset(0)))
-void copy_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
+void copy_{{ i }}(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
           __global DEVICE_ARRAY_DATA_TYPE * restrict out,
           const uint array_size) {
     uint number_elements = array_size / VECTOR_COUNT;
@@ -22,7 +22,7 @@ void copy_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
 
 __kernel
 __attribute__((uses_global_work_offset(0)))
-void add_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
+void add_{{ i }}(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
           __global const DEVICE_ARRAY_DATA_TYPE * restrict in2,
           __global DEVICE_ARRAY_DATA_TYPE * restrict out,
           const uint array_size) {
@@ -35,7 +35,7 @@ void add_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
 
 __kernel
 __attribute__((uses_global_work_offset(0)))
-void scale_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
+void scale_{{ i }}(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
           __global DEVICE_ARRAY_DATA_TYPE * restrict out,
           const DEVICE_SCALAR_DATA_TYPE scalar,
           const uint array_size) {
@@ -48,7 +48,7 @@ void scale_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in
 
 __kernel
 __attribute__((uses_global_work_offset(0)))
-void triad_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
+void triad_{{ i }}(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
           __global const DEVICE_ARRAY_DATA_TYPE * restrict in2,
           __global DEVICE_ARRAY_DATA_TYPE * restrict out,
           const DEVICE_SCALAR_DATA_TYPE scalar,
@@ -60,4 +60,4 @@ void triad_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/b_eff/src/device/communication_bw520n_IEC.cl b/b_eff/src/device/communication_bw520n_IEC.cl
index ce128d8c..e3d61b74 100644
--- a/b_eff/src/device/communication_bw520n_IEC.cl
+++ b/b_eff/src/device/communication_bw520n_IEC.cl
@@ -49,17 +49,17 @@ typedef struct {
 /**
  * Definition of the external channels
  */
- // PY_CODE_GEN block_start [replace(local_variables=locals()) for r in range(num_replications)]
-channel message_part ch_out_/*PY_CODE_GEN  2*r+1*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(r % 4) + "\""*/)));
-channel message_part ch_out_/*PY_CODE_GEN  2*r+2*/  __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str((r + 2) % 4) + "\""*/)));
-channel message_part ch_in_/*PY_CODE_GEN  2*r+1*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(r % 4) + "\""*/)));
-channel message_part ch_in_/*PY_CODE_GEN  2*r+2*/  __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str((r + 2) % 4) + "\""*/)));
-channel message_part ch_exchange/*PY_CODE_GEN 2*r+1*/;
-channel message_part ch_exchange/*PY_CODE_GEN 2*r+2*/;
-// PY_CODE_GEN block_end
+{% for i in range(num_replications) %}
+channel message_part ch_out_{{ 2*i+1 }} __attribute((io({{ "\"kernel_output_ch{}\"".format(i % 4) }})));
+channel message_part ch_out_{{ 2*i+2 }} __attribute((io({{ "\"kernel_output_ch{}\"".format((i + 2) % 4) }})));
+channel message_part ch_in_{{ 2*i+1 }} __attribute((io({{ "\"kernel_input_ch{}\"".format(i % 4) }})));
+channel message_part ch_in_{{ 2*i+2 }}  __attribute((io({{ "\"kernel_input_ch{}\"".format((i + 2) % 4) }})));
+channel message_part ch_exchange{{ 2*i+1 }};
+channel message_part ch_exchange{{ 2*i+2 }};
+{% endfor %}
 
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for r in range(num_replications)]
+{% for i in range(num_replications) %}
 /**
  * Send kernel that will send messages through two channels
  *
@@ -68,7 +68,7 @@ channel message_part ch_exchange/*PY_CODE_GEN 2*r+2*/;
  */
 __kernel
 __attribute__ ((max_global_work_dim(0)))
-void send/*PY_CODE_GEN  r*/(const unsigned data_size,
+void send{{ i }}(const unsigned data_size,
         const unsigned repetitions) {
     const unsigned send_iterations = ((1 << data_size) +  2 * ITEMS_PER_CHANNEL - 1) / (2 * ITEMS_PER_CHANNEL);
     message_part send_part1;
@@ -85,13 +85,13 @@ void send/*PY_CODE_GEN  r*/(const unsigned data_size,
     for (unsigned i=0; i < repetitions; i++) {
         // Send a single message sent over two channels split into multiple chunks
         for (unsigned k=0; k < send_iterations; k++) {
-            write_channel_intel(ch_out_/*PY_CODE_GEN  2*r+1*/, send_part1);
-            write_channel_intel(ch_out_/*PY_CODE_GEN  2*r+2*/, send_part2);
+            write_channel_intel(ch_out_{{ 2*i+1 }}, send_part1);
+            write_channel_intel(ch_out_{{ 2*i+2 }}, send_part2);
         }
 #ifndef EMULATE
         // Introduce data dependency between loop iterations to prevent coalescing of loop
-        send_part1 = read_channel_intel(ch_exchange/*PY_CODE_GEN 2*r+1*/);
-        send_part2 = read_channel_intel(ch_exchange/*PY_CODE_GEN 2*r+2*/);
+        send_part1 = read_channel_intel(ch_exchange{{ 2*i+1 }});
+        send_part2 = read_channel_intel(ch_exchange{{ 2*i+2 }});
 #endif
     }
 }
@@ -106,7 +106,7 @@ void send/*PY_CODE_GEN  r*/(const unsigned data_size,
  */
 __kernel
 __attribute__ ((max_global_work_dim(0)))
-void recv/*PY_CODE_GEN  r*/(__global DEVICE_DATA_TYPE* validation_buffer,
+void recv{{ i }}(__global DEVICE_DATA_TYPE* validation_buffer,
             const unsigned data_size,
             const unsigned repetitions) {
     const unsigned send_iterations = ((1 << data_size) +  2 * ITEMS_PER_CHANNEL - 1) / (2 * ITEMS_PER_CHANNEL);
@@ -117,14 +117,14 @@ void recv/*PY_CODE_GEN  r*/(__global DEVICE_DATA_TYPE* validation_buffer,
     for (unsigned i=0; i < repetitions; i++) {
         // Receive a single message sent over two channels split into multiple chunks
         for (unsigned k=0; k < send_iterations; k++) {
-            recv_part1 = read_channel_intel(ch_in_/*PY_CODE_GEN  2*r+1*/);
-            recv_part2 = read_channel_intel(ch_in_/*PY_CODE_GEN  2*r+2*/);
+            recv_part1 = read_channel_intel(ch_in_{{ 2*i+1 }});
+            recv_part2 = read_channel_intel(ch_in_{{ 2*i+2 }});
         }
 #ifndef EMULATE
         // Introduce data dependency between loop iterations to prevent coalescing of loop
         // by sending the data to the send kernel
-        write_channel_intel(ch_exchange/*PY_CODE_GEN 2*r+1*/, recv_part1);
-        write_channel_intel(ch_exchange/*PY_CODE_GEN 2*r+2*/, recv_part2);
+        write_channel_intel(ch_exchange{{ 2*i+1 }}, recv_part1);
+        write_channel_intel(ch_exchange{{ 2*i+2 }}, recv_part2);
 #endif
     }
 
@@ -139,4 +139,4 @@ void recv/*PY_CODE_GEN  r*/(__global DEVICE_DATA_TYPE* validation_buffer,
     }
 }
 
-//PY_CODE_GEN block_end
+{% endfor %}
diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake
index 0a3f8d66..64aa8d0a 100644
--- a/cmake/general_benchmark_build_setup.cmake
+++ b/cmake/general_benchmark_build_setup.cmake
@@ -214,7 +214,7 @@ if (INTELFPGAOPENCL_FOUND)
     separate_arguments(AOC_FLAGS)
 endif()
 
-set(CODE_GENERATOR "${CMAKE_SOURCE_DIR}/../scripts/code_generator/generator2.py" CACHE FILEPATH "Path to the code generator executable")
+set(CODE_GENERATOR "${CMAKE_SOURCE_DIR}/../scripts/code_generator/generator.py" CACHE FILEPATH "Path to the code generator executable")
 set(CUSTOM_KERNEL_FOLDER ${CMAKE_SOURCE_DIR}/src/device/custom/)
 
 
diff --git a/scripts/code_generator/README.md b/scripts/code_generator/README.md
index 20730682..2847e7ac 100644
--- a/scripts/code_generator/README.md
+++ b/scripts/code_generator/README.md
@@ -81,7 +81,7 @@ As an example the dynamic construction of a switch statement:
 
     switch(i) {
         // PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(replicate)]
-        case /*PY_CODE_GEN i*/: return /*PY_CODE_GEN i+1*/; break;
+        case {{ i }}: return /*PY_CODE_GEN i+1*/; break;
         // PY_CODE_GEN block_end 
     }
 
@@ -94,7 +94,7 @@ would result in:
         case 3: return 4; break;
     }
 
-Note, that the variables that have to be replaced are written in inline comments `/*PY_CODE_GEN i*/`.
+Note, that the variables that have to be replaced are written in inline comments `{{ i }}`.
 The given statement will be evaluated and the comment will be replaced by the result.
 Thus, it is also possible to call functions or do arithmetic.
 
diff --git a/scripts/code_generator/generator.py b/scripts/code_generator/generator.py
old mode 100755
new mode 100644
index 7b27ee93..b0452121
--- a/scripts/code_generator/generator.py
+++ b/scripts/code_generator/generator.py
@@ -1,49 +1,33 @@
-#!/usr/bin/env python3
-#
-# Copyright (c) 2019 Marius Meyer
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy of
-# this software and associated documentation files (the "Software"), to deal in
-# the Software without restriction, including without limitation the rights to
-# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-# of the Software, and to permit persons to whom the Software is furnished to do
-# so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-##
-
 import argparse
-import itertools
 import sys
 import logging
-import re
-
-
-comment_symbol = "//"
-ml_comment_symbol_start = "/*"
-ml_comment_symbol_end = "*/"
-pycodegen_cmd = "PY_CODE_GEN"
-pragma_cmd = comment_symbol +"\\s*"+ pycodegen_cmd
+from jinja2 import Environment, PackageLoader, BaseLoader, TemplateNotFound, select_autoescape
+from os.path import join, exists, getmtime
 
 parser = argparse.ArgumentParser(description='Preprocessor for code replication and advanced code modification.')
 parser.add_argument('file', metavar='CODE_FILE', type=str,
                    help='Path to the file that is used as input')
 parser.add_argument("-o", dest="output_file", default=None, help="Path to the output file. If not given, output will printed to stdout.")
-parser.add_argument("--comment", dest="comment_symbol", default=comment_symbol, help="Symbols that are used to comment out lines in the target language. Default='%s'" % comment_symbol)
-parser.add_argument("--comment-ml-start", dest="comment_symbol_ml_start", default=ml_comment_symbol_start, help="Symbols that are used to start a multi line comment in the target language. Default='%s'" % ml_comment_symbol_start)
-parser.add_argument("--comment-ml-end", dest="comment_symbol_ml_end", default=ml_comment_symbol_end, help="Symbols that are used to end a multi line comment in the target language. Default='%s'" % ml_comment_symbol_end)
 parser.add_argument("-p", dest="params", default=[], action="append", help="Python statement that is parsed before modifying the files. Can be used to define global variables.")
 
-CODE = ""
+# create a simple loader to load templates from the file system
+class SimpleLoader(BaseLoader):
+    def __init__(self, path):
+        self.path = path
+
+    def get_source(self, environment, template):
+        path = join(self.path, template)
+        if not exists(path):
+            raise TemplateNotFound(template)
+        mtime = getmtime(path)
+        with open(path) as f:
+            source = f.read()
+        return source, path, lambda: mtime == getmtime(path)
+
+env = Environment(
+    loader=SimpleLoader("./"),
+    autoescape=select_autoescape()
+)
 
 def use_file(file_name):
     """
@@ -67,124 +51,50 @@ def use_file(file_name):
         print("Error while parsing external file. See logs for more information.",file=sys.stderr)
         exit(1)
 
+if __name__ == '__main__':
+    args = parser.parse_args()
 
-def replace(code_block=None, local_variables=None):
-    """
-    Evaluate or execute inline code and replace the code with the result.
-
-    @param code_block The input code block that will be parsed and modified
-    @param local_variables A dictionary containing local variables that should also be considered (like locals())
-
-    @return the modified code
-    """
-    global CODE
-    if not code_block:
-        code_block = CODE
-    if local_variables is not None:
-        variables = {**globals(), **local_variables}
+    if args.output_file:
+        log_file_name = args.output_file + ".log"
     else:
-        variables = globals()
-    matches = itertools.chain(re.finditer("%s\\s*%s\\s+(?P<code>(.|\n)+?)%s" % (ml_comment_symbol_start, pycodegen_cmd, ml_comment_symbol_end), code_block, flags=0),
-                                re.finditer("%s\\s+(?!block_start\\s+)(?!block_end\\s+)(?P<code>(.)+?)\n" % (pragma_cmd), code_block, flags=0))
-    for res_ml in matches:
-        res_ml_code = res_ml.group(0)
-        try:
-            evaluated = str(eval(res_ml.groupdict()["code"], variables))
-            code_block = code_block.replace(res_ml_code, evaluated)
-            logging.debug("Evaluated '%s' to '%s'" % (res_ml.groupdict()["code"], evaluated))
-            continue
-        except Exception as e:
-            logging.debug("Failed to evaluate inline code")
-        try:
-            exec(res_ml.groupdict()["code"], globals())
-            code_block = code_block.replace(res_ml_code, "")
-            logging.debug("Executed in global space: '%s'" % res_ml.groupdict()["code"])
-        except Exception as e:
-            logging.warning("Could not execute inline code:\n\tCommand: '''\n%s\n'''\n\tError: %s" % (res_ml.groupdict()["code"], e))
-    return code_block
-
-
-def modify_block(code_block, cmd_str, out):
-    global CODE
-    CODE  = code_block
-    if cmd_str == "":
-        cmd_str = "None"
-    try:
-        mod_code = eval(cmd_str, {**globals(), **locals()})
-    except Exception as e:
-        logging.error("Block: %s \n %s" % (code_block, e))
-        logging.error("Global variables: %s" % globals())
-        print( "Block: %s \n %s" % (code_block, e),file=sys.stderr)
-        exit(1)
-    if type(mod_code) is list:
-        mod_code = "".join(mod_code)
-    elif mod_code is None:
-        mod_code = ""
-    elif type(mod_code) is not str:
-        logging.warning("%s is not a string. Automatic convert to string!" % mod_code)
-        mod_code = str(mod_code)
-    return mod_code
-    #logging.debug("Start parsing of modified sub-block")
-    #parse_string(mod_code, out)
-    #logging.debug("Finished parsing of modified sub-block")
-
-
-def parse_string(code_string, out):
-    try:
-        code_string = replace(code_string)
-        for res in re.finditer("%s\\s+block_start\\s+(?P<cmd>.*)\n(?P<code>(.|\n)+?)%s\\s+block_end\\s*\n" % (pragma_cmd, pragma_cmd), code_string, flags=0):
-            logging.debug("Found block match!")
-            d = res.groupdict()
-            code_block = d["code"]
-            logging.debug("Modify the block!")
-            code_block = modify_block(code_block, d["cmd"], out)
-            code_string = code_string.replace(res.group(0), code_block)
-        logging.debug("Parsing complete. Write result to file.")
-        output.write(code_string)
-    except Exception as e:
-        logging.error("Block: %s \n %s" % (code_string, e))
-        logging.error("Global variables: %s" % globals())
-        logging.error("Local variables: %s" % locals())
-        print( "Error while parsing code block: %s \n %s" % (e),file=sys.stderr)
+        log_file_name = "generator.log"
+    logging.basicConfig(filename=log_file_name, filemode='w', level=logging.DEBUG)
 
+    if not args.file:
+        logging.debug('no input file given')
+        exit(1)
+    if not args.output_file: 
+        logging.debug('no output file given')
+        exit(1)
+    for p in args.params:
+        logging.debug("Parse statement: %s" % p)
+        exec(p, globals())
 
-def parse_file(file_name, out):
-    """
-    Opens a single source code file and applies the changes to it.
+    template = env.get_template(args.file)
 
-    The function will output the modified source code into the given output stream.
+    try:
+        template.globals.update({"generate_attributes": generate_attributes})
+    except:
+        generate_attributes = lambda r : ["" for i in range(r)]
+        template.globals.update({"generate_attributes": generate_attributes})
 
-    @param file_name The psth to the source code file relative to the current working directory
-    @param out       Output stream that is used to output the modified source code
-    """
     try:
-        with open(file_name) as f:
-            parse_string(f.read(), out)
-    except Exception as e:
-        logging.error("Error when opening and parsing file %s: %s" % (file_name, e))
-        print("Error occurred when parsing file. See logs for more details.",file=sys.stderr)
+        template.globals.update({"generate_map_attributes": generate_map_attributes})
+    except:
+        generate_map_attributes = lambda r : [{"a": "", "b": "", "c": "", "out": ""} for i in range(r)]
+        template.globals.update({"generate_map_attributes": generate_map_attributes})
 
+    try:
+        template.globals.update({"generate_bi_map_attributes": generate_bi_map_attributes})
+    except:
+        generate_bi_map_attributes = lambda r : [{"in": "", "out": ""} for i in range(r)]
+        template.globals.update({"generate_bi_map_attributes": generate_bi_map_attributes})
 
+    if num_replications is None:
+        num_replications = 1 
 
+    if num_total_replications is None:
+        num_total_replications = 1
 
-if __name__=="__main__":
-    args = parser.parse_args()
-    if args.output_file:
-        log_file_name = args.output_file + ".log"
-    else:
-        log_file_name = "generator.log"
-    logging.basicConfig(filename=log_file_name, filemode='w', level=logging.DEBUG)
-    output = sys.stdout
-    for p in args.params:
-        logging.debug("Parse statement: %s" % p)
-        exec(p, globals())
-    if args.output_file:
-        logging.debug("Use output file: %s" % args.output_file)
-        output = open(args.output_file, 'w')
-    comment_symbol = re.escape(args.comment_symbol)
-    ml_comment_symbol_start = re.escape(args.comment_symbol_ml_start)
-    ml_comment_symbol_end = re.escape(args.comment_symbol_ml_end)
-    pragma_cmd = comment_symbol +"\\s*"+ pycodegen_cmd
-    logging.debug("Use pragma command: %s", pragma_cmd)
-    logging.debug("Start parsing file: %s" % args.file)
-    parse_file(args.file, output)
+    with open(args.output_file, 'w') as f:
+        f.write(template.render(num_replications=num_replications, num_total_replications=num_total_replications))
\ No newline at end of file
diff --git a/scripts/code_generator/generator2.py b/scripts/code_generator/generator2.py
deleted file mode 100644
index 09a0c142..00000000
--- a/scripts/code_generator/generator2.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import argparse
-import sys
-from jinja2 import Environment, PackageLoader, BaseLoader, TemplateNotFound, select_autoescape
-from os.path import join, exists, getmtime
-
-parser = argparse.ArgumentParser(description='Preprocessor for code replication and advanced code modification.')
-parser.add_argument('file', metavar='CODE_FILE', type=str,
-                   help='Path to the file that is used as input')
-parser.add_argument("-o", dest="output_file", default=None, help="Path to the output file. If not given, output will printed to stdout.")
-parser.add_argument("-p", dest="params", default=[], action="append", help="Python statement that is parsed before modifying the files. Can be used to define global variables.")
-
-# create a simple loader to load templates from the file system
-class SimpleLoader(BaseLoader):
-    def __init__(self, path):
-        self.path = path
-
-    def get_source(self, environment, template):
-        path = join(self.path, template)
-        if not exists(path):
-            raise TemplateNotFound(template)
-        mtime = getmtime(path)
-        with open(path) as f:
-            source = f.read()
-        return source, path, lambda: mtime == getmtime(path)
-
-env = Environment(
-    loader=SimpleLoader("./"),
-    autoescape=select_autoescape()
-)
-
-if __name__ == '__main__':
-    args = parser.parse_args()
-    if not args.file:
-        print('no input file given')
-        exit(1)
-    if not args.output_file: 
-        print('no output file given')
-        exit(1)
-    for p in args.params:
-        print("Parse statement: %s" % p)
-        exec(p, globals())
-
-    template = env.get_template(args.file)
-
-    try:
-        template.globals.update({"generate_attributes": generate_attributes})
-    except:
-        generate_attributes = lambda r : ["" for i in range(r)]
-        template.globals.update({"generate_attributes": generate_attributes})
-
-    if num_replications is None:
-        num_replications = 1 
-
-    with open(args.output_file, 'w') as f:
-        f.write(template.render(num_replications=num_replications))
\ No newline at end of file

From 016a19c7bda3f7e0e38115839b41fcfdc716b70d Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 30 Mar 2022 09:15:12 +0200
Subject: [PATCH 184/318] do not use format in template

---
 PTRANS/src/device/transpose_DIAG_IEC.cl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PTRANS/src/device/transpose_DIAG_IEC.cl b/PTRANS/src/device/transpose_DIAG_IEC.cl
index 94077736..a5ab3a03 100644
--- a/PTRANS/src/device/transpose_DIAG_IEC.cl
+++ b/PTRANS/src/device/transpose_DIAG_IEC.cl
@@ -18,8 +18,8 @@ typedef struct {
 
 {% for i in range(num_total_replications) %}
 // Channel used to send the transposed blocks of A
-channel ch_data chan_a_out{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(i) }}), depth(1)));
-channel ch_data chan_a_in{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2 * (i // 2) + ((i + 1) % 2)) }}), depth(1)));
+channel ch_data chan_a_out{{ i }} __attribute((io("kernel_output_ch{{ i }}"), depth(1)));
+channel ch_data chan_a_in{{ i }} __attribute((io("kernel_input_ch{{ (2 * (i // 2) + ((i + 1) % 2)) }}"), depth(1)));
 {% endfor %}
 #endif
 

From 1612e26562f254d39cd9b3449e00cec833c58ea4 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 30 Mar 2022 09:15:57 +0200
Subject: [PATCH 185/318] move default attributes to template

---
 .../settings.gen.intel.fft1d_float_8.hbm.py       |  2 +-
 .../settings.gen.intel.fft1d_float_8.svm.py       |  2 +-
 FFT/src/device/fft1d_float_8.cl                   | 11 +++++++++--
 .../settings.gen.intel.gemm_base.520n_mx.py       |  2 +-
 GEMM/settings/settings.gen.intel.gemm_base.hbm.py |  2 +-
 GEMM/src/device/gemm_base.cl                      | 10 +++++++++-
 STREAM/src/device/stream_kernels_single.cl        | 10 +++++++++-
 scripts/code_generator/generator.py               | 15 +--------------
 8 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py b/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py
index c72f4081..b4775387 100644
--- a/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py
+++ b/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py
@@ -1,7 +1,7 @@
 
 global_memory_name = "HBM"
 
-def generate_bi_map_attributes(num_replications, num_global_memory_banks=32):
+def generate_attributes(num_replications, num_global_memory_banks=32):
     """
     Generates the kernel attributes for the global memory. They specify in which 
     global memory the buffer is located. The buffers will be placed using a 
diff --git a/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py b/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py
index 2cb14bde..86e3cc3a 100644
--- a/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py
+++ b/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py
@@ -1,5 +1,5 @@
 
-def generate_bi_map_attributes(num_replications, num_global_memory_banks=32):
+def generate_attributes(num_replications, num_global_memory_banks=32):
     """
     Generates the kernel attributes for the global memory. They specify in which 
     global memory the buffer is located. The buffers will be placed using a 
diff --git a/FFT/src/device/fft1d_float_8.cl b/FFT/src/device/fft1d_float_8.cl
index 9cfe70c7..763399cf 100644
--- a/FFT/src/device/fft1d_float_8.cl
+++ b/FFT/src/device/fft1d_float_8.cl
@@ -51,8 +51,15 @@
 // code generation expects an array of maps of size num_replications with the keys "in" and "out".
 // The value of the keys have to be strings containing the attributes that
 // have to be assigned to input and output buffers in global memory
-{% set kernel_param_attributes = generate_bi_map_attributes(num_replications) %}
-
+{% macro list(content, count) -%}
+    [{% for i in range(count) %} content {% if not loop.last %}, {% endif %} {% endfor %}
+{%- endmacro %}
+
+{% if generate_attributes is defined %}
+    {% set kernel_param_attributes = generate_attributes(num_replications) %}
+{% else %}
+    {% set kernel_param_attributes = list({"in": "", "out": ""}, num_replications) %}
+{% endif %}
 
 #define min(a,b) (a<b?a:b)
 
diff --git a/GEMM/settings/settings.gen.intel.gemm_base.520n_mx.py b/GEMM/settings/settings.gen.intel.gemm_base.520n_mx.py
index 023500c0..ba180d5c 100644
--- a/GEMM/settings/settings.gen.intel.gemm_base.520n_mx.py
+++ b/GEMM/settings/settings.gen.intel.gemm_base.520n_mx.py
@@ -1,7 +1,7 @@
 
 global_memory_name = "HBM"
 
-def generate_map_attributes(num_replications, num_global_memory_banks=32):
+def generate_attributes(num_replications, num_global_memory_banks=32):
     """
     Generates the kernel attributes for the global memory. They specify in which 
     global memory the buffer is located. The buffers will be placed using a 
diff --git a/GEMM/settings/settings.gen.intel.gemm_base.hbm.py b/GEMM/settings/settings.gen.intel.gemm_base.hbm.py
index 4b3f1813..ab88f63a 100644
--- a/GEMM/settings/settings.gen.intel.gemm_base.hbm.py
+++ b/GEMM/settings/settings.gen.intel.gemm_base.hbm.py
@@ -1,7 +1,7 @@
 
 global_memory_name = "HBM"
 
-def generate_map_attributes(num_replications, num_global_memory_banks=32):
+def generate_attributes(num_replications, num_global_memory_banks=32):
     """
     Generates the kernel attributes for the global memory. They specify in which 
     global memory the buffer is located. The buffers will be placed using a 
diff --git a/GEMM/src/device/gemm_base.cl b/GEMM/src/device/gemm_base.cl
index 6511a221..40572bbb 100644
--- a/GEMM/src/device/gemm_base.cl
+++ b/GEMM/src/device/gemm_base.cl
@@ -33,7 +33,15 @@ SOFTWARE.
 // code generation expects an array of maps of size num_replications with the keys a,b,c,out.
 // The value of the keys have to be strings containing the attributes that
 // have to be assigned to input and output buffers in global memory
-{% set kernel_param_attributes = generate_map_attributes(num_replications) %}
+{% macro list(content, count) -%}
+    [{% for i in range(count) %} content {% if not loop.last %}, {% endif %} {% endfor %}
+{%- endmacro %}
+
+{% if generate_attributes is defined %}
+    {% set kernel_param_attributes = generate_attributes(num_replications) %}
+{% else %}
+    {% set kernel_param_attributes = list({"a": "", "b": "", "c": "", "out": ""}, num_replications) %}
+{% endif %}
 
 /**
 Calculate for the Level 2 block:
diff --git a/STREAM/src/device/stream_kernels_single.cl b/STREAM/src/device/stream_kernels_single.cl
index 6d421b94..632ee979 100644
--- a/STREAM/src/device/stream_kernels_single.cl
+++ b/STREAM/src/device/stream_kernels_single.cl
@@ -15,7 +15,15 @@ KERNEL_NUMBER will be replaced by the build script with the ID of the current re
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 
-{% set kernel_param_attributes = generate_attributes(num_replications) %}
+{% macro list(content, count) -%}
+    [{% for i in range(count) %} content {% if not loop.last %}, {% endif %} {% endfor %}
+{%- endmacro %}
+
+{% if generate_attributes is defined %}
+    {% set kernel_param_attributes = generate_attributes(num_replications) %}
+{% else %}
+    {% set kernel_param_attributes = list("", num_replications) %}
+{% endif %}
 
 {% for i in range(num_replications) %}
 __kernel
diff --git a/scripts/code_generator/generator.py b/scripts/code_generator/generator.py
index b0452121..9aa6bfea 100644
--- a/scripts/code_generator/generator.py
+++ b/scripts/code_generator/generator.py
@@ -75,20 +75,7 @@ def use_file(file_name):
     try:
         template.globals.update({"generate_attributes": generate_attributes})
     except:
-        generate_attributes = lambda r : ["" for i in range(r)]
-        template.globals.update({"generate_attributes": generate_attributes})
-
-    try:
-        template.globals.update({"generate_map_attributes": generate_map_attributes})
-    except:
-        generate_map_attributes = lambda r : [{"a": "", "b": "", "c": "", "out": ""} for i in range(r)]
-        template.globals.update({"generate_map_attributes": generate_map_attributes})
-
-    try:
-        template.globals.update({"generate_bi_map_attributes": generate_bi_map_attributes})
-    except:
-        generate_bi_map_attributes = lambda r : [{"in": "", "out": ""} for i in range(r)]
-        template.globals.update({"generate_bi_map_attributes": generate_bi_map_attributes})
+        pass
 
     if num_replications is None:
         num_replications = 1 

From 1e712ec4e4166d3420e1c70fd4e0619132940f46 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 30 Mar 2022 12:45:56 +0200
Subject: [PATCH 186/318] python list comprehension is better than jinja macros

---
 FFT/src/device/fft1d_float_8.cl            | 6 +-----
 GEMM/src/device/gemm_base.cl               | 6 +-----
 STREAM/src/device/stream_kernels_single.cl | 6 +-----
 scripts/code_generator/generator.py        | 5 +++++
 4 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/FFT/src/device/fft1d_float_8.cl b/FFT/src/device/fft1d_float_8.cl
index 763399cf..ba451fa8 100644
--- a/FFT/src/device/fft1d_float_8.cl
+++ b/FFT/src/device/fft1d_float_8.cl
@@ -51,14 +51,10 @@
 // code generation expects an array of maps of size num_replications with the keys "in" and "out".
 // The value of the keys have to be strings containing the attributes that
 // have to be assigned to input and output buffers in global memory
-{% macro list(content, count) -%}
-    [{% for i in range(count) %} content {% if not loop.last %}, {% endif %} {% endfor %}
-{%- endmacro %}
-
 {% if generate_attributes is defined %}
     {% set kernel_param_attributes = generate_attributes(num_replications) %}
 {% else %}
-    {% set kernel_param_attributes = list({"in": "", "out": ""}, num_replications) %}
+    {% set kernel_param_attributes = create_list({"in": "", "out": ""}, num_replications) %}
 {% endif %}
 
 #define min(a,b) (a<b?a:b)
diff --git a/GEMM/src/device/gemm_base.cl b/GEMM/src/device/gemm_base.cl
index 40572bbb..4392b3c0 100644
--- a/GEMM/src/device/gemm_base.cl
+++ b/GEMM/src/device/gemm_base.cl
@@ -33,14 +33,10 @@ SOFTWARE.
 // code generation expects an array of maps of size num_replications with the keys a,b,c,out.
 // The value of the keys have to be strings containing the attributes that
 // have to be assigned to input and output buffers in global memory
-{% macro list(content, count) -%}
-    [{% for i in range(count) %} content {% if not loop.last %}, {% endif %} {% endfor %}
-{%- endmacro %}
-
 {% if generate_attributes is defined %}
     {% set kernel_param_attributes = generate_attributes(num_replications) %}
 {% else %}
-    {% set kernel_param_attributes = list({"a": "", "b": "", "c": "", "out": ""}, num_replications) %}
+    {% set kernel_param_attributes = create_list({"a": "", "b": "", "c": "", "out": ""}, num_replications) %}
 {% endif %}
 
 /**
diff --git a/STREAM/src/device/stream_kernels_single.cl b/STREAM/src/device/stream_kernels_single.cl
index 632ee979..b3bfe7fc 100644
--- a/STREAM/src/device/stream_kernels_single.cl
+++ b/STREAM/src/device/stream_kernels_single.cl
@@ -15,14 +15,10 @@ KERNEL_NUMBER will be replaced by the build script with the ID of the current re
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 
-{% macro list(content, count) -%}
-    [{% for i in range(count) %} content {% if not loop.last %}, {% endif %} {% endfor %}
-{%- endmacro %}
-
 {% if generate_attributes is defined %}
     {% set kernel_param_attributes = generate_attributes(num_replications) %}
 {% else %}
-    {% set kernel_param_attributes = list("", num_replications) %}
+    {% set kernel_param_attributes = create_list("", num_replications) %}
 {% endif %}
 
 {% for i in range(num_replications) %}
diff --git a/scripts/code_generator/generator.py b/scripts/code_generator/generator.py
index 9aa6bfea..2c915876 100644
--- a/scripts/code_generator/generator.py
+++ b/scripts/code_generator/generator.py
@@ -51,6 +51,9 @@ def use_file(file_name):
         print("Error while parsing external file. See logs for more information.",file=sys.stderr)
         exit(1)
 
+def create_list(content, count):
+    return [content for i in range(count)]
+
 if __name__ == '__main__':
     args = parser.parse_args()
 
@@ -72,6 +75,8 @@ def use_file(file_name):
 
     template = env.get_template(args.file)
 
+    template.globals.update({'create_list': create_list})
+
     try:
         template.globals.update({"generate_attributes": generate_attributes})
     except:

From 7aa961e79d8a458b8eea4ac838ac821c681bf29c Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 30 Mar 2022 13:04:50 +0200
Subject: [PATCH 187/318] use templating instead of format everywhere

---
 PTRANS/src/device/transpose_PQ_IEC.cl        |  4 ++--
 PTRANS/src/device/transpose_c2_DIAG_IEC.cl   |  8 ++++----
 b_eff/src/device/communication_bw520n_IEC.cl | 12 ++++++------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/PTRANS/src/device/transpose_PQ_IEC.cl b/PTRANS/src/device/transpose_PQ_IEC.cl
index 5bce8ab7..9bfb6485 100644
--- a/PTRANS/src/device/transpose_PQ_IEC.cl
+++ b/PTRANS/src/device/transpose_PQ_IEC.cl
@@ -18,8 +18,8 @@ typedef struct {
 
 {% for i in range(num_total_replications) %}
 // Channel used to send the transposed blocks of A
-channel ch_data chan_a_out{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(i) }}), depth(1)));
-channel ch_data chan_a_in{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2 * (i // 2) + ((i + 1) % 2)) }}), depth(1)));
+channel ch_data chan_a_out{{ i }} __attribute((io("kernel_output_ch{{ i }}"), depth(1)));
+channel ch_data chan_a_in{{ i }} __attribute((io("kernel_input_ch{{ 2 * (i // 2) + ((i + 1) % 2) }}"), depth(1)));
 {% endfor %}
 #endif
 
diff --git a/PTRANS/src/device/transpose_c2_DIAG_IEC.cl b/PTRANS/src/device/transpose_c2_DIAG_IEC.cl
index cf2455e7..a40d6bb0 100644
--- a/PTRANS/src/device/transpose_c2_DIAG_IEC.cl
+++ b/PTRANS/src/device/transpose_c2_DIAG_IEC.cl
@@ -18,10 +18,10 @@ typedef struct {
 
 {% for i in range(num_replications) %}
 // Channel used to send the transposed blocks of A
-channel ch_data chan_a_out1{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(2*i) }}), depth(1)));
-channel ch_data chan_a_out2{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(2*i + 1) }}), depth(1)));
-channel ch_data chan_a_in1{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2*i + 1) }}), depth(1)));
-channel ch_data chan_a_in2{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2*i) }}), depth(1)));
+channel ch_data chan_a_out1{{ i }} __attribute((io("kernel_output_ch{{ 2*i }}"), depth(1)));
+channel ch_data chan_a_out2{{ i }} __attribute((io("kernel_output_ch{{ 2*i + 1 }}"), depth(1)));
+channel ch_data chan_a_in1{{ i }} __attribute((io("kernel_input_ch{{ 2*i + 1 }}"), depth(1)));
+channel ch_data chan_a_in2{{ i }} __attribute((io("kernel_input_ch{{ 2*i }}"), depth(1)));
 {% endfor %}
 #endif
 
diff --git a/b_eff/src/device/communication_bw520n_IEC.cl b/b_eff/src/device/communication_bw520n_IEC.cl
index e3d61b74..f5dcefe8 100644
--- a/b_eff/src/device/communication_bw520n_IEC.cl
+++ b/b_eff/src/device/communication_bw520n_IEC.cl
@@ -50,12 +50,12 @@ typedef struct {
  * Definition of the external channels
  */
 {% for i in range(num_replications) %}
-channel message_part ch_out_{{ 2*i+1 }} __attribute((io({{ "\"kernel_output_ch{}\"".format(i % 4) }})));
-channel message_part ch_out_{{ 2*i+2 }} __attribute((io({{ "\"kernel_output_ch{}\"".format((i + 2) % 4) }})));
-channel message_part ch_in_{{ 2*i+1 }} __attribute((io({{ "\"kernel_input_ch{}\"".format(i % 4) }})));
-channel message_part ch_in_{{ 2*i+2 }}  __attribute((io({{ "\"kernel_input_ch{}\"".format((i + 2) % 4) }})));
-channel message_part ch_exchange{{ 2*i+1 }};
-channel message_part ch_exchange{{ 2*i+2 }};
+channel message_part ch_out_{{ 2*i + 1 }} __attribute((io("kernel_output_ch{{ i % 4 }}")));
+channel message_part ch_out_{{ 2*i + 2 }} __attribute((io("kernel_output_ch{{ (i + 2) % 4 }}")));
+channel message_part ch_in_{{ 2*i + 1 }} __attribute((io("kernel_input_ch{{ i % 4 }} ")));
+channel message_part ch_in_{{ 2*i + 2 }}  __attribute((io("kernel_input_ch{{ (i + 2) % 4 }}")));
+channel message_part ch_exchange{{ 2*i + 1 }};
+channel message_part ch_exchange{{ 2*i + 2 }};
 {% endfor %}
 
 

From 25b6afaeb501ffeeb1eef01a5ea15562a2950b1f Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Thu, 10 Nov 2022 19:04:34 +0100
Subject: [PATCH 188/318] add default attributes for PTRANS and RandomAccess

---
 PTRANS/src/device/transpose_PQ_PCIE.cl                  | 9 +++++++--
 RandomAccess/src/device/random_access_kernels_single.cl | 6 +++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cl b/PTRANS/src/device/transpose_PQ_PCIE.cl
index caa20143..3fccf79a 100644
--- a/PTRANS/src/device/transpose_PQ_PCIE.cl
+++ b/PTRANS/src/device/transpose_PQ_PCIE.cl
@@ -8,7 +8,11 @@
 
 #include "parameters.h"
 
-{% set kernel_param_attributes = generate_attributes(num_replications) %}
+{% if generate_attributes is defined %}
+        {% set kernel_param_attributes = generate_attributes(num_replications) %}
+{% else %}
+        {% set kernel_param_attributes = create_list("", num_replications) %}
+{% endif %}
 
 {% for i in range(num_replications) %}
 
@@ -35,7 +39,8 @@ __kernel
 void transpose{{ i }}(__global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict A,
                                 __global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict B,
                                 __global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict A_out,
-            const uint offset,
+            const uint offset_a,
+            const uint offset_b,
             const uint number_of_blocks,
             const uint width_in_blocks,
             const uint height_in_blocks) {
diff --git a/RandomAccess/src/device/random_access_kernels_single.cl b/RandomAccess/src/device/random_access_kernels_single.cl
index 16637065..5ebc1376 100644
--- a/RandomAccess/src/device/random_access_kernels_single.cl
+++ b/RandomAccess/src/device/random_access_kernels_single.cl
@@ -34,7 +34,11 @@ Constant used to update the pseudo random number
 #define BLOCK_SIZE_LOG GLOBAL_MEM_UNROLL_LOG
 #define BLOCK_SIZE (1 << BLOCK_SIZE_LOG)
 
-{% set kernel_param_attributes = generate_attributes(num_replications) %}
+{% if generate_attributes is defined %}
+    {% set kernel_param_attributes = generate_attributes(num_replications) %}
+{% else %}
+    {% set kernel_param_attributes = create_list("", num_replications) %}
+{% endif %}
 
 {% for i in range(num_replications) %}
 

From 97b0c829e1f4e9e97fbf30854eee71504fa88d27 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Thu, 17 Nov 2022 19:27:10 +0100
Subject: [PATCH 189/318] add build:docs pipeline

---
 .gitlab-ci.yml                          | 21 ++++++++++++++++++++-
 scripts/code_generator/requirements.txt |  1 +
 scripts/evaluation/requirements.txt     |  2 +-
 3 files changed, 22 insertions(+), 2 deletions(-)
 create mode 100644 scripts/code_generator/requirements.txt

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 40ca7a1f..6d05430b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -10,7 +10,8 @@ default:
     - jacamar
   before_script:
     - module load fpga/intel/opencl_sdk/21.2.0 fpga/bittware/520n/20.4.0_max toolchain/foss/2021a devel/CMake/3.20.1-GCCcore-10.3.0 lang/Python/3.9.5-GCCcore-10.3.0
-    - python -m pip install pandas
+    - python -m pip install -r scripts/evaluation/requirements.txt
+    - python -m pip install -r scripts/code_generator/requirements.txt
 
 ###
 #
@@ -18,6 +19,23 @@ default:
 #
 ###
 
+build:docs:
+  stage: build
+  script:
+    - python -m pip install -r docs/requirements.txt
+    - module load devel/Doxygen/1.9.1-GCCcore-10.3.0
+    - cd docs
+    - make html
+    - doxygen doxy.config
+  only:
+    changes:
+      - docs/**/*
+      - .gitlab-ci.yml
+  artifacts:
+    paths:
+      - docs/build
+      - docs/xml
+
 build:STREAM:
   stage: build
   script:
@@ -174,6 +192,7 @@ build:LINPACK_DP:
       - shared/**/*
       - scripts/**/*
       - cmake/**/*
+      - .gitlab-ci.yml
 
 build:GEMM:
   stage: build
diff --git a/scripts/code_generator/requirements.txt b/scripts/code_generator/requirements.txt
new file mode 100644
index 00000000..ea18cd6f
--- /dev/null
+++ b/scripts/code_generator/requirements.txt
@@ -0,0 +1 @@
+jinja2==2.11.3
diff --git a/scripts/evaluation/requirements.txt b/scripts/evaluation/requirements.txt
index f9ccbaa9..efd4927b 100644
--- a/scripts/evaluation/requirements.txt
+++ b/scripts/evaluation/requirements.txt
@@ -1 +1 @@
-pandas==0.23.3
+pandas==1.4.3

From 7eb91bf26982c6e331b028240a5cf3f9982af324 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Fri, 18 Nov 2022 13:01:53 +0100
Subject: [PATCH 190/318] update code_generator README.md

---
 scripts/code_generator/README.md | 91 ++++++++++----------------------
 1 file changed, 29 insertions(+), 62 deletions(-)

diff --git a/scripts/code_generator/README.md b/scripts/code_generator/README.md
index 2847e7ac..9ff4d8ab 100644
--- a/scripts/code_generator/README.md
+++ b/scripts/code_generator/README.md
@@ -4,85 +4,57 @@ This is a small and highly extendable Python script for Code generation.
 The main application area is the generation of OpenCL code, but the generator works independently of the used programming language.
 It can be seen as an extension of the usually used preprocessors to adapt the code before compilation.
 With this code it is also possible to replicate code sections and do more complex modifications while keeping the code readable.
-This is done using inline scripting in code comments.
-A generator code line always starts with `PY_CODE_GEN`.
+This is done using the [jinja templating engine](https://jinja.palletsprojects.com/en/3.1.x/).
 
 ## Execution
 
-The script needs Python3 to run.
+The script needs Python3 with the module "jinja2"  to run.
 It will be used by the CMake build system to generate source code and settings for some of the benchmarks.
 A short summary of the usage of the script that can also be printed by running `./generator.py -h`:
 
-    usage: generator.py [-h] [-o OUTPUT_FILE] [--comment COMMENT_SYMBOL]
-                        [--comment-ml-start COMMENT_SYMBOL_ML_START]
-                        [--comment-ml-end COMMENT_SYMBOL_ML_END] [-p PARAMS]
-                        CODE_FILE
+    usage: generator.py [-h] [-o OUTPUT_FILE] [-p PARAMS] CODE_FILE
 
-    Preprocessor for code replication and advanced code modification.
+    Preprocessor for code replication and advanced code modification using jinja.
 
     positional arguments:
-    CODE_FILE             Path to the file that is used as input
+      CODE_FILE       Path to the file that is used as input
 
     optional arguments:
-    -h, --help            show this help message and exit
-    -o OUTPUT_FILE        Path to the output file. If not given, output will
-                            printed to stdout.
-    --comment COMMENT_SYMBOL
-                            Symbols that are used to comment out lines in the
-                            target language. Default='//'
-    --comment-ml-start COMMENT_SYMBOL_ML_START
-                            Symbols that are used to start a multi line comment in
-                            the target language. Default='/*'
-    --comment-ml-end COMMENT_SYMBOL_ML_END
-                            Symbols that are used to end a multi line comment in
-                            the target language. Default='*/'
-    -p PARAMS             Python statement that is parsed before modifying the
-                            files. Can be used to define global variables.
-
+      -h, --help      show this help message and exit
+      -o OUTPUT_FILE  Path to the output file. If not given, output will printed
+                      to stdout.
+      -p PARAMS       Python statement that is parsed before modifying the files.
+                      Can be used to define global variables.
 
+        usage: generator.py [-h] [-o OUTPUT_FILE] [--comment COMMENT_SYMBOL]
+                            [--comment-ml-start COMMENT_SYMBOL_ML_START]
+                            [--comment-ml-end COMMENT_SYMBOL_ML_END] [-p PARAMS]
+                            CODE_FILE
 
 ## Code Examples
 
-The generator takes arbitrary code files as input and only applies changes when specific comment patterns are found.
+The generator takes arbitrary code files as input and only applies changes when the specific jinja templating syntax is used.
 The code insertions have the following syntax:
 
-    // PY_CODE_GEN [block_start STATEMENT|block_end|STATEMENT]
-
-it is also possible to write multiple lines of code:
-
-    /* PY_CODE_GEN 
-    STATEMENT1
-    STATEMENT2
-    ...
-    */
-
-Where `STATEMENT`is an arbitrary python statement.
-The input file will be parsed from the beginning to the end and generation statements will be executed immediately.
 Example for the definition of a global variable:
 
-    PY_CODE_GEN replicate=4
+    {% set replicate = 4 %}
 
 This variable can then be used within the next pragmas to further modify the code.
 E.g. the defined variable can be used to modifiy a code block:
 
-    // PY_CODE_GEN block_start CODE.replace("$R", str(replicate))
-    int i = $R;
-    printf("i should be $R");
-    // PY_CODE_GEN block_end 
-
-`CODE` is a global variable containing the code within the recent block. It can be modified like every other Python string.
-In most cases it is recommended to use the build-in function `replace()` for replacing variables, but it might be used for more advanced code modifications.
-The result of the given Python statement will then be printed in the modified file.
+    int i = {{ replicate }};
+    printf("i should be {{ replicate }}");
 
 This is functionality, which would also be possible using the standard preprocessor.
 A case, where this script becomes handy is code replication.
-This can easily be doe using list comprehension.
+This can easily be done using the for-syntax similiar to list comprehension.
 As an example the dynamic construction of a switch statement:
 
     switch(i) {
-        // PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(replicate)]
+        {% for i in range(replicate) %}
         case {{ i }}: return /*PY_CODE_GEN i+1*/; break;
-        // PY_CODE_GEN block_end 
+        {% endfor %}
     }
 
 would result in:
@@ -94,25 +66,20 @@ would result in:
         case 3: return 4; break;
     }
 
-Note, that the variables that have to be replaced are written in inline comments `{{ i }}`.
+Note, that the variables that have to be replaced are written double brackets `{{ i }}`.
 The given statement will be evaluated and the comment will be replaced by the result.
 Thus, it is also possible to call functions or do arithmetic.
 
 ## Built-In Functions
 
-The generator can easily be extended by including additional file with the `use_file(FILENAME)` command.
-
-    PY_CODE_GEN use_file(helpers.py)
-
-This will read the file and make all functions and global variables available within following blocks.
+It is possible to insert variables or function definitions with the -p parameter, but they need to be defined explicitly in the script itself to be available in the template engine.
 
-`replace()` makes it easier to replace global variables within the code:
+For accessing functions the globals variable of the template needs to be updated.
 
-    // PY_CODE_GEN block_start replace(local_variables={"test": 2})
-    int var = /*PY_CODE_GEN test*/
-    // PY_CODE_GEN block_end
+    template.globals.update({'function': function})
+ 
+The variables need to be passed in the render step.
 
-will generate the code `int var = 2`.
+    template.render(variable=variable)
 
-It is easily possible to add other helper functions and extend the functionality of the generator using the `use_file` method
-or by declaring functions in multi line comments.
+This is very inflexible compared to the previous generation of this script. Further evaluation is needed to find out whether a automatic merge of the globals of the script with the globals of the template is possible.
\ No newline at end of file

From b88cfab356f750b5c686cceb53147565db41f823 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Fri, 18 Nov 2022 13:02:07 +0100
Subject: [PATCH 191/318] write to stdout if no outputfile is given

---
 scripts/code_generator/generator.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/scripts/code_generator/generator.py b/scripts/code_generator/generator.py
index 2c915876..f8b1da3a 100644
--- a/scripts/code_generator/generator.py
+++ b/scripts/code_generator/generator.py
@@ -4,7 +4,7 @@
 from jinja2 import Environment, PackageLoader, BaseLoader, TemplateNotFound, select_autoescape
 from os.path import join, exists, getmtime
 
-parser = argparse.ArgumentParser(description='Preprocessor for code replication and advanced code modification.')
+parser = argparse.ArgumentParser(description='Preprocessor for code replication and advanced code modification using jinja.')
 parser.add_argument('file', metavar='CODE_FILE', type=str,
                    help='Path to the file that is used as input')
 parser.add_argument("-o", dest="output_file", default=None, help="Path to the output file. If not given, output will printed to stdout.")
@@ -66,9 +66,6 @@ def create_list(content, count):
     if not args.file:
         logging.debug('no input file given')
         exit(1)
-    if not args.output_file: 
-        logging.debug('no output file given')
-        exit(1)
     for p in args.params:
         logging.debug("Parse statement: %s" % p)
         exec(p, globals())
@@ -82,11 +79,15 @@ def create_list(content, count):
     except:
         pass
 
-    if num_replications is None:
+    if not 'num_replications' in globals():
         num_replications = 1 
 
-    if num_total_replications is None:
+    if not 'num_total_replications' in globals():
         num_total_replications = 1
 
-    with open(args.output_file, 'w') as f:
-        f.write(template.render(num_replications=num_replications, num_total_replications=num_total_replications))
\ No newline at end of file
+    rendered_template = template.render(num_replications=num_replications, num_total_replications=num_total_replications)
+    try:
+        with open(args.output_file, 'w') as f:
+            f.write(rendered_template)
+    except:
+        sys.stdout.write(rendered_template)

From abfc85b3c4975358319e217da5e50be8abdd3edb Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 28 Nov 2022 17:05:29 +0100
Subject: [PATCH 192/318] update main README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c8ae0604..1f830da8 100755
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ All benchmarks come with the following build dependencies:
 - CMake >= 3.13
 - C++ compiler with C++11 and <regex> support (GCC 4.9.0+)
 - Intel OpenCL FPGA SDK or Xilinx Vitis
-- Python 3 for code generation and with [pandas](https://pandas.pydata.org) installed for the evaluation scripts
+- Python 3 with [jinja2](https://jinja.palletsprojects.com) for code generation and [pandas](https://pandas.pydata.org) for the evaluation scripts.
 
 Moreover, additional libraries are fetched by the build system during configuration:
 

From a9ee4fa2a44e1f89c77441b3b4e913bdcbbc2794 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 5 Dec 2022 10:17:02 +0100
Subject: [PATCH 193/318] fix b_eff template

---
 b_eff/src/device/communication_bw520n_IEC.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/b_eff/src/device/communication_bw520n_IEC.cl b/b_eff/src/device/communication_bw520n_IEC.cl
index f5dcefe8..8f43756b 100644
--- a/b_eff/src/device/communication_bw520n_IEC.cl
+++ b/b_eff/src/device/communication_bw520n_IEC.cl
@@ -52,7 +52,7 @@ typedef struct {
 {% for i in range(num_replications) %}
 channel message_part ch_out_{{ 2*i + 1 }} __attribute((io("kernel_output_ch{{ i % 4 }}")));
 channel message_part ch_out_{{ 2*i + 2 }} __attribute((io("kernel_output_ch{{ (i + 2) % 4 }}")));
-channel message_part ch_in_{{ 2*i + 1 }} __attribute((io("kernel_input_ch{{ i % 4 }} ")));
+channel message_part ch_in_{{ 2*i + 1 }} __attribute((io("kernel_input_ch{{ i % 4 }}")));
 channel message_part ch_in_{{ 2*i + 2 }}  __attribute((io("kernel_input_ch{{ (i + 2) % 4 }}")));
 channel message_part ch_exchange{{ 2*i + 1 }};
 channel message_part ch_exchange{{ 2*i + 2 }};

From d7177cadeb948b610c3bd283d40d18cf6b25231e Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Tue, 4 Oct 2022 20:48:11 +0200
Subject: [PATCH 194/318] add check stage (without really checking)

---
 .gitlab-ci.yml | 675 ++++++++++++++++---------------------------------
 1 file changed, 215 insertions(+), 460 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6d05430b..144932ad 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,4 +1,5 @@
 stages:
+    - check
     - build
     - test
 
@@ -15,7 +16,7 @@ default:
 
 ###
 #
-# Build all benchmarks
+# Build documentation
 #
 ###
 
@@ -36,287 +37,203 @@ build:docs:
       - docs/build
       - docs/xml
 
-build:STREAM:
-  stage: build
+###
+#
+# Check formatting of all benchmarks
+#
+###
+
+.check: &check
+  stage: check
   script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../STREAM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/stream_kernels_single_emulate.aocx
-      - build/bin/stream_kernels_emulate.aocx
-      - build/bin/STREAM_FPGA_intel
-      - build/bin/STREAM_FPGA_test_intel
+    - module load compiler/Clang/13.0.1-GCCcore-11.2.0
+    - find $BENCHMARK_FOLDER -regex '.*\.\(cpp\|hpp\|cc\|cxx\|h\)' -exec clang-format -style=file -i {} \;
+    - git diff | cat
+    ## do not test for real yet
+    #- test -z "$(git status --porcelain)"
+      
   only:
     changes:
-      - STREAM/**/*
+      - $BENCHMARK_FOLDER/**/*
       - shared/**/*
       - scripts/**/*
       - cmake/**/*
       - .gitlab-ci.yml
 
-build:STREAM_HP:
+check:STREAM:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: STREAM
+            
+check:RandomAccess:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: RandomAccess
+
+check:PTRANS:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: PTRANS
+
+check:LINPACK:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: LINPACK
+
+check:GEMM:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: GEMM
+
+check:FFT:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: FFT
+
+check:b_eff:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: b_eff
+
+###
+#
+# Build all benchmarks
+#
+###
+
+.build: &build
   stage: build
   script:
     - rm -rf build
     - mkdir -p build
     - cd build
-    - cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - cmake ../$BENCHMARK_FOLDER -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 $BENCHMARK_OPTIONS
     - make -j 40 all
   artifacts:
     paths:
-      - build/bin/stream_kernels_single_emulate.aocx
-      - build/bin/stream_kernels_emulate.aocx
-      - build/bin/STREAM_FPGA_intel
-      - build/bin/STREAM_FPGA_test_intel
+      - build/bin/*
   only:
     changes:
-      - STREAM/**/*
+      - $BENCHMARK_FOLDER/**/*
       - shared/**/*
       - scripts/**/*
       - cmake/**/*
       - .gitlab-ci.yml
 
+build:STREAM:
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: STREAM
+  dependencies:
+    - check:STREAM
+  needs: ["check:STREAM"]
+
+build:STREAM_HP:
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: STREAM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32
+  dependencies:
+    - check:STREAM
+  needs: ["check:STREAM"]
+
 build:STREAM_DP:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/stream_kernels_single_emulate.aocx
-      - build/bin/stream_kernels_emulate.aocx
-      - build/bin/STREAM_FPGA_intel
-      - build/bin/STREAM_FPGA_test_intel
-  only:
-    changes:
-      - STREAM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
-      
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: STREAM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8
+  dependencies:
+    - check:STREAM
+  needs: ["check:STREAM"]
+            
 build:RandomAccess:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../RandomAccess -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/random_access_kernels_single_emulate.aocx
-      - build/bin/RandomAccess_intel
-      - build/bin/RandomAccess_test_intel
-  only:
-    changes:
-      - RandomAccess/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
-
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: RandomAccess
+  dependencies:
+    - check:RandomAccess
+  needs: ["check:RandomAccess"]
 
 build:PTRANS:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../PTRANS -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DHOST_EMULATION_REORDER=Yes
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/transpose_DIAG_IEC_emulate.aocx
-      - build/bin/transpose_PQ_IEC_emulate.aocx
-      - build/bin/transpose_PQ_PCIE_emulate.aocx
-      - build/bin/transpose_DIAG_PCIE_emulate.aocx
-      - build/bin/transpose_c2_DIAG_IEC_emulate.aocx
-      - build/bin/Transpose_intel
-      - build/bin/Transpose_test_intel
-  only:
-    changes:
-      - PTRANS/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: PTRANS
+    BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes
+  dependencies:
+    - check:PTRANS
+  needs: ["check:PTRANS"]
 
 build:LINPACK:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DUSE_PCIE_MPI_COMMUNICATION=Yes
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/hpl_torus_PCIE_emulate.aocx
-      - build/bin/hpl_torus_IEC_emulate.aocx
-      - build/bin/Linpack_intel
-      - build/bin/Linpack_test_intel
-  only:
-    changes:
-      - LINPACK/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
-
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: LINPACK
+    BENCHMARK_OPTIONS: -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3
+  dependencies:
+    - check:LINPACK
+  needs: ["check:LINPACK"]
 
 build:LINPACK_DP:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/hpl_torus_PCIE_emulate.aocx
-      - build/bin/hpl_torus_IEC_emulate.aocx
-      - build/bin/Linpack_intel
-      - build/bin/Linpack_test_intel
-  only:
-    changes:
-      - LINPACK/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: LINPACK
+    BENCHMARK_OPTIONS: -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
+  dependencies:
+    - check:LINPACK
+  needs: ["check:LINPACK"]
 
 build:GEMM:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../GEMM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/gemm_base_emulate.aocx
-      - build/bin/GEMM_intel
-      - build/bin/GEMM_test_intel
-  only:
-    changes:
-      - GEMM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: GEMM
+    BENCHMARK_OPTIONS: -DBLOCK_SIZE=32
+  dependencies:
+    - check:GEMM
+  needs: ["check:GEMM"]
 
 build:GEMM_HP_REP2:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../GEMM -DDATA_TYPE=half -DNUM_REPLICATIONS=2 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/gemm_base_emulate.aocx
-      - build/bin/GEMM_intel
-      - build/bin/GEMM_test_intel
-  only:
-    changes:
-      - GEMM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: GEMM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=half -DNUM_REPLICATIONS=2 -DBLOCK_SIZE=32
+  dependencies:
+    - check:GEMM
+  needs: ["check:GEMM"]
 
 build:GEMM_DP_REP2:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../GEMM -DDATA_TYPE=double -DNUM_REPLICATIONS=2 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/gemm_base_emulate.aocx
-      - build/bin/GEMM_intel
-      - build/bin/GEMM_test_intel
-  only:
-    changes:
-      - GEMM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: GEMM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=double -DNUM_REPLICATIONS=2 -DBLOCK_SIZE=32
+  dependencies:
+    - check:GEMM
+  needs: ["check:GEMM"]
 
 build:FFT:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/fft1d_float_8_emulate.aocx
-      - build/bin/FFT_intel
-      - build/bin/FFT_test_intel
-  only:
-    changes:
-      - FFT/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: FFT
+  dependencies:
+    - check:FFT
+  needs: ["check:FFT"]
 
 build:FFT_small:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/fft1d_float_8_emulate.aocx
-      - build/bin/FFT_intel
-      - build/bin/FFT_test_intel
-  only:
-    changes:
-      - FFT/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: FFT
+    BENCHMARK_OPTIONS: -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2
+  dependencies:
+    - check:FFT
+  needs: ["check:FFT"]
 
 build:b_eff:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../b_eff -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DHOST_EMULATION_REORDER=Yes
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/*
-  only:
-    changes:
-      - b_eff/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: b_eff
+    BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes
+  dependencies:
+    - check:b_eff
+  needs: ["check:b_eff"]
+
 
 ###
 #
@@ -324,300 +241,138 @@ build:b_eff:
 #
 ###
 
-test:STREAM:
+.test: &test
   stage: test
   script:
     - cd build
-    - cmake ../STREAM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - cmake ../$BENCHMARK_FOLDER -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
     - make CTEST_OUTPUT_ON_FAILURE=1 test
-  dependencies:
-    - build:STREAM
   artifacts:
     when: on_failure
     paths:
       - build/Testing/Temporary/LastTest.log
   only:
     changes:
-      - STREAM/**/*
+      - $BENCHMARK_FOLDER/**/*
       - shared/**/*
       - scripts/**/*
       - cmake/**/*
       - .gitlab-ci.yml
+
+
+test:STREAM:
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: STREAM
+  dependencies:
+    - build:STREAM
   needs: ["build:STREAM"]
 
 test:STREAM_HP:
-  stage: test
-  script:
-    - cd build
-    - cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: STREAM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32
   dependencies:
     - build:STREAM_HP
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - STREAM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:STREAM_HP"]
-  # Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE)
-  allow_failure: true
 
 test:STREAM_DP:
-  stage: test
-  script:
-    - cd build
-    - cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: STREAM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8
   dependencies:
     - build:STREAM_DP
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - STREAM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:STREAM_DP"]
-    
+            
 test:RandomAccess:
-  stage: test
-  script:
-    - cd build
-    - cmake ../RandomAccess -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: RandomAccess
   dependencies:
     - build:RandomAccess
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - RandomAccess/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:RandomAccess"]
 
 test:PTRANS:
-  stage: test
-  script:
-    - cd build
-    - cmake ../PTRANS -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DPTRANS_HOST_EMULATION_REORDER=Yes
-    - cd bin
-    - touch kernel_output_ch0
-    - touch kernel_output_ch1
-    - touch kernel_output_ch2
-    - touch kernel_output_ch3
-    - ln -s kernel_output_ch0 kernel_input_ch1
-    - ln -s kernel_output_ch2 kernel_input_ch3
-    - ln -s kernel_output_ch1 kernel_input_ch0
-    - ln -s kernel_output_ch3 kernel_input_ch2
-    - cd ..
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: PTRANS
+    BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes
   dependencies:
     - build:PTRANS
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - PTRANS/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:PTRANS"]
 
 test:LINPACK:
-  stage: test
-  script:
-    - cd build
-    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0  -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: LINPACK
+    BENCHMARK_OPTIONS: -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3
   dependencies:
     - build:LINPACK
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - LINPACK/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:LINPACK"]
 
 test:LINPACK_DP:
-  stage: test
-  script:
-    - cd build
-    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0  -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: LINPACK
+    BENCHMARK_OPTIONS: -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
   dependencies:
     - build:LINPACK_DP
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - LINPACK/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
   needs: ["build:LINPACK_DP"]
 
 test:GEMM:
-  stage: test
-  script:
-    - cd build
-    - cmake ../GEMM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: GEMM
+    BENCHMARK_OPTIONS: -DBLOCK_SIZE=32
   dependencies:
     - build:GEMM
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - GEMM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:GEMM"]
 
 test:GEMM_HP_REP2:
-  stage: test
-  script:
-    - cd build
-    - cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=half -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: GEMM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=half -DNUM_REPLICATIONS=2 -DBLOCK_SIZE=32
   dependencies:
     - build:GEMM_HP_REP2
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - GEMM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:GEMM_HP_REP2"]
-  # Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE)
-  allow_failure: true
+  
 
 test:GEMM_DP_REP2:
-  stage: test
-  script:
-    - cd build
-    - cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=double -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: GEMM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=double -DNUM_REPLICATIONS=2 -DBLOCK_SIZE=32
   dependencies:
     - build:GEMM_DP_REP2
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - GEMM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:GEMM_DP_REP2"]
 
 test:FFT:
-  stage: test
-  script:
-    - cd build
-    - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: FFT
   dependencies:
     - build:FFT
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - FFT/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:FFT"]
 
 test:FFT_small:
-  stage: test
-  script:
-    - cd build
-    - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: FFT
+    BENCHMARK_OPTIONS: -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2
   dependencies:
     - build:FFT_small
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - FFT/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:FFT_small"]
 
 test:b_eff:
-  stage: test
-  script:
-    - cd build
-    - cmake ../b_eff -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DHOST_EMULATION_REORDER=Yes
-    - cd bin
-    - touch kernel_output_ch0
-    - touch kernel_output_ch1
-    - touch kernel_output_ch2
-    - touch kernel_output_ch3
-    - ln -s kernel_output_ch0 kernel_input_ch1
-    - ln -s kernel_output_ch2 kernel_input_ch3
-    - ln -s kernel_output_ch1 kernel_input_ch0
-    - ln -s kernel_output_ch3 kernel_input_ch2
-    - cd ..
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: b_eff
+    BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes
   dependencies:
     - build:b_eff
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - b_eff/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:b_eff"]
 
 

From f4552d62fbfbc348aca3a69212d93aa21b36ca31 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Fri, 25 Nov 2022 13:10:15 +0100
Subject: [PATCH 195/318] use explicit artifacts

---
 .gitlab-ci.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 144932ad..015fb208 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -244,8 +244,10 @@ build:b_eff:
 .test: &test
   stage: test
   script:
+    - mkdir -p build
     - cd build
-    - cmake ../$BENCHMARK_FOLDER -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - cmake ../$BENCHMARK_FOLDER $BENCHMARK_OPTIONS -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - $PREPARE_SCRIPT
     - make CTEST_OUTPUT_ON_FAILURE=1 test
   artifacts:
     when: on_failure

From 81fb766bbc9c4f0ee304b306ed713367da650f16 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Fri, 25 Nov 2022 13:25:43 +0100
Subject: [PATCH 196/318] add prepare_tests script for b_eff and PTRANS

---
 .gitlab-ci.yml                  |  3 +++
 PTRANS/scripts/prepare_tests.sh | 11 +++++++++++
 b_eff/scripts/prepare_tests.sh  | 11 +++++++++++
 3 files changed, 25 insertions(+)
 create mode 100755 PTRANS/scripts/prepare_tests.sh
 create mode 100755 b_eff/scripts/prepare_tests.sh

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 015fb208..e8d9c70e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -247,6 +247,7 @@ build:b_eff:
     - mkdir -p build
     - cd build
     - cmake ../$BENCHMARK_FOLDER $BENCHMARK_OPTIONS -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - make all -j41
     - $PREPARE_SCRIPT
     - make CTEST_OUTPUT_ON_FAILURE=1 test
   artifacts:
@@ -301,6 +302,7 @@ test:PTRANS:
   variables:
     BENCHMARK_FOLDER: PTRANS
     BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes
+    PREPARE_SCRIPT: ../$BENCHMARK_FOLDER/scripts/prepare_tests.sh ./bin
   dependencies:
     - build:PTRANS
   needs: ["build:PTRANS"]
@@ -373,6 +375,7 @@ test:b_eff:
   variables:
     BENCHMARK_FOLDER: b_eff
     BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes
+    PREPARE_SCRIPT: ../$BENCHMARK_FOLDER/scripts/prepare_tests.sh ./bin
   dependencies:
     - build:b_eff
   needs: ["build:b_eff"]
diff --git a/PTRANS/scripts/prepare_tests.sh b/PTRANS/scripts/prepare_tests.sh
new file mode 100755
index 00000000..2705d74d
--- /dev/null
+++ b/PTRANS/scripts/prepare_tests.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/bash
+
+cd $1
+touch kernel_output_ch0
+touch kernel_output_ch1
+touch kernel_output_ch2
+touch kernel_output_ch3
+ln -s kernel_output_ch0 kernel_input_ch1
+ln -s kernel_output_ch2 kernel_input_ch3
+ln -s kernel_output_ch1 kernel_input_ch0
+ln -s kernel_output_ch3 kernel_input_ch2
diff --git a/b_eff/scripts/prepare_tests.sh b/b_eff/scripts/prepare_tests.sh
new file mode 100755
index 00000000..2705d74d
--- /dev/null
+++ b/b_eff/scripts/prepare_tests.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/bash
+
+cd $1
+touch kernel_output_ch0
+touch kernel_output_ch1
+touch kernel_output_ch2
+touch kernel_output_ch3
+ln -s kernel_output_ch0 kernel_input_ch1
+ln -s kernel_output_ch2 kernel_input_ch3
+ln -s kernel_output_ch1 kernel_input_ch0
+ln -s kernel_output_ch3 kernel_input_ch2

From e24873c073a1ccc2aedcf383a13fd644f506a98a Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.uni-paderborn.de>
Date: Sat, 10 Dec 2022 13:24:32 +0100
Subject: [PATCH 197/318] allow failure of STREAM_HP and GEMM_HP_REP2

---
 .gitlab-ci.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index e8d9c70e..5136df6c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -198,6 +198,7 @@ build:GEMM_HP_REP2:
   dependencies:
     - check:GEMM
   needs: ["check:GEMM"]
+  allow_failure: true
 
 build:GEMM_DP_REP2:
   <<: *build
@@ -279,6 +280,7 @@ test:STREAM_HP:
   dependencies:
     - build:STREAM_HP
   needs: ["build:STREAM_HP"]
+  allow_failure: true
 
 test:STREAM_DP:
   <<: *test
@@ -342,7 +344,7 @@ test:GEMM_HP_REP2:
   dependencies:
     - build:GEMM_HP_REP2
   needs: ["build:GEMM_HP_REP2"]
-  
+  allow_failure: true
 
 test:GEMM_DP_REP2:
   <<: *test

From 202edd148c379b12eaac3d0cefdd4d187082abb9 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Sat, 10 Dec 2022 13:35:03 +0100
Subject: [PATCH 198/318] add .clang-format file

---
 .clang-format | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000..5b1cfaa3
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,125 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+# https://releases.llvm.org/12.0.1/tools/clang/docs/ClangFormatStyleOptions.html
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveMacros: None
+AlignConsecutiveAssignments: None
+AlignConsecutiveBitFields: None
+AlignConsecutiveDeclarations: None
+AlignEscapedNewlines: Right
+AlignOperands:   Align
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BreakBeforeBinaryOperators: None
+BreakBeforeConceptDeclarations: true
+BreakBeforeBraces: Linux
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     120
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineBeforeAccessModifier: LogicalBlock
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+IncludeBlocks:   Preserve
+IndentCaseLabels: false
+IndentCaseBlocks: false
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentExternBlock: AfterExternBlock
+IndentRequires:  false
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PenaltyIndentedWhitespace: 0
+PointerAlignment: Right
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+BitFieldColonSpacing: Both
+Standard:        Latest
+UseCRLF:         false
+UseTab:          Never
+CommentPragmas:  '^ IWYU pragma:'
+ForEachMacros:
+  - foreach
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+StatementMacros: []
+StatementAttributeLikeMacros:
+  - Q_EMIT
+WhitespaceSensitiveMacros: []
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        1
+    SortPriority:    0
+    CaseSensitive:   false

From 5c5f02775d356cc4ce913f51ed3b6a4aaa1b286c Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 13 Dec 2022 17:01:23 +0100
Subject: [PATCH 199/318] Fix platform string behavior

---
 shared/setup/fpga_setup.cpp | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp
index e6039973..5d0b79d1 100644
--- a/shared/setup/fpga_setup.cpp
+++ b/shared/setup/fpga_setup.cpp
@@ -245,20 +245,23 @@ choose a device.
 
         // Choose the target platform
         long unsigned int chosenPlatformId = 0;
-        if (defaultPlatform >= 0) {
-            if (platformString.size() > 0) {
-                bool found = false;
-                for (int i = 0; i < platformList.size(); i++) {
-                    if (platformList[i].getInfo<CL_PLATFORM_NAME>() == platformString) {
-                        chosenPlatformId = i;
-                        found = true;
-                        break;
-                    }
+        if (platformString.size() > 0) {
+            // Platform string has highest priority
+            bool found = false;
+            for (int i = 0; i < platformList.size(); i++) {
+                if (platformList[i].getInfo<CL_PLATFORM_NAME>() == platformString) {
+                    chosenPlatformId = i;
+                    found = true;
+                    break;
                 }
-                if (!found) {
-                    throw FpgaSetupException("Invalid platform string specified: " + platformString);
-                }
-            } else if (defaultPlatform < static_cast<int>(platformList.size())) {
+            }
+            if (!found) {
+                throw FpgaSetupException("Invalid platform string specified: " + platformString);
+            }
+        } 
+        else if (defaultPlatform >= 0) {
+            // Otherwise, select platform by index
+            if (defaultPlatform < static_cast<int>(platformList.size())) {
                 chosenPlatformId = defaultPlatform;
             } else {
                 std::cerr << "Default platform " << defaultPlatform

From ede7793eae0f85044917dc5dde03ef090e7da6a4 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Fri, 23 Sep 2022 11:51:22 +0200
Subject: [PATCH 200/318] first approach

---
 LINPACK/src/host/linpack_benchmark.cpp    | 56 ++++++++-----
 LINPACK/src/host/linpack_benchmark.hpp    |  6 +-
 extern/CMakeLists.txt                     | 21 +++++
 shared/CMakeLists.txt                     |  2 +-
 shared/include/hpcc_benchmark.hpp         | 95 ++++++++++++++++++++---
 shared/tests/hpcc_base_benchmark_test.cpp | 10 ++-
 6 files changed, 158 insertions(+), 32 deletions(-)

diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp
index d60be9d1..f0cd9867 100644
--- a/LINPACK/src/host/linpack_benchmark.cpp
+++ b/LINPACK/src/host/linpack_benchmark.cpp
@@ -126,11 +126,11 @@ linpack::LinpackBenchmark::executeKernel(LinpackData &data) {
 }
 
 void
-linpack::LinpackBenchmark::collectAndPrintResults(const linpack::LinpackExecutionTimings &output) {
+linpack::LinpackBenchmark::collectResults(const linpack::LinpackExecutionTimings &output) {
     // Calculate performance for kernel execution plus data transfer
-    double tmean = 0;
-    double tlumean = 0;
-    double tslmean = 0;
+    double t = 0;
+    double tlu = 0;
+    double tsl = 0;
     double tmin = std::numeric_limits<double>::max();
     double lu_min = std::numeric_limits<double>::max();
     double sl_min = std::numeric_limits<double>::max();
@@ -154,13 +154,13 @@ linpack::LinpackBenchmark::collectAndPrintResults(const linpack::LinpackExecutio
     }
 
     double total_matrix_size = static_cast<double>(executionSettings->programSettings->matrixSize);
-    double gflops_lu = ((2.0e0*total_matrix_size * total_matrix_size * total_matrix_size)/ 3.0) / 1.0e9; 
-    double gflops_sl = (2.0*(total_matrix_size * total_matrix_size))/1.0e9;
+    double gflop_lu = ((2.0e0*total_matrix_size * total_matrix_size * total_matrix_size)/ 3.0) / 1.0e9; 
+    double gflop_sl = (2.0*(total_matrix_size * total_matrix_size))/1.0e9;
     for (int i =0; i < global_lu_times.size(); i++) {
         double currentTime = global_lu_times[i] + global_sl_times[i];
-        tmean +=  currentTime;
-        tlumean +=  global_lu_times[i];
-        tslmean += global_sl_times[i];
+        t +=  currentTime;
+        tlu +=  global_lu_times[i];
+        tsl += global_sl_times[i];
         if (currentTime < tmin) {
             tmin = currentTime;
         }
@@ -171,29 +171,47 @@ linpack::LinpackBenchmark::collectAndPrintResults(const linpack::LinpackExecutio
             sl_min = global_sl_times[i];
         }
     }
-    tmean = tmean / global_lu_times.size();
-    tlumean = tlumean / global_lu_times.size();
-    tslmean = tslmean / global_sl_times.size();
+    
+    results.emplace("t_mean", hpcc_base::HpccResult(t / global_lu_times.size(), "s"));
+    results.emplace("t_min", hpcc_base::HpccResult(tmin, "?"));
+    results.emplace("tlu_mean", hpcc_base::HpccResult(tlu / global_lu_times.size(), "s"));
+    results.emplace("tlu_min", hpcc_base::HpccResult(lu_min, "s"));
+    results.emplace("tsl_mean", hpcc_base::HpccResult(tsl / global_sl_times.size(), "s"));
+    results.emplace("tsl_min", hpcc_base::HpccResult(sl_min, "s"));
+    results.emplace("gflops", hpcc_base::HpccResult((gflop_lu + gflop_sl) / tmin, "GFLOP/s"));
+    results.emplace("gflops_lu", hpcc_base::HpccResult(gflop_lu / lu_min, "GFLOP/s"));
+    results.emplace("gflops_sl", hpcc_base::HpccResult(gflop_sl / sl_min, "GFLOP/s"));
+    
+    return;
+}
 
-     std::cout << std::setw(ENTRY_SPACE)
+void
+linpack::LinpackBenchmark::printResults() {
+    if (mpi_comm_rank > 0) {
+        return;
+    }
+
+    std::cout << std::setw(ENTRY_SPACE)
               << "Method" << std::setw(ENTRY_SPACE)
               << "best" << std::setw(ENTRY_SPACE) << "mean"
               << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl;
 
+    /*
     std::cout << std::setw(ENTRY_SPACE) << "total" << std::setw(ENTRY_SPACE)
-              << tmin << std::setw(ENTRY_SPACE) << tmean
-              << std::setw(ENTRY_SPACE) << ((gflops_lu + gflops_sl) / tmin)
+              << results["t_min"] << std::setw(ENTRY_SPACE) << results["t_mean"]
+              << std::setw(ENTRY_SPACE) << results["gflops"]
               << std::endl;
 
     std::cout << std::setw(ENTRY_SPACE) << "GEFA" << std::setw(ENTRY_SPACE)
-            << lu_min << std::setw(ENTRY_SPACE) << tlumean
-            << std::setw(ENTRY_SPACE) << ((gflops_lu) / lu_min)
+            << results["tlu_min"] << std::setw(ENTRY_SPACE) << results["tlu_mean"]
+            << std::setw(ENTRY_SPACE) << results["gflops_lu"]
             << std::endl;
 
     std::cout << std::setw(ENTRY_SPACE) << "GESL" << std::setw(ENTRY_SPACE)
-              << sl_min << std::setw(ENTRY_SPACE) << tslmean
-              << std::setw(ENTRY_SPACE) << (gflops_sl / sl_min)
+              << results["tsl_min"] << std::setw(ENTRY_SPACE) << results["tsl_mean"]
+              << std::setw(ENTRY_SPACE) << results["gflops_sl"]
               << std::endl;
+              */
 }
 
 std::unique_ptr<linpack::LinpackData>
diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp
index c05b323a..adbae5ef 100644
--- a/LINPACK/src/host/linpack_benchmark.hpp
+++ b/LINPACK/src/host/linpack_benchmark.hpp
@@ -264,8 +264,12 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * 
      * @param output Measured runtimes of the kernel execution
      */
+    
     void
-    collectAndPrintResults(const LinpackExecutionTimings &output) override;
+    collectResults(const LinpackExecutionTimings &output) override;
+    
+    void
+    printResults() override;
 
     /**
      * @brief Construct a new Linpack Benchmark object
diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 75025b7c..d54a37c5 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -54,3 +54,24 @@ if(NOT extern_cxxopts_POPULATED)
     ${extern_cxxopts_BINARY_DIR}
     EXCLUDE_FROM_ALL)
 endif()
+
+# ------------------------------------------------------------------------------
+# A header only library for handling json
+FetchContent_Declare(
+  extern_json
+
+  URL      https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz
+  URL_HASH SHA256=8c4b26bf4b422252e13f332bc5e388ec0ab5c3443d24399acb675e68278d341f)
+
+FetchContent_GetProperties(extern_json)
+if(NOT extern_json_POPULATED)
+  message(STATUS "Fetching mandatory build dependency json")
+  FetchContent_Populate(extern_json)
+  add_subdirectory(
+    ${extern_json_SOURCE_DIR}
+    ${extern_json_BINARY_DIR}
+    EXCLUDE_FROM_ALL)
+  set(extern_json_SOURCE_DIR ${extern_json_SOURCE_DIR} PARENT_SCOPE)
+endif()
+
+
diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt
index 89a18117..77a1a8e5 100644
--- a/shared/CMakeLists.txt
+++ b/shared/CMakeLists.txt
@@ -15,6 +15,6 @@ else()
 endif()
 
 target_include_directories(hpcc_fpga_base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
-target_link_libraries(hpcc_fpga_base cxxopts)
+target_link_libraries(hpcc_fpga_base cxxopts nlohmann_json::nlohmann_json)
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tests)
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index aed3f901..56c060e9 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -23,6 +23,7 @@ SOFTWARE.
 #define SHARED_HPCC_BENCHMARK_HPP_
 
 #include <memory>
+#include <iostream>
 
 /* External library headers */
 #ifdef USE_DEPRECATED_HPP_HEADER
@@ -37,6 +38,7 @@ SOFTWARE.
 /* Project's headers */
 #include "setup/fpga_setup.hpp"
 #include "cxxopts.hpp"
+#include "nlohmann/json.hpp"
 #include "parameters.h"
 #include "communication_types.hpp"
 
@@ -45,6 +47,8 @@ SOFTWARE.
 
 #define ENTRY_SPACE 15
 
+using json = nlohmann::json;
+
 /**
  * @brief Contains all classes and functions that are used as basis
  *          for all benchmarks.
@@ -52,6 +56,25 @@ SOFTWARE.
  */
 namespace hpcc_base {
 
+class HpccResult {
+    double value;
+    std::string unit;
+
+public:
+    HpccResult(double value, std::string unit): value(value), unit(unit) {}
+    
+    friend std::ostream &operator<<(std::ostream &os, const HpccResult &result) {
+        os << result.value << " " << result.unit;
+        return os;
+    }
+
+    std::string to_string() const {
+        std::ostringstream oss;
+        oss << *this;
+        return oss.str();
+    }
+};
+
 /**
  * @brief This class should be derived and extended for every benchmark.
  *          It is a pure data object containing the benchmark settings that are
@@ -119,6 +142,8 @@ class BaseSettings {
      * 
      */
     bool testOnly;
+    
+    std::string dumpfilePath;
 
     /**
      * @brief Type of inter-FPGA communication used
@@ -152,6 +177,7 @@ class BaseSettings {
 #else
             communicationType(retrieveCommunicationType("UNSUPPORTED", results["f"].as<std::string>())),
 #endif
+            dumpfilePath(results["dump"].as<std::string>()),
             testOnly(static_cast<bool>(results.count("test"))) {}
 
     /**
@@ -236,6 +262,17 @@ class ExecutionSettings {
         programSettings = nullptr;
     }
 
+    std::string
+    getDeviceName() const {
+        std::string device_name;
+        if (!programSettings->testOnly) {
+            device->getInfo(CL_DEVICE_NAME, &device_name);
+        } else {
+            device_name = "TEST RUN: Not selected!";
+        }
+        return device_name;
+    }
+
 };
 
 /**
@@ -294,6 +331,15 @@ class HpccFpgaBenchmark {
      * 
      */
     bool mpi_external_init = true;
+    
+
+    /**
+     *
+     * @brief vector containing the benchmark results
+     *
+     */
+    std::map<std::string, HpccResult> results;
+
 
 public:
 
@@ -331,7 +377,10 @@ class HpccFpgaBenchmark {
      * @param output  The measurement data of the kernel execution
      */
     virtual void
-    collectAndPrintResults(const TOutput &output) = 0;
+    collectResults(const TOutput &output) = 0;
+    
+    virtual void
+    printResults() = 0;
 
     /**
      * @brief Method that can be overwritten by inheriting classes to check the validity of input parameters.
@@ -396,6 +445,7 @@ class HpccFpgaBenchmark {
                 ("comm-type", "Used communication type for inter-FPGA communication",
                 cxxopts::value<std::string>()->default_value(DEFAULT_COMM_TYPE))
 #endif
+                ("dump", "dump benchmark configuration and results to this file", cxxopts::value<std::string>()->default_value(std::string("")))
                 ("test", "Only test given configuration and skip execution and validation")
                 ("h,help", "Print this help");
 
@@ -448,6 +498,32 @@ class HpccFpgaBenchmark {
         std::cout << "Summary:" << std::endl;
         std::cout << *executionSettings << std::endl;
     }
+    
+    std::map<std::string, std::string> getResultsMap() {
+        // TODO: nested maps, recursive?
+        std::map<std::string, std::string> results_string;
+        for (auto const &result: results) {
+            results_string[result.first] = result.second.to_string();
+        }
+        return results_string;
+    }
+    
+    void
+    dumpConfigurationAndResults(std::string file_path) {
+        std::fstream fs;
+        fs.open(file_path, std::ios_base::out);
+        if (!fs.is_open()) {
+            std::cout << "Unable to open file for dumping configuration and results" << std::endl;
+        } else {
+            json dump;
+            std::string device_name = executionSettings->getDeviceName();
+            dump["device"] = device_name;
+            dump["settings"] = json(executionSettings->programSettings->getSettingsMap());
+            dump["results"] = getResultsMap();
+
+            fs << dump;
+        }
+    }
 
     /**
      * @brief Selects and prepares the target device and prints the final configuration.
@@ -586,7 +662,13 @@ class HpccFpgaBenchmark {
                     std::cout << "Validation Time: " << eval_time.count() << " s" << std::endl;
                 }
             }
-            collectAndPrintResults(*output);
+            collectResults(*output);
+            
+            if (executionSettings->programSettings->dumpfilePath.size() > 0) {
+                dumpConfigurationAndResults(executionSettings->programSettings->dumpfilePath);
+            }
+            
+            printResults();
 
             if (mpi_comm_rank == 0) {
                 if (!validateSuccess) {
@@ -658,6 +740,7 @@ class HpccFpgaBenchmark {
 
 };
 
+
 /**
  * @brief Prints the execution settings to an output stream
  * 
@@ -668,14 +751,8 @@ class HpccFpgaBenchmark {
  */
 template <class TSettings>
 std::ostream& operator<<(std::ostream& os, ExecutionSettings<TSettings> const& printedExecutionSettings){
-        std::string device_name;
+        std::string device_name = printedExecutionSettings.getDeviceName();
         os << std::left;
-        if (!printedExecutionSettings.programSettings->testOnly) {
-        printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name);
-        }
-        else {
-            device_name = "TEST RUN: Not selected!";
-        }
         for (auto k : printedExecutionSettings.programSettings->getSettingsMap()) {
             os   << std::setw(2 * ENTRY_SPACE) << k.first << k.second << std::endl;
         }
diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp
index 1c491b49..05489e17 100644
--- a/shared/tests/hpcc_base_benchmark_test.cpp
+++ b/shared/tests/hpcc_base_benchmark_test.cpp
@@ -45,7 +45,10 @@ class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
     checkInputParameters() override { return configurationCheckSucceeds;}
 
     void
-    collectAndPrintResults(const int &output) override {}
+    collectResults(const int &output) override {}
+
+    void
+    printResults() override {}
 
     MinimalBenchmark() : HpccFpgaBenchmark(0, { nullptr}) {}
 
@@ -94,7 +97,10 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
         return returnValidate;}
 
     void
-    collectAndPrintResults(const int &output) override {}
+    collectResults(const int &output) override {}
+
+    void
+    printResults() override {}
 
     bool
     checkInputParameters() override {

From a726c726a6542c1bd6af33413859b11b9de4bda6 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 5 Oct 2022 09:55:51 +0200
Subject: [PATCH 201/318] dump env, timings, git commit and config time

---
 LINPACK/src/common/parameters.h.in            |  3 +-
 .../host/execution_types/execution_iec.hpp    | 11 +++--
 .../host/execution_types/execution_pcie.hpp   | 10 ++--
 LINPACK/src/host/linpack_benchmark.cpp        | 18 +++----
 LINPACK/src/host/linpack_benchmark.hpp        |  6 +--
 ...nel_functionality_and_host_integration.cpp |  4 +-
 shared/include/hpcc_benchmark.hpp             | 47 +++++++++++++------
 shared/tests/hpcc_base_benchmark_test.cpp     | 18 +++----
 8 files changed, 67 insertions(+), 50 deletions(-)

diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in
index 4c036fb9..7d192e56 100644
--- a/LINPACK/src/common/parameters.h.in
+++ b/LINPACK/src/common/parameters.h.in
@@ -34,7 +34,8 @@
 /*
 Short description of the program
 */
-#define PROGRAM_DESCRIPTION "Implementation of the LINPACK benchmark"\
+#define PROGRAM_NAME "LINPACK"
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
 
diff --git a/LINPACK/src/host/execution_types/execution_iec.hpp b/LINPACK/src/host/execution_types/execution_iec.hpp
index b98bcc31..1584f8e2 100644
--- a/LINPACK/src/host/execution_types/execution_iec.hpp
+++ b/LINPACK/src/host/execution_types/execution_iec.hpp
@@ -44,7 +44,7 @@ namespace iec {
 /*
  Prepare kernels and execute benchmark for a bitstream that makes use of intel external channels
 */
-std::unique_ptr<linpack::LinpackExecutionTimings>
+std::map<std::string, std::vector<double>>
 calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&config,
           linpack::LinpackData& data) {
 
@@ -722,13 +722,14 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
     }
     buffer_queue.finish();
 #endif
+    
+    std::map<std::string, std::vector<double>> timings;
 
-    std::unique_ptr<linpack::LinpackExecutionTimings> results(
-                    new linpack::LinpackExecutionTimings{gefaExecutionTimes, geslExecutionTimes});
+    timings["gefa"] = gefaExecutionTimes;
+    timings["gesl"] = geslExecutionTimes;
     
     MPI_Barrier(MPI_COMM_WORLD);
-
-    return results;
+    return timings;
 }
 
 }   // namespace iec
diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp
index 51b9c546..e86600d2 100644
--- a/LINPACK/src/host/execution_types/execution_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_pcie.hpp
@@ -50,7 +50,7 @@ namespace pcie {
 
  @copydoc bm_execution::calculate()
 */
-std::unique_ptr<linpack::LinpackExecutionTimings>
+std::map<std::string, std::vector<double>>
 calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&config,
           linpack::LinpackData& data) {
 
@@ -717,12 +717,14 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
     MPI_Comm_free(&row_communicator);
     MPI_Comm_free(&col_communicator);
 
-    std::unique_ptr<linpack::LinpackExecutionTimings> results(
-                    new linpack::LinpackExecutionTimings{gefaExecutionTimes, geslExecutionTimes});
+    std::map<std::string, std::vector<double>> timings;
+    
+    timings["gefa"] = gefaExecutionTimes;
+    timings["gesl"] = geslExecutionTimes;
     
     MPI_Barrier(MPI_COMM_WORLD);
 
-    return results;
+    return timings;
 }
 
 }   // namespace pcie
diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp
index f0cd9867..b4156d57 100644
--- a/LINPACK/src/host/linpack_benchmark.cpp
+++ b/LINPACK/src/host/linpack_benchmark.cpp
@@ -111,9 +111,8 @@ linpack::LinpackBenchmark::addAdditionalParseOptions(cxxopts::Options &options)
         ("emulation", "Use kernel arguments for emulation. This may be necessary to simulate persistent local memory on the FPGA");
 }
 
-std::unique_ptr<linpack::LinpackExecutionTimings>
+void
 linpack::LinpackBenchmark::executeKernel(LinpackData &data) {
-    std::unique_ptr<linpack::LinpackExecutionTimings> timings;
     switch (executionSettings->programSettings->communicationType) {
         case hpcc_base::CommunicationType::pcie_mpi : timings = execution::pcie::calculate(*executionSettings, data); break;
         case hpcc_base::CommunicationType::intel_external_channels: timings = execution::iec::calculate(*executionSettings, data); break;
@@ -122,11 +121,10 @@ linpack::LinpackBenchmark::executeKernel(LinpackData &data) {
 #ifdef DISTRIBUTED_VALIDATION
     distributed_gesl_nopvt_ref(data);
 #endif
-    return timings;
 }
 
 void
-linpack::LinpackBenchmark::collectResults(const linpack::LinpackExecutionTimings &output) {
+linpack::LinpackBenchmark::collectResults() {
     // Calculate performance for kernel execution plus data transfer
     double t = 0;
     double tlu = 0;
@@ -139,10 +137,10 @@ linpack::LinpackBenchmark::collectResults(const linpack::LinpackExecutionTimings
     std::cout << "Rank " << mpi_comm_rank << ": Result collection started" << std::endl;
 #endif
 
-    std::vector<double> global_lu_times(output.gefaTimings.size());
-    MPI_Reduce(output.gefaTimings.data(), global_lu_times.data(), output.gefaTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-    std::vector<double> global_sl_times(output.geslTimings.size());
-    MPI_Reduce(output.geslTimings.data(), global_sl_times.data(), output.geslTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    std::vector<double> global_lu_times(timings["gefa"].size());
+    MPI_Reduce(timings["gefa"].data(), global_lu_times.data(), timings["gefa"].size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    std::vector<double> global_sl_times(timings["gesl"].size());
+    MPI_Reduce(timings["gesl"].data(), global_sl_times.data(), timings["gesl"].size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
 #ifndef NDEBUG
     std::cout << "Rank " << mpi_comm_rank << ": Result collection done" << std::endl;
 #endif
@@ -187,10 +185,6 @@ linpack::LinpackBenchmark::collectResults(const linpack::LinpackExecutionTimings
 
 void
 linpack::LinpackBenchmark::printResults() {
-    if (mpi_comm_rank > 0) {
-        return;
-    }
-
     std::cout << std::setw(ENTRY_SPACE)
               << "Method" << std::setw(ENTRY_SPACE)
               << "best" << std::setw(ENTRY_SPACE) << "mean"
diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp
index adbae5ef..7c7ce315 100644
--- a/LINPACK/src/host/linpack_benchmark.hpp
+++ b/LINPACK/src/host/linpack_benchmark.hpp
@@ -210,7 +210,7 @@ class LinpackExecutionTimings {
  * @brief Implementation of the Linpack benchmark
  * 
  */
-class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSettings, LinpackData, LinpackExecutionTimings> {
+class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSettings, LinpackData> {
 
 protected:
 
@@ -246,7 +246,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @param data The input and output data of the benchmark
      * @return std::unique_ptr<LinpackExecutionTimings> Measured runtimes of the kernel execution
      */
-    std::unique_ptr<LinpackExecutionTimings>
+    void
     executeKernel(LinpackData &data) override;
 
     /**
@@ -266,7 +266,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      */
     
     void
-    collectResults(const LinpackExecutionTimings &output) override;
+    collectResults() override;
     
     void
     printResults() override;
diff --git a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp
index 9432fb49..77c0fd70 100644
--- a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp
@@ -40,7 +40,7 @@ struct LinpackKernelTest : testing::TestWithParam<uint> {
  * Execution returns correct results for a single repetition
  */
 TEST_P(LinpackKernelTest, FPGACorrectResultsOneRepetition) {
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < array_size; i++) {
         EXPECT_NEAR(data->b[i], 1.0, 1.0e-3);
     }
@@ -50,7 +50,7 @@ TEST_P(LinpackKernelTest, FPGACorrectResultsOneRepetition) {
  * GEFA Execution returns correct results for a single repetition
  */
 TEST_P(LinpackKernelTest, DISABLED_FPGACorrectResultsGEFA) {
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     auto data2 = bm->generateInputData();
     if (bm->getExecutionSettings().programSettings->isDiagonallyDominant) {
         linpack::gefa_ref_nopvt(data2->A, array_size, array_size);
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 56c060e9..20fecd43 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -282,7 +282,7 @@ class ExecutionSettings {
  * @tparam TData Class used to represent the benchmark input and output data
  * @tparam TOutput Class representing the measurements like timings etc
  */
-template <class TSettings, class TData, class TOutput>
+template <class TSettings, class TData>
 class HpccFpgaBenchmark {
 
 private:
@@ -332,10 +332,16 @@ class HpccFpgaBenchmark {
      */
     bool mpi_external_init = true;
     
+    /**
+     *
+     * @brief map containing the benchmark timings
+     *
+     */
+    std::map<std::string, std::vector<double>> timings;
 
     /**
      *
-     * @brief vector containing the benchmark results
+     * @brief map containing the benchmark results
      *
      */
     std::map<std::string, HpccResult> results;
@@ -357,7 +363,7 @@ class HpccFpgaBenchmark {
      * @param data The initialized data for the kernel. It will be replaced by the kernel output for validation
      * @return std::unique_ptr<TOutput> A data class containing the measurement results of the execution
      */
-    virtual std::unique_ptr<TOutput>
+    virtual void
     executeKernel(TData &data) = 0;
 
     /**
@@ -377,7 +383,7 @@ class HpccFpgaBenchmark {
      * @param output  The measurement data of the kernel execution
      */
     virtual void
-    collectResults(const TOutput &output) = 0;
+    collectResults() = 0;
     
     virtual void
     printResults() = 0;
@@ -508,6 +514,12 @@ class HpccFpgaBenchmark {
         return results_string;
     }
     
+    std::map<std::string, std::string> getEnvironmentMap() {
+        std::map<std::string, std::string> env; 
+        env["LD_LIBRARY_PATH"] = std::string(std::getenv("LD_LIBRARY_PATH"));
+        return env;
+    }
+    
     void
     dumpConfigurationAndResults(std::string file_path) {
         std::fstream fs;
@@ -516,10 +528,17 @@ class HpccFpgaBenchmark {
             std::cout << "Unable to open file for dumping configuration and results" << std::endl;
         } else {
             json dump;
-            std::string device_name = executionSettings->getDeviceName();
-            dump["device"] = device_name;
-            dump["settings"] = json(executionSettings->programSettings->getSettingsMap());
+            dump["name"] = PROGRAM_NAME;
+#ifdef _USE_MPI_
+            dump["mpi"] ={{"version", MPI_VERSION}, {"subversion", MPI_SUBVERSION}};
+#endif
+            dump["config_time"] = CONFIG_TIME;
+            dump["git_commit"] = GIT_COMMIT_HASH;
+            dump["device"] = executionSettings->getDeviceName();
+            dump["settings"] = executionSettings->programSettings->getSettingsMap();
+            dump["timings"] = timings;
             dump["results"] = getResultsMap();
+            dump["environment"] = getEnvironmentMap();
 
             fs << dump;
         }
@@ -639,7 +658,7 @@ class HpccFpgaBenchmark {
 
             bool validateSuccess = false;
             auto exe_start = std::chrono::high_resolution_clock::now();
-            std::unique_ptr<TOutput> output =  executeKernel(*data);
+            executeKernel(*data);
 
 #ifdef _USE_MPI_
         MPI_Barrier(MPI_COMM_WORLD);
@@ -662,15 +681,15 @@ class HpccFpgaBenchmark {
                     std::cout << "Validation Time: " << eval_time.count() << " s" << std::endl;
                 }
             }
-            collectResults(*output);
+            collectResults();
             
-            if (executionSettings->programSettings->dumpfilePath.size() > 0) {
-                dumpConfigurationAndResults(executionSettings->programSettings->dumpfilePath);
-            }
+            if (mpi_comm_rank == 0) {
+                if (executionSettings->programSettings->dumpfilePath.size() > 0) {
+                    dumpConfigurationAndResults(executionSettings->programSettings->dumpfilePath);
+                }
             
-            printResults();
+                printResults();
 
-            if (mpi_comm_rank == 0) {
                 if (!validateSuccess) {
                     std::cerr << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl;
                 }
diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp
index 05489e17..c3cc7c2f 100644
--- a/shared/tests/hpcc_base_benchmark_test.cpp
+++ b/shared/tests/hpcc_base_benchmark_test.cpp
@@ -16,7 +16,7 @@
 // and enable the included tests
 void use_hpcc_base_lib() {}
 
-class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, int, int> {
+class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, int> {
 
 protected:
 
@@ -35,8 +35,8 @@ class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
     std::unique_ptr<int>
     generateInputData() override { return returnInputData ? std::unique_ptr<int>(new int) : std::unique_ptr<int>(nullptr);}
 
-    std::unique_ptr<int>
-    executeKernel(int &data) override { return returnExecuteKernel ? std::unique_ptr<int>(new int) : std::unique_ptr<int>(nullptr);}
+    void
+    executeKernel(int &data) override { return;}
 
     bool
     validateOutputAndPrintError(int &data) override { return returnValidate;}
@@ -45,7 +45,7 @@ class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
     checkInputParameters() override { return configurationCheckSucceeds;}
 
     void
-    collectResults(const int &output) override {}
+    collectResults() override {}
 
     void
     printResults() override {}
@@ -55,7 +55,7 @@ class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
 };
 
 
-class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, int, int> {
+class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, int> {
 
 protected:
 
@@ -83,13 +83,13 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
         generateInputDatacalled++;
         return std::unique_ptr<int>(new int);}
 
-    std::unique_ptr<int>
+    void
     executeKernel(int &data) override { 
         if (!returnExecuteKernel) {
             throw fpga_setup::FpgaSetupException("Test execute kernel failed");
         }
         executeKernelcalled++;
-        return std::unique_ptr<int>(new int);}
+        return;}
 
     bool
     validateOutputAndPrintError(int &data) override { 
@@ -97,7 +97,7 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
         return returnValidate;}
 
     void
-    collectResults(const int &output) override {}
+    collectResults() override {}
 
     void
     printResults() override {}
@@ -108,7 +108,7 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
             return false;
         }
         else {
-            return hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, int, int>::checkInputParameters();
+            return hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, int>::checkInputParameters();
         }
     }
 

From 55c05112a21c4b99224d2b9d063f8695796611eb Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 17 Oct 2022 13:43:10 +0200
Subject: [PATCH 202/318] output unit and value explicitly

---
 shared/include/hpcc_benchmark.hpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 20fecd43..410800e8 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -57,10 +57,10 @@ using json = nlohmann::json;
 namespace hpcc_base {
 
 class HpccResult {
+public:
     double value;
     std::string unit;
 
-public:
     HpccResult(double value, std::string unit): value(value), unit(unit) {}
     
     friend std::ostream &operator<<(std::ostream &os, const HpccResult &result) {
@@ -505,11 +505,14 @@ class HpccFpgaBenchmark {
         std::cout << *executionSettings << std::endl;
     }
     
-    std::map<std::string, std::string> getResultsMap() {
+    std::map<std::string, json> getResultsJson() {
         // TODO: nested maps, recursive?
-        std::map<std::string, std::string> results_string;
+        std::map<std::string, json> results_string;
         for (auto const &result: results) {
-            results_string[result.first] = result.second.to_string();
+            json j;
+            j["unit"] = result.second.unit;
+            j["value"] = result.second.value;
+            results_string[result.first] = j;
         }
         return results_string;
     }
@@ -537,7 +540,7 @@ class HpccFpgaBenchmark {
             dump["device"] = executionSettings->getDeviceName();
             dump["settings"] = executionSettings->programSettings->getSettingsMap();
             dump["timings"] = timings;
-            dump["results"] = getResultsMap();
+            dump["results"] = getResultsJson();
             dump["environment"] = getEnvironmentMap();
 
             fs << dump;

From d26f3770b9cd806a658829f9dd1238944dfdebc8 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 17 Oct 2022 13:43:32 +0200
Subject: [PATCH 203/318] output correct unit in json

---
 shared/include/hpcc_benchmark.hpp | 38 +++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 410800e8..e7f6a664 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -517,11 +517,45 @@ class HpccFpgaBenchmark {
         return results_string;
     }
     
-    std::map<std::string, std::string> getEnvironmentMap() {
+    std::map<std::string, std::string>
+    getEnvironmentMap() {
         std::map<std::string, std::string> env; 
         env["LD_LIBRARY_PATH"] = std::string(std::getenv("LD_LIBRARY_PATH"));
         return env;
     }
+
+    json
+    parseFPGATorusString(std::string str) {
+        json j; 
+        size_t space = str.find(" "); 
+        std::string p_str = str.substr(0, space);
+        std::string q_str = str.substr(space, str.size());
+        j["P"] = stoi(p_str.substr(p_str.find("=") + 1, p_str.find(",")));
+        j["Q"] = stoi(q_str.substr(q_str.find("=") + 1, q_str.size()));
+        return j;
+    }
+
+    std::map<std::string, json>
+    jsonifySettingsMap(std::map<std::string, std::string> settings_map) {
+        json j;
+        for (const auto& item: settings_map) {
+            std::string key = item.first;
+            std::string value = item.second;
+            try {
+                int value_int = stoi(value); 
+                j[key] = value_int;
+            } catch (std::invalid_argument const &ex) {
+                if (key == "FPGA Torus") {
+                    j[key] = parseFPGATorusString(value);
+                } else if (key == "Emulate") {
+                    j[key] = value == "Yes";
+                } else {
+                    j[key] = value; 
+                }
+            }     
+        }
+        return j;
+    }
     
     void
     dumpConfigurationAndResults(std::string file_path) {
@@ -538,7 +572,7 @@ class HpccFpgaBenchmark {
             dump["config_time"] = CONFIG_TIME;
             dump["git_commit"] = GIT_COMMIT_HASH;
             dump["device"] = executionSettings->getDeviceName();
-            dump["settings"] = executionSettings->programSettings->getSettingsMap();
+            dump["settings"] = jsonifySettingsMap(executionSettings->programSettings->getSettingsMap());
             dump["timings"] = timings;
             dump["results"] = getResultsJson();
             dump["environment"] = getEnvironmentMap();

From 8dfd1617ae89cc4f9519e33bd153865742fba640 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 17 Oct 2022 14:08:59 +0200
Subject: [PATCH 204/318] fix LINPACK result printing

---
 LINPACK/src/host/linpack_benchmark.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp
index b4156d57..98a35e3d 100644
--- a/LINPACK/src/host/linpack_benchmark.cpp
+++ b/LINPACK/src/host/linpack_benchmark.cpp
@@ -171,7 +171,7 @@ linpack::LinpackBenchmark::collectResults() {
     }
     
     results.emplace("t_mean", hpcc_base::HpccResult(t / global_lu_times.size(), "s"));
-    results.emplace("t_min", hpcc_base::HpccResult(tmin, "?"));
+    results.emplace("t_min", hpcc_base::HpccResult(tmin, "s"));
     results.emplace("tlu_mean", hpcc_base::HpccResult(tlu / global_lu_times.size(), "s"));
     results.emplace("tlu_min", hpcc_base::HpccResult(lu_min, "s"));
     results.emplace("tsl_mean", hpcc_base::HpccResult(tsl / global_sl_times.size(), "s"));
@@ -190,22 +190,20 @@ linpack::LinpackBenchmark::printResults() {
               << "best" << std::setw(ENTRY_SPACE) << "mean"
               << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl;
 
-    /*
     std::cout << std::setw(ENTRY_SPACE) << "total" << std::setw(ENTRY_SPACE)
-              << results["t_min"] << std::setw(ENTRY_SPACE) << results["t_mean"]
-              << std::setw(ENTRY_SPACE) << results["gflops"]
+              << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean")
+              << std::setw(ENTRY_SPACE) << results.at("gflops")
               << std::endl;
 
     std::cout << std::setw(ENTRY_SPACE) << "GEFA" << std::setw(ENTRY_SPACE)
-            << results["tlu_min"] << std::setw(ENTRY_SPACE) << results["tlu_mean"]
-            << std::setw(ENTRY_SPACE) << results["gflops_lu"]
+            << results.at("tlu_min") << std::setw(ENTRY_SPACE) << results.at("tlu_mean")
+            << std::setw(ENTRY_SPACE) << results.at("gflops_lu")
             << std::endl;
 
     std::cout << std::setw(ENTRY_SPACE) << "GESL" << std::setw(ENTRY_SPACE)
-              << results["tsl_min"] << std::setw(ENTRY_SPACE) << results["tsl_mean"]
-              << std::setw(ENTRY_SPACE) << results["gflops_sl"]
+              << results.at("tsl_min") << std::setw(ENTRY_SPACE) << results.at("tsl_mean")
+              << std::setw(ENTRY_SPACE) << results.at("gflops_sl")
               << std::endl;
-              */
 }
 
 std::unique_ptr<linpack::LinpackData>

From c70f32a602720fd825cbc9be6feaec8e72e68525 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 17 Oct 2022 14:51:15 +0200
Subject: [PATCH 205/318] add version to dump

---
 shared/include/hpcc_benchmark.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index e7f6a664..60291bec 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -571,6 +571,7 @@ class HpccFpgaBenchmark {
 #endif
             dump["config_time"] = CONFIG_TIME;
             dump["git_commit"] = GIT_COMMIT_HASH;
+            dump["version"] = VERSION;
             dump["device"] = executionSettings->getDeviceName();
             dump["settings"] = jsonifySettingsMap(executionSettings->programSettings->getSettingsMap());
             dump["timings"] = timings;

From 791831f89ef26004cc8d1f49b23e401f438a8e07 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Tue, 18 Oct 2022 15:11:18 +0200
Subject: [PATCH 206/318] add dump for GEMM and FFT

---
 FFT/src/common/parameters.h.in                |  3 +-
 FFT/src/host/execution.h                      |  2 +-
 FFT/src/host/execution_default.cpp            | 11 +++---
 FFT/src/host/fft_benchmark.cpp                | 29 +++++++++------
 FFT/src/host/fft_benchmark.hpp                |  9 +++--
 FFT/tests/test_execution_functionality.cpp    | 24 ++++++-------
 GEMM/src/common/parameters.h.in               |  6 ++--
 GEMM/src/host/execution.h                     |  4 +--
 GEMM/src/host/execution_default.cpp           | 10 +++---
 GEMM/src/host/gemm_benchmark.cpp              | 36 +++++++++++--------
 GEMM/src/host/gemm_benchmark.hpp              | 23 +++---------
 ...nel_functionality_and_host_integration.cpp | 20 +++++------
 shared/include/hpcc_benchmark.hpp             |  7 +++-
 13 files changed, 98 insertions(+), 86 deletions(-)

diff --git a/FFT/src/common/parameters.h.in b/FFT/src/common/parameters.h.in
index 52a87a98..57c85c61 100644
--- a/FFT/src/common/parameters.h.in
+++ b/FFT/src/common/parameters.h.in
@@ -27,7 +27,8 @@
 Short description of the program.
 Moreover the version and build time is also compiled into the description.
 */
-#define PROGRAM_DESCRIPTION "Implementation of the FFT benchmark"\
+#define PROGRAM_NAME "FFT"
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
                             
diff --git a/FFT/src/host/execution.h b/FFT/src/host/execution.h
index 2d588ded..fa44dc38 100644
--- a/FFT/src/host/execution.h
+++ b/FFT/src/host/execution.h
@@ -45,7 +45,7 @@ simple exchange of the different calculation methods.
 
 @return The resulting matrix
 */
-    std::unique_ptr<fft::FFTExecutionTimings>
+    std::map<std::string, std::vector<double>>
     calculate(hpcc_base::ExecutionSettings<fft::FFTProgramSettings> const& config, std::complex<HOST_DATA_TYPE>* data, std::complex<HOST_DATA_TYPE>* data_out, unsigned iterations, bool inverse);
 
 }  // namespace bm_execution
diff --git a/FFT/src/host/execution_default.cpp b/FFT/src/host/execution_default.cpp
index 59a81f87..d0d565da 100644
--- a/FFT/src/host/execution_default.cpp
+++ b/FFT/src/host/execution_default.cpp
@@ -44,7 +44,7 @@ namespace bm_execution {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
-    std::unique_ptr<fft::FFTExecutionTimings>
+    std::map<std::string, std::vector<double>>
     calculate(hpcc_base::ExecutionSettings<fft::FFTProgramSettings> const&  config,
             std::complex<HOST_DATA_TYPE>* data,
             std::complex<HOST_DATA_TYPE>* data_out,
@@ -210,10 +210,11 @@ namespace bm_execution {
                 ASSERT_CL(err)
 #endif
         }
-        std::unique_ptr<fft::FFTExecutionTimings> result(new fft::FFTExecutionTimings{
-                calculationTimings
-        });
-        return result;
+        std::map<std::string, std::vector<double>> timings;
+
+        timings["calculation"] = calculationTimings;
+
+        return timings;
     }
 
 }  // namespace bm_execution
diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp
index cf7ad994..ff0710ef 100644
--- a/FFT/src/host/fft_benchmark.cpp
+++ b/FFT/src/host/fft_benchmark.cpp
@@ -86,37 +86,44 @@ fft::FFTBenchmark::addAdditionalParseOptions(cxxopts::Options &options) {
             ("inverse", "If set, the inverse FFT is calculated instead");
 }
 
-std::unique_ptr<fft::FFTExecutionTimings>
+void
 fft::FFTBenchmark::executeKernel(FFTData &data) {
-    return bm_execution::calculate(*executionSettings, data.data, data.data_out, executionSettings->programSettings->iterations,
+    timings = bm_execution::calculate(*executionSettings, data.data, data.data_out, executionSettings->programSettings->iterations,
                                          executionSettings->programSettings->inverse);
 }
 
 void
-fft::FFTBenchmark::collectAndPrintResults(const fft::FFTExecutionTimings &output) {
+fft::FFTBenchmark::collectResults() {
     double gflop = static_cast<double>(5 * (1 << LOG_FFT_SIZE) * LOG_FFT_SIZE) * executionSettings->programSettings->iterations * 1.0e-9 * mpi_comm_size;
 
-    uint number_measurements = output.timings.size();
+    uint number_measurements = timings["calculation"].size();
     std::vector<double> avg_measures(number_measurements);
 #ifdef _USE_MPI_
     // Copy the object variable to a local variable to make it accessible to the lambda function
     int mpi_size = mpi_comm_size;
-    MPI_Reduce(output.timings.data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(timings.data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
     std::for_each(avg_measures.begin(),avg_measures.end(), [mpi_size](double& x) {x /= mpi_size;});
 #else
-    std::copy(output.timings.begin(), output.timings.end(), avg_measures.begin());
+    std::copy(timings["calculation"].begin(), timings["calculation"].end(), avg_measures.begin());
 #endif
     if (mpi_comm_rank == 0) {
         double minTime = *min_element(avg_measures.begin(), avg_measures.end());
         double avgTime = accumulate(avg_measures.begin(), avg_measures.end(), 0.0) / avg_measures.size();
+        results.emplace("t_min", hpcc_base::HpccResult(minTime / (executionSettings->programSettings->iterations * executionSettings->programSettings->kernelReplications), "s"));
+        results.emplace("t_avg", hpcc_base::HpccResult(avgTime / (executionSettings->programSettings->iterations * executionSettings->programSettings->kernelReplications), "s"));
+        results.emplace("gflops_min", hpcc_base::HpccResult(gflop / minTime, "GFLOP/s"));
+        results.emplace("gflops_avg", hpcc_base::HpccResult(gflop / avgTime, "GFLOP/s"));
+    }
+}
 
+void
+fft::FFTBenchmark::printResults() {
         std::cout << std::setw(ENTRY_SPACE) << " " << std::setw(ENTRY_SPACE) << "avg"
                 << std::setw(ENTRY_SPACE) << "best" << std::endl;
-        std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << std::setw(ENTRY_SPACE) << avgTime / (executionSettings->programSettings->iterations * executionSettings->programSettings->kernelReplications)
-                    << std::setw(ENTRY_SPACE) << minTime / (executionSettings->programSettings->iterations * executionSettings->programSettings->kernelReplications) << std::endl;
-        std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << std::setw(ENTRY_SPACE) << gflop / avgTime
-                    << std::setw(ENTRY_SPACE) << gflop / minTime << std::endl;
-    }
+        std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << std::setw(ENTRY_SPACE) << results.at("t_avg")
+                    << std::setw(ENTRY_SPACE) << results.at("t_min") << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << std::setw(ENTRY_SPACE) << results.at("gflops_avg")
+                    << std::setw(ENTRY_SPACE) << results.at("gflop_min") << std::endl;
 }
 
 std::unique_ptr<fft::FFTData>
diff --git a/FFT/src/host/fft_benchmark.hpp b/FFT/src/host/fft_benchmark.hpp
index 4ee82f12..99fd3458 100644
--- a/FFT/src/host/fft_benchmark.hpp
+++ b/FFT/src/host/fft_benchmark.hpp
@@ -137,7 +137,7 @@ class FFTExecutionTimings {
  * @brief Implementation of the FFT benchmark
  * 
  */
-class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark<FFTProgramSettings, FFTData, FFTExecutionTimings> {
+class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark<FFTProgramSettings, FFTData> {
 
 protected:
 
@@ -165,7 +165,7 @@ class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark<FFTProgramSettings, FFT
      * @param data The input and output data of the benchmark
      * @return std::unique_ptr<FFTExecutionTimings> Measured runtimes of the kernel execution
      */
-    std::unique_ptr<FFTExecutionTimings>
+    void
     executeKernel(FFTData &data) override;
 
     /**
@@ -184,7 +184,10 @@ class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark<FFTProgramSettings, FFT
      * @param output Measured runtimes of the kernel execution
      */
     void
-    collectAndPrintResults(const FFTExecutionTimings &output) override;
+    collectResults() override;
+    
+    void
+    printResults() override;
 
     /**
      * @brief Construct a new FFT Benchmark object
diff --git a/FFT/tests/test_execution_functionality.cpp b/FFT/tests/test_execution_functionality.cpp
index 6e7f6493..7b3b836f 100644
--- a/FFT/tests/test_execution_functionality.cpp
+++ b/FFT/tests/test_execution_functionality.cpp
@@ -34,8 +34,8 @@ struct FFTKernelTest : testing::Test {
 TEST_F(FFTKernelTest, CalculateReturnsCorrectExecutionResultFor11False) {
     bm->getExecutionSettings().programSettings->numRepetitions = 1;
     data = bm->generateInputData();
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(1, result->timings.size());
+    bm->executeKernel(*data);
+    EXPECT_EQ(1, bm->getTimingsMap().at("calculation").size());
 }
 
 /**
@@ -44,8 +44,8 @@ TEST_F(FFTKernelTest, CalculateReturnsCorrectExecutionResultFor11False) {
 TEST_F(FFTKernelTest, CalculateReturnsCorrectExecutionResultFor24True) {
     bm->getExecutionSettings().programSettings->numRepetitions = 2;
     data = bm->generateInputData();
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(2, result->timings.size());
+    bm->executeKernel(*data);
+    EXPECT_EQ(2, bm->getTimingsMap().at("calculation").size());
 }
 
 /**
@@ -56,7 +56,7 @@ TEST_F(FFTKernelTest, FFTReturnsZero) {
         data->data[i].real(0.0);
         data->data[i].imag(0.0);
     }
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
         EXPECT_FLOAT_EQ(std::abs(data->data_out[i]), 0.0);
     }
@@ -71,7 +71,7 @@ TEST_F(FFTKernelTest, FFTCloseToZeroForAll1And1) {
         data->data[i].real(1.0);
         data->data[i].imag(1.0);
     }
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     EXPECT_NEAR(data->data_out[0].real(), (1 << LOG_FFT_SIZE), 0.00001);
     EXPECT_NEAR(data->data_out[0].imag(), (1 << LOG_FFT_SIZE), 0.00001);
     for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
@@ -88,7 +88,7 @@ TEST_F(FFTKernelTest, FFTCloseToZeroForAll0And0) {
         data->data[i].real(0.0);
         data->data[i].imag(0.0);
     }
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i=0; i < (1 << LOG_FFT_SIZE); i++) {
         EXPECT_NEAR(data->data[i].real(), 0.0, 0.00001);
         EXPECT_NEAR(data->data[i].imag(), 0.0, 0.00001);
@@ -104,7 +104,7 @@ TEST_F(FFTKernelTest, IFFTCloseToZeroForAll1And1) {
         data->data[i].real(1.0);
         data->data[i].imag(0.0);
     }
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     EXPECT_NEAR(data->data_out[0].real(), static_cast<HOST_DATA_TYPE>(1 << LOG_FFT_SIZE), 0.00001);
     EXPECT_NEAR(data->data_out[0].imag(), 0.0, 0.00001);
     for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
@@ -119,7 +119,7 @@ TEST_F(FFTKernelTest, IFFTCloseToZeroForAll1And1) {
 TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) {
     auto verify_data = bm->generateInputData();
 
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
 
     // Normalize iFFT result
     for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
@@ -135,7 +135,7 @@ TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) {
     }
 
     bm->getExecutionSettings().programSettings->inverse = true;
-    auto result2 = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     // Since data was already sorted by iFFT the bit reversal of the kernel has t be undone
     fft::bit_reverse(data->data_out, 1);
 
@@ -150,7 +150,7 @@ TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) {
 TEST_F(FFTKernelTest, FPGAFFTAndCPUFFTGiveSameResults) {
     auto verify_data = bm->generateInputData();
 
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
 
     fft::fourier_transform_gold(false,LOG_FFT_SIZE,verify_data->data);
     fft::bit_reverse(verify_data->data, 1);
@@ -171,7 +171,7 @@ TEST_F(FFTKernelTest, FPGAiFFTAndCPUiFFTGiveSameResults) {
     auto verify_data = bm->generateInputData();
 
     bm->getExecutionSettings().programSettings->inverse = true;
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
 
     fft::fourier_transform_gold(true,LOG_FFT_SIZE,verify_data->data);
     fft::bit_reverse(verify_data->data, 1);
diff --git a/GEMM/src/common/parameters.h.in b/GEMM/src/common/parameters.h.in
index 3e35bf01..82ca5a25 100644
--- a/GEMM/src/common/parameters.h.in
+++ b/GEMM/src/common/parameters.h.in
@@ -29,7 +29,9 @@
 /*
 Short description of the program
 */
-#define PROGRAM_DESCRIPTION "Implementation of the GEMM benchmark"\
+#define PROGRAM_NAME "GEMM"
+
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark adapted for FPGA\n"\
                             "Version: " VERSION "\n"
 
@@ -49,4 +51,4 @@ Output separator
 #endif
 #endif
 
-#endif // SRC_COMMON_PARAMETERS_H_
\ No newline at end of file
+#endif // SRC_COMMON_PARAMETERS_H_
diff --git a/GEMM/src/host/execution.h b/GEMM/src/host/execution.h
index 9446c16f..c4ce1412 100644
--- a/GEMM/src/host/execution.h
+++ b/GEMM/src/host/execution.h
@@ -48,9 +48,9 @@ simple exchange of the different calculation methods.
                 execution in number of items
 @param blockSize Size of a block that is calculated by the kernel
 
-@return The time measurements and the error rate counted from the executions
+@return The time measurements
 */
-std::unique_ptr<gemm::GEMMExecutionTimings>
+std::map<std::string, std::vector<double>>
 calculate(hpcc_base::ExecutionSettings<gemm::GEMMProgramSettings> const& config, HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, HOST_DATA_TYPE* c,
         HOST_DATA_TYPE* c_out, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta);
 }  // namespace bm_execution
diff --git a/GEMM/src/host/execution_default.cpp b/GEMM/src/host/execution_default.cpp
index aa89d258..e608a35a 100644
--- a/GEMM/src/host/execution_default.cpp
+++ b/GEMM/src/host/execution_default.cpp
@@ -42,7 +42,7 @@ namespace bm_execution {
 
  @copydoc bm_execution::calculate()
 */
-std::unique_ptr<gemm::GEMMExecutionTimings>
+std::map<std::string, std::vector<double>>
 calculate(hpcc_base::ExecutionSettings<gemm::GEMMProgramSettings> const& config, HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, HOST_DATA_TYPE* c_out,
         HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta) {
 
@@ -257,10 +257,10 @@ calculate(hpcc_base::ExecutionSettings<gemm::GEMMProgramSettings> const& config,
     }
 #endif
 
-
-    std::unique_ptr<gemm::GEMMExecutionTimings> results(
-                    new gemm::GEMMExecutionTimings{executionTimes});
-    return results;
+    std::map<std::string, std::vector<double>> timings;
+    
+    timings["execution"] = executionTimes;
+    return timings;
 }
 
 }  // namespace bm_execution
diff --git a/GEMM/src/host/gemm_benchmark.cpp b/GEMM/src/host/gemm_benchmark.cpp
index 8910aacf..141ea160 100644
--- a/GEMM/src/host/gemm_benchmark.cpp
+++ b/GEMM/src/host/gemm_benchmark.cpp
@@ -99,29 +99,25 @@ gemm::GEMMBenchmark::addAdditionalParseOptions(cxxopts::Options &options) {
             ("replicate-inputs", "Also replicates the input buffer for each kernel");
 }
 
-std::unique_ptr<gemm::GEMMExecutionTimings>
+void
 gemm::GEMMBenchmark::executeKernel(GEMMData &data) {
-    return bm_execution::calculate(*executionSettings, data.A, data.B, data.C, data.C_out, data.alpha, data.beta);
+    timings = bm_execution::calculate(*executionSettings, data.A, data.B, data.C, data.C_out, data.alpha, data.beta);
 }
 
 void
-gemm::GEMMBenchmark::collectAndPrintResults(const gemm::GEMMExecutionTimings &output) {
+gemm::GEMMBenchmark::collectResults() {
 
-    uint number_measurements = output.timings.size();
+    uint number_measurements = timings.at("execution").size();
     std::vector<double> avg_measures(number_measurements);
 #ifdef _USE_MPI_
     // Copy the object variable to a local variable to make it accessible to the lambda function
     int mpi_size = mpi_comm_size;
-    MPI_Reduce(output.timings.data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(timings.at("execution").data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
     std::for_each(avg_measures.begin(),avg_measures.end(), [mpi_size](double& x) {x /= mpi_size;});
 #else
-    std::copy(output.timings.begin(), output.timings.end(), avg_measures.begin());
+    std::copy(timings.at("execution").begin(), timings.at("execution").end(), avg_measures.begin());
 #endif
     if (mpi_comm_rank == 0) {
-        std::cout << std::setw(ENTRY_SPACE)
-                << "best" << std::setw(ENTRY_SPACE) << "mean"
-                << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl;
-
         // Calculate performance for kernel execution
         double tmean = 0;
         double tmin = std::numeric_limits<double>::max();
@@ -136,14 +132,24 @@ gemm::GEMMBenchmark::collectAndPrintResults(const gemm::GEMMExecutionTimings &ou
             }
         }
         tmean = tmean / avg_measures.size();
-
-        std::cout << std::setw(ENTRY_SPACE)
-                << tmin << std::setw(ENTRY_SPACE) << tmean
-                << std::setw(ENTRY_SPACE) << gflops / tmin
-                << std::endl;
+        results.emplace("t_mean", hpcc_base::HpccResult(tmean, "s"));
+        results.emplace("t_min", hpcc_base::HpccResult(tmin, "s"));
+        results.emplace("gflops", hpcc_base::HpccResult(gflops / tmin, "GFLOP/s"));
     }
 }
 
+void
+gemm::GEMMBenchmark::printResults() {
+    std::cout << std::setw(ENTRY_SPACE)
+            << "best" << std::setw(ENTRY_SPACE) << "mean"
+            << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl;
+
+    std::cout << std::setw(ENTRY_SPACE)
+            << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean")
+            << std::setw(ENTRY_SPACE) << results.at("gflops")
+            << std::endl;
+}
+
 std::unique_ptr<gemm::GEMMData>
 gemm::GEMMBenchmark::generateInputData() {
     auto d = std::unique_ptr<gemm::GEMMData>(new gemm::GEMMData(*executionSettings->context, executionSettings->programSettings->matrixSize));
diff --git a/GEMM/src/host/gemm_benchmark.hpp b/GEMM/src/host/gemm_benchmark.hpp
index fde2e2ae..534a5bab 100644
--- a/GEMM/src/host/gemm_benchmark.hpp
+++ b/GEMM/src/host/gemm_benchmark.hpp
@@ -170,25 +170,11 @@ class GEMMData {
 
 };
 
-/**
- * @brief Measured execution timing from the kernel execution
- * 
- */
-class GEMMExecutionTimings {
-public:
-    /**
-     * @brief A vector containing the timings for all repetitions for the kernel execution
-     * 
-     */
-    std::vector<double> timings;
-
-};
-
 /**
  * @brief Implementation of the GEMM benchmark
  * 
  */
-class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark<GEMMProgramSettings, GEMMData, GEMMExecutionTimings> {
+class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark<GEMMProgramSettings, GEMMData> {
 
 protected:
 
@@ -203,7 +189,7 @@ class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark<GEMMProgramSettings, G
 public:
 
     /**
-     * @brief LINPACK specific implementation of the data generation
+     * @brief GEMM specific implementation of the data generation
      * 
      * @return std::unique_ptr<GEMMData> The input and output data of the benchmark
      */
@@ -216,7 +202,7 @@ class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark<GEMMProgramSettings, G
      * @param data The input and output data of the benchmark
      * @return std::unique_ptr<GEMMExecutionTimings> Measured runtimes of the kernel execution
      */
-    std::unique_ptr<GEMMExecutionTimings>
+    void
     executeKernel(GEMMData &data) override;
 
     /**
@@ -229,13 +215,14 @@ class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark<GEMMProgramSettings, G
     bool
     validateOutputAndPrintError(GEMMData &data) override;
 
+    void collectResults() override;
     /**
      * @brief GEMM specific implementation of printing the execution results
      * 
      * @param output Measured runtimes of the kernel execution
      */
     void
-    collectAndPrintResults(const GEMMExecutionTimings &output) override;
+    printResults() override;
 
     /**
      * @brief Construct a new GEMM Benchmark object
diff --git a/GEMM/tests/test_kernel_functionality_and_host_integration.cpp b/GEMM/tests/test_kernel_functionality_and_host_integration.cpp
index aafbf650..c3d9723e 100755
--- a/GEMM/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/GEMM/tests/test_kernel_functionality_and_host_integration.cpp
@@ -40,8 +40,8 @@ struct GEMMKernelTest : testing::Test, testing::WithParamInterface<unsigned> {
  */
 TEST_P(GEMMKernelTest, FPGACorrectNumberOfRepetitionsIs1) {
     bm->getExecutionSettings().programSettings->numRepetitions = 1;
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(result->timings.size(), 1);
+    bm->executeKernel(*data);
+    EXPECT_EQ(bm->getTimingsMap().at("execution").size(), 1);
 }
 
 /**
@@ -49,8 +49,8 @@ TEST_P(GEMMKernelTest, FPGACorrectNumberOfRepetitionsIs1) {
  */
 TEST_P(GEMMKernelTest, FPGACorrectNumberOfRepetitionsIs3) {
     bm->getExecutionSettings().programSettings->numRepetitions = 3;
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(result->timings.size(), 3);
+    bm->executeKernel(*data);
+    EXPECT_EQ(bm->getTimingsMap().at("execution").size(), 3);
 }
 
 /**
@@ -64,7 +64,7 @@ TEST_P(GEMMKernelTest, FPGACorrectCtimesBeta) {
             data->C[i * matrix_size + j] = OPTIONAL_CAST(1.0);
         }
     }
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < matrix_size; i++) {
         for (int j = 0; j < matrix_size; j++) {
             EXPECT_NEAR(data->C_out[i * matrix_size + j], 2.0 * data->C[i * matrix_size + j], std::numeric_limits<HOST_DATA_TYPE>::epsilon());
@@ -85,7 +85,7 @@ TEST_P(GEMMKernelTest, FPGACorrectAtimesAlpha) {
     data->alpha = 2.0;
     data->beta = 0.0;
 
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < matrix_size; i++) {
         for (int j = 0; j < matrix_size; j++) {
             EXPECT_NEAR(data->C_out[i * matrix_size + j], 2.0 * data->A[i * matrix_size + j], std::numeric_limits<HOST_DATA_TYPE>::epsilon());
@@ -105,7 +105,7 @@ TEST_P(GEMMKernelTest, FPGACorrectBtimesAlpha) {
     }
     data->alpha = 2.0;
     data->beta = 0.0;
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < matrix_size; i++) {
         for (int j = 0; j < matrix_size; j++) {
             EXPECT_NEAR(data->C_out[i * matrix_size + j], 2.0 * data->B[i * matrix_size + j], std::numeric_limits<HOST_DATA_TYPE>::epsilon());
@@ -126,7 +126,7 @@ TEST_P(GEMMKernelTest, FPGACorrectAmulB) {
     }
     data->alpha = 1.0;
     data->beta = 1.0;
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
 
     HOST_DATA_TYPE c_ref_out[matrix_size * matrix_size];
     ref_matmul(data->A,data->B,c_ref_out,matrix_size);
@@ -150,7 +150,7 @@ TEST_P(GEMMKernelTest, FPGACorrectCplusA) {
     data->alpha = 1.0;
     data->beta = 1.0;
 
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < matrix_size; i++) {
         for (int j = 0; j < matrix_size; j++) {
             EXPECT_FLOAT_EQ(data->C_out[i * matrix_size + j], data->A[i * matrix_size + j] + data->C[i * matrix_size + j]);
@@ -165,7 +165,7 @@ TEST_P(GEMMKernelTest, FPGACorrectCplusA) {
 
 TEST_P(GEMMKernelTest, FPGACorrectbetaCplusalphaAB) {
     HOST_DATA_TYPE c_ref_out[matrix_size * matrix_size];
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < matrix_size; i++) {
         for (int j = 0; j < matrix_size; j++) {
            c_ref_out[i * matrix_size + j] = data->C[i * matrix_size + j];
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 60291bec..4c8454b1 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -505,6 +505,11 @@ class HpccFpgaBenchmark {
         std::cout << *executionSettings << std::endl;
     }
     
+    std::map<std::string, std::vector<double>>
+    getTimingsMap() {
+        return timings;
+    }
+    
     std::map<std::string, json> getResultsJson() {
         // TODO: nested maps, recursive?
         std::map<std::string, json> results_string;
@@ -547,7 +552,7 @@ class HpccFpgaBenchmark {
             } catch (std::invalid_argument const &ex) {
                 if (key == "FPGA Torus") {
                     j[key] = parseFPGATorusString(value);
-                } else if (key == "Emulate") {
+                } else if (key == "Emulate" || key == "Replicate Inputs") {
                     j[key] = value == "Yes";
                 } else {
                     j[key] = value; 

From 14ad00434dc786c0864c23192c7a626f3479e524 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Tue, 18 Oct 2022 16:10:06 +0200
Subject: [PATCH 207/318] add dump for transpose

---
 PTRANS/src/common/parameters.h.in             |  3 +-
 .../host/execution_types/execution_cpu.hpp    | 10 +--
 .../host/execution_types/execution_intel.hpp  | 13 ++-
 .../execution_types/execution_intel_pq.hpp    | 11 ++-
 .../host/execution_types/execution_pcie.hpp   | 10 +--
 .../execution_types/execution_pcie_pq.hpp     | 12 ++-
 PTRANS/src/host/transpose_benchmark.cpp       | 85 ++++++++++---------
 PTRANS/src/host/transpose_benchmark.hpp       | 10 ++-
 PTRANS/src/host/transpose_data.hpp            | 20 -----
 PTRANS/tests/test_host_functionality.cpp      | 15 ++--
 ...nel_functionality_and_host_integration.cpp | 10 +--
 shared/include/hpcc_benchmark.hpp             |  5 ++
 12 files changed, 98 insertions(+), 106 deletions(-)

diff --git a/PTRANS/src/common/parameters.h.in b/PTRANS/src/common/parameters.h.in
index 68b50dd7..2f5f95b3 100644
--- a/PTRANS/src/common/parameters.h.in
+++ b/PTRANS/src/common/parameters.h.in
@@ -33,7 +33,8 @@
 Short description of the program.
 Moreover the version and build time is also compiled into the description.
 */
-#define PROGRAM_DESCRIPTION "Implementation of the matrix transposition benchmark"\
+#define PROGRAM_NAME "matrix transposition"
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
 
diff --git a/PTRANS/src/host/execution_types/execution_cpu.hpp b/PTRANS/src/host/execution_types/execution_cpu.hpp
index ab74fdc9..a2775809 100644
--- a/PTRANS/src/host/execution_types/execution_cpu.hpp
+++ b/PTRANS/src/host/execution_types/execution_cpu.hpp
@@ -50,7 +50,7 @@ namespace transpose
  * @param data data object that contains all required data for the execution
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
-            static std::unique_ptr<transpose::TransposeExecutionTimings>
+            static std::map<std::string, std::vector<double>>
             calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler)
             {
                 int err;
@@ -115,10 +115,10 @@ namespace transpose
                     transferTimings.push_back(transferTime.count());
                 }
 
-                std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
-                    transferTimings,
-                    calculationTimings});
-                return result;
+                std::map<std::string, std::vector<double>> timings;
+                timings["transfer"] = transferTimings;
+                timings["calculation"] = calculationTimings;
+                return timings;
             }
 
         } // namespace bm_execution
diff --git a/PTRANS/src/host/execution_types/execution_intel.hpp b/PTRANS/src/host/execution_types/execution_intel.hpp
index d95bf578..fc752d0f 100644
--- a/PTRANS/src/host/execution_types/execution_intel.hpp
+++ b/PTRANS/src/host/execution_types/execution_intel.hpp
@@ -40,9 +40,9 @@ namespace intel {
  * 
  * @param config The progrma configuration
  * @param data data object that contains all required data for the execution on the FPGA
- * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
+ * @return std::map<std::string, std::vector<double>> The measured execution times 
  */
-static  std::unique_ptr<transpose::TransposeExecutionTimings>
+static std::map<std::string, std::vector<double>>
     calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& config, transpose::TransposeData& data) {
         int err;
 
@@ -264,11 +264,10 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
             transferTimings.push_back(transferTime.count());
         }
 
-        std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
-                transferTimings,
-                calculationTimings
-        });
-        return result;
+        std::map<std::string, std::vector<double>> timings;
+        timings["transfer"] = transferTimings;
+        timings["calculation"] = calculationTimings;
+        return timings;
     }
 
 }  // namespace transpose
diff --git a/PTRANS/src/host/execution_types/execution_intel_pq.hpp b/PTRANS/src/host/execution_types/execution_intel_pq.hpp
index 431ff40d..8dcc080e 100644
--- a/PTRANS/src/host/execution_types/execution_intel_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_intel_pq.hpp
@@ -43,7 +43,7 @@ namespace intel_pq {
  * @param data data object that contains all required data for the execution on the FPGA
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
-static  std::unique_ptr<transpose::TransposeExecutionTimings>
+static std::map<std::string, std::vector<double>>
     calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) {
         int err;
 
@@ -343,11 +343,10 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
             transferTimings.push_back(transferTime.count());
         }
 
-        std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
-                transferTimings,
-                calculationTimings
-        });
-        return result;
+        std::map<std::string, std::vector<double>> timings;
+        timings["transfer"] = transferTimings;
+        timings["calculation"] = calculationTimings;
+        return timings;
     }
 
 }  // namespace transpose
diff --git a/PTRANS/src/host/execution_types/execution_pcie.hpp b/PTRANS/src/host/execution_types/execution_pcie.hpp
index 5e29ad2e..aa0d589f 100644
--- a/PTRANS/src/host/execution_types/execution_pcie.hpp
+++ b/PTRANS/src/host/execution_types/execution_pcie.hpp
@@ -48,7 +48,7 @@ namespace transpose
  * @param handler data handler instance that should be used to exchange data between hosts
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
-            static std::unique_ptr<transpose::TransposeExecutionTimings>
+            static std::map<std::string, std::vector<double>>
             calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler)
             {
                 int err;
@@ -227,10 +227,10 @@ namespace transpose
                     transferTimings.push_back(transferTime.count());
                 }
 
-                std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
-                    transferTimings,
-                    calculationTimings});
-                return result;
+                std::map<std::string, std::vector<double>> timings;
+                timings["transfer"] = transferTimings;
+                timings["calculation"] = calculationTimings;
+                return timings;
             }
 
         } // namespace bm_execution
diff --git a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
index d2cfae7e..6be472a4 100644
--- a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
@@ -44,7 +44,7 @@ namespace pcie_pq {
  * @param handler data handler instance that should be used to exchange data between hosts
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
-static  std::unique_ptr<transpose::TransposeExecutionTimings>
+static std::map<std::string, std::vector<double>>
     calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) {
         int err;
 
@@ -366,12 +366,10 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
             transferTimings.push_back(transferTime.count());
         }
 
-        std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
-                transferTimings,
-                calculationTimings
-        });
-
-        return result;
+        std::map<std::string, std::vector<double>> timings;
+        timings["transfer"] = transferTimings;
+        timings["calculation"] = calculationTimings;
+        return timings;
     }
 
 }  // namespace transpose
diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp
index 755b11a0..1c2682e3 100644
--- a/PTRANS/src/host/transpose_benchmark.cpp
+++ b/PTRANS/src/host/transpose_benchmark.cpp
@@ -65,22 +65,22 @@ transpose::TransposeBenchmark::addAdditionalParseOptions(cxxopts::Options &optio
             cxxopts::value<std::string>()->default_value(DEFAULT_DIST_TYPE));
 }
 
-std::unique_ptr<transpose::TransposeExecutionTimings>
+void
 transpose::TransposeBenchmark::executeKernel(TransposeData &data) {
     switch (executionSettings->programSettings->communicationType) {
         case hpcc_base::CommunicationType::intel_external_channels: 
                                 if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
-                                    return transpose::fpga_execution::intel::calculate(*executionSettings, data);
+                                    timings = transpose::fpga_execution::intel::calculate(*executionSettings, data);
                                 }
                                 else {
-                                    return transpose::fpga_execution::intel_pq::calculate(*executionSettings, data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*dataHandler));
+                                    timings = transpose::fpga_execution::intel_pq::calculate(*executionSettings, data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*dataHandler));
                                 } break;
         case hpcc_base::CommunicationType::pcie_mpi :                                 
                                 if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
-                                    return transpose::fpga_execution::pcie::calculate(*executionSettings, data, *dataHandler);
+                                    timings = transpose::fpga_execution::pcie::calculate(*executionSettings, data, *dataHandler);
                                 }
                                 else {
-                                    return transpose::fpga_execution::pcie_pq::calculate(*executionSettings, data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*dataHandler));
+                                    timings = transpose::fpga_execution::pcie_pq::calculate(*executionSettings, data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*dataHandler));
                                 } break;
 #ifdef MKL_FOUND
         case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*executionSettings, data, *dataHandler); break;
@@ -90,63 +90,70 @@ transpose::TransposeBenchmark::executeKernel(TransposeData &data) {
 }
 
 void
-transpose::TransposeBenchmark::collectAndPrintResults(const transpose::TransposeExecutionTimings &output) {
+transpose::TransposeBenchmark::collectResults() {
     double flops = static_cast<double>(executionSettings->programSettings->matrixSize) * executionSettings->programSettings->matrixSize;
 
     // Number of experiment repetitions
-    uint number_measurements = output.calculationTimings.size();
+    uint number_measurements = timings.at("calculation").size();
     std::vector<double> max_measures(number_measurements);
     std::vector<double> max_transfers(number_measurements);
 #ifdef _USE_MPI_
         // Copy the object variable to a local variable to make it accessible to the lambda function
         int mpi_size = mpi_comm_size;
-        MPI_Reduce(output.calculationTimings.data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-        MPI_Reduce(output.transferTimings.data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+        MPI_Reduce(timings.at("calculation").data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+        MPI_Reduce(timings.at("transfer").data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
 #else
-        std::copy(output.calculationTimings.begin(), output.calculationTimings.end(), max_measures.begin());
-        std::copy(output.transferTimings.begin(), output.transferTimings.end(), max_transfers.begin());
+        std::copy(timings.at("calculation").begin(), timings.at("calculation").end(), max_measures.begin());
+        std::copy(timings.at("transfer").begin(), timings.at("transfer").end(), max_transfers.begin());
 #endif
 
     double avgCalculationTime = accumulate(max_measures.begin(), max_measures.end(), 0.0)
                                 / max_measures.size();
+    results.emplace("avg_calc_t", hpcc_base::HpccResult(avgCalculationTime, "s"));
+
     double minCalculationTime = *min_element(max_measures.begin(), max_measures.end());
+    results.emplace("min_calc_t", hpcc_base::HpccResult(minCalculationTime, "s"));
 
     double avgTransferTime = accumulate(max_transfers.begin(), max_transfers.end(), 0.0)
                                 / max_transfers.size();
-    double minTransferTime = *min_element(max_transfers.begin(), max_transfers.end());
-
-    double avgCalcFLOPS = flops / avgCalculationTime;
-    double maxCalcFLOPS = flops / minCalculationTime;
-    double avgMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime;
-    double maxMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime;
-    double avgTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime;
-    double maxTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime;
-
-
+    results.emplace("avg_transfer_t", hpcc_base::HpccResult(avgTransferTime, "s"));
 
+    double minTransferTime = *min_element(max_transfers.begin(), max_transfers.end());
+    results.emplace("min_transfer_t", hpcc_base::HpccResult(minTransferTime, "s"));
+    
+    results.emplace("avg_t", hpcc_base::HpccResult(avgCalculationTime + avgTransferTime, "s"));
+    results.emplace("min_t", hpcc_base::HpccResult(minCalculationTime + minTransferTime, "s"));
+
+    results.emplace("avg_calc_flops", hpcc_base::HpccResult(flops / avgCalculationTime, "GFLOP/s"));
+    results.emplace("max_calc_flops", hpcc_base::HpccResult(flops / minCalculationTime, "GFLOP/s"));
+    results.emplace("avg_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime, "GB/s"));
+    results.emplace("max_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime, "GB/s"));
+    results.emplace("avg_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime, "GB/s"));
+    results.emplace("max_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime, "GB/s"));
+}
 
-    if (mpi_comm_rank == 0) {
-        std::cout << "       total [s]     transfer [s]  calc [s]      calc FLOPS    Mem [B/s]     PCIe [B/s]" << std::endl;
-        std::cout << "avg:   " << (avgTransferTime + avgCalculationTime)
-                << "   " << avgTransferTime
-                << "   " << avgCalculationTime
-                << "   " << avgCalcFLOPS
-                << "   " << avgMemBandwidth
-                << "   " << avgTransferBandwidth
-                << std::endl;
-        std::cout << "best:  " << (minTransferTime + minCalculationTime)
-                << "   " << minTransferTime
-                << "   " << minCalculationTime
-                << "   " << maxCalcFLOPS
-                << "   " << maxMemBandwidth
-                << "   " << maxTransferBandwidth
-                << std::endl;
-    }
+void
+transpose::TransposeBenchmark::printResults() {
+    std::cout << "       total [s]     transfer [s]  calc [s]      calc FLOPS    Mem [B/s]     PCIe [B/s]" << std::endl;
+    std::cout << "avg:   " << results.at("avg_t")
+            << "   " << results.at("avg_transfer_t")
+            << "   " << results.at("avg_calc_t")
+            << "   " << results.at("avg_calc_flops")
+            << "   " << results.at("avg_mem_bandwidth")
+            << "   " << results.at("avg_transfer_bandwidth")
+            << std::endl;
+    std::cout << "best:  " << results.at("min_t")
+            << "   " << results.at("min_transfer_t")
+            << "   " << results.at("min_calculation_t")
+            << "   " << results.at("max_calc_flops")
+            << "   " << results.at("max_mem_bandwidth")
+            << "   " << results.at("max_transfer_bandwidth")
+            << std::endl;
 }
 
 std::unique_ptr<transpose::TransposeData>
 transpose::TransposeBenchmark::generateInputData() {
-    return dataHandler->generateData(*executionSettings);
+return dataHandler->generateData(*executionSettings);
 }
 
 bool  
diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index 5de333ca..cd595637 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -46,7 +46,7 @@ namespace transpose {
  * @brief Implementation of the transpose benchmark
  * 
  */
-class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings, TransposeData, TransposeExecutionTimings> {
+class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings, TransposeData> {
 
 protected:
 
@@ -81,9 +81,8 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark<TransposeProgramS
      * @brief Transpose specific implementation of the kernel execution
      * 
      * @param data The input and output data of the benchmark
-     * @return std::unique_ptr<TransposeExecutionTimings> Measured runtimes of the kernel execution
      */
-    std::unique_ptr<TransposeExecutionTimings>
+    void
     executeKernel(TransposeData &data) override;
 
     /**
@@ -102,7 +101,10 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark<TransposeProgramS
      * @param output Measured runtimes of the kernel execution
      */
     void
-    collectAndPrintResults(const TransposeExecutionTimings &output) override;
+    collectResults() override;
+    
+    void
+    printResults() override;
 
     /**
      * @brief Construct a new Transpose Benchmark object
diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp
index a223353f..36acd00e 100644
--- a/PTRANS/src/host/transpose_data.hpp
+++ b/PTRANS/src/host/transpose_data.hpp
@@ -157,26 +157,6 @@ class TransposeData {
 
 };
 
-/**
- * @brief Measured execution timing from the kernel execution
- * 
- */
-class TransposeExecutionTimings {
-public:
-    /**
-     * @brief A vector containing the timings for all repetitions for the data transfer
-     * 
-     */
-    std::vector<double> transferTimings;
-
-    /**
-     * @brief A vector containing the timings for all repetitions for the calculation
-     * 
-     */
-    std::vector<double> calculationTimings;
-
-};
-
 }
 
 #endif
diff --git a/PTRANS/tests/test_host_functionality.cpp b/PTRANS/tests/test_host_functionality.cpp
index 0f7c64a0..c65019a6 100644
--- a/PTRANS/tests/test_host_functionality.cpp
+++ b/PTRANS/tests/test_host_functionality.cpp
@@ -24,16 +24,16 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatHeader) {
     std::vector<double> calculateTimings;
     transferTimings.push_back(1.0);
     calculateTimings.push_back(1.0);
-    std::shared_ptr<transpose::TransposeExecutionTimings> results(
-            new transpose::TransposeExecutionTimings{transferTimings, calculateTimings});
-
+    bm->addTimings("transfer", transferTimings);
+    bm->addTimings("calculation", calculateTimings);
 
     // Redirect stout buffer to local buffer to make checks possible
     std::stringstream newStdOutBuffer;
     std::streambuf *oldStdOutBuffer = std::cout.rdbuf();
     std::cout.rdbuf(newStdOutBuffer.rdbuf());
 
-    bm->collectAndPrintResults(*results);
+    bm->collectResults();
+    bm->printResults();
 
     // Redirect stdout to old buffer
     std::cout.rdbuf(oldStdOutBuffer);
@@ -50,8 +50,8 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatValues) {
     std::vector<double> calculateTimings;
     transferTimings.push_back(1.0);
     calculateTimings.push_back(1.0);
-    std::shared_ptr<transpose::TransposeExecutionTimings> results(
-            new transpose::TransposeExecutionTimings{transferTimings, calculateTimings});
+    bm->addTimings("transfer", transferTimings);
+    bm->addTimings("calculation", calculateTimings);
 
 
     // Redirect stout buffer to local buffer to make checks possible
@@ -59,7 +59,8 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatValues) {
     std::streambuf *oldStdOutBuffer = std::cout.rdbuf();
     std::cout.rdbuf(newStdOutBuffer.rdbuf());
 
-    bm->collectAndPrintResults(*results);
+    bm->collectResults();
+    bm->printResults();
 
     // Redirect stdout to old buffer
     std::cout.rdbuf(oldStdOutBuffer);
diff --git a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
index d7bc0c7f..985a0698 100644
--- a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
@@ -195,12 +195,12 @@ TEST_F(TransposeKernelTest, FPGAAAndBAreSummedUp4Blocks) {
  */
 TEST_F(TransposeKernelTest, FPGATimingsMeasuredForEveryIteration) {
     bm->getExecutionSettings().programSettings->numRepetitions = 10;
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(result->calculationTimings.size(), 10);
-    EXPECT_EQ(result->transferTimings.size(), 10);
+    bm->executeKernel(*data);
+    EXPECT_EQ(bm->getTimingsMap().at("calculation").size(), 10);
+    EXPECT_EQ(bm->getTimingsMap().at("transfer").size(), 10);
     for (int t = 0; t < 10; t++) {
-        EXPECT_GE(result->transferTimings[t], 0.0);
-        EXPECT_GE(result->calculationTimings[t], 0.0);
+        EXPECT_GE(bm->getTimingsMap().at("transfer")[t], 0.0);
+        EXPECT_GE(bm->getTimingsMap().at("calculation")[t], 0.0);
     }
 }
 
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 4c8454b1..68127eab 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -510,6 +510,11 @@ class HpccFpgaBenchmark {
         return timings;
     }
     
+    void
+    addTimings(std::string key, std::vector<double> value) {
+        timings.emplace(key, value);
+    }
+    
     std::map<std::string, json> getResultsJson() {
         // TODO: nested maps, recursive?
         std::map<std::string, json> results_string;

From 8ab9c86484b6ea682a4196a43ecd52de7c6aad0f Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Tue, 18 Oct 2022 21:02:58 +0200
Subject: [PATCH 208/318] add dump for RandomAccess

---
 RandomAccess/src/common/parameters.h.in       |  6 +--
 RandomAccess/src/host/execution.h             |  2 +-
 RandomAccess/src/host/execution_single.cpp    |  9 ++--
 .../src/host/random_access_benchmark.cpp      | 50 ++++++++++---------
 .../src/host/random_access_benchmark.hpp      | 24 +++------
 ...nel_functionality_and_host_integration.cpp | 10 ++--
 6 files changed, 48 insertions(+), 53 deletions(-)

diff --git a/RandomAccess/src/common/parameters.h.in b/RandomAccess/src/common/parameters.h.in
index 837d3c74..a47f850e 100644
--- a/RandomAccess/src/common/parameters.h.in
+++ b/RandomAccess/src/common/parameters.h.in
@@ -35,8 +35,8 @@
 Short description of the program.
 Moreover the version and build time is also compiled into the description.
 */
-
-#define PROGRAM_DESCRIPTION "Implementation of the random access benchmark"\
+#define PROGRAM_NAME "random access"
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
 
@@ -62,4 +62,4 @@ Output separator
 
 
 
-#endif // SRC_COMMON_PARAMETERS_H_
\ No newline at end of file
+#endif // SRC_COMMON_PARAMETERS_H_
diff --git a/RandomAccess/src/host/execution.h b/RandomAccess/src/host/execution.h
index 88cf6736..51d1796d 100644
--- a/RandomAccess/src/host/execution.h
+++ b/RandomAccess/src/host/execution.h
@@ -40,7 +40,7 @@ namespace bm_execution {
  * @param data The data that is used as input and output of the random accesses
  * @return std::unique_ptr<random_access::RandomAccessExecutionTimings> The measured runtimes of the kernel
  */
-std::unique_ptr<random_access::RandomAccessExecutionTimings>
+std::map<std::string, std::vector<double>>
 calculate(hpcc_base::ExecutionSettings<random_access::RandomAccessProgramSettings> const& config, HOST_DATA_TYPE * data, int mpi_rank, int mpi_size);
 
 }  // namespace bm_execution
diff --git a/RandomAccess/src/host/execution_single.cpp b/RandomAccess/src/host/execution_single.cpp
index 486234bf..d4718083 100644
--- a/RandomAccess/src/host/execution_single.cpp
+++ b/RandomAccess/src/host/execution_single.cpp
@@ -40,7 +40,7 @@ namespace bm_execution {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
-    std::unique_ptr<random_access::RandomAccessExecutionTimings>
+    std::map<std::string, std::vector<double>>
     calculate(hpcc_base::ExecutionSettings<random_access::RandomAccessProgramSettings> const& config, HOST_DATA_TYPE * data, int mpi_rank, int mpi_size) {
         // int used to check for OpenCL errors
         int err;
@@ -204,7 +204,10 @@ namespace bm_execution {
 
         free(random_inits);
 
-        return std::unique_ptr<random_access::RandomAccessExecutionTimings>(new random_access::RandomAccessExecutionTimings{executionTimes});
-    }
+        std::map<std::string, std::vector<double>> timings;
+
+        timings["execution"] = executionTimes;
 
+        return timings;
+    }
 }  // namespace bm_execution
diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp
index e51e1fe2..a5f06303 100644
--- a/RandomAccess/src/host/random_access_benchmark.cpp
+++ b/RandomAccess/src/host/random_access_benchmark.cpp
@@ -87,45 +87,49 @@ random_access::RandomAccessBenchmark::addAdditionalParseOptions(cxxopts::Options
             cxxopts::value<uint>()->default_value(std::to_string(HPCC_FPGA_RA_RNG_COUNT_LOG)));
 }
 
-std::unique_ptr<random_access::RandomAccessExecutionTimings>
+void
 random_access::RandomAccessBenchmark::executeKernel(RandomAccessData &data) {
-    return bm_execution::calculate(*executionSettings, data.data, mpi_comm_rank, mpi_comm_size);
+    timings = bm_execution::calculate(*executionSettings, data.data, mpi_comm_rank, mpi_comm_size);
 }
 
 void
-random_access::RandomAccessBenchmark::collectAndPrintResults(const random_access::RandomAccessExecutionTimings &output) {
+random_access::RandomAccessBenchmark::collectResults() {
 
-    std::vector<double> avgTimings(output.times.size());
+    std::vector<double> avgTimings(timings.at("execution").size());
 #ifdef _USE_MPI_
     // Copy the object variable to a local variable to make it accessible to the lambda function
     int mpi_size = mpi_comm_size;
-    MPI_Reduce(output.times.data(),avgTimings.data(),output.times.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-    std::for_each(avgTimings.begin(),avgTimings.end(), [mpi_size](double& x) {x /= mpi_size;});
+    MPI_Reduce(timings.at("execution").data(), avgTimings.data(),timings.at("execution").size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    std::for_each(avgTimings.begin(), avgTimings.end(), [mpi_size](double& x) {x /= mpi_size;});
 #else
-    std::copy(output.times.begin(), output.times.end(), avgTimings.begin());
+    std::copy(timings.at("execution").begin(), timings.at("execution").end(), avgTimings.begin());
 #endif
-    if (mpi_comm_rank == 0) {
+    // Calculate performance for kernel execution
+    double tmean = 0;
+    double tmin = std::numeric_limits<double>::max();
+    double gups = static_cast<double>(4 * executionSettings->programSettings->dataSize * mpi_comm_size) / 1000000000;
+    for (double currentTime : avgTimings) {
+        tmean +=  currentTime;
+        if (currentTime < tmin) {
+            tmin = currentTime;
+        }
+    }
+    tmean = tmean / timings.at("execution").size();
+
+    results.emplace("t_min", hpcc_base::HpccResult(tmin, "s"));
+    results.emplace("t_mean", hpcc_base::HpccResult(tmean, "s"));
+    results.emplace("guops", hpcc_base::HpccResult(gups / tmin, "GUOP/s"));
+}
+
+void random_access::RandomAccessBenchmark::printResults() {
         std::cout << std::setw(ENTRY_SPACE)
                 << "best" << std::setw(ENTRY_SPACE) << "mean"
                 << std::setw(ENTRY_SPACE) << "GUOPS" << std::endl;
 
-        // Calculate performance for kernel execution
-        double tmean = 0;
-        double tmin = std::numeric_limits<double>::max();
-        double gups = static_cast<double>(4 * executionSettings->programSettings->dataSize * mpi_comm_size) / 1000000000;
-        for (double currentTime : avgTimings) {
-            tmean +=  currentTime;
-            if (currentTime < tmin) {
-                tmin = currentTime;
-            }
-        }
-        tmean = tmean / output.times.size();
-
         std::cout << std::setw(ENTRY_SPACE)
-                << tmin << std::setw(ENTRY_SPACE) << tmean
-                << std::setw(ENTRY_SPACE) << gups / tmin
+                << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean")
+                << std::setw(ENTRY_SPACE) << results.at("guops")
                 << std::endl;
-    }
 }
 
 bool
diff --git a/RandomAccess/src/host/random_access_benchmark.hpp b/RandomAccess/src/host/random_access_benchmark.hpp
index 393c9b53..56c7ff40 100644
--- a/RandomAccess/src/host/random_access_benchmark.hpp
+++ b/RandomAccess/src/host/random_access_benchmark.hpp
@@ -114,25 +114,11 @@ class RandomAccessData {
 
 };
 
-/**
- * @brief Measured execution timing from the kernel execution
- * 
- */
-class RandomAccessExecutionTimings {
-public:
-    /**
-     * @brief A vector containing the timings for all repetitions
-     * 
-     */
-    std::vector<double> times;
-
-};
-
 /**
  * @brief Implementation of the random access benchmark
  * 
  */
-class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark<RandomAccessProgramSettings, RandomAccessData, RandomAccessExecutionTimings> {
+class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark<RandomAccessProgramSettings, RandomAccessData> {
 
 protected:
 
@@ -158,9 +144,8 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark<RandomAccessPr
      * @brief RandomAccess specific implementation of the kernel execution
      * 
      * @param data The benchmark input and output data
-     * @return std::unique_ptr<RandomAccessExecutionTimings> 
      */
-    std::unique_ptr<RandomAccessExecutionTimings>
+    void
     executeKernel(RandomAccessData &data) override;
 
     /**
@@ -179,7 +164,10 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark<RandomAccessPr
      * @param output The measurement values that are generated yb the kernel execution
      */
     void
-    collectAndPrintResults(const RandomAccessExecutionTimings &output) override;
+    collectResults() override;
+
+    void
+    printResults() override;
 
     /**
      * @brief Check the given bencmark configuration and its validity
diff --git a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp
index 067ea7ab..0cb30dd1 100644
--- a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp
@@ -28,8 +28,8 @@ struct RandomAccessKernelTest : testing::Test {
  * Check if the number of measurements from the calculation matches the number of repetitions
  */
 TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements1Rep) {
-    auto result = bm->executeKernel( *data);
-    EXPECT_EQ(result->times.size(), 1);
+    bm->executeKernel( *data);
+    EXPECT_EQ(bm->getTimingsMap().at("execution").size(), 1);
 }
 
 /**
@@ -37,15 +37,15 @@ TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements1Rep) {
  */
 TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements3Rep) {
     bm->getExecutionSettings().programSettings->numRepetitions = 3;
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(result->times.size(), 3);
+    bm->executeKernel(*data);
+    EXPECT_EQ(bm->getTimingsMap().at("execution").size(), 3);
 }
 
 /**
  * Execution returns correct results for a single repetition
  */
 TEST_F(RandomAccessKernelTest, FPGAErrorBelow1Percent) {
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     bool success = bm->validateOutputAndPrintError(*data);
     EXPECT_TRUE(success);
 }

From 58a06c4378f1d4d602f8b5218eb2032c570b8682 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Tue, 18 Oct 2022 21:55:01 +0200
Subject: [PATCH 209/318] add dump for STREAM

---
 STREAM/src/common/parameters.h.in             |  5 +-
 STREAM/src/host/execution.hpp                 |  8 ++-
 STREAM/src/host/execution_default.cpp         | 10 +---
 STREAM/src/host/stream_benchmark.cpp          | 57 ++++++++++---------
 STREAM/src/host/stream_benchmark.hpp          | 30 ++--------
 ...nel_functionality_and_host_integration.cpp |  4 +-
 6 files changed, 50 insertions(+), 64 deletions(-)

diff --git a/STREAM/src/common/parameters.h.in b/STREAM/src/common/parameters.h.in
index 57bb0d0a..8d822247 100644
--- a/STREAM/src/common/parameters.h.in
+++ b/STREAM/src/common/parameters.h.in
@@ -33,7 +33,8 @@
 #cmakedefine USE_SVM
 #cmakedefine USE_HBM
 
-#define PROGRAM_DESCRIPTION "Implementation of the STREAM benchmark"\
+#define PROGRAM_NAME "STREAM"
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
 
@@ -48,4 +49,4 @@ Output separator
 #define TRIAD_KERNEL_TYPE 3
 
 
-#endif // SRC_COMMON_PARAMETERS_H_
\ No newline at end of file
+#endif // SRC_COMMON_PARAMETERS_H_
diff --git a/STREAM/src/host/execution.hpp b/STREAM/src/host/execution.hpp
index 70d6f948..d3e1c31b 100644
--- a/STREAM/src/host/execution.hpp
+++ b/STREAM/src/host/execution.hpp
@@ -35,13 +35,15 @@ SOFTWARE.
 #include "half.hpp"
 
 // Map keys for execution timings
-#define PCIE_WRITE_KEY "PCI write"
-#define PCIE_READ_KEY "PCI read"
+#define PCIE_WRITE_KEY "PCI_write"
+#define PCIE_READ_KEY "PCI_read"
 #define COPY_KEY "Copy"
 #define SCALE_KEY "Scale"
 #define ADD_KEY "Add"
 #define TRIAD_KEY "Triad"
 
+const std::string keys[] = {PCIE_WRITE_KEY, PCIE_READ_KEY, COPY_KEY, SCALE_KEY, ADD_KEY, TRIAD_KEY};
+
 namespace bm_execution {
 
     static std::map<std::string,double> multiplicatorMap = {
@@ -62,7 +64,7 @@ namespace bm_execution {
      * @param C The array C of the stream benchmark
      * @return std::unique_ptr<stream::StreamExecutionTimings> The measured timings for all stream operations
      */
-    std::unique_ptr<stream::StreamExecutionTimings>
+    std::map<std::string, std::vector<double>>
     calculate(const hpcc_base::ExecutionSettings<stream::StreamProgramSettings>& config,
               HOST_DATA_TYPE* A,
               HOST_DATA_TYPE* B,
diff --git a/STREAM/src/host/execution_default.cpp b/STREAM/src/host/execution_default.cpp
index 71a4d04f..a8cc5d83 100644
--- a/STREAM/src/host/execution_default.cpp
+++ b/STREAM/src/host/execution_default.cpp
@@ -67,7 +67,7 @@ namespace bm_execution {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
-    std::unique_ptr<stream::StreamExecutionTimings>
+    std::map<std::string, std::vector<double>>
     calculate(const hpcc_base::ExecutionSettings<stream::StreamProgramSettings>& config,
             HOST_DATA_TYPE* A,
             HOST_DATA_TYPE* B,
@@ -105,7 +105,7 @@ namespace bm_execution {
                                           add_kernels, triad_kernels, command_queues);
         }
         if (!success) {
-            return std::unique_ptr<stream::StreamExecutionTimings>(nullptr);
+            return std::map<std::string, std::vector<double>>();
         }
 
         //
@@ -331,11 +331,7 @@ namespace bm_execution {
 
         }
 
-        std::unique_ptr<stream::StreamExecutionTimings> result(new stream::StreamExecutionTimings{
-                timingMap,
-                config.programSettings->streamArraySize
-        });
-        return result;
+        return timingMap;
     }
 
     bool initialize_queues_and_kernels(const hpcc_base::ExecutionSettings<stream::StreamProgramSettings> &config,
diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp
index 4dac0ea0..07da82b3 100644
--- a/STREAM/src/host/stream_benchmark.cpp
+++ b/STREAM/src/host/stream_benchmark.cpp
@@ -102,19 +102,18 @@ stream::StreamBenchmark::addAdditionalParseOptions(cxxopts::Options &options) {
             ("multi-kernel", "Use the legacy multi kernel implementation");
 }
 
-std::unique_ptr<stream::StreamExecutionTimings>
+void
 stream::StreamBenchmark::executeKernel(StreamData &data) {
-    return bm_execution::calculate(*executionSettings,
+    timings = bm_execution::calculate(*executionSettings,
               data.A,
               data.B,
               data.C);
 }
 
 void
-stream::StreamBenchmark::collectAndPrintResults(const stream::StreamExecutionTimings &output) {
-
+stream::StreamBenchmark::collectResults() {
     std::map<std::string,std::vector<double>> totalTimingsMap;
-    for (auto v : output.timings) {
+    for (auto v : timings) {
         // Number of experiment repetitions
         uint number_measurements = v.second.size();
         // create a new 
@@ -127,29 +126,35 @@ stream::StreamBenchmark::collectAndPrintResults(const stream::StreamExecutionTim
 #else
         std::copy(v.second.begin(), v.second.end(), avg_measures.begin());
 #endif
-        totalTimingsMap.insert({v.first,avg_measures});
-    }
 
-    if (mpi_comm_rank == 0) {
-        std::cout << std::setw(ENTRY_SPACE) << "Function";
-        std::cout << std::setw(ENTRY_SPACE) << "Best Rate MB/s";
-        std::cout << std::setw(ENTRY_SPACE) << "Avg time s";
-        std::cout << std::setw(ENTRY_SPACE) << "Min time" ;
-        std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::endl;
+        double minTime = *min_element(v.second.begin(), v.second.end());
+        double avgTime = accumulate(v.second.begin(), v.second.end(), 0.0)
+                        / v.second.size();
+        double maxTime = *max_element(v.second.begin(), v.second.end());
+
+        double bestRate = (static_cast<double>(sizeof(HOST_DATA_TYPE)) * executionSettings->programSettings->streamArraySize * bm_execution::multiplicatorMap[v.first] / minTime) * 1.0e-6 * mpi_comm_size;
+        
+        results.emplace(v.first + "_min_t", hpcc_base::HpccResult(minTime, "s"));
+        results.emplace(v.first + "_avg_t", hpcc_base::HpccResult(avgTime, "s"));
+        results.emplace(v.first + "_max_t", hpcc_base::HpccResult(maxTime, "s"));
+        results.emplace(v.first + "_best_rate", hpcc_base::HpccResult(bestRate, "MB/s"));
+    }
+}
 
-        for (auto v : totalTimingsMap) {
-            double minTime = *min_element(v.second.begin(), v.second.end());
-            double avgTime = accumulate(v.second.begin(), v.second.end(), 0.0)
-                            / v.second.size();
-            double maxTime = *max_element(v.second.begin(), v.second.end());
+void
+stream::StreamBenchmark::printResults() {
+    std::cout << std::setw(ENTRY_SPACE) << "Function";
+    std::cout << std::setw(ENTRY_SPACE) << "Best Rate";
+    std::cout << std::setw(ENTRY_SPACE) << "Avg time";
+    std::cout << std::setw(ENTRY_SPACE) << "Min time" ;
+    std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::endl;
 
-            std::cout << std::setw(ENTRY_SPACE) << v.first;
-            std::cout << std::setw(ENTRY_SPACE)
-            << (static_cast<double>(sizeof(HOST_DATA_TYPE)) * output.arraySize * bm_execution::multiplicatorMap[v.first] / minTime) * 1.0e-6 * mpi_comm_size
-                    << std::setw(ENTRY_SPACE) << avgTime
-                    << std::setw(ENTRY_SPACE) << minTime
-                    << std::setw(ENTRY_SPACE) << maxTime << std::endl;
-        }
+    for (auto key : keys) {
+        std::cout << std::setw(ENTRY_SPACE) << key;
+        std::cout << std::setw(ENTRY_SPACE) << results.at(key + "_best_rate")
+                << std::setw(ENTRY_SPACE) << results.at(key + "_avg_t")
+                << std::setw(ENTRY_SPACE) << results.at(key + "_min_t")
+                << std::setw(ENTRY_SPACE) << results.at(key + "_max_t") << std::endl;
     }
 }
 
@@ -265,4 +270,4 @@ stream::StreamBenchmark::validateOutputAndPrintError(stream::StreamData &data) {
         return false;
     }
     return true;
-}
\ No newline at end of file
+}
diff --git a/STREAM/src/host/stream_benchmark.hpp b/STREAM/src/host/stream_benchmark.hpp
index 401a899d..8377b744 100644
--- a/STREAM/src/host/stream_benchmark.hpp
+++ b/STREAM/src/host/stream_benchmark.hpp
@@ -127,30 +127,11 @@ class StreamData {
 
 };
 
-/**
- * @brief Measured execution timing from the kernel execution
- * 
- */
-class StreamExecutionTimings {
-public:
-    /**
-     * @brief A map containing the timings for all stream operation types
-     * 
-     */
-    std::map<std::string,std::vector<double>> timings;
-
-    /**
-     * @brief The used array size
-     * 
-     */
-    uint arraySize;
-};
-
 /**
  * @brief Implementation of the Sream benchmark
  * 
  */
-class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark<StreamProgramSettings, StreamData, StreamExecutionTimings> {
+class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark<StreamProgramSettings, StreamData> {
 
 protected:
 
@@ -176,9 +157,8 @@ class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark<StreamProgramSetting
      * @brief Stream specific implementation of the kernel execution
      * 
      * @param data 
-     * @return std::unique_ptr<StreamExecutionTimings> 
      */
-    std::unique_ptr<StreamExecutionTimings>
+    void
     executeKernel( StreamData &data) override;
 
     /**
@@ -194,10 +174,12 @@ class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark<StreamProgramSetting
     /**
      * @brief Stream specific implementation of printing the execution results
      * 
-     * @param output 
      */
     void
-    collectAndPrintResults(const StreamExecutionTimings &output) override;
+    collectResults() override;
+
+    void
+    printResults() override;
 
     /**
      * @brief Construct a new Stream Benchmark object
diff --git a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp
index 181750ae..ec78075e 100644
--- a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp
@@ -29,7 +29,7 @@ struct StreamKernelTest :public  ::testing::Test {
  */
 TEST_F(StreamKernelTest, FPGACorrectResultsOneRepetition) {
     bm->getExecutionSettings().programSettings->numRepetitions = 1;
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < bm->getExecutionSettings().programSettings->streamArraySize; i++) {
         EXPECT_FLOAT_EQ(data->A[i], 30.0);
         EXPECT_FLOAT_EQ(data->B[i], 6.0);
@@ -42,7 +42,7 @@ TEST_F(StreamKernelTest, FPGACorrectResultsOneRepetition) {
  */
 TEST_F(StreamKernelTest, FPGACorrectResultsThreeRepetition) {
     bm->getExecutionSettings().programSettings->numRepetitions = 3;
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < bm->getExecutionSettings().programSettings->streamArraySize; i++) {
         EXPECT_FLOAT_EQ(data->A[i], 6750.0);
         EXPECT_FLOAT_EQ(data->B[i], 1350.0);

From eddef646d30f566bdaf975333f3e06a31da7634b Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 24 Oct 2022 17:29:29 +0200
Subject: [PATCH 210/318] add dump for b_eff

---
 b_eff/src/common/parameters.h.in              |  4 +-
 b_eff/src/host/execution.h                    |  2 +-
 .../host/execution_types/execution_cpu.hpp    |  7 +-
 .../host/execution_types/execution_iec.hpp    |  7 +-
 .../host/execution_types/execution_pcie.hpp   |  7 +-
 b_eff/src/host/network_benchmark.cpp          | 89 ++++++++++---------
 b_eff/src/host/network_benchmark.hpp          | 66 ++++++++++----
 ...nel_functionality_and_host_integration.cpp | 40 ++++-----
 shared/include/hpcc_benchmark.hpp             |  7 +-
 9 files changed, 131 insertions(+), 98 deletions(-)

diff --git a/b_eff/src/common/parameters.h.in b/b_eff/src/common/parameters.h.in
index d404bfd7..a334f7db 100644
--- a/b_eff/src/common/parameters.h.in
+++ b/b_eff/src/common/parameters.h.in
@@ -29,7 +29,9 @@ Short description of the program.
 Moreover the version and build time is also compiled into the description.
 */
 
-#define PROGRAM_DESCRIPTION "Implementation of the effective bandwidth benchmark"\
+#define PROGRAM_NAME "effective bandwidth"
+
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
 
diff --git a/b_eff/src/host/execution.h b/b_eff/src/host/execution.h
index 195b97b1..f43c31de 100644
--- a/b_eff/src/host/execution.h
+++ b/b_eff/src/host/execution.h
@@ -44,7 +44,7 @@ simple exchange of the different calculation methods.
 
 @return The resulting matrix
 */
-    std::shared_ptr<network::ExecutionTimings>
+    network::ExecutionTimings
     calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength, cl::vector<HOST_DATA_TYPE> &validationData);
 
 }  // namespace bm_execution
diff --git a/b_eff/src/host/execution_types/execution_cpu.hpp b/b_eff/src/host/execution_types/execution_cpu.hpp
index 778dc2f1..ec37dcb6 100644
--- a/b_eff/src/host/execution_types/execution_cpu.hpp
+++ b/b_eff/src/host/execution_types/execution_cpu.hpp
@@ -38,7 +38,7 @@ namespace network::execution_types::cpu {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
-    std::shared_ptr<network::ExecutionTimings>
+    network::ExecutionTimings
     calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength,
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
@@ -105,12 +105,11 @@ namespace network::execution_types::cpu {
             err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
             ASSERT_CL(err);
         }
-        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
+        return network::ExecutionTimings{
                 looplength,
                 messageSize,
                 calculationTimings
-        });
-        return result;
+        };
     }
 
 }  // namespace bm_execution
diff --git a/b_eff/src/host/execution_types/execution_iec.hpp b/b_eff/src/host/execution_types/execution_iec.hpp
index 2ec348e5..4225c783 100644
--- a/b_eff/src/host/execution_types/execution_iec.hpp
+++ b/b_eff/src/host/execution_types/execution_iec.hpp
@@ -39,7 +39,7 @@ namespace network::execution_types::iec {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
-    std::shared_ptr<network::ExecutionTimings>
+    network::ExecutionTimings
     calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength,
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
@@ -164,12 +164,11 @@ namespace network::execution_types::iec {
             err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
             ASSERT_CL(err);
         }
-        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
+        return network::ExecutionTimings{
                 looplength,
                 messageSize,
                 calculationTimings
-        });
-        return result;
+        };
     }
 
 }  // namespace bm_execution
diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp
index 73156b7e..50d357e6 100644
--- a/b_eff/src/host/execution_types/execution_pcie.hpp
+++ b/b_eff/src/host/execution_types/execution_pcie.hpp
@@ -38,7 +38,7 @@ namespace network::execution_types::pcie {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
-    std::shared_ptr<network::ExecutionTimings>
+    network::ExecutionTimings
     calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength,
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
@@ -111,12 +111,11 @@ namespace network::execution_types::pcie {
             err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
             ASSERT_CL(err);
         }
-        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
+        return network::ExecutionTimings{
                 looplength,
                 messageSize,
                 calculationTimings
-        });
-        return result;
+        };
     }
 
 }  // namespace bm_execution
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index 7bf728a2..73cd27a7 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -89,7 +89,7 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options)
             cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE)));
 }
 
-std::unique_ptr<network::NetworkExecutionTimings>
+void
 network::NetworkBenchmark::executeKernel(NetworkData &data) {
     // Get the number of processes
     int world_size;
@@ -99,13 +99,13 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
     int world_rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 
-    std::vector<std::shared_ptr<network::ExecutionTimings>> timing_results;
+    std::vector<network::ExecutionTimings> timing_results;
 
     for (auto& run : data.items) {
         if (world_rank == 0) {
             std::cout << "Measure for " << (1 << run.messageSize) << " Byte" << std::endl;
         }
-        std::shared_ptr<network::ExecutionTimings> timing;
+        network::ExecutionTimings timing;
         switch (executionSettings->programSettings->communicationType) {
             case hpcc_base::CommunicationType::cpu_only: timing = execution_types::cpu::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
             case hpcc_base::CommunicationType::pcie_mpi: timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
@@ -115,16 +115,15 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
         timing_results.push_back(timing);
     }
 
-    std::unique_ptr<network::NetworkExecutionTimings> collected_results = std::unique_ptr<network::NetworkExecutionTimings> (new network::NetworkExecutionTimings());
     if (world_rank > 0) {
         for (const auto& t : timing_results) {
-            MPI_Send(&(t->messageSize),
+            MPI_Send(&(t.messageSize),
                      1,
                      MPI_UNSIGNED, 0, 0, MPI_COMM_WORLD);
-            MPI_Send(&(t->looplength),
+            MPI_Send(&(t.looplength),
                      1,
                      MPI_UNSIGNED, 0, 1, MPI_COMM_WORLD);
-            MPI_Send(&(t->calculationTimings.front()),
+            MPI_Send(&(t.calculationTimings.front()),
                      executionSettings->programSettings->numRepetitions,
                      MPI_DOUBLE, 0,  2, MPI_COMM_WORLD);
         }
@@ -132,84 +131,86 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
         std::cout << "Collect results over MPI.";
         int k = 0;
         for (auto& run : data.items) {
-            std::vector<std::shared_ptr<network::ExecutionTimings>> tmp_timings;
+            std::vector<network::ExecutionTimings> tmp_timings;
             std::cout << ".";
             for (int i=1; i < world_size; i++) {
-                auto execution_result = std::shared_ptr<network::ExecutionTimings>( new network::ExecutionTimings {
+                auto execution_result = network::ExecutionTimings{
                     0,0,std::vector<double>(executionSettings->programSettings->numRepetitions)
-                });
+                };
                 MPI_Status status;
-                MPI_Recv(&(execution_result->messageSize),
+                MPI_Recv(&(execution_result.messageSize),
                          1,
                          MPI_UNSIGNED, i, 0, MPI_COMM_WORLD, &status);
-                MPI_Recv(&(execution_result->looplength),
+                MPI_Recv(&(execution_result.looplength),
                          1,
                          MPI_UNSIGNED, i, 1, MPI_COMM_WORLD, &status);
-                MPI_Recv(&(execution_result->calculationTimings.front()),
+                MPI_Recv(&(execution_result.calculationTimings.front()),
                          executionSettings->programSettings->numRepetitions,
                          MPI_DOUBLE, i, 2, MPI_COMM_WORLD, &status);
                 tmp_timings.push_back(execution_result);
-                if (execution_result->messageSize != run.messageSize) {
-                    std::cerr << "Wrong message size: " << execution_result->messageSize << " != " << run.messageSize << " from rank " << i << std::endl;
+                if (execution_result.messageSize != run.messageSize) {
+                    std::cerr << "Wrong message size: " << execution_result.messageSize << " != " << run.messageSize << " from rank " << i << std::endl;
                     throw std::runtime_error("Wrong message size received! Something went wrong in the MPI communication");
                 }
             }
             tmp_timings.push_back(timing_results[k]);
             k++;
-            collected_results->timings.emplace(run.messageSize, std::make_shared<std::vector<std::shared_ptr<network::ExecutionTimings>>>(tmp_timings));
+            collected_timings.emplace(run.messageSize, network::ExecutionResult{tmp_timings, 0.0, 0.0});
         }
         std::cout << " done!" << std::endl;
     }
-
-        return collected_results;
+    return;
 }
 
 void
-network::NetworkBenchmark::collectAndPrintResults(const network::NetworkExecutionTimings &output) {
+network::NetworkBenchmark::collectResults() {
     std::vector<double> maxBandwidths;
 
     if (mpi_comm_rank == 0) {
-        std::cout << std::setw(ENTRY_SPACE) << "MSize" << "   "
-                << std::setw(ENTRY_SPACE) << "looplength" << "   "
-                << std::setw(ENTRY_SPACE) << "transfer" << "   "
-                << std::setw(ENTRY_SPACE) << "B/s" << std::endl;
-        std::vector<double> totalMaxMinCalculationTime;
-        for (long unsigned int i =0; i < output.timings.size(); i++) {
-            totalMaxMinCalculationTime.push_back(0.0);
-        }
         int i = 0;
-        for (const auto& msgSizeResults : output.timings) {
-            for (const auto& r : *msgSizeResults.second) {
-                double localMinCalculationTime = *min_element(r->calculationTimings.begin(), r->calculationTimings.end());
-                totalMaxMinCalculationTime[i] = std::max(totalMaxMinCalculationTime[i], localMinCalculationTime);
+        for (auto& timing : collected_timings) {
+            for (auto& r : timing.second.execution_timings) {
+                double localMinCalculationTime = *min_element(r.calculationTimings.begin(), r.calculationTimings.end());
+                timing.second.maxMinCalculationTime = std::max(timing.second.maxMinCalculationTime, localMinCalculationTime);
             }
             i++;
         }
         i = 0;
-        for (const auto& msgSizeResults : output.timings) {
-            int looplength = msgSizeResults.second->at(0)->looplength;
+        for (auto& timing : collected_timings) {
+            int looplength = timing.second.execution_timings.at(0).looplength;
+            int messageSize = timing.first;
+            int num_timings = timing.second.execution_timings.size();
             // The total sent data in bytes will be:
             // #Nodes * message_size * looplength * 2
             // the * 2 is because we have two kernels per bitstream that will send and receive simultaneously.
             // This will be divided by half of the maximum of the minimum measured runtime over all ranks.
-            double maxCalcBW = static_cast<double>(msgSizeResults.second->size() * 2 * (1 << msgSizeResults.first) * looplength)
-                                                                / (totalMaxMinCalculationTime[i]);
+            timing.second.maxCalcBW = static_cast<double>(num_timings * 2 * (1 << messageSize) * looplength)
+                                                                / timing.second.maxMinCalculationTime;
 
-            maxBandwidths.push_back(maxCalcBW);
+            maxBandwidths.push_back(timing.second.maxCalcBW);
 
-            std::cout << std::setw(ENTRY_SPACE) << (1 << msgSizeResults.first) << "   "
-                    << std::setw(ENTRY_SPACE) << looplength << "   "
-                    << std::setw(ENTRY_SPACE) << totalMaxMinCalculationTime[i] << "   "
-                    << std::setw(ENTRY_SPACE)  << maxCalcBW
-                    << std::endl;
             i++;
         }
 
+        results.emplace("b_eff", hpcc_base::HpccResult(accumulate(maxBandwidths.begin(), maxBandwidths.end(), 0.0) / static_cast<double>(maxBandwidths.size()), "B/s"));
+    }
+}
 
-        double b_eff = accumulate(maxBandwidths.begin(), maxBandwidths.end(), 0.0) / static_cast<double>(maxBandwidths.size());
-
-        std::cout << std::endl << "b_eff = " << b_eff << " B/s" << std::endl;
+void network::NetworkBenchmark::printResults() {
+    std::cout << std::setw(ENTRY_SPACE) << "MSize" << "   "
+            << std::setw(ENTRY_SPACE) << "looplength" << "   "
+            << std::setw(ENTRY_SPACE) << "transfer" << "   "
+            << std::setw(ENTRY_SPACE) << "B/s" << std::endl;
+
+    for (const auto& timing : collected_timings) {
+        std::cout << std::setw(ENTRY_SPACE) << (1 << timing.first) << "   "
+                << std::setw(ENTRY_SPACE) << timing.second.execution_timings.at(0).looplength << "   "
+                << std::setw(ENTRY_SPACE) << timing.second.maxMinCalculationTime << "   "
+                << std::setw(ENTRY_SPACE)  << timing.second.maxCalcBW
+                << std::endl;
     }
+
+    std::cout << std::endl << "b_eff = " << results.at("b_eff") << std::endl;
 }
 
 std::unique_ptr<network::NetworkData>
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index 0fdf8064..2d8e9ee9 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -64,11 +64,26 @@ namespace network {
         std::vector<double> calculationTimings;
     };
 
+    struct ExecutionResult {
+        std::vector<ExecutionTimings> execution_timings;
+        /**
+         * @brief maximum of minimum calculation time, filled by collectResults
+         * 
+         */
+        double maxMinCalculationTime;
+    
+        /**
+         * @brief maximum of calculated bandwidths, filled by collectResults
+         * 
+         */
+        double maxCalcBW;
+    };
+
     /**
      * @brief The data structure used to store all measurement results
      * 
      */
-    typedef std::map<int, std::shared_ptr<std::vector<std::shared_ptr<ExecutionTimings>>>> CollectedResultMap;
+    typedef std::map<int, ExecutionResult> CollectedTimingsMap;
 
 /**
  * @brief The Network benchmark specific program settings
@@ -194,26 +209,11 @@ class NetworkData {
 
 };
 
-/**
- * @brief Measured execution timing from the kernel execution
- * 
- */
-class NetworkExecutionTimings {
-public:
-
-    /**
-     * @brief A vector containing the timings for all repetitions for the kernel execution
-     * 
-     */
-    CollectedResultMap timings;
-
-};
-
 /**
  * @brief Implementation of the Network benchmark
  * 
  */
-class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, NetworkData, NetworkExecutionTimings> {
+class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, NetworkData> {
 
 protected:
 
@@ -227,6 +227,31 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark<NetworkProgramSetti
 
 public:
 
+    CollectedTimingsMap collected_timings;
+    
+    json
+    getTimingsJson() override
+    {
+        json j;
+        for (const auto& timing: collected_timings) {
+            json timing_json;
+            timing_json["maxMinCalculationTime"] = timing.second.maxMinCalculationTime;
+            timing_json["maxCalcBW"] = timing.second.maxCalcBW;
+            std::vector<json> timings_json;
+            for (const auto& execution_timing: timing.second.execution_timings) {
+                json single_timing_json;
+                single_timing_json["looplength"] = execution_timing.looplength;
+                single_timing_json["messageSize"] = execution_timing.messageSize;
+                single_timing_json["timings"] = execution_timing.calculationTimings;
+                timings_json.push_back(single_timing_json);
+            }
+            timing_json["timings"] = timings_json;
+            
+            j[std::to_string(timing.first)] = timing_json;
+        }
+        return j;
+    }
+
     /**
      * @brief Network specific implementation of the data generation
      * 
@@ -241,7 +266,7 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark<NetworkProgramSetti
      * @param data The input and output data of the benchmark
      * @return std::unique_ptr<NetworkExecutionTimings> Measured runtimes of the kernel execution
      */
-    std::unique_ptr<NetworkExecutionTimings>
+    void
     executeKernel(NetworkData &data) override;
 
     /**
@@ -259,7 +284,10 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark<NetworkProgramSetti
      * @param output Measured runtimes of the kernel execution
      */
     void
-    collectAndPrintResults(const NetworkExecutionTimings &output) override;
+    collectResults() override;
+
+    void
+    printResults() override;
 
     /**
      * @brief Construct a new Network Benchmark object. This construtor will directly setup
diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
index ba254201..9262e331 100644
--- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
@@ -51,10 +51,10 @@ struct NetworkKernelTest : testing::TestWithParam<hpcc_base::CommunicationType>
 TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) {
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(1,1));
-    auto result = bm->executeKernel(*data);
-    EXPECT_NE(result->timings.end(), result->timings.find(1));
-    EXPECT_EQ(1, result->timings.find(1)->second->at(0)->looplength);
-    EXPECT_EQ(1, result->timings.find(1)->second->at(0)->calculationTimings.size());
+    bm->executeKernel(*data);
+    EXPECT_NE(bm->collected_timings.end(), bm->collected_timings.find(1));
+    EXPECT_EQ(1, bm->collected_timings.find(1)->second.execution_timings.at(0).looplength);
+    EXPECT_EQ(1, bm->collected_timings.find(1)->second.execution_timings.at(0).calculationTimings.size());
 }
 
 /**
@@ -64,10 +64,10 @@ TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) {
     bm->getExecutionSettings().programSettings->numRepetitions = 2;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(8,4));
-    auto result = bm->executeKernel(*data);
-    EXPECT_NE(result->timings.end(), result->timings.find(8));
-    EXPECT_EQ(4, result->timings.find(8)->second->at(0)->looplength);
-    EXPECT_EQ(2, result->timings.find(8)->second->at(0)->calculationTimings.size());
+    bm->executeKernel(*data);
+    EXPECT_NE(bm->collected_timings.end(), bm->collected_timings.find(8));
+    EXPECT_EQ(4, bm->collected_timings.find(8)->second.execution_timings.at(0).looplength);
+    EXPECT_EQ(2, bm->collected_timings.find(8)->second.execution_timings.at(0).calculationTimings.size());
 }
 
 /**
@@ -82,7 +82,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel)
     const unsigned looplength = 4;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2];
     for (int i=0; i < numberOfChannels; i++) {
         std::string ifname = channelOutName + std::to_string(i);
@@ -110,7 +110,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels
     const unsigned looplength = 4;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize, looplength));
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2];
     for (int i=0; i < numberOfChannels; i++) {
         std::string ifname = channelOutName + std::to_string(i);
@@ -135,7 +135,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwo
     const unsigned looplength = 1;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2];
     for (int i=0; i < numberOfChannels; i++) {
         std::string ifname = channelOutName + std::to_string(i);
@@ -160,7 +160,7 @@ TEST_P(NetworkKernelTest, CorrectDataIsWrittenToChannel) {
     const unsigned looplength = 4;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[messageSize * looplength * 2];
     for (int i=0; i < numberOfChannels; i++) {
         std::string ifname = channelOutName + std::to_string(i);
@@ -180,7 +180,7 @@ TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) {
     const unsigned looplength = 4;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     HOST_DATA_TYPE cvalue = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     EXPECT_EQ(cvalue, data->items[0].validationBuffer[0]);
     bool all_same = true;
@@ -195,7 +195,7 @@ TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) {
     const unsigned looplength = 4;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     HOST_DATA_TYPE cvalue = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     EXPECT_EQ(cvalue, data->items[0].validationBuffer[0]);
     bool all_same = true;
@@ -210,7 +210,7 @@ TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) {
     const unsigned looplength = 4;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size());
 }
 
@@ -219,7 +219,7 @@ TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) {
     const unsigned looplength = 1;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size());
 }
 
@@ -228,7 +228,7 @@ TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) {
     const unsigned looplength = 1;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     EXPECT_EQ(looplength * CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size());
 }
 
@@ -268,7 +268,7 @@ TEST_P(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) {
     const unsigned looplength = 4;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
 }
 
@@ -281,7 +281,7 @@ TEST_P(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExec
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength));
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
 }
 
@@ -291,7 +291,7 @@ TEST_P(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) {
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength));
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     data->items[1].validationBuffer[0] = static_cast<HOST_DATA_TYPE>(0);
     EXPECT_FALSE(bm->validateOutputAndPrintError(*data));
 }
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 68127eab..3d560ce4 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -527,6 +527,11 @@ class HpccFpgaBenchmark {
         return results_string;
     }
     
+    // override for special benchmarks like b_eff
+    virtual json getTimingsJson() {
+        return timings;
+    }
+    
     std::map<std::string, std::string>
     getEnvironmentMap() {
         std::map<std::string, std::string> env; 
@@ -584,7 +589,7 @@ class HpccFpgaBenchmark {
             dump["version"] = VERSION;
             dump["device"] = executionSettings->getDeviceName();
             dump["settings"] = jsonifySettingsMap(executionSettings->programSettings->getSettingsMap());
-            dump["timings"] = timings;
+            dump["timings"] = getTimingsJson();
             dump["results"] = getResultsJson();
             dump["environment"] = getEnvironmentMap();
 

From a99cca1c4774690b0b7d835dcd6a62334d30ceb5 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Thu, 10 Nov 2022 19:33:19 +0100
Subject: [PATCH 211/318] add units to timings dump

---
 b_eff/src/host/network_benchmark.hpp |  9 ++++++++-
 shared/include/hpcc_benchmark.hpp    | 13 ++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index 2d8e9ee9..d86c2f61 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -242,7 +242,14 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark<NetworkProgramSetti
                 json single_timing_json;
                 single_timing_json["looplength"] = execution_timing.looplength;
                 single_timing_json["messageSize"] = execution_timing.messageSize;
-                single_timing_json["timings"] = execution_timing.calculationTimings;
+                std::vector<json> calculation_timings;
+                for (const auto& timing: execution_timing.calculationTimings) {
+                    json j;
+                    j["unit"] = "s";
+                    j["value"] = timing;
+                    calculation_timings.push_back(timing);
+                }
+                single_timing_json["timings"] = calculation_timings;
                 timings_json.push_back(single_timing_json);
             }
             timing_json["timings"] = timings_json;
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 3d560ce4..71430599 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -529,7 +529,18 @@ class HpccFpgaBenchmark {
     
     // override for special benchmarks like b_eff
     virtual json getTimingsJson() {
-        return timings;
+        json j;
+        for (auto const &key: timings) {
+            std::vector<json> timings_list;
+            for (auto const &timing: key.second) {
+                json j;
+                j["unit"] = "s";
+                j["value"] = timing;
+                timings_list.push_back(j);
+            }
+            j[key.first] = timings_list;
+        }
+        return j;
     }
     
     std::map<std::string, std::string>

From d2ee8ba547796526b9b7bcc54c5fa9e65da01522 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Thu, 10 Nov 2022 19:34:16 +0100
Subject: [PATCH 212/318] change dump option to dump-json

---
 shared/include/hpcc_benchmark.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 71430599..ec611c3d 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -177,7 +177,7 @@ class BaseSettings {
 #else
             communicationType(retrieveCommunicationType("UNSUPPORTED", results["f"].as<std::string>())),
 #endif
-            dumpfilePath(results["dump"].as<std::string>()),
+            dumpfilePath(results["dump-json"].as<std::string>()),
             testOnly(static_cast<bool>(results.count("test"))) {}
 
     /**
@@ -451,7 +451,7 @@ class HpccFpgaBenchmark {
                 ("comm-type", "Used communication type for inter-FPGA communication",
                 cxxopts::value<std::string>()->default_value(DEFAULT_COMM_TYPE))
 #endif
-                ("dump", "dump benchmark configuration and results to this file", cxxopts::value<std::string>()->default_value(std::string("")))
+                ("dump-json", "dump benchmark configuration and results to this file in json format", cxxopts::value<std::string>()->default_value(std::string("")))
                 ("test", "Only test given configuration and skip execution and validation")
                 ("h,help", "Print this help");
 

From 3bd17f8d2a7bd538ff33fc4412e5e030a0ea7f0e Mon Sep 17 00:00:00 2001
From: Marius Meyer <marius.meyer@uni-paderborn.de>
Date: Wed, 16 Nov 2022 18:35:18 +0100
Subject: [PATCH 213/318] Fix output parser for GEMM, RA, STREAM, HPL

---
 scripts/evaluation/parse_raw_to_csv.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py
index 03dfa2f4..833d5391 100755
--- a/scripts/evaluation/parse_raw_to_csv.py
+++ b/scripts/evaluation/parse_raw_to_csv.py
@@ -10,12 +10,12 @@
 
 # Regular expressions for the raw output of all 
 fft_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Batch\\sSize\\s+(?P<batch_size>\d+)\n(.*\n)FFT\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<best_time>(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P<avg_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_flops>(\d|\.|\+|-|e)+)"
-gemm_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<resid>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<gflops>(\d|\.|\+|-|e)+)"
-ra_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+(?P<size>(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+Error:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<gops>(\d|\.|\+|-|e)+)"
+gemm_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<resid>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<gflops>(\d|\.|\+|-|e)+)"
+ra_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+(?P<size>(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+Error:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<gops>(\d|\.|\+|-|e)+)"
 trans_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+total\\s\\[s\\]\\s+transfer\\s\\[s\\]\\s+calc\\s\\[s\\]\\s+calc\\s+FLOPS\\s+Mem\\s+\\[B/s\\]\\s+PCIe\\s+\\[B/s\\]\n\\s*avg:\\s+(?P<avg_total_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_transfer_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\\s+(?P<avg_trans_bw>(\d|\.|\+|-|e|inf)+)\n\\s*best:\\s+(?P<best_total_time>(\d|\.|\+|-|e)+)\\s+(?P<best_transfer_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)\\s+(?P<best_trans_bw>(\d|\.|\+|-|e|inf)+)"
-stream_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P<size>(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P<data_type>.+)\n(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Kernel\\sType\\s+(?P<type>.+)\n(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P<add_rate>(\d|\.|\+|-|e)+)\\s+(?P<add_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<add_min_time>(\d|\.|\+|-|e)+)\\s+(?P<add_max_time>(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P<copy_rate>(\d|\.|\+|-|e)+)\\s+(?P<copy_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<copy_min_time>(\d|\.|\+|-|e)+)\\s+(?P<copy_max_time>(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P<pcir_rate>(\d|\.|\+|-|e)+)\\s+(?P<pcir_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<pcir_min_time>(\d|\.|\+|-|e)+)\\s+(?P<pcir_max_time>(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P<pciw_rate>(\d|\.|\+|-|e)+)\\s+(?P<pciw_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<pciw_min_time>(\d|\.|\+|-|e)+)\\s+(?P<pciw_max_time>(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P<scale_rate>(\d|\.|\+|-|e)+)\\s+(?P<scale_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<scale_min_time>(\d|\.|\+|-|e)+)\\s+(?P<scale_max_time>(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P<triad_rate>(\d|\.|\+|-|e)+)\\s+(?P<triad_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<triad_min_time>(\d|\.|\+|-|e)+)\\s+(?P<triad_max_time>(\d|\.|\+|-|e)+)"
-linpack_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P<error>((\d|\.|\+|-|e)+|nan))\\s+(?P<resid>((\d|\.|\+|-|e)+|nan))\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P<total_best_time>(\d|\.|\+|-|e)+)\\s+(?P<total_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<total_gflops>(\d|\.|\+|-|e)+)(\\s*\n)\\s+GEFA\\s+(?P<lu_best_time>(\d|\.|\+|-|e)+)\\s+(?P<lu_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<lu_gflops>(\d|\.|\+|-|e)+)(\\s*\n)\\s+GESL\\s+(?P<sl_best_time>(\d|\.|\+|-|e)+)\\s+(?P<sl_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<sl_gflops>(\d|\.|\+|-|e)+)"
-   
+stream_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P<size>(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P<data_type>.+)\n(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Kernel\\sType\\s+(?P<type>.+)\n(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\s+Avg\\stime\\s+Min\\stime\\s+Max\\stime\n\\s+PCI_write\\s+(?P<pciw_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pciw_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pciw_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pciw_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+PCI_read\\s+(?P<pcir_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pcir_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pcir_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pcir_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+Copy\\s+(?P<copy_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<copy_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<copy_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<copy_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+Scale\\s+(?P<scale_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<scale_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<scale_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<scale_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+Add\\s+(?P<add_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<add_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<add_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<add_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+Triad\\s+(?P<triad_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<triad_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<triad_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<triad_max_time>(\d|\.|\+|-|e)+)"
+linpack_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P<error>((\d|\.|\+|-|e)+|nan))\\s+(?P<resid>((\d|\.|\+|-|e)+|nan))\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P<total_best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<total_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<total_gflops>(\d|\.|\+|-|e)+)(\\s+.+\\s*\n)\\s+GEFA\\s+(?P<lu_best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<lu_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<lu_gflops>(\d|\.|\+|-|e)+)(\\s+.+\\s*\n)\\s+GESL\\s+(?P<sl_best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<sl_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<sl_gflops>(\d|\.|\+|-|e)+)"
+
 
 def parse_network(file_content):
     '''

From 9f99017c590e5ddbf56069933dd01364675d7a17 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 23 Nov 2022 16:03:14 +0100
Subject: [PATCH 214/318] fix map:at panics

---
 FFT/src/host/fft_benchmark.cpp          | 2 +-
 PTRANS/src/host/transpose_benchmark.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp
index ff0710ef..4bfed7d2 100644
--- a/FFT/src/host/fft_benchmark.cpp
+++ b/FFT/src/host/fft_benchmark.cpp
@@ -123,7 +123,7 @@ fft::FFTBenchmark::printResults() {
         std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << std::setw(ENTRY_SPACE) << results.at("t_avg")
                     << std::setw(ENTRY_SPACE) << results.at("t_min") << std::endl;
         std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << std::setw(ENTRY_SPACE) << results.at("gflops_avg")
-                    << std::setw(ENTRY_SPACE) << results.at("gflop_min") << std::endl;
+                    << std::setw(ENTRY_SPACE) << results.at("gflops_min") << std::endl;
 }
 
 std::unique_ptr<fft::FFTData>
diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp
index 1c2682e3..0a7e6bc7 100644
--- a/PTRANS/src/host/transpose_benchmark.cpp
+++ b/PTRANS/src/host/transpose_benchmark.cpp
@@ -144,7 +144,7 @@ transpose::TransposeBenchmark::printResults() {
             << std::endl;
     std::cout << "best:  " << results.at("min_t")
             << "   " << results.at("min_transfer_t")
-            << "   " << results.at("min_calculation_t")
+            << "   " << results.at("min_calc_t")
             << "   " << results.at("max_calc_flops")
             << "   " << results.at("max_mem_bandwidth")
             << "   " << results.at("max_transfer_bandwidth")

From 2231e19f0df04f3c87b99bfd7e861958489fad9f Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 23 Nov 2022 21:00:54 +0100
Subject: [PATCH 215/318] add test for json dump feature

---
 shared/tests/hpcc_base_benchmark_test.cpp | 29 +++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp
index c3cc7c2f..b6378840 100644
--- a/shared/tests/hpcc_base_benchmark_test.cpp
+++ b/shared/tests/hpcc_base_benchmark_test.cpp
@@ -8,6 +8,7 @@
 #include "test_program_settings.h"
 #include "gmock/gmock.h"
 #include "hpcc_benchmark.hpp"
+#include "nlohmann/json.hpp"
 
 
 // Dirty GoogleTest and static library hack
@@ -264,3 +265,31 @@ TEST(SetupTest, BenchmarkSetupFails) {
     delete [] tmp_argv;
     delete [] name_str;
 }
+
+using json = nlohmann::json;
+
+/**
+ *
+ * Check if dump-json flag produces valid json output
+ */
+TEST(SetupTest, BenchmarkJsonDump) {
+    std::unique_ptr<MinimalBenchmark> bm = std::unique_ptr<MinimalBenchmark>(new MinimalBenchmark());
+    bm->setupBenchmark(global_argc, global_argv);
+    bm->getExecutionSettings().programSettings->dumpfilePath = "out.json";
+    bm->executeBenchmark();
+    std::FILE *f = std::fopen("out.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        // json::parse will panic if f is nullptr
+        json j = json::parse(f);
+        // check if the expected keys are there
+        EXPECT_TRUE(j.contains("config_time"));
+        EXPECT_TRUE(j.contains("device"));
+        EXPECT_TRUE(j.contains("environment"));
+        EXPECT_TRUE(j.contains("git_commit"));
+        EXPECT_TRUE(j.contains("results"));
+        EXPECT_TRUE(j.contains("settings"));
+        EXPECT_TRUE(j.contains("timings"));
+        EXPECT_TRUE(j.contains("version"));
+    }
+}

From d52aca158a42a33c6e44f17636e22ff6e94a9307 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 25 Nov 2022 14:23:50 +0100
Subject: [PATCH 216/318] Fix raw parsing for FFT and PTRANS

---
 PTRANS/src/host/transpose_benchmark.cpp | 14 +++++++-------
 scripts/evaluation/parse_raw_to_csv.py  |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp
index 0a7e6bc7..9b37f162 100644
--- a/PTRANS/src/host/transpose_benchmark.cpp
+++ b/PTRANS/src/host/transpose_benchmark.cpp
@@ -124,17 +124,17 @@ transpose::TransposeBenchmark::collectResults() {
     results.emplace("avg_t", hpcc_base::HpccResult(avgCalculationTime + avgTransferTime, "s"));
     results.emplace("min_t", hpcc_base::HpccResult(minCalculationTime + minTransferTime, "s"));
 
-    results.emplace("avg_calc_flops", hpcc_base::HpccResult(flops / avgCalculationTime, "GFLOP/s"));
-    results.emplace("max_calc_flops", hpcc_base::HpccResult(flops / minCalculationTime, "GFLOP/s"));
-    results.emplace("avg_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime, "GB/s"));
-    results.emplace("max_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime, "GB/s"));
-    results.emplace("avg_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime, "GB/s"));
-    results.emplace("max_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime, "GB/s"));
+    results.emplace("avg_calc_flops", hpcc_base::HpccResult(flops / avgCalculationTime * 1.0e9, "GFLOP/s"));
+    results.emplace("max_calc_flops", hpcc_base::HpccResult(flops / minCalculationTime * 1.0e9, "GFLOP/s"));
+    results.emplace("avg_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime * 1.0e9, "GB/s"));
+    results.emplace("max_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime * 1.0e9, "GB/s"));
+    results.emplace("avg_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime * 1.0e9, "GB/s"));
+    results.emplace("max_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime * 1.0e9, "GB/s"));
 }
 
 void
 transpose::TransposeBenchmark::printResults() {
-    std::cout << "       total [s]     transfer [s]  calc [s]      calc FLOPS    Mem [B/s]     PCIe [B/s]" << std::endl;
+    std::cout << "       total time     transfer time  calc time      calc FLOPS    Memory Bandwidth     PCIe Bandwidth" << std::endl;
     std::cout << "avg:   " << results.at("avg_t")
             << "   " << results.at("avg_transfer_t")
             << "   " << results.at("avg_calc_t")
diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py
index 833d5391..e5306dc7 100755
--- a/scripts/evaluation/parse_raw_to_csv.py
+++ b/scripts/evaluation/parse_raw_to_csv.py
@@ -9,10 +9,10 @@
 import sys
 
 # Regular expressions for the raw output of all 
-fft_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Batch\\sSize\\s+(?P<batch_size>\d+)\n(.*\n)FFT\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<best_time>(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P<avg_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_flops>(\d|\.|\+|-|e)+)"
+fft_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Batch\\sSize\\s+(?P<batch_size>\d+)\n(.*\n)FFT\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+GFLOPS:\\s+(?P<avg_flops>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_flops>(\d|\.|\+|-|e)+)"
 gemm_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<resid>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<gflops>(\d|\.|\+|-|e)+)"
 ra_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+(?P<size>(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+Error:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<gops>(\d|\.|\+|-|e)+)"
-trans_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+total\\s\\[s\\]\\s+transfer\\s\\[s\\]\\s+calc\\s\\[s\\]\\s+calc\\s+FLOPS\\s+Mem\\s+\\[B/s\\]\\s+PCIe\\s+\\[B/s\\]\n\\s*avg:\\s+(?P<avg_total_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_transfer_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\\s+(?P<avg_trans_bw>(\d|\.|\+|-|e|inf)+)\n\\s*best:\\s+(?P<best_total_time>(\d|\.|\+|-|e)+)\\s+(?P<best_transfer_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)\\s+(?P<best_trans_bw>(\d|\.|\+|-|e|inf)+)"
+trans_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+total\\s+time\\s+transfer\\s+time\\s+calc\\s+time\\s+calc\\s+FLOPS\\s+Memory\\s+Bandwidth\\s+PCIe\\s+Bandwidth\n\\s*avg:\\s+(?P<avg_total_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_transfer_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_trans_bw>(\d|\.|\+|-|e|inf)+)\\s+.+\n\\s*best:\\s+(?P<best_total_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_transfer_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_trans_bw>(\d|\.|\+|-|e|inf)+)"
 stream_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P<size>(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P<data_type>.+)\n(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Kernel\\sType\\s+(?P<type>.+)\n(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\s+Avg\\stime\\s+Min\\stime\\s+Max\\stime\n\\s+PCI_write\\s+(?P<pciw_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pciw_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pciw_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pciw_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+PCI_read\\s+(?P<pcir_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pcir_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pcir_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pcir_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+Copy\\s+(?P<copy_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<copy_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<copy_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<copy_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+Scale\\s+(?P<scale_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<scale_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<scale_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<scale_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+Add\\s+(?P<add_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<add_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<add_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<add_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+Triad\\s+(?P<triad_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<triad_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<triad_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<triad_max_time>(\d|\.|\+|-|e)+)"
 linpack_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P<error>((\d|\.|\+|-|e)+|nan))\\s+(?P<resid>((\d|\.|\+|-|e)+|nan))\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P<total_best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<total_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<total_gflops>(\d|\.|\+|-|e)+)(\\s+.+\\s*\n)\\s+GEFA\\s+(?P<lu_best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<lu_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<lu_gflops>(\d|\.|\+|-|e)+)(\\s+.+\\s*\n)\\s+GESL\\s+(?P<sl_best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<sl_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<sl_gflops>(\d|\.|\+|-|e)+)"
 

From 6c09f06627a91c43e6b2ae35136de760663e7764 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 25 Nov 2022 14:26:21 +0100
Subject: [PATCH 217/318] Fix metric conversion

---
 PTRANS/src/host/transpose_benchmark.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp
index 9b37f162..decc9b85 100644
--- a/PTRANS/src/host/transpose_benchmark.cpp
+++ b/PTRANS/src/host/transpose_benchmark.cpp
@@ -124,12 +124,12 @@ transpose::TransposeBenchmark::collectResults() {
     results.emplace("avg_t", hpcc_base::HpccResult(avgCalculationTime + avgTransferTime, "s"));
     results.emplace("min_t", hpcc_base::HpccResult(minCalculationTime + minTransferTime, "s"));
 
-    results.emplace("avg_calc_flops", hpcc_base::HpccResult(flops / avgCalculationTime * 1.0e9, "GFLOP/s"));
-    results.emplace("max_calc_flops", hpcc_base::HpccResult(flops / minCalculationTime * 1.0e9, "GFLOP/s"));
-    results.emplace("avg_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime * 1.0e9, "GB/s"));
-    results.emplace("max_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime * 1.0e9, "GB/s"));
-    results.emplace("avg_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime * 1.0e9, "GB/s"));
-    results.emplace("max_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime * 1.0e9, "GB/s"));
+    results.emplace("avg_calc_flops", hpcc_base::HpccResult(flops / avgCalculationTime * 1.0e-9, "GFLOP/s"));
+    results.emplace("max_calc_flops", hpcc_base::HpccResult(flops / minCalculationTime * 1.0e-9, "GFLOP/s"));
+    results.emplace("avg_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime * 1.0e-9, "GB/s"));
+    results.emplace("max_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime * 1.0e-9, "GB/s"));
+    results.emplace("avg_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime * 1.0e-9, "GB/s"));
+    results.emplace("max_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime * 1.0e-9, "GB/s"));
 }
 
 void

From 1e3dc1e2a7790d9463c66071b5f2c765078045cf Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 25 Nov 2022 14:35:01 +0100
Subject: [PATCH 218/318] Fix unit tests for output parsing

---
 PTRANS/tests/test_host_functionality.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/PTRANS/tests/test_host_functionality.cpp b/PTRANS/tests/test_host_functionality.cpp
index c65019a6..1c671f2b 100644
--- a/PTRANS/tests/test_host_functionality.cpp
+++ b/PTRANS/tests/test_host_functionality.cpp
@@ -39,7 +39,7 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatHeader) {
     std::cout.rdbuf(oldStdOutBuffer);
 
     EXPECT_THAT(newStdOutBuffer.str(),
-                ::testing::MatchesRegex("(\\s+)total\\s\\[s\\](\\s+)transfer\\s\\[s\\](\\s+)calc\\s\\[s\\](\\s+)calc\\sFLOPS(\\s+)Mem\\s\\[B/s\\](\\s+)PCIe\\s\\[B/s\\]\n.*"));
+                ::testing::MatchesRegex("(\\s+)total\\stime(\\s+)transfer\\stime(\\s+)calc\\s+time(\\s+)calc\\sFLOPS(\\s+)Memory\\sBandwidth(\\s+)PCIe\\sBandwidth\n.*"));
 }
 
 /**
@@ -66,7 +66,7 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatValues) {
     std::cout.rdbuf(oldStdOutBuffer);
 
     EXPECT_THAT(newStdOutBuffer.str(),
-                ::testing::MatchesRegex(".*\navg:\\s+2\\.00000e\\+00\\s+1\\.00000e\\+00\\s+1\\.00000e\\+00.*\n.*\n"));
+                ::testing::MatchesRegex(".*\navg:\\s+2\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s.*\n.*\n"));
 }
 
 /**

From 9ad73dbefad83aa3f36278f63ba729b3acb24694 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 5 Dec 2022 10:29:28 +0100
Subject: [PATCH 219/318] add json tests for all benchmarks

---
 FFT/tests/test_fft_functionality.cpp          | 27 +++++++++-
 ...nel_functionality_and_host_integration.cpp | 24 +++++++++
 ...nel_functionality_and_host_integration.cpp | 33 +++++++++++-
 ...nel_functionality_and_host_integration.cpp | 34 ++++++++++++-
 ...nel_functionality_and_host_integration.cpp | 25 ++++++++-
 ...nel_functionality_and_host_integration.cpp | 51 ++++++++++++++++++-
 b_eff/src/host/network_benchmark.hpp          |  2 +-
 ...nel_functionality_and_host_integration.cpp | 34 +++++++++++++
 8 files changed, 224 insertions(+), 6 deletions(-)

diff --git a/FFT/tests/test_fft_functionality.cpp b/FFT/tests/test_fft_functionality.cpp
index f5818814..df5dba9f 100644
--- a/FFT/tests/test_fft_functionality.cpp
+++ b/FFT/tests/test_fft_functionality.cpp
@@ -6,6 +6,7 @@
 #include "fft_benchmark.hpp"
 #include "parameters.h"
 #include "test_program_settings.h"
+#include "nlohmann/json.hpp"
 
 
 struct FFTHostTest : testing::Test {
@@ -119,4 +120,28 @@ TEST_F(FFTHostTest, FFTandiFFTProduceResultCloseToSource) {
     for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
         EXPECT_NEAR(std::abs(data->data[i]), std::abs(verify_data->data[i]), 0.001);
     }
-}
\ No newline at end of file
+}
+
+using json = nlohmann::json;
+
+TEST_F(FFTHostTest, JsonDump) {
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("fft.json");
+    std::FILE *f = std::fopen("fft.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].contains("calculation"));
+        }
+        EXPECT_TRUE(j.contains("results"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("gflops_avg"));
+            EXPECT_TRUE(j["results"].contains("gflops_min"));
+            EXPECT_TRUE(j["results"].contains("t_avg"));
+            EXPECT_TRUE(j["results"].contains("t_min"));
+        }
+    }
+}
diff --git a/GEMM/tests/test_kernel_functionality_and_host_integration.cpp b/GEMM/tests/test_kernel_functionality_and_host_integration.cpp
index c3d9723e..41ead85e 100755
--- a/GEMM/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/GEMM/tests/test_kernel_functionality_and_host_integration.cpp
@@ -7,6 +7,7 @@
 #include "gemm_benchmark.hpp"
 #include "parameters.h"
 #include "test_program_settings.h"
+#include "nlohmann/json.hpp"
 
 void
 ref_matmul(HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C, int size) {
@@ -179,6 +180,29 @@ TEST_P(GEMMKernelTest, FPGACorrectbetaCplusalphaAB) {
     }
 }
 
+using json = nlohmann::json;
+
+TEST_P(GEMMKernelTest, JsonDump) {
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("gemm.json");
+    std::FILE *f = std::fopen("gemm.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].contains("execution"));
+        }
+        EXPECT_TRUE(j.contains("results"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("gflops"));
+            EXPECT_TRUE(j["results"].contains("t_mean"));
+            EXPECT_TRUE(j["results"].contains("t_min"));
+        }
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(Default, GEMMKernelTest,
          testing::Values(1,2));
 
diff --git a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp
index 77c0fd70..2dbd21f0 100644
--- a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp
@@ -5,6 +5,7 @@
 #include "parameters.h"
 #include "test_program_settings.h"
 #include "linpack_benchmark.hpp"
+#include "nlohmann/json.hpp"
 
 #ifdef _LAPACK_
 #ifdef _DP
@@ -94,7 +95,37 @@ TEST_P(LinpackKernelTest, DISABLED_ValidationWorksForMKL) {
 
 #endif
 
+using json = nlohmann::json;
+
+TEST_P(LinpackKernelTest, JsonDump) {
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("linpack.json");
+    std::FILE *f = std::fopen("linpack.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].contains("gefa"));
+            EXPECT_TRUE(j["timings"].contains("gesl"));
+        }
+        EXPECT_TRUE(j.contains("results"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("gflops"));
+            EXPECT_TRUE(j["results"].contains("gflops_lu"));
+            EXPECT_TRUE(j["results"].contains("gflops_sl"));
+            EXPECT_TRUE(j["results"].contains("t_mean"));
+            EXPECT_TRUE(j["results"].contains("t_min"));
+            EXPECT_TRUE(j["results"].contains("tlu_mean"));
+            EXPECT_TRUE(j["results"].contains("tlu_min"));
+            EXPECT_TRUE(j["results"].contains("tsl_mean"));
+            EXPECT_TRUE(j["results"].contains("tsl_min"));
+        }
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(
         LinpackKernelParametrizedTests,
         LinpackKernelTest,
-        ::testing::Values(1, 2, 3));
\ No newline at end of file
+        ::testing::Values(1, 2, 3));
diff --git a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
index 985a0698..eaff5c42 100644
--- a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
@@ -6,7 +6,7 @@
 #include "gtest/gtest.h"
 #include "parameters.h"
 #include "test_program_settings.h"
-
+#include "nlohmann/json.hpp"
 
 struct TransposeKernelTest : testing::Test {
     std::shared_ptr<transpose::TransposeData> data;
@@ -204,3 +204,35 @@ TEST_F(TransposeKernelTest, FPGATimingsMeasuredForEveryIteration) {
     }
 }
 
+using json = nlohmann::json;
+
+TEST_F(TransposeKernelTest, JsonDump) {
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("ptrans.json");
+    std::FILE *f = std::fopen("ptrans.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].contains("calculation"));
+            EXPECT_TRUE(j["timings"].contains("transfer"));
+        }
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("avg_calc_flops"));
+            EXPECT_TRUE(j["results"].contains("avg_calc_t"));
+            EXPECT_TRUE(j["results"].contains("avg_mem_bandwidth"));
+            EXPECT_TRUE(j["results"].contains("avg_t"));
+            EXPECT_TRUE(j["results"].contains("avg_transfer_bandwidth"));
+            EXPECT_TRUE(j["results"].contains("avg_transfer_t"));
+            EXPECT_TRUE(j["results"].contains("max_calc_flops"));
+            EXPECT_TRUE(j["results"].contains("max_mem_bandwidth"));
+            EXPECT_TRUE(j["results"].contains("max_transfer_bandwidth"));
+            EXPECT_TRUE(j["results"].contains("min_calc_t"));
+            EXPECT_TRUE(j["results"].contains("min_t"));
+            EXPECT_TRUE(j["results"].contains("min_transfer_t"));
+        }
+    }
+}
diff --git a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp
index 0cb30dd1..35c9f229 100644
--- a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp
@@ -5,7 +5,7 @@
 #include "parameters.h"
 #include "random_access_benchmark.hpp"
 #include "test_program_settings.h"
-
+#include "nlohmann/json.hpp"
 
 struct RandomAccessKernelTest : testing::Test {
     std::unique_ptr<random_access::RandomAccessData> data;
@@ -49,3 +49,26 @@ TEST_F(RandomAccessKernelTest, FPGAErrorBelow1Percent) {
     bool success = bm->validateOutputAndPrintError(*data);
     EXPECT_TRUE(success);
 }
+
+using json = nlohmann::json;
+
+TEST_F(RandomAccessKernelTest, JsonDump) {
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("fft.json");
+    std::FILE *f = std::fopen("fft.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].contains("execution"));
+        }
+        EXPECT_TRUE(j.contains("results"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("guops"));
+            EXPECT_TRUE(j["results"].contains("t_mean"));
+            EXPECT_TRUE(j["results"].contains("t_min"));
+        }
+    }
+}
diff --git a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp
index ec78075e..1aae4c2a 100644
--- a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp
@@ -5,7 +5,7 @@
 #include "parameters.h"
 #include "test_program_settings.h"
 #include "stream_benchmark.hpp"
-
+#include "nlohmann/json.hpp"
 
 struct StreamKernelTest :public  ::testing::Test {
     std::shared_ptr<stream::StreamData> data;
@@ -49,3 +49,52 @@ TEST_F(StreamKernelTest, FPGACorrectResultsThreeRepetition) {
         EXPECT_FLOAT_EQ(data->C[i], 1800.0);
     }
 }
+
+using json = nlohmann::json;
+
+TEST_F(StreamKernelTest, JsonDump) {
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("stream.json");
+    std::FILE *f = std::fopen("stream.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].contains("Add"));
+            EXPECT_TRUE(j["timings"].contains("Copy"));
+            EXPECT_TRUE(j["timings"].contains("PCI_read"));
+            EXPECT_TRUE(j["timings"].contains("PCI_write"));
+            EXPECT_TRUE(j["timings"].contains("Scale"));
+            EXPECT_TRUE(j["timings"].contains("Triad"));
+        }
+        EXPECT_TRUE(j.contains("results"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("Add_avg_t"));
+            EXPECT_TRUE(j["results"].contains("Add_best_rate"));
+            EXPECT_TRUE(j["results"].contains("Add_max_t"));
+            EXPECT_TRUE(j["results"].contains("Add_min_t"));
+            EXPECT_TRUE(j["results"].contains("Copy_avg_t"));
+            EXPECT_TRUE(j["results"].contains("Copy_best_rate"));
+            EXPECT_TRUE(j["results"].contains("Copy_max_t"));
+            EXPECT_TRUE(j["results"].contains("Copy_min_t"));
+            EXPECT_TRUE(j["results"].contains("PCI_read_avg_t"));
+            EXPECT_TRUE(j["results"].contains("PCI_read_best_rate"));
+            EXPECT_TRUE(j["results"].contains("PCI_read_max_t"));
+            EXPECT_TRUE(j["results"].contains("PCI_read_min_t"));
+            EXPECT_TRUE(j["results"].contains("PCI_write_avg_t"));
+            EXPECT_TRUE(j["results"].contains("PCI_write_best_rate"));
+            EXPECT_TRUE(j["results"].contains("PCI_write_max_t"));
+            EXPECT_TRUE(j["results"].contains("PCI_write_min_t"));
+            EXPECT_TRUE(j["results"].contains("Scale_avg_t"));
+            EXPECT_TRUE(j["results"].contains("Scale_best_rate"));
+            EXPECT_TRUE(j["results"].contains("Scale_max_t"));
+            EXPECT_TRUE(j["results"].contains("Scale_min_t"));
+            EXPECT_TRUE(j["results"].contains("Triad_avg_t"));
+            EXPECT_TRUE(j["results"].contains("Triad_best_rate"));
+            EXPECT_TRUE(j["results"].contains("Triad_max_t"));
+            EXPECT_TRUE(j["results"].contains("Triad_min_t"));
+        }
+    }
+}
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index d86c2f61..e1b77bc9 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -247,7 +247,7 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark<NetworkProgramSetti
                     json j;
                     j["unit"] = "s";
                     j["value"] = timing;
-                    calculation_timings.push_back(timing);
+                    calculation_timings.push_back(j);
                 }
                 single_timing_json["timings"] = calculation_timings;
                 timings_json.push_back(single_timing_json);
diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
index 9262e331..a25a9ac1 100644
--- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
@@ -296,6 +296,40 @@ TEST_P(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) {
     EXPECT_FALSE(bm->validateOutputAndPrintError(*data));
 }
 
+TEST_P(NetworkKernelTest, JsonDump) {
+    data->items.clear();
+    data->items.push_back(network::NetworkData::NetworkDataItem(8,4));
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("b_eff.json");
+    std::FILE *f = std::fopen("b_eff.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].size() > 0);
+            if (j["timings"].size() > 0) {
+                for (const auto& timing: j["timings"].items()) {
+                    EXPECT_TRUE(timing.value().contains("maxCalcBW"));
+                    EXPECT_TRUE(timing.value().contains("maxMinCalculationTime"));
+                    EXPECT_TRUE(timing.value().contains("timings"));
+                    if (timing.value().contains("timings")) {
+                        for (const auto& timing: timing.value()["timings"]) {
+                            EXPECT_TRUE(timing.contains("looplength"));
+                            EXPECT_TRUE(timing.contains("messageSize"));
+                            EXPECT_TRUE(timing.contains("timings"));
+                        }
+                    }
+                }
+            }
+        }
+        EXPECT_TRUE(j.contains("results"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("b_eff"));
+        }
+    }
+}
 
 
 INSTANTIATE_TEST_CASE_P(

From fcdab10ba8295c93b6291772935bfa940900df28 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 5 Dec 2022 10:29:52 +0100
Subject: [PATCH 220/318] add documentation for json-dump feature

---
 README.md                                     |   1 +
 .../Host Input Parameters/index.rst           |   6 +-
 .../json_output/available_keys.csv            |  58 ++++
 .../technical_support/json_output/index.rst   | 284 ++++++++++++++++++
 4 files changed, 348 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/technical_support/json_output/available_keys.csv
 create mode 100644 docs/source/technical_support/json_output/index.rst

diff --git a/README.md b/README.md
index 1f830da8..1814c5a0 100755
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@ Moreover, additional libraries are fetched by the build system during configurat
 - [cxxopts](https://github.com/jarro2783/cxxopts) for option parsing
 - [hlslib](https://github.com/definelicht/hlslib) for CMake FindPackages
 - [Googletest](https://github.com/google/googletest) for unit testing
+- [json](https://github.com/nlohmann/json) for json output
 
 These dependencies will be downloaded automatically when configuring a benchmark for the first time.
 The exact version that are used can be found in the `CMakeLists.txt`located in the `extern` directory where all extern dependencies are defined.
diff --git a/docs/source/technical_support/Host Input Parameters/index.rst b/docs/source/technical_support/Host Input Parameters/index.rst
index 50121964..550e8f19 100644
--- a/docs/source/technical_support/Host Input Parameters/index.rst	
+++ b/docs/source/technical_support/Host Input Parameters/index.rst	
@@ -1,3 +1,4 @@
+.. _execution:
 ========================
 Execution of a Benchmark
 ========================
@@ -46,9 +47,12 @@ Input parameters (or options) can be appended to the host execution call like th
 ``--comm-type COMM``:
     This parameter chooses the communication strategy which will be used. Current Options are "IEC" for using the Intel External Channel, "PCIE" for using the host-to-host communicationa and "CPU" for calculating on the CPU.
 
+``--dump-json PATH``:
+    This parameters enables the dumping of the benchmark configuration, settings, timings and results in machine-readable json-format. The parameter describes the path of the json file, where the dump will go. If no parameter is given no dump will be created.
+
 ``--test``:
     This option will also skip the execution of the benchmark. It can be used to test different data generation schemes or the benchmark summary before the actual execution. Please note, that the 
     host will exit with a non-zero exit code, because it will not be able to validate the output.
 
 Additionally, every benchmark will have several options to define the size and type of the used input data.
-These options vary between the benchmarks. An easy way to find out more about these options is to use the ``-h`` option with the host.
\ No newline at end of file
+These options vary between the benchmarks. An easy way to find out more about these options is to use the ``-h`` option with the host.
diff --git a/docs/source/technical_support/json_output/available_keys.csv b/docs/source/technical_support/json_output/available_keys.csv
new file mode 100644
index 00000000..070a21da
--- /dev/null
+++ b/docs/source/technical_support/json_output/available_keys.csv
@@ -0,0 +1,58 @@
+Benchmark,timings,results,settings
+:ref:`FFT <fft>`,calculation,gflops_avg,FFT Size
+,,gflops_min,Batch Size
+,,t_avg,
+,,t_min,
+:ref:`GEMM <gemm>`,execution,gflops,Matrix Size
+,,t_mean,Replicate Inputs
+,,t_min,
+:ref:`LINPACK <hpl>`,gefa,gflops,Matrix Size
+,gesl,gflops_lu,Block Size
+,,gflops_sl,Emulate
+,,t_mean,Data Type
+,,t_min,FPGA Torus
+,,tlu_mean,
+,,tlu_min,
+,,tsl_mean,
+,,tsl_min,
+:ref:`PTRANS <ptrans>`,calculation,avg_calc_flops,Matrix Size
+,transfer,avg_calc_t,Block Size
+,,avg_mem_bandwidth,Dist. Buffers
+,,avg_t,Data Handler
+,,avg_transfer_bandwidth,
+,,avg_transfer_t,
+,,max_calc_flops,
+,,max_mem_bandwidth,
+,,max_transfer_bandwidth,
+,,min_calc_t,
+,,min_t,
+,,min_transfer_t,
+:ref:`RandomAccess <randomaccess>`,execution,guops,Array Size
+,,t_mean,#RNGs
+,,t_min,
+:ref:`STREAM <stream>`,Add,Add_avg_t,Data Type
+,,Add_best_rate,Array Size
+,,Add_max_t,Kernel Type
+,,Add_min_t,
+,,Copy_avg_t,
+,,Copy_best_rate,
+,,Copy_max_t,
+,,Copy_min_t,
+,PCI_read,PCI_read_avg_t,
+,,PCI_read_best_rate,
+,,PCI_read_max_t,
+,,PCI_read_min_t,
+,PCI_write,PCI_write_avg_t,
+,,PCI_write_best_rate,
+,,PCI_write_max_t,
+,,PCI_write_min_t,
+,Scale,Scale_avg_t,
+,,Scale_best_rate,
+,,Scale_max_t,
+,,Scale_min_t,
+,Triad,Triad_avg_t,
+,,Triad_best_rate,
+,,Triad_max_t,
+,,Triad_min_t,
+:ref:`b_eff <beff>`,**special syntax - see below**,b_eff,Loop Length
+,,,Message Sizes
diff --git a/docs/source/technical_support/json_output/index.rst b/docs/source/technical_support/json_output/index.rst
new file mode 100644
index 00000000..37aa1f68
--- /dev/null
+++ b/docs/source/technical_support/json_output/index.rst
@@ -0,0 +1,284 @@
+===========
+JSON Output
+===========
+
+The output of the configuration, settings, timings and results in machine-readable json-format can be enabled as described in :ref:`Execution of a Benchmark <execution>`
+
+When enabled, this creates a json file which will have some information for all benchmarks. In the following example the different informations are left out, so these are the same for all benchmarks.
+
+.. code-block:: javascript
+
+    {
+      "config_time": "Mon Dec 05 15:09:08 UTC 2022",
+      "device": "Intel(R) FPGA Emulation Device",
+      "environment": {
+        "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+      },
+      "git_commit": "c7f3890-dirty",
+      "mpi": {
+        "subversion": 1,
+        "version": 3
+      },
+      "name": "effective bandwidth",
+      "results": {
+      },
+      "settings": {
+        "Communication Type": "IEC",
+        "Kernel File": "./communication_bw520n_IEC_emulate.aocx",
+        "Kernel Replications": 2,
+        "MPI Ranks": 1,
+        "Repetitions": 10,
+        "Test Mode": "No"
+      },
+      "timings": {
+      },
+      "version": "1.3"
+    }
+
+If a benchmark has more settings, they will be added to the settings-key. Every benchmark can track different categories of timings and different results. The following table shows which keys are available for which benchmark.
+
+.. csv-table:: Available keys
+    :file: available_keys.csv
+    :header-rows: 1
+    :class: longtable
+    :widths: 1 1 1 1
+
+The results and timings are in a special format, which consists of the value and the unit.
+
+.. code-block:: javascript
+
+    {
+      "results": {
+        "b_eff": {
+          "unit": "B/s",
+          "value": 14806691.755972749
+        }
+      }
+    }
+
+The timings are a vector of all the timings which were measured, expect for b_eff, where a special format is used. For every message size used in the benchmark the interim results are saved in the following way.
+
+.. code-block:: javascript
+
+    {
+    "6": {
+      "maxCalcBW": 9225059.007945802,
+      "maxMinCalculationTime": 5.5501e-05,
+      "timings": [
+        {
+          "looplength": 4,
+          "messageSize": 6,
+          "timings": [
+            {
+              "unit": "s",
+              "value": 0.008889638
+            },
+            {
+              "unit": "s",
+              "value": 0.000115271
+            },
+            {
+              "unit": "s",
+              "value": 0.000149272
+            },
+            {
+              "unit": "s",
+              "value": 0.000163372
+            },
+            {
+              "unit": "s",
+              "value": 7.5731e-05
+            },
+            {
+              "unit": "s",
+              "value": 5.5501e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000162132
+            },
+            {
+              "unit": "s",
+              "value": 8.2091e-05
+            },
+            {
+              "unit": "s",
+              "value": 6.7621e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000126891
+            }
+          ]
+        }
+      ]
+    },
+    "7": {
+      "maxCalcBW": 12222341.581026724,
+      "maxMinCalculationTime": 8.3781e-05,
+      "timings": [
+        {
+          "looplength": 4,
+          "messageSize": 7,
+          "timings": [
+            {
+              "unit": "s",
+              "value": 0.000296573
+            },
+            {
+              "unit": "s",
+              "value": 0.000136292
+            },
+            {
+              "unit": "s",
+              "value": 0.000320834
+            },
+            {
+              "unit": "s",
+              "value": 0.000130881
+            },
+            {
+              "unit": "s",
+              "value": 8.3781e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000247252
+            },
+            {
+              "unit": "s",
+              "value": 0.000430356
+            },
+            {
+              "unit": "s",
+              "value": 0.000281403
+            },
+            {
+              "unit": "s",
+              "value": 0.000421565
+            },
+            {
+              "unit": "s",
+              "value": 0.000266754
+            }
+          ]
+        }
+      ]
+    },
+    "8": {
+      "maxCalcBW": 38030862.93662141,
+      "maxMinCalculationTime": 5.3851e-05,
+      "timings": [
+        {
+          "looplength": 4,
+          "messageSize": 8,
+          "timings": [
+            {
+              "unit": "s",
+              "value": 0.000157722
+            },
+            {
+              "unit": "s",
+              "value": 0.000121611
+            },
+            {
+              "unit": "s",
+              "value": 0.000217192
+            },
+            {
+              "unit": "s",
+              "value": 9.7101e-05
+            },
+            {
+              "unit": "s",
+              "value": 6.6931e-05
+            },
+            {
+              "unit": "s",
+              "value": 8.6791e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000145572
+            },
+            {
+              "unit": "s",
+              "value": 0.000143042
+            },
+            {
+              "unit": "s",
+              "value": 8.5281e-05
+            },
+            {
+              "unit": "s",
+              "value": 5.3851e-05
+            }
+          ]
+        }
+      ]
+    }
+    }
+
+A full example for FFT looks like this.
+
+.. code-block:: javascript
+
+    {
+      "config_time": "Mon Dec 05 17:39:57 UTC 2022",
+      "device": "Intel(R) FPGA Emulation Device",
+      "environment": {
+        "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+      },
+      "git_commit": "c7f3890-dirty",
+      "name": "FFT",
+      "results": {
+        "gflops_avg": {
+          "unit": "GFLOP/s",
+          "value": 0.27772734580591407
+        },
+        "gflops_min": {
+          "unit": "GFLOP/s",
+          "value": 0.28466663597913383
+        },
+        "t_avg": {
+          "unit": "s",
+          "value": 0.0008848966575
+        },
+        "t_min": {
+          "unit": "s",
+          "value": 0.00086332562
+        }
+      },
+      "settings": {
+        "Batch Size": 100,
+        "Communication Type": "UNSUPPORTED",
+        "FFT Size": 4096,
+        "Kernel File": "fft1d_float_8_emulate.aocx",
+        "Kernel Replications": 1,
+        "MPI Ranks": "None",
+        "Repetitions": 4,
+        "Test Mode": "No"
+      },
+      "timings": {
+        "calculation": [
+          {
+            "unit": "s",
+            "value": 0.090789326
+          },
+          {
+            "unit": "s",
+            "value": 0.086332562
+          },
+          {
+            "unit": "s",
+            "value": 0.090089428
+          },
+          {
+            "unit": "s",
+            "value": 0.086747347
+          }
+        ]
+      },
+      "version": "1.4"
+    }
+

From afecda016c1db4d163659c40522e1e212cf8af7c Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Sat, 10 Dec 2022 11:56:22 +0100
Subject: [PATCH 221/318] add errors to json output

---
 FFT/src/host/execution_default.cpp            |  2 +-
 FFT/src/host/fft_benchmark.cpp                | 31 ++++---
 FFT/src/host/fft_benchmark.hpp                |  9 +-
 FFT/tests/test_execution_functionality.cpp    |  4 +-
 FFT/tests/test_fft_functionality.cpp          |  2 +-
 GEMM/src/host/gemm_benchmark.cpp              | 26 +++---
 GEMM/src/host/gemm_benchmark.hpp              |  9 +-
 LINPACK/src/host/linpack_benchmark.cpp        | 48 +++++-----
 LINPACK/src/host/linpack_benchmark.hpp        |  9 +-
 .../test_host_reference_implementations.cpp   |  9 +-
 LINPACK/tests/test_kernel_communication.cpp   |  3 +-
 ...nel_functionality_and_host_integration.cpp |  4 +-
 PTRANS/src/host/transpose_benchmark.cpp       | 57 +++++++-----
 PTRANS/src/host/transpose_benchmark.hpp       | 16 +++-
 PTRANS/tests/test_host_functionality.cpp      |  6 +-
 .../src/host/random_access_benchmark.cpp      | 22 +++--
 .../src/host/random_access_benchmark.hpp      |  9 +-
 RandomAccess/tests/test_host_code.cpp         | 10 +--
 ...nel_functionality_and_host_integration.cpp |  4 +-
 STREAM/src/host/stream_benchmark.cpp          | 89 +++++++++++++------
 STREAM/src/host/stream_benchmark.hpp          |  9 +-
 b_eff/src/host/network_benchmark.cpp          | 20 +++--
 b_eff/src/host/network_benchmark.hpp          | 15 +++-
 ...nel_functionality_and_host_integration.cpp | 18 ++--
 shared/include/hpcc_benchmark.hpp             | 66 ++++++++++----
 shared/tests/hpcc_base_benchmark_test.cpp     | 10 ++-
 26 files changed, 334 insertions(+), 173 deletions(-)

diff --git a/FFT/src/host/execution_default.cpp b/FFT/src/host/execution_default.cpp
index d0d565da..a1ae245a 100644
--- a/FFT/src/host/execution_default.cpp
+++ b/FFT/src/host/execution_default.cpp
@@ -212,7 +212,7 @@ namespace bm_execution {
         }
         std::map<std::string, std::vector<double>> timings;
 
-        timings["calculation"] = calculationTimings;
+        timings["execution"] = calculationTimings;
 
         return timings;
     }
diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp
index 4bfed7d2..fde7c01c 100644
--- a/FFT/src/host/fft_benchmark.cpp
+++ b/FFT/src/host/fft_benchmark.cpp
@@ -96,7 +96,7 @@ void
 fft::FFTBenchmark::collectResults() {
     double gflop = static_cast<double>(5 * (1 << LOG_FFT_SIZE) * LOG_FFT_SIZE) * executionSettings->programSettings->iterations * 1.0e-9 * mpi_comm_size;
 
-    uint number_measurements = timings["calculation"].size();
+    uint number_measurements = timings["execution"].size();
     std::vector<double> avg_measures(number_measurements);
 #ifdef _USE_MPI_
     // Copy the object variable to a local variable to make it accessible to the lambda function
@@ -104,7 +104,7 @@ fft::FFTBenchmark::collectResults() {
     MPI_Reduce(timings.data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
     std::for_each(avg_measures.begin(),avg_measures.end(), [mpi_size](double& x) {x /= mpi_size;});
 #else
-    std::copy(timings["calculation"].begin(), timings["calculation"].end(), avg_measures.begin());
+    std::copy(timings["execution"].begin(), timings["execution"].end(), avg_measures.begin());
 #endif
     if (mpi_comm_rank == 0) {
         double minTime = *min_element(avg_measures.begin(), avg_measures.end());
@@ -118,12 +118,10 @@ fft::FFTBenchmark::collectResults() {
 
 void
 fft::FFTBenchmark::printResults() {
-        std::cout << std::setw(ENTRY_SPACE) << " " << std::setw(ENTRY_SPACE) << "avg"
-                << std::setw(ENTRY_SPACE) << "best" << std::endl;
-        std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << std::setw(ENTRY_SPACE) << results.at("t_avg")
-                    << std::setw(ENTRY_SPACE) << results.at("t_min") << std::endl;
-        std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << std::setw(ENTRY_SPACE) << results.at("gflops_avg")
-                    << std::setw(ENTRY_SPACE) << results.at("gflops_min") << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << " " << std::left << std::setw(ENTRY_SPACE) << " avg"
+                << std::setw(ENTRY_SPACE) << " best" << std::right << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << results.at("t_avg") << results.at("t_min") << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << results.at("gflops_avg") << results.at("gflops_min") << std::endl;
 }
 
 std::unique_ptr<fft::FFTData>
@@ -141,7 +139,7 @@ fft::FFTBenchmark::generateInputData() {
 }
 
 bool  
-fft::FFTBenchmark::validateOutputAndPrintError(fft::FFTData &data) {
+fft::FFTBenchmark::validateOutput(fft::FFTData &data) {
     double residual_max = 0;
     for (int i = 0; i < executionSettings->programSettings->iterations; i++) {
         // we have to bit reverse the output data of the FPGA kernel, since it will be provided in bit-reversed order.
@@ -159,17 +157,22 @@ fft::FFTBenchmark::validateOutputAndPrintError(fft::FFTData &data) {
             residual_max = residual_max > tmp_error ? residual_max : tmp_error;
         }
     }
+    // Calculate residual according to paper considering also the used iterations
     double error = residual_max /
                    (std::numeric_limits<HOST_DATA_TYPE>::epsilon() * LOG_FFT_SIZE);
+    
+    errors.emplace("residual", hpcc_base::HpccResult(error, ""));
+    errors.emplace("epsilon", hpcc_base::HpccResult(std::numeric_limits<HOST_DATA_TYPE>::epsilon(), ""));
 
-    std::cout << std::setw(ENTRY_SPACE) << "res. error" << std::setw(ENTRY_SPACE) << "mach. eps" << std::endl;
-    std::cout << std::setw(ENTRY_SPACE) << error << std::setw(ENTRY_SPACE)
-              << std::numeric_limits<HOST_DATA_TYPE>::epsilon() << std::endl << std::endl;
-
-    // Calculate residual according to paper considering also the used iterations
     return error < 1.0;
 }
 
+void fft::FFTBenchmark::printError() {
+    std::cout << std::left << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl;
+    std::cout << errors.at("residual") << errors.at("epsilon") << std::endl << std::endl;
+
+}
+
 void 
 fft::bit_reverse(std::complex<HOST_DATA_TYPE> *data, unsigned iterations) {
     auto *tmp = new std::complex<HOST_DATA_TYPE>[(1 << LOG_FFT_SIZE)];
diff --git a/FFT/src/host/fft_benchmark.hpp b/FFT/src/host/fft_benchmark.hpp
index 99fd3458..33ee832a 100644
--- a/FFT/src/host/fft_benchmark.hpp
+++ b/FFT/src/host/fft_benchmark.hpp
@@ -176,7 +176,14 @@ class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark<FFTProgramSettings, FFT
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(FFTData &data) override;
+    validateOutput(FFTData &data) override;
+    
+    /**
+     * @brief FFT specifig implementation of the error printing
+     * 
+     */
+    void
+    printError() override;
 
     /**
      * @brief FFT specific implementation of printing the execution results
diff --git a/FFT/tests/test_execution_functionality.cpp b/FFT/tests/test_execution_functionality.cpp
index 7b3b836f..ab5040f3 100644
--- a/FFT/tests/test_execution_functionality.cpp
+++ b/FFT/tests/test_execution_functionality.cpp
@@ -35,7 +35,7 @@ TEST_F(FFTKernelTest, CalculateReturnsCorrectExecutionResultFor11False) {
     bm->getExecutionSettings().programSettings->numRepetitions = 1;
     data = bm->generateInputData();
     bm->executeKernel(*data);
-    EXPECT_EQ(1, bm->getTimingsMap().at("calculation").size());
+    EXPECT_EQ(1, bm->getTimingsMap().at("execution").size());
 }
 
 /**
@@ -45,7 +45,7 @@ TEST_F(FFTKernelTest, CalculateReturnsCorrectExecutionResultFor24True) {
     bm->getExecutionSettings().programSettings->numRepetitions = 2;
     data = bm->generateInputData();
     bm->executeKernel(*data);
-    EXPECT_EQ(2, bm->getTimingsMap().at("calculation").size());
+    EXPECT_EQ(2, bm->getTimingsMap().at("execution").size());
 }
 
 /**
diff --git a/FFT/tests/test_fft_functionality.cpp b/FFT/tests/test_fft_functionality.cpp
index df5dba9f..4453a695 100644
--- a/FFT/tests/test_fft_functionality.cpp
+++ b/FFT/tests/test_fft_functionality.cpp
@@ -134,7 +134,7 @@ TEST_F(FFTHostTest, JsonDump) {
         json j = json::parse(f);
         EXPECT_TRUE(j.contains("timings"));
         if (j.contains("timings")) {
-            EXPECT_TRUE(j["timings"].contains("calculation"));
+            EXPECT_TRUE(j["timings"].contains("execution"));
         }
         EXPECT_TRUE(j.contains("results"));
         if (j.contains("results")) {
diff --git a/GEMM/src/host/gemm_benchmark.cpp b/GEMM/src/host/gemm_benchmark.cpp
index 141ea160..21b8fd99 100644
--- a/GEMM/src/host/gemm_benchmark.cpp
+++ b/GEMM/src/host/gemm_benchmark.cpp
@@ -140,13 +140,12 @@ gemm::GEMMBenchmark::collectResults() {
 
 void
 gemm::GEMMBenchmark::printResults() {
-    std::cout << std::setw(ENTRY_SPACE)
-            << "best" << std::setw(ENTRY_SPACE) << "mean"
-            << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl;
+    std::cout << std::left << std::setw(ENTRY_SPACE)
+            << " best" << std::setw(ENTRY_SPACE) << " mean"
+            << std::setw(ENTRY_SPACE) << " GFLOPS" << std::right << std::endl;
 
     std::cout << std::setw(ENTRY_SPACE)
-            << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean")
-            << std::setw(ENTRY_SPACE) << results.at("gflops")
+            << results.at("t_min") << results.at("t_mean") << results.at("gflops")
             << std::endl;
 }
 
@@ -170,7 +169,7 @@ gemm::GEMMBenchmark::generateInputData() {
 }
 
 bool  
-gemm::GEMMBenchmark::validateOutputAndPrintError(gemm::GEMMData &data) {
+gemm::GEMMBenchmark::validateOutput(gemm::GEMMData &data) {
     auto ref_data = generateInputData();
 
     gemm_ref(ref_data->A, ref_data->B, ref_data->C, executionSettings->programSettings->matrixSize, OPTIONAL_CAST(0.5), OPTIONAL_CAST(2.0));
@@ -195,19 +194,22 @@ gemm::GEMMBenchmark::validateOutputAndPrintError(gemm::GEMMData &data) {
         double eps = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
         double residn = resid / (executionSettings->programSettings->matrixSize*executionSettings->programSettings->matrixSize*ref_data->normtotal*normx*eps);
 
-        std::cout << "  norm. resid        resid       "\
-                    "machep" << std::endl;
-        std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE)
-                << resid << std::setw(ENTRY_SPACE) << eps
-                << std::endl;
+        errors.emplace("epsilon", hpcc_base::HpccResult(eps, ""));
+        errors.emplace("residual", hpcc_base::HpccResult(resid, ""));
+        errors.emplace("residual_norm", hpcc_base::HpccResult(residn, ""));
 
         return residn < 1.0;
     }
-
     // All other ranks are always reporting success of the validation
     return true;
 }
 
+void
+gemm::GEMMBenchmark::printError() {
+    std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl;
+    std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl;
+}
+
 void 
 gemm::gemm_ref(HOST_DATA_TYPE* a,HOST_DATA_TYPE* b, HOST_DATA_TYPE* c,
                                 int n, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta) {
diff --git a/GEMM/src/host/gemm_benchmark.hpp b/GEMM/src/host/gemm_benchmark.hpp
index 534a5bab..c77a212f 100644
--- a/GEMM/src/host/gemm_benchmark.hpp
+++ b/GEMM/src/host/gemm_benchmark.hpp
@@ -213,7 +213,14 @@ class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark<GEMMProgramSettings, G
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(GEMMData &data) override;
+    validateOutput(GEMMData &data) override;
+
+    /**
+     * @brief GEMM specific implementation of the error printing
+     *
+     */
+    void
+    printError() override;
 
     void collectResults() override;
     /**
diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp
index 98a35e3d..ec3d900e 100644
--- a/LINPACK/src/host/linpack_benchmark.cpp
+++ b/LINPACK/src/host/linpack_benchmark.cpp
@@ -185,25 +185,23 @@ linpack::LinpackBenchmark::collectResults() {
 
 void
 linpack::LinpackBenchmark::printResults() {
-    std::cout << std::setw(ENTRY_SPACE)
-              << "Method" << std::setw(ENTRY_SPACE)
-              << "best" << std::setw(ENTRY_SPACE) << "mean"
-              << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl;
-
-    std::cout << std::setw(ENTRY_SPACE) << "total" << std::setw(ENTRY_SPACE)
-              << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean")
-              << std::setw(ENTRY_SPACE) << results.at("gflops")
+    std::cout << std::left << std::setw(ENTRY_SPACE) << " Method"
+        << std::setw(ENTRY_SPACE) << " best"
+        << std::setw(ENTRY_SPACE) << " mean"
+        << std::setw(ENTRY_SPACE) << " GFLOPS"
+        << std::endl;
+
+    std::cout << std::left << std::setw(ENTRY_SPACE) << " total" 
+              << results.at("t_min") << results.at("t_mean") << results.at("gflops")
               << std::endl;
 
-    std::cout << std::setw(ENTRY_SPACE) << "GEFA" << std::setw(ENTRY_SPACE)
-            << results.at("tlu_min") << std::setw(ENTRY_SPACE) << results.at("tlu_mean")
-            << std::setw(ENTRY_SPACE) << results.at("gflops_lu")
+    std::cout << std::left << std::setw(ENTRY_SPACE) << " GEFA"
+            << results.at("tlu_min") << results.at("tlu_mean") << results.at("gflops_lu")
             << std::endl;
 
-    std::cout << std::setw(ENTRY_SPACE) << "GESL" << std::setw(ENTRY_SPACE)
-              << results.at("tsl_min") << std::setw(ENTRY_SPACE) << results.at("tsl_mean")
-              << std::setw(ENTRY_SPACE) << results.at("gflops_sl")
-              << std::endl;
+    std::cout << std::left << std::setw(ENTRY_SPACE) << " GESL"
+              << results.at("tsl_min") << results.at("tsl_mean") << results.at("gflops_sl")
+              << std::right << std::endl;
 }
 
 std::unique_ptr<linpack::LinpackData>
@@ -295,7 +293,7 @@ linpack::LinpackBenchmark::generateInputData() {
 }
 
 bool  
-linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &data) {
+linpack::LinpackBenchmark::validateOutput(linpack::LinpackData &data) {
     uint n= executionSettings->programSettings->matrixSize;
     uint matrix_width = data.matrix_width;
     uint matrix_height = data.matrix_height;
@@ -420,19 +418,23 @@ linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &dat
         }
     #endif
 
+    errors.emplace("epsilon", hpcc_base::HpccResult(eps, ""));
+    errors.emplace("residual", hpcc_base::HpccResult(resid, ""));
+    errors.emplace("residual_norm", hpcc_base::HpccResult(residn, ""));
+
     if (mpi_comm_rank == 0) {
-        //std::cout << resid << ", " << norma << ", " << normx << std::endl;
-        std::cout << "  norm. resid        resid       "\
-                    "machep   " << std::endl;
-        std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE)
-                << resid << std::setw(ENTRY_SPACE) << eps << std::endl;
         return residn < 1;
-    }
-    else {
+    } else {
         return true;
     }
 }
 
+void
+linpack::LinpackBenchmark::printError() {
+    std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl;
+    std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl;
+}
+
 void 
 linpack::LinpackBenchmark::distributed_gesl_nopvt_ref(linpack::LinpackData& data) {
     uint global_matrix_size = executionSettings->programSettings->matrixSize;
diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp
index 7c7ce315..6178230d 100644
--- a/LINPACK/src/host/linpack_benchmark.hpp
+++ b/LINPACK/src/host/linpack_benchmark.hpp
@@ -257,7 +257,14 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(LinpackData &data) override;
+    validateOutput(LinpackData &data) override;
+
+    /**
+     * @brief Linpack specific implementation of the error printing
+     *
+     */
+    void
+    printError() override;
 
     /**
      * @brief Linpack specific implementation of printing the execution results
diff --git a/LINPACK/tests/test_host_reference_implementations.cpp b/LINPACK/tests/test_host_reference_implementations.cpp
index b1c7c8fc..0d671415 100644
--- a/LINPACK/tests/test_host_reference_implementations.cpp
+++ b/LINPACK/tests/test_host_reference_implementations.cpp
@@ -74,7 +74,8 @@ TEST_F(LinpackHostTest, ReferenceSolveGMRES) {
     for (int i=0; i < array_size; i++) {
         data->b[i] = static_cast<float>(x[i]);
     }
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError(); 
 }
 #endif
 
@@ -83,7 +84,8 @@ TEST_F(LinpackHostTest, ReferenceSolveWithPivoting) {
     data = bm->generateInputData();
     linpack::gefa_ref(data->A, array_size, array_size, data->ipvt);
     linpack::gesl_ref(data->A, data->b, data->ipvt, array_size, array_size);
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError(); 
 }
 
 
@@ -91,7 +93,8 @@ TEST_F(LinpackHostTest, ReferenceSolveWithoutPivoting) {
     data = bm->generateInputData();
     linpack::gefa_ref_nopvt(data->A, array_size, array_size);
     linpack::gesl_ref_nopvt(data->A, data->b, array_size, array_size);
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError(); 
 }
 
 
diff --git a/LINPACK/tests/test_kernel_communication.cpp b/LINPACK/tests/test_kernel_communication.cpp
index 603bedef..2182de1e 100644
--- a/LINPACK/tests/test_kernel_communication.cpp
+++ b/LINPACK/tests/test_kernel_communication.cpp
@@ -920,7 +920,8 @@ TEST_F(LinpackKernelCommunicationTestLU, LUBlockExternalResultisSameAsRef) {
 
 TEST_F(LinpackKernelCommunicationTestLU, LUBlockExternalResultisCorrect) {
     linpack::gesl_ref_nopvt(data->A, data->b, bm->getExecutionSettings().programSettings->matrixSize,bm->getExecutionSettings().programSettings->matrixSize);
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError(); 
 
 }
 
diff --git a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp
index 2dbd21f0..0200a017 100644
--- a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp
@@ -88,8 +88,8 @@ TEST_P(LinpackKernelTest, DISABLED_ValidationWorksForMKL) {
 #else
         dgesv_(&s, &lrhs, data_cpu->A, &s, data_cpu->ipvt, data_cpu->b, &s, &info);
 #endif
-    bool success = bm->validateOutputAndPrintError(*data_cpu);
-    EXPECT_TRUE(success);
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError(); 
 }
 
 
diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp
index decc9b85..e0e45c11 100644
--- a/PTRANS/src/host/transpose_benchmark.cpp
+++ b/PTRANS/src/host/transpose_benchmark.cpp
@@ -134,21 +134,30 @@ transpose::TransposeBenchmark::collectResults() {
 
 void
 transpose::TransposeBenchmark::printResults() {
-    std::cout << "       total time     transfer time  calc time      calc FLOPS    Memory Bandwidth     PCIe Bandwidth" << std::endl;
-    std::cout << "avg:   " << results.at("avg_t")
-            << "   " << results.at("avg_transfer_t")
-            << "   " << results.at("avg_calc_t")
-            << "   " << results.at("avg_calc_flops")
-            << "   " << results.at("avg_mem_bandwidth")
-            << "   " << results.at("avg_transfer_bandwidth")
-            << std::endl;
-    std::cout << "best:  " << results.at("min_t")
-            << "   " << results.at("min_transfer_t")
-            << "   " << results.at("min_calc_t")
-            << "   " << results.at("max_calc_flops")
-            << "   " << results.at("max_mem_bandwidth")
-            << "   " << results.at("max_transfer_bandwidth")
-            << std::endl;
+    std::cout << std::setw(ENTRY_SPACE) << " "
+        << std::left << std::setw(ENTRY_SPACE) << "total time"
+        << std::setw(ENTRY_SPACE) << "transfer time"
+        << std::setw(ENTRY_SPACE) << "calc time"
+        << std::setw(ENTRY_SPACE) << "calc FLOPS"
+        << std::setw(ENTRY_SPACE) << "Memory Bandwidth"
+        << std::setw(ENTRY_SPACE) << "PCIe Bandwidth"
+        << std::right << std::endl;
+    std::cout << std::setw(ENTRY_SPACE) << "avg: "
+        << results.at("avg_t")
+        << results.at("avg_transfer_t")
+        << results.at("avg_calc_t")
+        << results.at("avg_calc_flops")
+        << results.at("avg_mem_bandwidth")
+        << results.at("avg_transfer_bandwidth")
+        << std::endl;
+    std::cout << std::setw(ENTRY_SPACE) << "best: " 
+        << results.at("min_t")
+        << results.at("min_transfer_t")
+        << results.at("min_calc_t")
+        << results.at("max_calc_flops")
+        << results.at("max_mem_bandwidth")
+        << results.at("max_transfer_bandwidth")
+        << std::endl;
 }
 
 std::unique_ptr<transpose::TransposeData>
@@ -157,8 +166,7 @@ return dataHandler->generateData(*executionSettings);
 }
 
 bool  
-transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeData &data) {
-
+transpose::TransposeBenchmark::validateOutput(transpose::TransposeData &data) {
     // exchange the data using MPI depending on the chosen distribution scheme
     dataHandler->exchangeData(data);
 
@@ -172,14 +180,19 @@ transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeD
     double global_max_error = 0;
     MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
 
-    if (mpi_comm_rank == 0) {
-        std::cout << "Maximum error: " << global_max_error << " < " << 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon() <<  std::endl;
-        std::cout << "Mach. Epsilon: " << std::numeric_limits<HOST_DATA_TYPE>::epsilon() << std::endl;
-    }
+    errors.emplace("epsilon", hpcc_base::HpccResult(std::numeric_limits<HOST_DATA_TYPE>::epsilon(), ""));
+    errors.emplace("max_error", hpcc_base::HpccResult(global_max_error, ""));
 
     return static_cast<double>(global_max_error) < 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon();
 }
 
+void
+transpose::TransposeBenchmark::printError() {
+    std::cout << "Maximum error: " << errors.at("epsilon") << " < " << 100 * errors.at("epsilon").value <<  std::endl;
+    std::cout << "Mach. Epsilon: " << errors.at("epsilon")  << std::endl;
+
+}
+
 void
 transpose::TransposeBenchmark::setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier) {
     switch (dataHandlerIdentifier) {
@@ -187,6 +200,4 @@ transpose::TransposeBenchmark::setTransposeDataHandler(transpose::data_handler::
         case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler>(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break;
         default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier));
     }
-        
-
 }
diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index cd595637..57cd0231 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -93,16 +93,26 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark<TransposeProgramS
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(TransposeData &data) override;
+    validateOutput(TransposeData &data) override;
 
     /**
-     * @brief Transpose specific implementation of printing the execution results
+     * @brief Transpose specific impelmentation of the error printing
+     *
+     */
+    void
+    printError() override;
+
+    /**
+     * @brief Transpose specific implementation of collecting the execution results
      * 
-     * @param output Measured runtimes of the kernel execution
      */
     void
     collectResults() override;
     
+    /**
+     * @brief Transpose specific implementation of printing the execution results
+     * 
+     */
     void
     printResults() override;
 
diff --git a/PTRANS/tests/test_host_functionality.cpp b/PTRANS/tests/test_host_functionality.cpp
index 1c671f2b..4f7ebed6 100644
--- a/PTRANS/tests/test_host_functionality.cpp
+++ b/PTRANS/tests/test_host_functionality.cpp
@@ -90,7 +90,8 @@ TEST_F(TransposeHostTest, AggregatedErrorIsPrinted) {
     std::streambuf *oldStdOutBuffer = std::cout.rdbuf();
     std::cout.rdbuf(newStdOutBuffer.rdbuf());
 
-    bool success = bm->validateOutputAndPrintError(*data);
+    bool success = bm->validateOutput(*data);
+    bm->printError();
 
     // Redirect stdout to old buffer
     std::cout.rdbuf(oldStdOutBuffer);
@@ -128,7 +129,8 @@ TEST_F(TransposeHostTest, ValidationIsSuccess) {
     std::streambuf *oldStdOutBuffer = std::cout.rdbuf();
     std::cout.rdbuf(newStdOutBuffer.rdbuf());
 
-    bool success = bm->validateOutputAndPrintError(*data);
+    bool success = bm->validateOutput(*data);
+    bm->printError();
 
     // Redirect stdout to old buffer
     std::cout.rdbuf(oldStdOutBuffer);
diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp
index a5f06303..2c5c449b 100644
--- a/RandomAccess/src/host/random_access_benchmark.cpp
+++ b/RandomAccess/src/host/random_access_benchmark.cpp
@@ -122,9 +122,9 @@ random_access::RandomAccessBenchmark::collectResults() {
 }
 
 void random_access::RandomAccessBenchmark::printResults() {
-        std::cout << std::setw(ENTRY_SPACE)
+        std::cout << std::left << std::setw(ENTRY_SPACE)
                 << "best" << std::setw(ENTRY_SPACE) << "mean"
-                << std::setw(ENTRY_SPACE) << "GUOPS" << std::endl;
+                << std::setw(ENTRY_SPACE) << "GUOPS" << std::right << std::endl;
 
         std::cout << std::setw(ENTRY_SPACE)
                 << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean")
@@ -159,7 +159,7 @@ random_access::RandomAccessBenchmark::generateInputData() {
 }
 
 bool  
-random_access::RandomAccessBenchmark::validateOutputAndPrintError(random_access::RandomAccessData &data) {
+random_access::RandomAccessBenchmark::validateOutput(random_access::RandomAccessData &data) {
 
     HOST_DATA_TYPE* rawdata;
     if (mpi_comm_size > 1) {
@@ -190,19 +190,18 @@ random_access::RandomAccessBenchmark::validateOutputAndPrintError(random_access:
             rawdata[(temp >> 3) & (executionSettings->programSettings->dataSize * mpi_comm_size - 1)] ^= temp;
         }
 
-        double errors = 0;
-#pragma omp parallel for reduction(+:errors)
+        double error_count = 0;
+#pragma omp parallel for reduction(+:error_count)
         for (HOST_DATA_TYPE i=0; i< executionSettings->programSettings->dataSize * mpi_comm_size; i++) {
             if (rawdata[i] != i) {
                 // If the array at index i does not contain i, it differs from the initial value and is counted as an error
-                errors++;
+                error_count++;
             }
         }
 
         // The overall error is calculated in percent of the overall array size
-        double error_ratio = static_cast<double>(errors) / (executionSettings->programSettings->dataSize * mpi_comm_size);
-        std::cout  << "Error: " << error_ratio * 100 
-                    << "%" << std::endl;
+        double error_ratio = static_cast<double>(error_count) / (executionSettings->programSettings->dataSize * mpi_comm_size);
+        errors.emplace("ratio", hpcc_base::HpccResult(error_ratio, ""));
 
 #ifdef _USE_MPI_
         if (mpi_comm_rank == 0 && mpi_comm_size > 1) {
@@ -216,3 +215,8 @@ random_access::RandomAccessBenchmark::validateOutputAndPrintError(random_access:
     // All other ranks skip validation and always return true
     return true;
 }
+
+void
+random_access::RandomAccessBenchmark::printError() {
+    std::cout  << "Error: " << errors.at("ratio") << std::endl;
+}
diff --git a/RandomAccess/src/host/random_access_benchmark.hpp b/RandomAccess/src/host/random_access_benchmark.hpp
index 56c7ff40..3a1eebaa 100644
--- a/RandomAccess/src/host/random_access_benchmark.hpp
+++ b/RandomAccess/src/host/random_access_benchmark.hpp
@@ -156,7 +156,14 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark<RandomAccessPr
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(RandomAccessData &data) override;
+    validateOutput(RandomAccessData &data) override;
+
+    /**
+     * @brief RandomAccess specific implementation of the error printing
+     *
+     */
+    void
+    printError() override;
 
     /**
      * @brief RandomAccess specific implementation of printing the execution results
diff --git a/RandomAccess/tests/test_host_code.cpp b/RandomAccess/tests/test_host_code.cpp
index b96f3dc1..675c6979 100644
--- a/RandomAccess/tests/test_host_code.cpp
+++ b/RandomAccess/tests/test_host_code.cpp
@@ -24,10 +24,10 @@ struct RandomAccessHostCodeTest : testing::Test {
 TEST_F(RandomAccessHostCodeTest, ResultValidationWorksForCorrectUpdates) {
     auto data = bm->generateInputData();
     // do random accesses
-    bm->validateOutputAndPrintError(*data);
+    bm->validateOutput(*data);
     // check correctness of random accesses
-    bool success = bm->validateOutputAndPrintError(*data);
-    EXPECT_TRUE(success);
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError();
 }
 
 /**
@@ -53,6 +53,6 @@ TEST_F(RandomAccessHostCodeTest, ValidDataSizeAreDetected) {
 TEST_F(RandomAccessHostCodeTest, ResultValidationWorksForWrongUpdates) {
     auto data = bm->generateInputData();
     // check correctness of random accesses
-    bool success = bm->validateOutputAndPrintError( *data);
-    EXPECT_FALSE(success);
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError();
 }
diff --git a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp
index 35c9f229..a52ce55f 100644
--- a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp
@@ -46,8 +46,8 @@ TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements3Rep) {
  */
 TEST_F(RandomAccessKernelTest, FPGAErrorBelow1Percent) {
     bm->executeKernel(*data);
-    bool success = bm->validateOutputAndPrintError(*data);
-    EXPECT_TRUE(success);
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError();
 }
 
 using json = nlohmann::json;
diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp
index 07da82b3..bdba29e8 100644
--- a/STREAM/src/host/stream_benchmark.cpp
+++ b/STREAM/src/host/stream_benchmark.cpp
@@ -143,18 +143,19 @@ stream::StreamBenchmark::collectResults() {
 
 void
 stream::StreamBenchmark::printResults() {
-    std::cout << std::setw(ENTRY_SPACE) << "Function";
+    std::cout << std::left << std::setw(ENTRY_SPACE) << "Function";
     std::cout << std::setw(ENTRY_SPACE) << "Best Rate";
     std::cout << std::setw(ENTRY_SPACE) << "Avg time";
     std::cout << std::setw(ENTRY_SPACE) << "Min time" ;
-    std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::endl;
+    std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::right << std::endl;
 
     for (auto key : keys) {
-        std::cout << std::setw(ENTRY_SPACE) << key;
-        std::cout << std::setw(ENTRY_SPACE) << results.at(key + "_best_rate")
-                << std::setw(ENTRY_SPACE) << results.at(key + "_avg_t")
-                << std::setw(ENTRY_SPACE) << results.at(key + "_min_t")
-                << std::setw(ENTRY_SPACE) << results.at(key + "_max_t") << std::endl;
+        std::cout << std::left << std::setw(ENTRY_SPACE) << key
+            << results.at(key + "_best_rate")
+            << results.at(key + "_avg_t")
+            << results.at(key + "_min_t")
+            << results.at(key + "_max_t")
+            << std::right << std::endl;
     }
 }
 
@@ -171,7 +172,7 @@ stream::StreamBenchmark::generateInputData() {
 }
 
 bool  
-stream::StreamBenchmark::validateOutputAndPrintError(stream::StreamData &data) {
+stream::StreamBenchmark::validateOutput(stream::StreamData &data) {
     HOST_DATA_TYPE aj,bj,cj,scalar;
     double aSumErr,bSumErr,cSumErr;
     double aAvgErr,bAvgErr,cAvgErr;
@@ -220,54 +221,84 @@ stream::StreamBenchmark::validateOutputAndPrintError(stream::StreamData &data) {
     bAvgErr = totalBAvgErr / mpi_comm_size;
 #endif
 
+    bool success = true;
     if (mpi_comm_rank == 0) {
+        errors.emplace("a_expected_value", hpcc_base::HpccResult(aj, ""));
+        errors.emplace("a_average_error", hpcc_base::HpccResult(aAvgErr, ""));
+        errors.emplace("a_average_relative_error", hpcc_base::HpccResult(abs(aAvgErr)/aj, ""));
+
+        errors.emplace("b_expected_value", hpcc_base::HpccResult(bj, ""));
+        errors.emplace("b_average_error", hpcc_base::HpccResult(bAvgErr, ""));
+        errors.emplace("b_average_relative_error", hpcc_base::HpccResult(abs(bAvgErr)/bj, ""));
+
+        errors.emplace("c_expected_value", hpcc_base::HpccResult(cj, ""));
+        errors.emplace("c_average_error", hpcc_base::HpccResult(cAvgErr, ""));
+        errors.emplace("c_average_relative_error", hpcc_base::HpccResult(abs(cAvgErr)/cj, ""));
 
         epsilon = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
+        errors.emplace("epsilon", hpcc_base::HpccResult(epsilon, ""));
 
-        err = 0;
         if (abs(aAvgErr/aj) > epsilon) {
-            err++;
-            printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
-            printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
+            success = false;
             ierr = 0;
             for (j=0; j<executionSettings->programSettings->streamArraySize; j++) {
                 if (abs(data.A[j]/aj-1.0) > epsilon) {
                     ierr++;
                 }
             }
-            printf("     For array a[], %d errors were found.\n",ierr);
+            errors.emplace("a_error_count", hpcc_base::HpccResult(ierr, ""));
+            ierr = 0;
         }
         if (abs(bAvgErr/bj) > epsilon) {
-            err++;
-            printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
-            printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
-            printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+            success = false;
             ierr = 0;
             for (j=0; j<executionSettings->programSettings->streamArraySize; j++) {
                 if (abs(data.B[j]/bj-1.0) > epsilon) {
                     ierr++;
                 }
             }
-            printf("     For array b[], %d errors were found.\n",ierr);
+            errors.emplace("b_error_count", hpcc_base::HpccResult(ierr, ""));
         }
         if (abs(cAvgErr/cj) > epsilon) {
-            err++;
-            printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
-            printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
-            printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+            success = false;
             ierr = 0;
             for (j=0; j<executionSettings->programSettings->streamArraySize; j++) {
                 if (abs(data.C[j]/cj-1.0) > epsilon) {
                     ierr++;
                 }
             }
-            printf("     For array c[], %d errors were found.\n",ierr);
+            errors.emplace("b_error_count", hpcc_base::HpccResult(ierr, ""));
         }
-        if (err == 0) {
-            printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
-            return true;
-        }
-        return false;
     }
-    return true;
+    return success;
+}
+
+void
+stream::StreamBenchmark::printError() {
+    int err = 0;
+    double epsilon = errors.at("epsilon").value;
+    if (errors.at("a_average_relative_error").value > epsilon) {
+        err++;
+        printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value);
+        printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("a_expected_value").value, errors.at("a_average_error").value, errors.at("a_average_relative_error").value);
+        printf("     For array a[], %d errors were found.\n", errors.at("a_error_count"));
+    }
+
+    if (errors.at("b_average_relative_error").value > epsilon) {
+        err++;
+        printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value);
+        printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("b_expected_value").value, errors.at("b_average_error").value, errors.at("b_average_relative_error").value);
+        printf("     AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value);
+        printf("     For array b[], %d errors were found.\n", errors.at("b_error_count").value);
+    }
+    if (errors.at("c_average_relative_error").value > epsilon) {
+        err++;
+        printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value);
+        printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("c_expected_value").value, errors.at("c_average_error").value, errors.at("c_average_relative_error").value);
+        printf("     AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value);
+        printf("     For array c[], %d errors were found.\n", errors.at("c_error_count").value);
+    }
+    if (err == 0) {
+        printf ("Solution Validates: avg error less than %e on all three arrays\n", errors.at("epsilon").value);
+    }
 }
diff --git a/STREAM/src/host/stream_benchmark.hpp b/STREAM/src/host/stream_benchmark.hpp
index 8377b744..50f24b88 100644
--- a/STREAM/src/host/stream_benchmark.hpp
+++ b/STREAM/src/host/stream_benchmark.hpp
@@ -169,7 +169,14 @@ class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark<StreamProgramSetting
      * @return false 
      */
     bool
-    validateOutputAndPrintError(StreamData &data) override;
+    validateOutput(StreamData &data) override;
+
+    /**
+     * @brief STREAM specific implementation of the error printing
+     *
+     */
+    void
+    printError() override;
 
     /**
      * @brief Stream specific implementation of printing the execution results
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index 73cd27a7..2986465d 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -234,29 +234,35 @@ network::NetworkBenchmark::generateInputData() {
 }
 
 bool  
-network::NetworkBenchmark::validateOutputAndPrintError(network::NetworkData &data) {
+network::NetworkBenchmark::validateOutput(network::NetworkData &data) {
     unsigned total_error = 0;
 
     // For every data size in the data set
     for (const auto& item : data.items) {
         // check if the validation buffer contains the expected data
         HOST_DATA_TYPE expected_value = static_cast<HOST_DATA_TYPE>(item.messageSize & 255u);
-        unsigned errors = 0;
+        unsigned error_count = 0;
         HOST_DATA_TYPE failing_entry = 0;
         for (const auto& v: item.validationBuffer) {
             if (v != expected_value) {
-                errors++;
+                error_count++;
                 failing_entry = v;
             }
         }
-        total_error += errors;
-        if (errors > 0) {
-            std::cerr << "Validation data invalid for message size " << (1 << item.messageSize) << " in " << errors << " cases! Expected: " 
-                    << static_cast<int>(expected_value) << ", Value: " << static_cast<int>(failing_entry) << std::endl;
+        if (error_count > 0) {
+            errors.emplace(std::to_string(item.messageSize), hpcc_base::HpccResult(error_count, "")); 
         }
+        total_error += error_count;
     }
 
     // success only, if no error occured
     return total_error == 0;
 }
 
+void
+network::NetworkBenchmark::printError() {
+    for (const auto& error: errors) {
+        std::cerr << "Validation data invalid for message size " << (1 << stoi(error.first)) << " in " << int(error.second.value) << " cases!" << std::endl; 
+    }
+}
+
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index e1b77bc9..4d47c392 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -283,16 +283,27 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark<NetworkProgramSetti
      * @return true always, since no checks are done
      */
     bool
-    validateOutputAndPrintError(NetworkData &data) override;
+    validateOutput(NetworkData &data) override;
 
     /**
-     * @brief Network specific implementation of printing the execution results
+     * @brief Network specific implementation of the error printing
+     *
+     */
+    void
+    printError() override;
+
+    /**
+     * @brief Network specific implementation of collecting the execution results
      * 
      * @param output Measured runtimes of the kernel execution
      */
     void
     collectResults() override;
 
+    /**
+     * @brief Network specifig implementation of the printing the execution results
+     *
+     */
     void
     printResults() override;
 
diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
index a25a9ac1..4e1cdb62 100644
--- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
@@ -240,7 +240,8 @@ TEST_P(NetworkKernelTest, ValidationDataSingleItemWrongCheckFails) {
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
     std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data;});
     data->items[0].validationBuffer[looplength] = expected_data + 1;
-    EXPECT_FALSE(bm->validateOutputAndPrintError(*data));
+    EXPECT_FALSE(bm->validateOutput(*data));
+    bm->printError();
 }
 
 TEST_P(NetworkKernelTest, ValidationDataWrongCheckFails) {
@@ -250,7 +251,8 @@ TEST_P(NetworkKernelTest, ValidationDataWrongCheckFails) {
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
     std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data - 1;});
-    EXPECT_FALSE(bm->validateOutputAndPrintError(*data));
+    EXPECT_FALSE(bm->validateOutput(*data));
+    bm->printError();
 }
 
 TEST_P(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) {
@@ -260,7 +262,8 @@ TEST_P(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) {
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
     std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data;});
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError();
 }
 
 TEST_P(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) {
@@ -269,7 +272,8 @@ TEST_P(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) {
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
     bm->executeKernel(*data);
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError();
 }
 
 // This test is disabled because it does not work with the current implementation of the
@@ -282,7 +286,8 @@ TEST_P(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExec
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength));
     bm->executeKernel(*data);
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError();
 }
 
 TEST_P(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) {
@@ -293,7 +298,8 @@ TEST_P(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) {
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength));
     bm->executeKernel(*data);
     data->items[1].validationBuffer[0] = static_cast<HOST_DATA_TYPE>(0);
-    EXPECT_FALSE(bm->validateOutputAndPrintError(*data));
+    EXPECT_FALSE(bm->validateOutput(*data));
+    bm->printError();
 }
 
 TEST_P(NetworkKernelTest, JsonDump) {
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index ec611c3d..a2f7de95 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -45,7 +45,9 @@ SOFTWARE.
 #define STR_EXPAND(tok) #tok
 #define STR(tok) STR_EXPAND(tok)
 
-#define ENTRY_SPACE 15
+#define VALUE_SPACE 11
+#define UNIT_SPACE 8
+#define ENTRY_SPACE (VALUE_SPACE + UNIT_SPACE + 1)
 
 using json = nlohmann::json;
 
@@ -64,7 +66,7 @@ class HpccResult {
     HpccResult(double value, std::string unit): value(value), unit(unit) {}
     
     friend std::ostream &operator<<(std::ostream &os, const HpccResult &result) {
-        os << result.value << " " << result.unit;
+        os << std::setw(VALUE_SPACE) << result.value << " " << std::left << std::setw(UNIT_SPACE) << result.unit << std::right;
         return os;
     }
 
@@ -73,6 +75,7 @@ class HpccResult {
         oss << *this;
         return oss.str();
     }
+    // TODO: to_json function
 };
 
 /**
@@ -345,7 +348,13 @@ class HpccFpgaBenchmark {
      *
      */
     std::map<std::string, HpccResult> results;
-
+    
+    /**
+     *
+     * @brief map containing the errors of the benchmark
+     *
+     */
+    std::map<std::string, HpccResult> errors;
 
 public:
 
@@ -374,7 +383,13 @@ class HpccFpgaBenchmark {
      * @return false If the validation failed
      */
     virtual bool
-    validateOutputAndPrintError(TData &data) = 0;
+    validateOutput(TData &data) = 0;
+    
+    /**
+     * @brief Print the error after validating output
+    */
+    virtual void
+    printError() = 0;
 
     /**
      * @brief Collects the measurment results from all MPI ranks and 
@@ -515,18 +530,6 @@ class HpccFpgaBenchmark {
         timings.emplace(key, value);
     }
     
-    std::map<std::string, json> getResultsJson() {
-        // TODO: nested maps, recursive?
-        std::map<std::string, json> results_string;
-        for (auto const &result: results) {
-            json j;
-            j["unit"] = result.second.unit;
-            j["value"] = result.second.value;
-            results_string[result.first] = j;
-        }
-        return results_string;
-    }
-    
     // override for special benchmarks like b_eff
     virtual json getTimingsJson() {
         json j;
@@ -542,6 +545,28 @@ class HpccFpgaBenchmark {
         }
         return j;
     }
+
+    std::map<std::string, json> getResultsJson() {
+        std::map<std::string, json> results_string;
+        for (auto const &result: results) {
+            json j;
+            j["unit"] = result.second.unit;
+            j["value"] = result.second.value;
+            results_string[result.first] = j;
+        }
+        return results_string;
+    }
+
+    std::map<std::string, json> getErrorsJson() {
+        std::map<std::string, json> errors_string; 
+        for (auto const &error: errors) {
+            json j;
+            j["unit"] = error.second.unit;
+            j["value"] = error.second.value;
+            errors_string[error.first] = j;
+        }
+        return errors_string;
+    }
     
     std::map<std::string, std::string>
     getEnvironmentMap() {
@@ -602,6 +627,7 @@ class HpccFpgaBenchmark {
             dump["settings"] = jsonifySettingsMap(executionSettings->programSettings->getSettingsMap());
             dump["timings"] = getTimingsJson();
             dump["results"] = getResultsJson();
+            dump["errors"] = getErrorsJson();
             dump["environment"] = getEnvironmentMap();
 
             fs << dump;
@@ -738,13 +764,15 @@ class HpccFpgaBenchmark {
 
             if (!executionSettings->programSettings->skipValidation) {
                 auto eval_start = std::chrono::high_resolution_clock::now();
-                validateSuccess = validateOutputAndPrintError(*data);
+                validateSuccess = validateOutput(*data);
+                printError();
                 std::chrono::duration<double> eval_time = std::chrono::high_resolution_clock::now() - eval_start;
 
                 if (mpi_comm_rank == 0) {
                     std::cout << "Validation Time: " << eval_time.count() << " s" << std::endl;
                 }
             }
+            std::cout << HLINE << "Collect results..." << std::endl << HLINE;
             collectResults();
             
             if (mpi_comm_rank == 0) {
@@ -755,10 +783,10 @@ class HpccFpgaBenchmark {
                 printResults();
 
                 if (!validateSuccess) {
-                    std::cerr << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl;
+                    std::cerr << HLINE << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl;
                 }
                 else {
-                    std::cout << "Validation: SUCCESS!" << std::endl;
+                    std::cout << HLINE << "Validation: SUCCESS!" << std::endl;
                 }
             }
 
diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp
index b6378840..194c8920 100644
--- a/shared/tests/hpcc_base_benchmark_test.cpp
+++ b/shared/tests/hpcc_base_benchmark_test.cpp
@@ -40,7 +40,10 @@ class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
     executeKernel(int &data) override { return;}
 
     bool
-    validateOutputAndPrintError(int &data) override { return returnValidate;}
+    validateOutput(int &data) override { return returnValidate;}
+    
+    void
+    printError() override {}
 
     bool
     checkInputParameters() override { return configurationCheckSucceeds;}
@@ -93,9 +96,12 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
         return;}
 
     bool
-    validateOutputAndPrintError(int &data) override { 
+    validateOutput(int &data) override { 
         validateOutputcalled++;
         return returnValidate;}
+    
+    void
+    printError() override {}
 
     void
     collectResults() override {}

From 5b3efaf9366621da0873960b836c9a59711c6bee Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Sat, 10 Dec 2022 11:56:48 +0100
Subject: [PATCH 222/318] update documentation and READMEs

---
 FFT/README.md                                 | 157 +++++-
 GEMM/Readme.md                                | 181 +++++--
 LINPACK/Readme.md                             | 190 ++++++-
 PTRANS/README.md                              | 248 +++++++--
 RandomAccess/README.md                        | 136 ++++-
 STREAM/README.md                              | 486 +++++++++++++++++-
 b_eff/README.md                               | 293 +++++++++--
 docs/source/FFT/index.rst                     |   1 +
 docs/source/conf.py                           |   1 +
 docs/source/index.rst                         |   7 +
 .../json_output/available_keys.csv            |  58 ---
 .../technical_support/json_output/index.rst   |  72 +--
 12 files changed, 1518 insertions(+), 312 deletions(-)
 delete mode 100644 docs/source/technical_support/json_output/available_keys.csv

diff --git a/FFT/README.md b/FFT/README.md
index 1d14663d..2926a5ac 100644
--- a/FFT/README.md
+++ b/FFT/README.md
@@ -59,31 +59,36 @@ For execution of the benchmark run:
     
 For more information on available input parameters run
 
-    $./FFT_intel -h
+    ./FFT_intel -h
     
     Implementation of the FFT benchmark proposed in the HPCC benchmark suite for FPGA.
-    Version: 1.2
+    Version: 1.4
     Usage:
       ./FFT_intel [OPTION...]
     
-        -f, --file arg         Kernel file name
-        -n, arg                Number of repetitions (default: 10)
-        -i,                    Use memory Interleaving
-            --skip-validation  Skip the validation of the output data. This will
-                                speed up execution and helps when working with special
-                                data types.
-            --device arg       Index of the device that has to be used. If not
-                                given you will be asked which device to use if there are
-                                multiple devices available. (default: -1)
-            --platform arg     Index of the platform that has to be used. If not
-                                given you will be asked which platform to use if there
-                                are multiple platforms available. (default: -1)
-        -h, --help             Print this help
-        -b, arg                Number of batched FFT calculations (iterations)
-                                (default: 100)
-            --inverse          If set, the inverse FFT is calculated instead
-        -r, arg                Number of kernel replications used for calculation
-                                (default: 1)
+      -f, --file arg          Kernel file name
+      -n, arg                 Number of repetitions (default: 10)
+      -i,                     Use memory Interleaving
+          --skip-validation   Skip the validation of the output data. This will
+                              speed up execution and helps when working with
+                              special data types.
+          --device arg        Index of the device that has to be used. If not
+                              given you will be asked which device to use if there
+                              are multiple devices available. (default: 0)
+          --platform arg      Index of the platform that has to be used. If not
+                              given you will be asked which platform to use if
+                              there are multiple platforms available. (default: 0)
+          --platform_str arg  Name of the platform that has to be used (default:
+                              )
+      -r, arg                 Number of used kernel replications (default: 1)
+          --dump-json arg     dump benchmark configuration and results to this
+                              file in json format (default: )
+          --test              Only test given configuration and skip execution
+                              and validation
+      -h, --help              Print this help
+      -b, arg                 Number of batched FFT calculations (iterations)
+                              (default: 100)
+          --inverse           If set, the inverse FFT is calculated instead
     
 To execute the unit and integration tests run
 
@@ -96,12 +101,13 @@ It will run an emulation of the kernel and execute some functionality tests.
 
 The benchmark will print the following two tables to standard output after execution:
 
-       res. error    mach. eps
-      2.67000e-01  1.19209e-07
-    
-                           avg         best
-       Time in s:  7.56801e-03  7.07241e-03
-          GFLOPS:  3.24735e-02  3.47491e-02
+     res. error          mach. eps
+     2.63523e-01         1.19209e-07
+
+                     avg                 best
+          Time in s: 8.93261e-04 s       8.73572e-04 s
+             GFLOPS: 2.75127e-01 GFLOP/s 2.81328e-01 GFLOP/s
+
           
 The first table contains the maximum residual error of the calculation and the
 machine epsilon that was used to calculate the residual error.
@@ -118,3 +124,102 @@ In the second table the measured execution times and calculated FLOPs are given.
 It gives the average and bast for both.
 The time gives the averaged execution time for a single FFT in case of a batched execution (an execution with more than one iteration).
 They are also used to calculate the FLOPs.
+
+The json output looks like the following.
+
+```json
+
+{
+  "config_time": "Thu Dec 08 10:39:10 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {
+    "epsilon": {
+      "unit": "",
+      "value": 1.1920928955078125e-07
+    },
+    "residual": {
+      "unit": "",
+      "value": 0.2635231415430705
+    }
+  },
+  "git_commit": "86e0064-dirty",
+  "name": "FFT",
+  "results": {
+    "gflops_avg": {
+      "unit": "GFLOP/s",
+      "value": 0.2751268094908118
+    },
+    "gflops_min": {
+      "unit": "GFLOP/s",
+      "value": 0.2813275822966743
+    },
+    "t_avg": {
+      "unit": "s",
+      "value": 0.0008932608220000002
+    },
+    "t_min": {
+      "unit": "s",
+      "value": 0.0008735723600000001
+    }
+  },
+  "settings": {
+    "Batch Size": 100,
+    "Communication Type": "UNSUPPORTED",
+    "FFT Size": 4096,
+    "Kernel File": "./bin/fft1d_float_8_emulate.aocx",
+    "Kernel Replications": 1,
+    "MPI Ranks": "None",
+    "Repetitions": 10,
+    "Test Mode": "No"
+  },
+  "timings": {
+    "calculation": [
+      {
+        "unit": "s",
+        "value": 0.090378907
+      },
+      {
+        "unit": "s",
+        "value": 0.089294969
+      },
+      {
+        "unit": "s",
+        "value": 0.08941156
+      },
+      {
+        "unit": "s",
+        "value": 0.089993811
+      },
+      {
+        "unit": "s",
+        "value": 0.087884474
+      },
+      {
+        "unit": "s",
+        "value": 0.087357236
+      },
+      {
+        "unit": "s",
+        "value": 0.089228888
+      },
+      {
+        "unit": "s",
+        "value": 0.089401591
+      },
+      {
+        "unit": "s",
+        "value": 0.089537203
+      },
+      {
+        "unit": "s",
+        "value": 0.090772183
+      }
+    ]
+  },
+  "version": "1.4"
+}
+
+```
diff --git a/GEMM/Readme.md b/GEMM/Readme.md
index 831194bd..33f0419b 100755
--- a/GEMM/Readme.md
+++ b/GEMM/Readme.md
@@ -75,36 +75,43 @@ For execution of the benchmark run:
 For more information on available input parameters run
 
     ./GEMM_intel -h
-    
+
     Implementation of the GEMM benchmark proposed in the HPCC benchmark adapted for FPGA
+    Version: 1.3
+
+    MPI Version:  3.1
+    Config. Time: Thu Dec 08 10:39:51 UTC 2022
+    Git Commit:   86e0064-dirty
+
     Usage:
-    ./GEMM_intel [OPTION...]
-
-Implementation of the GEMM benchmark proposed in the HPCC benchmark adapted for FPGA
-Version: 1.0
-
-Usage:
-  bin/GEMM_intel [OPTION...]
-
-    -f, --file arg         Kernel file name
-    -n, arg                Number of repetitions (default: 10)
-    -i,                    Use memory Interleaving
-        --skip-validation  Skip the validation of the output data. This will
-                            speed up execution and helps when working with special
-                            data types.
-        --device arg       Index of the device that has to be used. If not
-                            given you will be asked which device to use if there are
-                            multiple devices available. (default: -1)
-        --platform arg     Index of the platform that has to be used. If not
-                            given you will be asked which platform to use if there
-                            are multiple platforms available. (default: -1)
-    -h, --help             Print this help
-    -m, arg                Matrix size in number of blocks in a single
-                            dimension (default: 8)
-    -b, arg                Block size in number of values in one dimension
-                            (default: 256)
-    -r, arg                Number of used kernel replications (default: 4)
-    
+      ./bin/GEMM_intel [OPTION...]
+
+      -f, --file arg          Kernel file name
+      -n, arg                 Number of repetitions (default: 10)
+      -i,                     Use memory Interleaving
+          --skip-validation   Skip the validation of the output data. This will
+                              speed up execution and helps when working with
+                              special data types.
+          --device arg        Index of the device that has to be used. If not
+                              given you will be asked which device to use if there
+                              are multiple devices available. (default: 0)
+          --platform arg      Index of the platform that has to be used. If not
+                              given you will be asked which platform to use if
+                              there are multiple platforms available. (default: 0)
+          --platform_str arg  Name of the platform that has to be used (default:
+                              )
+      -r, arg                 Number of used kernel replications (default: 4)
+          --dump-json arg     dump benchmark configuration and results to this
+                              file in json format (default: )
+          --test              Only test given configuration and skip execution
+                              and validation
+      -h, --help              Print this help
+      -m, arg                 Matrix size in number of blocks in a single
+                              dimension (default: 8)
+      -b, arg                 Block size in number of values in one dimension
+                              (default: 32)
+          --replicate-inputs  Also replicates the input buffer for each kernel
+
 To execute the unit and integration tests run
 
     ./GEMM_test_intel -f KERNEL_FILE_NAME
@@ -116,16 +123,17 @@ It will run an emulation of the kernel and execute some functionality tests.
 
 An example output from an emulation is given below:
 
-    norm. resid        resid       machep
-    1.45417e-05  4.76837e-05  1.19209e-07
-           best         mean       GFLOPS
-    6.89168e-03  6.89168e-03  1.03868e+02
+     norm. residual      res. error          mach. eps          
+     8.08345e-05         7.62939e-06         1.19209e-07        
+
+     best                mean                GFLOPS             
+     6.50672e-03 s       1.06789e-02 s       5.15689e+00 GFLOP/s
 
 The first two rows give information about the calculation error.
 
-- `norm. resid`: The normalized residual error based on the used matrix size and used values
-- `resid`: The maximum residual error of the calculation
-- `machep`: The machine epsilon
+- `norm. residual`: The normalized residual error based on the used matrix size and used values
+- `res. error`: The maximum residual error of the calculation
+- `mach. epsilon`: The machine epsilon
 
 The last two columns contain the time measurements and based on that the achieved FLOPS
 of the calculation.
@@ -133,3 +141,106 @@ of the calculation.
 - `best`: The shortest execution time in all runs
 - `mean`: Arithmetic mean of all execution times
 - `GFLOPS`: GFLOPS calculated from the shortest execution time
+
+The json output looks like the following.
+
+```json
+
+{
+  "config_time": "Thu Dec 08 10:39:51 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {
+    "epsilon": {
+      "unit": "",
+      "value": 1.1920928955078125e-07
+    },
+    "residual": {
+      "unit": "",
+      "value": 7.62939453125e-06
+    },
+    "residual_norm": {
+      "unit": "",
+      "value": 8.08345175162664e-05
+    }
+  },
+  "git_commit": "86e0064-dirty",
+  "mpi": {
+    "subversion": 1,
+    "version": 3
+  },
+  "name": "GEMM",
+  "results": {
+    "gflops": {
+      "unit": "GFLOP/s",
+      "value": 5.347517549652832
+    },
+    "t_mean": {
+      "unit": "s",
+      "value": 0.009541589199999999
+    },
+    "t_min": {
+      "unit": "s",
+      "value": 0.006274768
+    }
+  },
+  "settings": {
+    "Communication Type": "UNSUPPORTED",
+    "Kernel File": "./bin/gemm_base_emulate.aocx",
+    "Kernel Replications": 4,
+    "MPI Ranks": 1,
+    "Matrix Size": 256,
+    "Repetitions": 10,
+    "Replicate Inputs": false,
+    "Test Mode": "No"
+  },
+  "timings": {
+    "execution": [
+      {
+        "unit": "s",
+        "value": 0.012631986
+      },
+      {
+        "unit": "s",
+        "value": 0.012796959
+      },
+      {
+        "unit": "s",
+        "value": 0.012527344
+      },
+      {
+        "unit": "s",
+        "value": 0.012579805
+      },
+      {
+        "unit": "s",
+        "value": 0.0064457
+      },
+      {
+        "unit": "s",
+        "value": 0.006274768
+      },
+      {
+        "unit": "s",
+        "value": 0.00642924
+      },
+      {
+        "unit": "s",
+        "value": 0.012808459
+      },
+      {
+        "unit": "s",
+        "value": 0.006587663
+      },
+      {
+        "unit": "s",
+        "value": 0.006333968
+      }
+    ]
+  },
+  "version": "1.3"
+}
+
+```
diff --git a/LINPACK/Readme.md b/LINPACK/Readme.md
index a8a07566..62162c43 100644
--- a/LINPACK/Readme.md
+++ b/LINPACK/Readme.md
@@ -127,14 +127,13 @@ It will run an emulation of the kernel and execute some functionality tests.
 The host code will print the results of the execution to the standard output.
 The result  summary looks similar to this:
 
-    norm. resid        resid       machep   
-        3.25054e-08    5.88298e-05    1.19209e-07
-    Validation Time: 4.55059e+01 s
-            Method           best           mean         GFLOPS
-            total    5.87510e+01    5.87510e+01    2.10546e+04
-            GEFA    5.87510e+01    5.87510e+01    2.10541e+04
-            GESL    4.70000e-08    4.70000e-08    6.42532e+08
-    Validation: SUCCESS!
+     norm. residual      res. error          mach. eps
+    4.35451e-03         5.96046e-07         1.19209e-07
+
+     Method              best                mean                GFLOPS             
+     total              1.12152e-01 s       1.16113e-01 s       2.13045e-04 GFLOP/s 
+     GEFA               1.12152e-01 s       1.16113e-01 s       1.94784e-04 GFLOP/s 
+     GESL               2.00000e-08 s       3.97000e-08 s       1.02400e+02 GFLOP/s 
 
 The first row contains data from the correctness check that is done once when
 executing the benchmark:
@@ -155,3 +154,178 @@ The columns of the table contain the following information:
 The last row of the output will always contain `Validation: SUCCESS!`, if the norm. residual is below 1.
 This will be interpreted as successful validation.
 In this case, the executable will return 0 as exit code, 1 otherwise.
+
+The json output looks like the following.
+
+```json
+
+{
+  "config_time": "Thu Dec 08 10:41:13 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {
+    "epsilon": {
+      "unit": "",
+      "value": 1.1920928955078125e-07
+    },
+    "residual": {
+      "unit": "",
+      "value": 5.960464477539062e-07
+    },
+    "residual_norm": {
+      "unit": "",
+      "value": 0.004354506590071576
+    }
+  },
+  "git_commit": "86e0064-dirty",
+  "mpi": {
+    "subversion": 1,
+    "version": 3
+  },
+  "name": "LINPACK",
+  "results": {
+    "gflops": {
+      "unit": "GFLOP/s",
+      "value": 0.000213044786995575
+    },
+    "gflops_lu": {
+      "unit": "GFLOP/s",
+      "value": 0.00019478383998887983
+    },
+    "gflops_sl": {
+      "unit": "GFLOP/s",
+      "value": 102.4
+    },
+    "t_mean": {
+      "unit": "s",
+      "value": 0.1161132923
+    },
+    "t_min": {
+      "unit": "s",
+      "value": 0.112151692
+    },
+    "tlu_mean": {
+      "unit": "s",
+      "value": 0.11611325259999998
+    },
+    "tlu_min": {
+      "unit": "s",
+      "value": 0.112151672
+    },
+    "tsl_mean": {
+      "unit": "s",
+      "value": 3.97e-08
+    },
+    "tsl_min": {
+      "unit": "s",
+      "value": 2e-08
+    }
+  },
+  "settings": {
+    "Block Size": 16,
+    "Communication Type": "IEC",
+    "Data Type": "cl_float",
+    "Emulate": false,
+    "FPGA Torus": {
+      "P": 1,
+      "Q": 1
+    },
+    "Kernel File": "./bin/hpl_torus_IEC_emulate.aocx",
+    "Kernel Replications": 3,
+    "MPI Ranks": 1,
+    "Matrix Size": 32,
+    "Repetitions": 10,
+    "Test Mode": "No"
+  },
+  "timings": {
+    "gefa": [
+      {
+        "unit": "s",
+        "value": 0.112151672
+      },
+      {
+        "unit": "s",
+        "value": 0.112186842
+      },
+      {
+        "unit": "s",
+        "value": 0.114559183
+      },
+      {
+        "unit": "s",
+        "value": 0.114920089
+      },
+      {
+        "unit": "s",
+        "value": 0.113395783
+      },
+      {
+        "unit": "s",
+        "value": 0.113512676
+      },
+      {
+        "unit": "s",
+        "value": 0.118974459
+      },
+      {
+        "unit": "s",
+        "value": 0.11378015
+      },
+      {
+        "unit": "s",
+        "value": 0.131815478
+      },
+      {
+        "unit": "s",
+        "value": 0.115836194
+      }
+    ],
+    "gesl": [
+      {
+        "unit": "s",
+        "value": 2e-08
+      },
+      {
+        "unit": "s",
+        "value": 3e-08
+      },
+      {
+        "unit": "s",
+        "value": 3e-08
+      },
+      {
+        "unit": "s",
+        "value": 2.9e-08
+      },
+      {
+        "unit": "s",
+        "value": 1.5e-07
+      },
+      {
+        "unit": "s",
+        "value": 3e-08
+      },
+      {
+        "unit": "s",
+        "value": 2e-08
+      },
+      {
+        "unit": "s",
+        "value": 2.9e-08
+      },
+      {
+        "unit": "s",
+        "value": 2.9e-08
+      },
+      {
+        "unit": "s",
+        "value": 3e-08
+      }
+    ]
+  },
+  "version": "2.6"
+}
+
+```
diff --git a/PTRANS/README.md b/PTRANS/README.md
index 55dfd8c4..9350e1de 100644
--- a/PTRANS/README.md
+++ b/PTRANS/README.md
@@ -69,58 +69,58 @@ For the execution of the benchmark run:
     
 For more information on available input parameters run
 
-    $./Transpose_xilinx -h
-    -------------------------------------------------------------
-    General setup:
-    C++ high resolution clock is used.
-    The clock precision seems to be 1.00000e+01ns
-    -------------------------------------------------------------
+    ./Transpose_xilinx -h
+
     Implementation of the matrix transposition benchmark proposed in the HPCC benchmark suite for FPGA.
     Version: 1.7
 
     MPI Version:  3.1
-    Config. Time: Fri Mar 04 10:31:13 UTC 2022
-    Git Commit:   caebda4-dirty
+    Config. Time: Thu Dec 08 10:41:51 UTC 2022
+    Git Commit:   86e0064-dirty
 
     Usage:
-    bin/Transpose_intel [OPTION...]
+      ./bin/Transpose_intel [OPTION...]
 
-    -f, --file arg            Kernel file name
-    -n, arg                   Number of repetitions (default: 10)
-    -i,                       Use memory Interleaving
-        --skip-validation     Skip the validation of the output data. This will
+      -f, --file arg            Kernel file name
+      -n, arg                   Number of repetitions (default: 10)
+      -i,                       Use memory Interleaving
+          --skip-validation     Skip the validation of the output data. This will
                                 speed up execution and helps when working with
                                 special data types.
-        --device arg          Index of the device that has to be used. If not
+          --device arg          Index of the device that has to be used. If not
                                 given you will be asked which device to use if
-                                there are multiple devices available. (default: -1)
-        --platform arg        Index of the platform that has to be used. If not
+                                there are multiple devices available. (default: 0)
+          --platform arg        Index of the platform that has to be used. If not
                                 given you will be asked which platform to use if
                                 there are multiple platforms available. (default:
-                                -1)
-    -r, arg                   Number of used kernel replications (default: 2)
-        --comm-type arg       Used communication type for inter-FPGA
+                                0)
+          --platform_str arg    Name of the platform that has to be used
+                                (default: )
+      -r, arg                   Number of used kernel replications (default: 2)
+          --comm-type arg       Used communication type for inter-FPGA
                                 communication (default: AUTO)
-        --test                Only test given configuration and skip execution
+          --dump-json arg       dump benchmark configuration and results to this
+                                file in json format (default: )
+          --test                Only test given configuration and skip execution
                                 and validation
-    -h, --help                Print this help
-    -m, arg                   Matrix size in number of blocks in one dimension
-                                (default: 8)
-    -b, arg                   Block size in number of values in one dimension
+      -h, --help                Print this help
+      -m, arg                   Matrix size in number of blocks in one dimension
                                 (default: 8)
-    -p, arg                   Value of P that equals the width of the PQ grid
+      -b, arg                   Block size in number of values in one dimension
+                                (default: 512)
+      -p, arg                   Value of P that equals the width of the PQ grid
                                 of FPGAs. Q is determined by the world size.
                                 (default: 1)
-        --distribute-buffers  Distribute buffers over memory banks. This will
+          --distribute-buffers  Distribute buffers over memory banks. This will
                                 use three memory banks instead of one for a single
                                 kernel replication, but kernel replications may
                                 interfere. This is an Intel only attribute, since
                                 buffer placement is decided at compile time for
                                 Xilinx FPGAs.
-        --handler arg         Specify the used data handler that distributes
+          --handler arg         Specify the used data handler that distributes
                                 the data over devices and memory banks (default:
                                 AUTO)
-    
+
 Available options for `--comm-type`:
 
 - `CPU`: CPU only execution. MKL required.
@@ -142,16 +142,12 @@ It will run an emulation of the kernel and execute some functionality tests.
 
 An example output from an emulation is given below:
 
-    -------------------------------------------------------------
-    Validate output...
-    -------------------------------------------------------------
-    Maximum error: 7.62939e-06 < 1.19209e-05
+    Maximum error: 1.19209e-07          < 1.19209e-05
     Mach. Epsilon: 1.19209e-07
-    Validation Time: 4.66312e+00 s
-           total [s]     transfer [s]  calc [s]      calc FLOPS    Mem [B/s]     PCIe [B/s]
-    avg:   1.15886e+00   1.04112e+00   1.17743e-01   9.11940e+09   1.09433e+11   1.23760e+10
-    best:  1.13323e+00   1.02481e+00   1.08424e-01   9.90319e+09   1.18838e+11   1.25730e+10
-    Validation: SUCCESS!
+
+                    total time          transfer time       calc time           calc FLOPS          Memory Bandwidth    PCIe Bandwidth      
+               avg: 6.05723e-02 s       1.30980e-02 s       4.74743e-02 s       3.53396e-01 GFLOP/s 4.24075e+00 GB/s    1.53708e+01 GB/s    
+              best: 4.69977e-02 s       1.05343e-02 s       3.64633e-02 s       4.60112e-01 GFLOP/s 5.52134e+00 GB/s    1.91115e+01 GB/s    
 
 The output gives the average and best calculation time for the transposition and important derived metrics based on these times.
 For the average and best timings, we have the following columns:
@@ -171,3 +167,181 @@ The machine epsilon is given in the row below with `Mach. Epsilon`.
 Moreover, the total time that was needed for the validation of the result is given, which is just a debug information.
 The very last column summarizes the result: The last row will show `Validation: SUCCESS!` if the validation succeeded and the error is below the tolerated threshold.
 
+The json output looks like the following.
+
+```json
+
+{
+  "config_time": "Thu Dec 08 10:41:51 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {
+    "epsilon": {
+      "unit": "",
+      "value": 1.1920928955078125e-07
+    },
+    "max_error": {
+      "unit": "",
+      "value": 7.62939453125e-06
+    }
+  },
+  "git_commit": "86e0064-dirty",
+  "mpi": {
+    "subversion": 1,
+    "version": 3
+  },
+  "name": "matrix transposition",
+  "results": {
+    "avg_calc_flops": {
+      "unit": "GFLOP/s",
+      "value": 0.36102157111728794
+    },
+    "avg_calc_t": {
+      "unit": "s",
+      "value": 0.0464715057
+    },
+    "avg_mem_bandwidth": {
+      "unit": "GB/s",
+      "value": 4.332258853407454
+    },
+    "avg_t": {
+      "unit": "s",
+      "value": 0.061001096899999996
+    },
+    "avg_transfer_bandwidth": {
+      "unit": "GB/s",
+      "value": 13.856314966383914
+    },
+    "avg_transfer_t": {
+      "unit": "s",
+      "value": 0.0145295912
+    },
+    "max_calc_flops": {
+      "unit": "GFLOP/s",
+      "value": 0.4431353845559759
+    },
+    "max_mem_bandwidth": {
+      "unit": "GB/s",
+      "value": 5.31762461467171
+    },
+    "max_transfer_bandwidth": {
+      "unit": "GB/s",
+      "value": 17.8236830498358
+    },
+    "min_calc_t": {
+      "unit": "s",
+      "value": 0.037860249
+    },
+    "min_t": {
+      "unit": "s",
+      "value": 0.049155702999999995
+    },
+    "min_transfer_t": {
+      "unit": "s",
+      "value": 0.011295454
+    }
+  },
+  "settings": {
+    "Block Size": 512,
+    "Communication Type": "PCIE",
+    "Data Handler": "PQ",
+    "Dist. Buffers": "No",
+    "Kernel File": "./bin/transpose_PQ_PCIE_emulate.aocx",
+    "Kernel Replications": 2,
+    "MPI Ranks": 1,
+    "Matrix Size": 4096,
+    "Repetitions": 10,
+    "Test Mode": "No"
+  },
+  "timings": {
+    "calculation": [
+      {
+        "unit": "s",
+        "value": 0.054139988
+      },
+      {
+        "unit": "s",
+        "value": 0.05014593
+      },
+      {
+        "unit": "s",
+        "value": 0.037867809
+      },
+      {
+        "unit": "s",
+        "value": 0.037973641
+      },
+      {
+        "unit": "s",
+        "value": 0.046004999
+      },
+      {
+        "unit": "s",
+        "value": 0.037860249
+      },
+      {
+        "unit": "s",
+        "value": 0.056381497
+      },
+      {
+        "unit": "s",
+        "value": 0.050036547
+      },
+      {
+        "unit": "s",
+        "value": 0.048048414
+      },
+      {
+        "unit": "s",
+        "value": 0.046255983
+      }
+    ],
+    "transfer": [
+      {
+        "unit": "s",
+        "value": 0.025985196
+      },
+      {
+        "unit": "s",
+        "value": 0.012733798000000001
+      },
+      {
+        "unit": "s",
+        "value": 0.012989071999999999
+      },
+      {
+        "unit": "s",
+        "value": 0.011295454
+      },
+      {
+        "unit": "s",
+        "value": 0.013326449
+      },
+      {
+        "unit": "s",
+        "value": 0.012952722
+      },
+      {
+        "unit": "s",
+        "value": 0.014228134
+      },
+      {
+        "unit": "s",
+        "value": 0.013149265
+      },
+      {
+        "unit": "s",
+        "value": 0.014597321
+      },
+      {
+        "unit": "s",
+        "value": 0.014038500999999998
+      }
+    ]
+  },
+  "version": "1.7"
+}
+
+```
diff --git a/RandomAccess/README.md b/RandomAccess/README.md
index 12e665d7..a852b630 100644
--- a/RandomAccess/README.md
+++ b/RandomAccess/README.md
@@ -76,6 +76,40 @@ For more information on available input parameters run
 
     ./RandomAccess_intel -h
     
+    Implementation of the random access benchmark proposed in the HPCC benchmark suite for FPGA.
+    Version: 2.5
+
+    MPI Version:  3.1
+    Config. Time: Thu Dec 08 10:42:40 UTC 2022
+    Git Commit:   86e0064-dirty
+
+    Usage:
+      ./bin/RandomAccess_intel [OPTION...]
+
+      -f, --file arg          Kernel file name
+      -n, arg                 Number of repetitions (default: 10)
+      -i,                     Use memory Interleaving
+          --skip-validation   Skip the validation of the output data. This will
+                              speed up execution and helps when working with
+                              special data types.
+          --device arg        Index of the device that has to be used. If not
+                              given you will be asked which device to use if there
+                              are multiple devices available. (default: 0)
+          --platform arg      Index of the platform that has to be used. If not
+                              given you will be asked which platform to use if
+                              there are multiple platforms available. (default: 0)
+          --platform_str arg  Name of the platform that has to be used (default:
+                              )
+      -r, arg                 Number of used kernel replications (default: 4)
+          --dump-json arg     dump benchmark configuration and results to this
+                              file in json format (default: )
+          --test              Only test given configuration and skip execution
+                              and validation
+      -h, --help              Print this help
+      -d, arg                 Log2 of the size of the data array (default: 29)
+      -g, arg                 Log2 of the number of random number generators
+                              (default: 5)
+
 To execute the unit and integration tests for Intel devices run
 
     CL_CONTEXT_EMULATOR_DEVICE=1 ./RandomAccess_test_intel -f KERNEL_FILE_NAME
@@ -88,9 +122,10 @@ It will run an emulation of the kernel and execute some functionality tests.
 The host code will print the results of the execution to the standard output.
 The result  summary looks similar to this:
 
-    Error: 9.87137e-03%
-    best         mean         GUPS      
-    1.73506e+01  1.73507e+01  2.47540e-01 
+    Error: 3.90625e-03
+
+    best                mean                GUOPS
+    5.04258e-04 s       7.85656e-04 s       2.03071e-03 GUOP/s
 
 - `best` and `mean` are the fastest and the mean kernel execution time.
     The pure kernel execution time is measured without transferring the buffer
@@ -105,3 +140,98 @@ The result  summary looks similar to this:
 
 Benchmark results can be found in the `results` folder in this
 repository.
+
+The json output looks like the following.
+
+```json
+
+{
+  "config_time": "Thu Dec 08 10:42:40 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {
+    "ratio": {
+      "unit": "",
+      "value": 0.00390625
+    }
+  },
+  "git_commit": "86e0064-dirty",
+  "mpi": {
+    "subversion": 1,
+    "version": 3
+  },
+  "name": "random access",
+  "results": {
+    "guops": {
+      "unit": "GUOP/s",
+      "value": 0.0022880227372259515
+    },
+    "t_mean": {
+      "unit": "s",
+      "value": 0.0005729401999999999
+    },
+    "t_min": {
+      "unit": "s",
+      "value": 0.000447548
+    }
+  },
+  "settings": {
+    "#RNGs": 32,
+    "Array Size": 256,
+    "Communication Type": "UNSUPPORTED",
+    "Kernel File": "./bin/random_access_kernels_single_emulate.aocx",
+    "Kernel Replications": 4,
+    "MPI Ranks": 1,
+    "Repetitions": 10,
+    "Test Mode": "No"
+  },
+  "timings": {
+    "execution": [
+      {
+        "unit": "s",
+        "value": 0.000672612
+      },
+      {
+        "unit": "s",
+        "value": 0.00058854
+      },
+      {
+        "unit": "s",
+        "value": 0.00058064
+      },
+      {
+        "unit": "s",
+        "value": 0.00057064
+      },
+      {
+        "unit": "s",
+        "value": 0.00053845
+      },
+      {
+        "unit": "s",
+        "value": 0.00055827
+      },
+      {
+        "unit": "s",
+        "value": 0.00056768
+      },
+      {
+        "unit": "s",
+        "value": 0.000649792
+      },
+      {
+        "unit": "s",
+        "value": 0.00055523
+      },
+      {
+        "unit": "s",
+        "value": 0.000447548
+      }
+    ]
+  },
+  "version": "2.5"
+}
+
+```
diff --git a/STREAM/README.md b/STREAM/README.md
index 4c5fa5ff..10980aad 100644
--- a/STREAM/README.md
+++ b/STREAM/README.md
@@ -73,24 +73,40 @@ For execution of the benchmark run:
 For more information on available input parameters run
 
     $./STREAM_FPGA_intel -h
+
     Implementation of the STREAM benchmark proposed in the HPCC benchmark suite for FPGA.
+    Version: 2.6
+
+    MPI Version:  3.1
+    Config. Time: Thu Dec 08 10:43:26 UTC 2022
+    Git Commit:   86e0064-dirty
+
     Usage:
-    ./STREAM_FPGA_xilinx [OPTION...]
-
-    -f, --file arg       Kernel file name
-    -n, arg              Number of repetitions (default: 10)
-    -s, arg              Size of the data arrays (default: 134217728)
-    -r, arg              Number of kernel replications used (default: 1)
-        --multi-kernel  Use the legacy multi-kernel implementation
-        --device arg     Index of the device that has to be used. If not given
-                        you will be asked which device to use if there are
-                        multiple devices available. (default: -1)
-        --platform arg   Index of the platform that has to be used. If not
-                        given you will be asked which platform to use if there are
-                        multiple platforms available. (default: -1)
-    -h, --help           Print this help
+      ./bin/STREAM_FPGA_intel [OPTION...]
+
+      -f, --file arg          Kernel file name
+      -n, arg                 Number of repetitions (default: 10)
+      -i,                     Use memory Interleaving
+          --skip-validation   Skip the validation of the output data. This will
+                              speed up execution and helps when working with
+                              special data types.
+          --device arg        Index of the device that has to be used. If not
+                              given you will be asked which device to use if there
+                              are multiple devices available. (default: 0)
+          --platform arg      Index of the platform that has to be used. If not
+                              given you will be asked which platform to use if
+                              there are multiple platforms available. (default: 0)
+          --platform_str arg  Name of the platform that has to be used (default:
+                              )
+      -r, arg                 Number of used kernel replications (default: 4)
+          --dump-json arg     dump benchmark configuration and results to this
+                              file in json format (default: )
+          --test              Only test given configuration and skip execution
+                              and validation
+      -h, --help              Print this help
+      -s, arg                 Size of the data arrays (default: 134217728)
+          --multi-kernel      Use the legacy multi kernel implementation
 
-    
 To execute the unit and integration tests for Intel devices run
 
     CL_CONTEXT_EMULATOR_DEVICE=1 ./STREAM_FPGA_test_intel -f KERNEL_FILE_NAME
@@ -102,13 +118,13 @@ It will run an emulation of the kernel and execute some functionality tests.
 
 The output of the host application is similar to the original STREAM benchmark:
 
-	Function    Best Rate MB/s  Avg time     Min time     Max time
-	Copy:           30875.9     0.025914     0.025910     0.025919
-	Scale:          30885.6     0.025905     0.025902     0.025911
-	Add:            46289.2     0.025928     0.025924     0.025935
-	Triad:          45613.4     0.026310     0.026308     0.026312
-	PCI Write:       6324.0     0.189800     0.189753     0.189862
-	PCI Read:        5587.3     0.214869     0.214773     0.214943
+    Function            Best Rate           Avg time            Min time            Max time
+    PCI_write           2.68152e+04 MB/s    6.36535e-02 s       6.00633e-02 s       8.45139e-02 s
+    PCI_read            2.47220e+04 MB/s    6.72553e-02 s       6.51490e-02 s       6.82519e-02 s
+    Copy                4.75583e+04 MB/s    2.32275e-02 s       2.25774e-02 s       2.55071e-02 s
+    Scale               5.35745e+04 MB/s    2.13423e-02 s       2.00420e-02 s       2.42722e-02 s
+    Add                 5.36221e+04 MB/s    3.33479e-02 s       3.00364e-02 s       3.68116e-02 s
+    Triad               4.84564e+04 MB/s    3.46477e-02 s       3.32384e-02 s       3.70085e-02 s
 
 In addition it also measures the bandwidth of the connection between host and
 device. It is distinguished between writing to and reading from the devices
@@ -143,4 +159,428 @@ The raw data of these runs can be found in the folder `csv_result_export`.
 ![Single precision results](csv_result_export/sp_global_ring_plot.jpeg)
 
 ##### Double Precision
-![Double precision results](csv_result_export/dp_global_ring_plot.jpeg)
\ No newline at end of file
+![Double precision results](csv_result_export/dp_global_ring_plot.jpeg)
+
+```json
+
+{
+  "config_time": "Thu Dec 08 10:43:26 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {
+    "a_average_error": {
+      "unit": "",
+      "value": 0
+    },
+    "a_average_relative_error": {
+      "unit": "",
+      "value": 0
+    },
+    "a_expected_value": {
+      "unit": "",
+      "value": 1153300692992
+    },
+    "b_average_error": {
+      "unit": "",
+      "value": 0
+    },
+    "b_average_relative_error": {
+      "unit": "",
+      "value": 0
+    },
+    "b_expected_value": {
+      "unit": "",
+      "value": 230660145152
+    },
+    "c_average_error": {
+      "unit": "",
+      "value": 0
+    },
+    "c_average_relative_error": {
+      "unit": "",
+      "value": 0
+    },
+    "c_expected_value": {
+      "unit": "",
+      "value": 307546849280
+    },
+    "epsilon": {
+      "unit": "",
+      "value": 1.1920928955078125e-07
+    }
+  },
+  "git_commit": "86e0064-dirty",
+  "mpi": {
+    "subversion": 1,
+    "version": 3
+  },
+  "name": "STREAM",
+  "results": {
+    "Add_avg_t": {
+      "unit": "s",
+      "value": 0.033347886300000004
+    },
+    "Add_best_rate": {
+      "unit": "MB/s",
+      "value": 53622.07621998581
+    },
+    "Add_max_t": {
+      "unit": "s",
+      "value": 0.03681156
+    },
+    "Add_min_t": {
+      "unit": "s",
+      "value": 0.030036374
+    },
+    "Copy_avg_t": {
+      "unit": "s",
+      "value": 0.0232275248
+    },
+    "Copy_best_rate": {
+      "unit": "MB/s",
+      "value": 47558.26475478994
+    },
+    "Copy_max_t": {
+      "unit": "s",
+      "value": 0.025507117
+    },
+    "Copy_min_t": {
+      "unit": "s",
+      "value": 0.022577397
+    },
+    "PCI_read_avg_t": {
+      "unit": "s",
+      "value": 0.0672552576
+    },
+    "PCI_read_best_rate": {
+      "unit": "MB/s",
+      "value": 24721.98479896992
+    },
+    "PCI_read_max_t": {
+      "unit": "s",
+      "value": 0.06825187
+    },
+    "PCI_read_min_t": {
+      "unit": "s",
+      "value": 0.065149006
+    },
+    "PCI_write_avg_t": {
+      "unit": "s",
+      "value": 0.0636534559
+    },
+    "PCI_write_best_rate": {
+      "unit": "MB/s",
+      "value": 26815.238093906166
+    },
+    "PCI_write_max_t": {
+      "unit": "s",
+      "value": 0.084513938
+    },
+    "PCI_write_min_t": {
+      "unit": "s",
+      "value": 0.060063339
+    },
+    "Scale_avg_t": {
+      "unit": "s",
+      "value": 0.021342261699999997
+    },
+    "Scale_best_rate": {
+      "unit": "MB/s",
+      "value": 53574.52309080775
+    },
+    "Scale_max_t": {
+      "unit": "s",
+      "value": 0.024272246
+    },
+    "Scale_min_t": {
+      "unit": "s",
+      "value": 0.020042023
+    },
+    "Triad_avg_t": {
+      "unit": "s",
+      "value": 0.0346477169
+    },
+    "Triad_best_rate": {
+      "unit": "MB/s",
+      "value": 48456.4004453886
+    },
+    "Triad_max_t": {
+      "unit": "s",
+      "value": 0.037008534
+    },
+    "Triad_min_t": {
+      "unit": "s",
+      "value": 0.03323839
+    }
+  },
+  "settings": {
+    "Array Size": 134217728,
+    "Communication Type": "UNSUPPORTED",
+    "Data Type": "cl_float",
+    "Kernel File": "./bin/stream_kernels_single_emulate.aocx",
+    "Kernel Replications": 4,
+    "Kernel Type": "Single",
+    "MPI Ranks": 1,
+    "Repetitions": 10,
+    "Test Mode": "No"
+  },
+  "timings": {
+    "Add": [
+      {
+        "unit": "s",
+        "value": 0.03681156
+      },
+      {
+        "unit": "s",
+        "value": 0.030148826
+      },
+      {
+        "unit": "s",
+        "value": 0.034179315
+      },
+      {
+        "unit": "s",
+        "value": 0.03443528
+      },
+      {
+        "unit": "s",
+        "value": 0.030036374
+      },
+      {
+        "unit": "s",
+        "value": 0.03498338
+      },
+      {
+        "unit": "s",
+        "value": 0.033383682
+      },
+      {
+        "unit": "s",
+        "value": 0.03149675
+      },
+      {
+        "unit": "s",
+        "value": 0.035128302
+      },
+      {
+        "unit": "s",
+        "value": 0.032875394
+      }
+    ],
+    "Copy": [
+      {
+        "unit": "s",
+        "value": 0.023277928
+      },
+      {
+        "unit": "s",
+        "value": 0.023061445
+      },
+      {
+        "unit": "s",
+        "value": 0.022577397
+      },
+      {
+        "unit": "s",
+        "value": 0.025507117
+      },
+      {
+        "unit": "s",
+        "value": 0.022904103
+      },
+      {
+        "unit": "s",
+        "value": 0.023076385
+      },
+      {
+        "unit": "s",
+        "value": 0.022585516
+      },
+      {
+        "unit": "s",
+        "value": 0.023018084
+      },
+      {
+        "unit": "s",
+        "value": 0.023126956
+      },
+      {
+        "unit": "s",
+        "value": 0.023140317
+      }
+    ],
+    "PCI_read": [
+      {
+        "unit": "s",
+        "value": 0.066263925
+      },
+      {
+        "unit": "s",
+        "value": 0.065149006
+      },
+      {
+        "unit": "s",
+        "value": 0.06823823
+      },
+      {
+        "unit": "s",
+        "value": 0.067614649
+      },
+      {
+        "unit": "s",
+        "value": 0.068157828
+      },
+      {
+        "unit": "s",
+        "value": 0.06825187
+      },
+      {
+        "unit": "s",
+        "value": 0.068159038
+      },
+      {
+        "unit": "s",
+        "value": 0.066694763
+      },
+      {
+        "unit": "s",
+        "value": 0.067605659
+      },
+      {
+        "unit": "s",
+        "value": 0.066417608
+      }
+    ],
+    "PCI_write": [
+      {
+        "unit": "s",
+        "value": 0.084513938
+      },
+      {
+        "unit": "s",
+        "value": 0.060253183
+      },
+      {
+        "unit": "s",
+        "value": 0.060325944
+      },
+      {
+        "unit": "s",
+        "value": 0.064254031
+      },
+      {
+        "unit": "s",
+        "value": 0.060529077
+      },
+      {
+        "unit": "s",
+        "value": 0.063792623
+      },
+      {
+        "unit": "s",
+        "value": 0.060357565
+      },
+      {
+        "unit": "s",
+        "value": 0.060063339
+      },
+      {
+        "unit": "s",
+        "value": 0.060287283
+      },
+      {
+        "unit": "s",
+        "value": 0.062157576
+      }
+    ],
+    "Scale": [
+      {
+        "unit": "s",
+        "value": 0.021235864
+      },
+      {
+        "unit": "s",
+        "value": 0.020608554
+      },
+      {
+        "unit": "s",
+        "value": 0.020822067
+      },
+      {
+        "unit": "s",
+        "value": 0.020042023
+      },
+      {
+        "unit": "s",
+        "value": 0.021288745
+      },
+      {
+        "unit": "s",
+        "value": 0.020088374
+      },
+      {
+        "unit": "s",
+        "value": 0.021096531
+      },
+      {
+        "unit": "s",
+        "value": 0.021525769
+      },
+      {
+        "unit": "s",
+        "value": 0.024272246
+      },
+      {
+        "unit": "s",
+        "value": 0.022442444
+      }
+    ],
+    "Triad": [
+      {
+        "unit": "s",
+        "value": 0.037008534
+      },
+      {
+        "unit": "s",
+        "value": 0.036020228
+      },
+      {
+        "unit": "s",
+        "value": 0.033424273
+      },
+      {
+        "unit": "s",
+        "value": 0.033462613
+      },
+      {
+        "unit": "s",
+        "value": 0.033843901
+      },
+      {
+        "unit": "s",
+        "value": 0.033447893
+      },
+      {
+        "unit": "s",
+        "value": 0.03323839
+      },
+      {
+        "unit": "s",
+        "value": 0.036342203
+      },
+      {
+        "unit": "s",
+        "value": 0.03446487
+      },
+      {
+        "unit": "s",
+        "value": 0.035224264
+      }
+    ]
+  },
+  "version": "2.6"
+}
+
+```
diff --git a/b_eff/README.md b/b_eff/README.md
index ad2a9c27..157b0a67 100644
--- a/b_eff/README.md
+++ b/b_eff/README.md
@@ -71,38 +71,51 @@ For execution of the benchmark run:
     
 For more information on available input parameters run
 
-    $./Network_intel -h
+    ./Network_intel -h
     
     Implementation of the effective bandwidth benchmark proposed in the HPCC benchmark suite for FPGA.
     Version: 1.3
 
+    MPI Version:  3.1
+    Config. Time: Thu Dec 08 10:38:28 UTC 2022
+    Git Commit:   86e0064-dirty
+
     Usage:
-    bin/Network_intel [OPTION...]
-
-    -f, --file arg         Kernel file name
-    -n, arg                Number of repetitions (default: 10)
-    -i,                    Use memory Interleaving
-        --skip-validation  Skip the validation of the output data. This will
-                            speed up execution and helps when working with special
-                            data types.
-        --device arg       Index of the device that has to be used. If not
-                            given you will be asked which device to use if there are
-                            multiple devices available. (default: -1)
-        --platform arg     Index of the platform that has to be used. If not
-                            given you will be asked which platform to use if there
-                            are multiple platforms available. (default: -1)
-    -h, --help             Print this help
-    -u, --upper arg        Maximum number of repetitions per data size
-                            (default: 32768)
-    -l, --lower arg        Minimum number of repetitions per data size
-                            (default: 1)
-        --min-size arg     Minimum Message Size (default: 0)
-    -m, arg                Maximum message size (default: 20)
-    -o, arg                Offset used before reducing repetitions (default: 1)
-    -d, arg                Number os steps the repetitions are decreased to its
-                            minimum (default: 5)
+      ./bin/Network_intel [OPTION...]
+
+      -f, --file arg          Kernel file name
+      -n, arg                 Number of repetitions (default: 10)
+      -i,                     Use memory Interleaving
+          --skip-validation   Skip the validation of the output data. This will
+                              speed up execution and helps when working with
+                              special data types.
+          --device arg        Index of the device that has to be used. If not
+                              given you will be asked which device to use if there
+                              are multiple devices available. (default: 0)
+          --platform arg      Index of the platform that has to be used. If not
+                              given you will be asked which platform to use if
+                              there are multiple platforms available. (default: 0)
+          --platform_str arg  Name of the platform that has to be used (default:
+                              )
+      -r, arg                 Number of used kernel replications (default: 2)
+          --comm-type arg     Used communication type for inter-FPGA
+                              communication (default: AUTO)
+          --dump-json arg     dump benchmark configuration and results to this
+                              file in json format (default: )
+          --test              Only test given configuration and skip execution
+                              and validation
+      -h, --help              Print this help
+      -u, --upper arg         Maximum number of repetitions per data size
+                              (default: 65536)
+      -l, --lower arg         Minimum number of repetitions per data size
+                              (default: 256)
+          --min-size arg      Minimum Message Size (default: 0)
+      -m, arg                 Maximum message size (default: 20)
+      -o, arg                 Offset used before reducing repetitions (default:
+                              11)
+      -d, arg                 Number os steps the repetitions are decreased to
+                              its minimum (default: 7)
 
-    
 To execute the unit and integration tests run
 
     ./Network_test_intel -f KERNEL_FILE_NAME
@@ -140,30 +153,12 @@ This might still lead to inaccuracies in the time measurements depending on the
 The benchmark will output a result table to the standard output after execution.
 This is an example output using a single rank in emulation:
 
-            MSize      looplength            time            B/s
-                1           16384     5.46779e-02     5.99292e+05
-                2            8192     5.19651e-02     6.30578e+05
-                4            4096     2.58565e-02     1.26730e+06
-                8            2048     7.51376e-03     4.36107e+06
-               16            1024     3.01288e-03     1.08760e+07
-               32             512     1.66958e-03     1.96265e+07
-               64             256     4.60622e-03     7.11386e+06
-              128             128     1.86568e-03     1.75636e+07
-              256              64     3.75094e-03     8.73594e+06
-              512              32     3.81549e-03     8.58814e+06
-             1024              16     3.44074e-03     9.52354e+06
-             2048               8     3.83420e-03     8.54624e+06
-             4096               4     3.34786e-03     9.78775e+06
-            16384               2     7.84717e-03     8.35154e+06
-            32768               1     7.42386e-03     8.82775e+06
-            65536               1     1.40822e-02     9.30761e+06
-           131072               1     1.28135e-02     2.04585e+07
-           262144               1     5.52680e-02     9.48628e+06
-           524288               1     9.99676e-02     1.04892e+07
-          1048576               1     1.21861e-01     1.72094e+07
-          2097152               1     4.20120e-01     9.98360e+06
-    
-    b_eff = 9.58731e+06 B/s
+               MSize             looplength               transfer                    B/s
+                  64                      5            4.38310e-05            1.46015e+07
+                 128                      5            7.07010e-05            1.81044e+07
+                 256                      5            7.73410e-05            3.31002e+07
+
+    b_eff = 2.19354e+07 B/s
 
 The table contains the measurements for all tested message sizes.
 It is split into the following four columns:
@@ -177,4 +172,200 @@ It is possible to set the number of repetitions of the experiment.
 In this case, the best measured time will be used to calculate the bandwidth.
 
 Under the table the calculated effective bandwidth is printed.
-It is the mean of the achieved bandwidths for all used message sizes.
\ No newline at end of file
+It is the mean of the achieved bandwidths for all used message sizes.
+
+The json output looks like the following.
+
+```json
+
+{
+  "config_time": "Thu Dec 08 10:38:28 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {},
+  "git_commit": "86e0064-dirty",
+  "mpi": {
+    "subversion": 1,
+    "version": 3
+  },
+  "name": "effective bandwidth",
+  "results": {
+    "b_eff": {
+      "unit": "B/s",
+      "value": 21935372.01805185
+    }
+  },
+  "settings": {
+    "Communication Type": "IEC",
+    "Kernel File": "./bin/communication_bw520n_IEC_emulate.aocx",
+    "Kernel Replications": 2,
+    "Loop Length": 5,
+    "MPI Ranks": 1,
+    "Message Sizes": 2,
+    "Repetitions": 10,
+    "Test Mode": "No"
+  },
+  "timings": {
+    "6": {
+      "maxCalcBW": 14601537.724441605,
+      "maxMinCalculationTime": 4.3831e-05,
+      "timings": [
+        {
+          "looplength": 5,
+          "messageSize": 6,
+          "timings": [
+            {
+              "unit": "s",
+              "value": 0.013389739
+            },
+            {
+              "unit": "s",
+              "value": 6.2761e-05
+            },
+            {
+              "unit": "s",
+              "value": 4.9321e-05
+            },
+            {
+              "unit": "s",
+              "value": 4.3831e-05
+            },
+            {
+              "unit": "s",
+              "value": 4.951e-05
+            },
+            {
+              "unit": "s",
+              "value": 4.7561e-05
+            },
+            {
+              "unit": "s",
+              "value": 5.2311e-05
+            },
+            {
+              "unit": "s",
+              "value": 5.0441e-05
+            },
+            {
+              "unit": "s",
+              "value": 4.6901e-05
+            },
+            {
+              "unit": "s",
+              "value": 5.4401e-05
+            }
+          ]
+        }
+      ]
+    },
+    "7": {
+      "maxCalcBW": 18104411.535904724,
+      "maxMinCalculationTime": 7.0701e-05,
+      "timings": [
+        {
+          "looplength": 5,
+          "messageSize": 7,
+          "timings": [
+            {
+              "unit": "s",
+              "value": 0.000104852
+            },
+            {
+              "unit": "s",
+              "value": 0.000125222
+            },
+            {
+              "unit": "s",
+              "value": 7.9731e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000151442
+            },
+            {
+              "unit": "s",
+              "value": 9.3052e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000193763
+            },
+            {
+              "unit": "s",
+              "value": 8.4472e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000116562
+            },
+            {
+              "unit": "s",
+              "value": 8.2471e-05
+            },
+            {
+              "unit": "s",
+              "value": 7.0701e-05
+            }
+          ]
+        }
+      ]
+    },
+    "8": {
+      "maxCalcBW": 33100166.79380923,
+      "maxMinCalculationTime": 7.7341e-05,
+      "timings": [
+        {
+          "looplength": 5,
+          "messageSize": 8,
+          "timings": [
+            {
+              "unit": "s",
+              "value": 0.000711343
+            },
+            {
+              "unit": "s",
+              "value": 0.000378606
+            },
+            {
+              "unit": "s",
+              "value": 0.000280195
+            },
+            {
+              "unit": "s",
+              "value": 0.000107392
+            },
+            {
+              "unit": "s",
+              "value": 0.000203963
+            },
+            {
+              "unit": "s",
+              "value": 0.000122193
+            },
+            {
+              "unit": "s",
+              "value": 8.2151e-05
+            },
+            {
+              "unit": "s",
+              "value": 8.6861e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000167473
+            },
+            {
+              "unit": "s",
+              "value": 7.7341e-05
+            }
+          ]
+        }
+      ]
+    }
+  },
+  "version": "1.3"
+}
+
+```
diff --git a/docs/source/FFT/index.rst b/docs/source/FFT/index.rst
index 2fda355a..4f54398b 100644
--- a/docs/source/FFT/index.rst
+++ b/docs/source/FFT/index.rst
@@ -13,6 +13,7 @@ It is possible to specify the size of the FFT and the number of kernel replicati
    :glob:
 
    */index
+   ../../../FFT/README.md
 
 ------------------------
 Configuration Parameters
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 73c3c248..99328fa6 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -43,6 +43,7 @@
     'sphinx.ext.githubpages',
  #   'breathe',
     'sphinx_rtd_theme',
+    'myst_parser'
 ]
 
 # Enable Figure numbering and referencing
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 13f1de5c..8139915b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -64,6 +64,13 @@ Further optimized implementations that use such device-specific communication ap
    :glob:
 
    */results/index
+   
+.. toctree::
+   :maxdepth: 1
+   :caption: Benchmark Results:
+   :glob:
+   
+   ../../../*/README.md
 
    
 ----------
diff --git a/docs/source/technical_support/json_output/available_keys.csv b/docs/source/technical_support/json_output/available_keys.csv
deleted file mode 100644
index 070a21da..00000000
--- a/docs/source/technical_support/json_output/available_keys.csv
+++ /dev/null
@@ -1,58 +0,0 @@
-Benchmark,timings,results,settings
-:ref:`FFT <fft>`,calculation,gflops_avg,FFT Size
-,,gflops_min,Batch Size
-,,t_avg,
-,,t_min,
-:ref:`GEMM <gemm>`,execution,gflops,Matrix Size
-,,t_mean,Replicate Inputs
-,,t_min,
-:ref:`LINPACK <hpl>`,gefa,gflops,Matrix Size
-,gesl,gflops_lu,Block Size
-,,gflops_sl,Emulate
-,,t_mean,Data Type
-,,t_min,FPGA Torus
-,,tlu_mean,
-,,tlu_min,
-,,tsl_mean,
-,,tsl_min,
-:ref:`PTRANS <ptrans>`,calculation,avg_calc_flops,Matrix Size
-,transfer,avg_calc_t,Block Size
-,,avg_mem_bandwidth,Dist. Buffers
-,,avg_t,Data Handler
-,,avg_transfer_bandwidth,
-,,avg_transfer_t,
-,,max_calc_flops,
-,,max_mem_bandwidth,
-,,max_transfer_bandwidth,
-,,min_calc_t,
-,,min_t,
-,,min_transfer_t,
-:ref:`RandomAccess <randomaccess>`,execution,guops,Array Size
-,,t_mean,#RNGs
-,,t_min,
-:ref:`STREAM <stream>`,Add,Add_avg_t,Data Type
-,,Add_best_rate,Array Size
-,,Add_max_t,Kernel Type
-,,Add_min_t,
-,,Copy_avg_t,
-,,Copy_best_rate,
-,,Copy_max_t,
-,,Copy_min_t,
-,PCI_read,PCI_read_avg_t,
-,,PCI_read_best_rate,
-,,PCI_read_max_t,
-,,PCI_read_min_t,
-,PCI_write,PCI_write_avg_t,
-,,PCI_write_best_rate,
-,,PCI_write_max_t,
-,,PCI_write_min_t,
-,Scale,Scale_avg_t,
-,,Scale_best_rate,
-,,Scale_max_t,
-,,Scale_min_t,
-,Triad,Triad_avg_t,
-,,Triad_best_rate,
-,,Triad_max_t,
-,,Triad_min_t,
-:ref:`b_eff <beff>`,**special syntax - see below**,b_eff,Loop Length
-,,,Message Sizes
diff --git a/docs/source/technical_support/json_output/index.rst b/docs/source/technical_support/json_output/index.rst
index 37aa1f68..08ca9ab7 100644
--- a/docs/source/technical_support/json_output/index.rst
+++ b/docs/source/technical_support/json_output/index.rst
@@ -35,13 +35,7 @@ When enabled, this creates a json file which will have some information for all
       "version": "1.3"
     }
 
-If a benchmark has more settings, they will be added to the settings-key. Every benchmark can track different categories of timings and different results. The following table shows which keys are available for which benchmark.
-
-.. csv-table:: Available keys
-    :file: available_keys.csv
-    :header-rows: 1
-    :class: longtable
-    :widths: 1 1 1 1
+If a benchmark has more settings, they will be added to the settings-key. Every benchmark can track different categories of timings, different results and errors. To see a full example and which keys are available have a look at the README.md of the single benchmarks in the [git repositoy](https://git.uni-paderborn.de/pc2/HPCC_FPGA).
 
 The results and timings are in a special format, which consists of the value and the unit.
 
@@ -218,67 +212,3 @@ The timings are a vector of all the timings which were measured, expect for b_ef
       ]
     }
     }
-
-A full example for FFT looks like this.
-
-.. code-block:: javascript
-
-    {
-      "config_time": "Mon Dec 05 17:39:57 UTC 2022",
-      "device": "Intel(R) FPGA Emulation Device",
-      "environment": {
-        "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
-      },
-      "git_commit": "c7f3890-dirty",
-      "name": "FFT",
-      "results": {
-        "gflops_avg": {
-          "unit": "GFLOP/s",
-          "value": 0.27772734580591407
-        },
-        "gflops_min": {
-          "unit": "GFLOP/s",
-          "value": 0.28466663597913383
-        },
-        "t_avg": {
-          "unit": "s",
-          "value": 0.0008848966575
-        },
-        "t_min": {
-          "unit": "s",
-          "value": 0.00086332562
-        }
-      },
-      "settings": {
-        "Batch Size": 100,
-        "Communication Type": "UNSUPPORTED",
-        "FFT Size": 4096,
-        "Kernel File": "fft1d_float_8_emulate.aocx",
-        "Kernel Replications": 1,
-        "MPI Ranks": "None",
-        "Repetitions": 4,
-        "Test Mode": "No"
-      },
-      "timings": {
-        "calculation": [
-          {
-            "unit": "s",
-            "value": 0.090789326
-          },
-          {
-            "unit": "s",
-            "value": 0.086332562
-          },
-          {
-            "unit": "s",
-            "value": 0.090089428
-          },
-          {
-            "unit": "s",
-            "value": 0.086747347
-          }
-        ]
-      },
-      "version": "1.4"
-    }
-

From b2769e51412d8ba6dff20f9bfd50c49586919cfd Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 12 Dec 2022 10:33:31 +0100
Subject: [PATCH 223/318] only print on rank0

---
 FFT/src/host/fft_benchmark.cpp                | 12 ++-
 GEMM/src/host/gemm_benchmark.cpp              | 20 +++--
 LINPACK/src/host/linpack_benchmark.cpp        | 38 +++++----
 PTRANS/src/host/transpose_benchmark.cpp       | 57 +++++++-------
 .../src/host/random_access_benchmark.cpp      |  6 +-
 RandomAccess/tests/test_host_code.cpp         |  2 +-
 STREAM/src/host/stream_benchmark.cpp          | 78 ++++++++++---------
 7 files changed, 118 insertions(+), 95 deletions(-)

diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp
index fde7c01c..c7042add 100644
--- a/FFT/src/host/fft_benchmark.cpp
+++ b/FFT/src/host/fft_benchmark.cpp
@@ -118,10 +118,12 @@ fft::FFTBenchmark::collectResults() {
 
 void
 fft::FFTBenchmark::printResults() {
+    if (mpi_comm_rank == 0) {
         std::cout << std::setw(ENTRY_SPACE) << " " << std::left << std::setw(ENTRY_SPACE) << " avg"
                 << std::setw(ENTRY_SPACE) << " best" << std::right << std::endl;
-        std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << results.at("t_avg") << results.at("t_min") << std::endl;
-        std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << results.at("gflops_avg") << results.at("gflops_min") << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << "Time in s: " << results.at("t_avg") << results.at("t_min") << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << "GFLOPS: " << results.at("gflops_avg") << results.at("gflops_min") << std::endl;
+    }
 }
 
 std::unique_ptr<fft::FFTData>
@@ -168,8 +170,10 @@ fft::FFTBenchmark::validateOutput(fft::FFTData &data) {
 }
 
 void fft::FFTBenchmark::printError() {
-    std::cout << std::left << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl;
-    std::cout << errors.at("residual") << errors.at("epsilon") << std::endl << std::endl;
+    if (mpi_comm_rank == 0) {
+        std::cout << std::left << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl;
+        std::cout << errors.at("residual") << errors.at("epsilon") << std::endl << std::endl;
+    }
 
 }
 
diff --git a/GEMM/src/host/gemm_benchmark.cpp b/GEMM/src/host/gemm_benchmark.cpp
index 21b8fd99..5f6dfec6 100644
--- a/GEMM/src/host/gemm_benchmark.cpp
+++ b/GEMM/src/host/gemm_benchmark.cpp
@@ -140,13 +140,15 @@ gemm::GEMMBenchmark::collectResults() {
 
 void
 gemm::GEMMBenchmark::printResults() {
-    std::cout << std::left << std::setw(ENTRY_SPACE)
-            << " best" << std::setw(ENTRY_SPACE) << " mean"
-            << std::setw(ENTRY_SPACE) << " GFLOPS" << std::right << std::endl;
+    if (mpi_comm_rank == 0) {
+        std::cout << std::left << std::setw(ENTRY_SPACE)
+                << " best" << std::setw(ENTRY_SPACE) << " mean"
+                << std::setw(ENTRY_SPACE) << " GFLOPS" << std::right << std::endl;
 
-    std::cout << std::setw(ENTRY_SPACE)
-            << results.at("t_min") << results.at("t_mean") << results.at("gflops")
-            << std::endl;
+        std::cout << std::setw(ENTRY_SPACE)
+                << results.at("t_min") << results.at("t_mean") << results.at("gflops")
+                << std::endl;
+    }
 }
 
 std::unique_ptr<gemm::GEMMData>
@@ -206,8 +208,10 @@ gemm::GEMMBenchmark::validateOutput(gemm::GEMMData &data) {
 
 void
 gemm::GEMMBenchmark::printError() {
-    std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl;
-    std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl;
+    if (mpi_comm_rank == 0) {
+        std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl;
+        std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl;
+    }
 }
 
 void 
diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp
index ec3d900e..66265a04 100644
--- a/LINPACK/src/host/linpack_benchmark.cpp
+++ b/LINPACK/src/host/linpack_benchmark.cpp
@@ -185,23 +185,25 @@ linpack::LinpackBenchmark::collectResults() {
 
 void
 linpack::LinpackBenchmark::printResults() {
-    std::cout << std::left << std::setw(ENTRY_SPACE) << " Method"
-        << std::setw(ENTRY_SPACE) << " best"
-        << std::setw(ENTRY_SPACE) << " mean"
-        << std::setw(ENTRY_SPACE) << " GFLOPS"
-        << std::endl;
-
-    std::cout << std::left << std::setw(ENTRY_SPACE) << " total" 
-              << results.at("t_min") << results.at("t_mean") << results.at("gflops")
-              << std::endl;
-
-    std::cout << std::left << std::setw(ENTRY_SPACE) << " GEFA"
-            << results.at("tlu_min") << results.at("tlu_mean") << results.at("gflops_lu")
+    if (mpi_comm_rank == 0) {
+        std::cout << std::left << std::setw(ENTRY_SPACE) << " Method"
+            << std::setw(ENTRY_SPACE) << " best"
+            << std::setw(ENTRY_SPACE) << " mean"
+            << std::setw(ENTRY_SPACE) << " GFLOPS"
             << std::endl;
 
-    std::cout << std::left << std::setw(ENTRY_SPACE) << " GESL"
-              << results.at("tsl_min") << results.at("tsl_mean") << results.at("gflops_sl")
-              << std::right << std::endl;
+        std::cout << std::left << std::setw(ENTRY_SPACE) << " total" 
+                  << results.at("t_min") << results.at("t_mean") << results.at("gflops")
+                  << std::endl;
+
+        std::cout << std::left << std::setw(ENTRY_SPACE) << " GEFA"
+                << results.at("tlu_min") << results.at("tlu_mean") << results.at("gflops_lu")
+                << std::endl;
+
+        std::cout << std::left << std::setw(ENTRY_SPACE) << " GESL"
+                  << results.at("tsl_min") << results.at("tsl_mean") << results.at("gflops_sl")
+                  << std::right << std::endl;
+    }
 }
 
 std::unique_ptr<linpack::LinpackData>
@@ -431,8 +433,10 @@ linpack::LinpackBenchmark::validateOutput(linpack::LinpackData &data) {
 
 void
 linpack::LinpackBenchmark::printError() {
-    std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl;
-    std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl;
+    if (mpi_comm_rank == 0) {
+        std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl;
+        std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl;
+    }
 }
 
 void 
diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp
index e0e45c11..782b4680 100644
--- a/PTRANS/src/host/transpose_benchmark.cpp
+++ b/PTRANS/src/host/transpose_benchmark.cpp
@@ -134,30 +134,32 @@ transpose::TransposeBenchmark::collectResults() {
 
 void
 transpose::TransposeBenchmark::printResults() {
-    std::cout << std::setw(ENTRY_SPACE) << " "
-        << std::left << std::setw(ENTRY_SPACE) << "total time"
-        << std::setw(ENTRY_SPACE) << "transfer time"
-        << std::setw(ENTRY_SPACE) << "calc time"
-        << std::setw(ENTRY_SPACE) << "calc FLOPS"
-        << std::setw(ENTRY_SPACE) << "Memory Bandwidth"
-        << std::setw(ENTRY_SPACE) << "PCIe Bandwidth"
-        << std::right << std::endl;
-    std::cout << std::setw(ENTRY_SPACE) << "avg: "
-        << results.at("avg_t")
-        << results.at("avg_transfer_t")
-        << results.at("avg_calc_t")
-        << results.at("avg_calc_flops")
-        << results.at("avg_mem_bandwidth")
-        << results.at("avg_transfer_bandwidth")
-        << std::endl;
-    std::cout << std::setw(ENTRY_SPACE) << "best: " 
-        << results.at("min_t")
-        << results.at("min_transfer_t")
-        << results.at("min_calc_t")
-        << results.at("max_calc_flops")
-        << results.at("max_mem_bandwidth")
-        << results.at("max_transfer_bandwidth")
-        << std::endl;
+    if (mpi_comm_rank == 0) {
+        std::cout << std::setw(ENTRY_SPACE) << " "
+            << std::left << std::setw(ENTRY_SPACE) << "total time"
+            << std::setw(ENTRY_SPACE) << "transfer time"
+            << std::setw(ENTRY_SPACE) << "calc time"
+            << std::setw(ENTRY_SPACE) << "calc FLOPS"
+            << std::setw(ENTRY_SPACE) << "Memory Bandwidth"
+            << std::setw(ENTRY_SPACE) << "PCIe Bandwidth"
+            << std::right << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << "avg: "
+            << results.at("avg_t")
+            << results.at("avg_transfer_t")
+            << results.at("avg_calc_t")
+            << results.at("avg_calc_flops")
+            << results.at("avg_mem_bandwidth")
+            << results.at("avg_transfer_bandwidth")
+            << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << "best: " 
+            << results.at("min_t")
+            << results.at("min_transfer_t")
+            << results.at("min_calc_t")
+            << results.at("max_calc_flops")
+            << results.at("max_mem_bandwidth")
+            << results.at("max_transfer_bandwidth")
+            << std::endl;
+    }
 }
 
 std::unique_ptr<transpose::TransposeData>
@@ -188,9 +190,10 @@ transpose::TransposeBenchmark::validateOutput(transpose::TransposeData &data) {
 
 void
 transpose::TransposeBenchmark::printError() {
-    std::cout << "Maximum error: " << errors.at("epsilon") << " < " << 100 * errors.at("epsilon").value <<  std::endl;
-    std::cout << "Mach. Epsilon: " << errors.at("epsilon")  << std::endl;
-
+    if (mpi_comm_rank == 0) {
+        std::cout << "Maximum error: " << errors.at("epsilon") << " < " << 100 * errors.at("epsilon").value <<  std::endl;
+        std::cout << "Mach. Epsilon: " << errors.at("epsilon")  << std::endl;
+    }
 }
 
 void
diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp
index 2c5c449b..512ab354 100644
--- a/RandomAccess/src/host/random_access_benchmark.cpp
+++ b/RandomAccess/src/host/random_access_benchmark.cpp
@@ -122,6 +122,7 @@ random_access::RandomAccessBenchmark::collectResults() {
 }
 
 void random_access::RandomAccessBenchmark::printResults() {
+    if (mpi_comm_rank == 0) {
         std::cout << std::left << std::setw(ENTRY_SPACE)
                 << "best" << std::setw(ENTRY_SPACE) << "mean"
                 << std::setw(ENTRY_SPACE) << "GUOPS" << std::right << std::endl;
@@ -130,6 +131,7 @@ void random_access::RandomAccessBenchmark::printResults() {
                 << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean")
                 << std::setw(ENTRY_SPACE) << results.at("guops")
                 << std::endl;
+    }
 }
 
 bool
@@ -218,5 +220,7 @@ random_access::RandomAccessBenchmark::validateOutput(random_access::RandomAccess
 
 void
 random_access::RandomAccessBenchmark::printError() {
-    std::cout  << "Error: " << errors.at("ratio") << std::endl;
+    if (mpi_comm_rank == 0) {
+        std::cout  << "Error: " << errors.at("ratio") << std::endl;
+    }
 }
diff --git a/RandomAccess/tests/test_host_code.cpp b/RandomAccess/tests/test_host_code.cpp
index 675c6979..59d1a27c 100644
--- a/RandomAccess/tests/test_host_code.cpp
+++ b/RandomAccess/tests/test_host_code.cpp
@@ -53,6 +53,6 @@ TEST_F(RandomAccessHostCodeTest, ValidDataSizeAreDetected) {
 TEST_F(RandomAccessHostCodeTest, ResultValidationWorksForWrongUpdates) {
     auto data = bm->generateInputData();
     // check correctness of random accesses
-    EXPECT_TRUE(bm->validateOutput(*data));
+    EXPECT_FALSE(bm->validateOutput(*data));
     bm->printError();
 }
diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp
index bdba29e8..e8328dc8 100644
--- a/STREAM/src/host/stream_benchmark.cpp
+++ b/STREAM/src/host/stream_benchmark.cpp
@@ -143,19 +143,21 @@ stream::StreamBenchmark::collectResults() {
 
 void
 stream::StreamBenchmark::printResults() {
-    std::cout << std::left << std::setw(ENTRY_SPACE) << "Function";
-    std::cout << std::setw(ENTRY_SPACE) << "Best Rate";
-    std::cout << std::setw(ENTRY_SPACE) << "Avg time";
-    std::cout << std::setw(ENTRY_SPACE) << "Min time" ;
-    std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::right << std::endl;
-
-    for (auto key : keys) {
-        std::cout << std::left << std::setw(ENTRY_SPACE) << key
-            << results.at(key + "_best_rate")
-            << results.at(key + "_avg_t")
-            << results.at(key + "_min_t")
-            << results.at(key + "_max_t")
-            << std::right << std::endl;
+    if (mpi_comm_rank == 0) {
+        std::cout << std::left << std::setw(ENTRY_SPACE) << "Function";
+        std::cout << std::setw(ENTRY_SPACE) << "Best Rate";
+        std::cout << std::setw(ENTRY_SPACE) << "Avg time";
+        std::cout << std::setw(ENTRY_SPACE) << "Min time" ;
+        std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::right << std::endl;
+
+        for (auto key : keys) {
+            std::cout << std::left << std::setw(ENTRY_SPACE) << key
+                << results.at(key + "_best_rate")
+                << results.at(key + "_avg_t")
+                << results.at(key + "_min_t")
+                << results.at(key + "_max_t")
+                << std::right << std::endl;
+        }
     }
 }
 
@@ -275,30 +277,32 @@ stream::StreamBenchmark::validateOutput(stream::StreamData &data) {
 
 void
 stream::StreamBenchmark::printError() {
-    int err = 0;
-    double epsilon = errors.at("epsilon").value;
-    if (errors.at("a_average_relative_error").value > epsilon) {
-        err++;
-        printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value);
-        printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("a_expected_value").value, errors.at("a_average_error").value, errors.at("a_average_relative_error").value);
-        printf("     For array a[], %d errors were found.\n", errors.at("a_error_count"));
-    }
+    if (mpi_comm_rank == 0) {
+        int err = 0;
+        double epsilon = errors.at("epsilon").value;
+        if (errors.at("a_average_relative_error").value > epsilon) {
+            err++;
+            printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value);
+            printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("a_expected_value").value, errors.at("a_average_error").value, errors.at("a_average_relative_error").value);
+            printf("     For array a[], %d errors were found.\n", errors.at("a_error_count"));
+        }
 
-    if (errors.at("b_average_relative_error").value > epsilon) {
-        err++;
-        printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value);
-        printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("b_expected_value").value, errors.at("b_average_error").value, errors.at("b_average_relative_error").value);
-        printf("     AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value);
-        printf("     For array b[], %d errors were found.\n", errors.at("b_error_count").value);
-    }
-    if (errors.at("c_average_relative_error").value > epsilon) {
-        err++;
-        printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value);
-        printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("c_expected_value").value, errors.at("c_average_error").value, errors.at("c_average_relative_error").value);
-        printf("     AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value);
-        printf("     For array c[], %d errors were found.\n", errors.at("c_error_count").value);
-    }
-    if (err == 0) {
-        printf ("Solution Validates: avg error less than %e on all three arrays\n", errors.at("epsilon").value);
+        if (errors.at("b_average_relative_error").value > epsilon) {
+            err++;
+            printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value);
+            printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("b_expected_value").value, errors.at("b_average_error").value, errors.at("b_average_relative_error").value);
+            printf("     AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value);
+            printf("     For array b[], %d errors were found.\n", errors.at("b_error_count").value);
+        }
+        if (errors.at("c_average_relative_error").value > epsilon) {
+            err++;
+            printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value);
+            printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("c_expected_value").value, errors.at("c_average_error").value, errors.at("c_average_relative_error").value);
+            printf("     AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value);
+            printf("     For array c[], %d errors were found.\n", errors.at("c_error_count").value);
+        }
+        if (err == 0) {
+            printf ("Solution Validates: avg error less than %e on all three arrays\n", errors.at("epsilon").value);
+        }
     }
 }

From c3dcd9b80605df5d621f743503b19aa44099c8b1 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 12 Dec 2022 17:04:40 +0100
Subject: [PATCH 224/318] add more settings to SettingsMap

---
 FFT/src/host/fft_benchmark.cpp                |  2 +-
 FFT/src/host/fft_benchmark.hpp                |  6 ----
 GEMM/src/host/gemm_benchmark.cpp              |  4 +--
 GEMM/src/host/gemm_benchmark.hpp              |  6 ----
 LINPACK/src/host/linpack_benchmark.cpp        |  1 +
 PTRANS/src/host/transpose_data.cpp            | 11 +++---
 .../src/host/random_access_benchmark.cpp      |  2 --
 .../src/host/random_access_benchmark.hpp      |  6 ----
 STREAM/src/host/stream_benchmark.cpp          |  2 --
 STREAM/src/host/stream_benchmark.hpp          |  6 ----
 shared/include/hpcc_benchmark.hpp             | 35 +++++++++++++++++--
 11 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp
index c7042add..3e783685 100644
--- a/FFT/src/host/fft_benchmark.cpp
+++ b/FFT/src/host/fft_benchmark.cpp
@@ -44,7 +44,7 @@ fft::FFTProgramSettings::getSettingsMap() {
         auto map = hpcc_base::BaseSettings::getSettingsMap();
         map["FFT Size"] = std::to_string(1 << LOG_FFT_SIZE);
         map["Batch Size"] = std::to_string(iterations);
-        map["Kernel Replications"] = std::to_string(kernelReplications);
+        map["Inverse"] = inverse ? "Yes" : "No";
         return map;
 }
 
diff --git a/FFT/src/host/fft_benchmark.hpp b/FFT/src/host/fft_benchmark.hpp
index 33ee832a..6307275a 100644
--- a/FFT/src/host/fft_benchmark.hpp
+++ b/FFT/src/host/fft_benchmark.hpp
@@ -55,12 +55,6 @@ class FFTProgramSettings : public hpcc_base::BaseSettings {
     */
     bool inverse;
 
-    /**
-     * @brief The number of used kernel replications
-     * 
-     */
-    uint kernelReplications;
-
     /**
      * @brief Construct a new FFT Program Settings object
      * 
diff --git a/GEMM/src/host/gemm_benchmark.cpp b/GEMM/src/host/gemm_benchmark.cpp
index 5f6dfec6..21896b4d 100644
--- a/GEMM/src/host/gemm_benchmark.cpp
+++ b/GEMM/src/host/gemm_benchmark.cpp
@@ -35,7 +35,7 @@ SOFTWARE.
 #include "parameters.h"
 
 gemm::GEMMProgramSettings::GEMMProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
-    matrixSize(results["b"].as<uint>() * results["m"].as<uint>()), blockSize(results["b"].as<uint>()), kernelReplications(results["r"].as<uint>()),
+    matrixSize(results["b"].as<uint>() * results["m"].as<uint>()), blockSize(results["b"].as<uint>()),
     replicateInputBuffers(results["replicate-inputs"].count() > 0) {
 
 }
@@ -44,7 +44,7 @@ std::map<std::string, std::string>
 gemm::GEMMProgramSettings::getSettingsMap() {
         auto map = hpcc_base::BaseSettings::getSettingsMap();
         map["Matrix Size"] = std::to_string(matrixSize);
-        map["Kernel Replications"] = std::to_string(kernelReplications);
+        map["Block Size"] = std::to_string(blockSize);
         map["Replicate Inputs"] = replicateInputBuffers ? "Yes" : "No";
         return map;
 }
diff --git a/GEMM/src/host/gemm_benchmark.hpp b/GEMM/src/host/gemm_benchmark.hpp
index c77a212f..a17a29f7 100644
--- a/GEMM/src/host/gemm_benchmark.hpp
+++ b/GEMM/src/host/gemm_benchmark.hpp
@@ -71,12 +71,6 @@ class GEMMProgramSettings : public hpcc_base::BaseSettings {
      */
     uint blockSize;
 
-    /**
-     * @brief Number of times the kernel is replicated
-     * 
-     */
-    uint kernelReplications;
-
     /**
      * @brief If True, replicate input buffers for each kernel replication
      */
diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp
index 66265a04..ce4fba22 100644
--- a/LINPACK/src/host/linpack_benchmark.cpp
+++ b/LINPACK/src/host/linpack_benchmark.cpp
@@ -58,6 +58,7 @@ linpack::LinpackProgramSettings::getSettingsMap() {
         map["Matrix Size"] = std::to_string(matrixSize);
         map["Block Size"] = std::to_string(blockSize);
         map["Emulate"] = (isEmulationKernel) ? "Yes" : "No";
+        map["Diagonally Dominant"] = isDiagonallyDominant ? "Yes" : "No";
         map["Data Type"] = STR(HOST_DATA_TYPE);
         map["FPGA Torus"] = "P=" + std::to_string(torus_width) + ", Q=" + std::to_string(torus_height);
         return map;
diff --git a/PTRANS/src/host/transpose_data.cpp b/PTRANS/src/host/transpose_data.cpp
index af794f30..3e8e2ba4 100644
--- a/PTRANS/src/host/transpose_data.cpp
+++ b/PTRANS/src/host/transpose_data.cpp
@@ -26,14 +26,17 @@ transpose::TransposeProgramSettings::TransposeProgramSettings(cxxopts::ParseResu
 std::map<std::string, std::string>
 transpose::TransposeProgramSettings::getSettingsMap() {
         auto map = hpcc_base::BaseSettings::getSettingsMap();
-        int mpi_size;
-#ifdef _USE_MPI_
-        MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-#endif
+        int mpi_comm_size;
+        MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size);
+        // calculate the row and column of the MPI rank in the torus 
+        if (mpi_comm_size % p != 0) {
+            throw std::runtime_error("MPI Comm size not dividable by P=" + std::to_string(p) + "!");
+        } 
         map["Matrix Size"] = std::to_string(matrixSize);
         map["Block Size"] = std::to_string(blockSize);
         map["Dist. Buffers"] = distributeBuffers ? "Yes" : "No";
         map["Data Handler"] = transpose::data_handler::handlerToString(dataHandlerIdentifier);
+        map["FPGA Torus"] = "P=" + std::to_string(p) + " ,Q=" + std::to_string(mpi_comm_size / p);
         return map;
 }
 
diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp
index 512ab354..1fe1142a 100644
--- a/RandomAccess/src/host/random_access_benchmark.cpp
+++ b/RandomAccess/src/host/random_access_benchmark.cpp
@@ -36,7 +36,6 @@ SOFTWARE.
 
 random_access::RandomAccessProgramSettings::RandomAccessProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
     dataSize((1UL << results["d"].as<size_t>())),
-    kernelReplications(results["r"].as<uint>()),
     numRngs((1UL << results["g"].as<uint>())) {
 
 }
@@ -51,7 +50,6 @@ random_access::RandomAccessProgramSettings::getSettingsMap() {
     std::stringstream ss;
     ss << dataSize << " (" << static_cast<double>(dataSize * sizeof(HOST_DATA_TYPE) * mpi_size) << " Byte )";
     map["Array Size"] = ss.str();
-    map["Kernel Replications"] = std::to_string(kernelReplications);
     map["#RNGs"] = std::to_string(numRngs);
     return map;
 }
diff --git a/RandomAccess/src/host/random_access_benchmark.hpp b/RandomAccess/src/host/random_access_benchmark.hpp
index 3a1eebaa..0bbd02e2 100644
--- a/RandomAccess/src/host/random_access_benchmark.hpp
+++ b/RandomAccess/src/host/random_access_benchmark.hpp
@@ -50,12 +50,6 @@ class RandomAccessProgramSettings : public hpcc_base::BaseSettings {
      */
     size_t dataSize;
 
-    /**
-     * @brief The number of used kernel replications
-     * 
-     */
-    uint kernelReplications;
-
     /**
      * @brief Number of random number generators that are used per kernel replication
      * 
diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp
index e8328dc8..a47ab743 100644
--- a/STREAM/src/host/stream_benchmark.cpp
+++ b/STREAM/src/host/stream_benchmark.cpp
@@ -36,7 +36,6 @@ SOFTWARE.
 
 stream::StreamProgramSettings::StreamProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
     streamArraySize(results["s"].as<uint>()),
-    kernelReplications(results["r"].as<uint>()),
     useSingleKernel(!static_cast<bool>(results.count("multi-kernel"))) {
 
 }
@@ -48,7 +47,6 @@ stream::StreamProgramSettings::getSettingsMap() {
         std::stringstream ss;
         ss << streamArraySize << " (" << static_cast<double>(streamArraySize * sizeof(HOST_DATA_TYPE)) << " Byte )";
         map["Array Size"] = ss.str();
-        map["Kernel Replications"] = std::to_string(kernelReplications);
         map["Kernel Type"] = (useSingleKernel ? "Single" : "Separate");
         return map;
 }
diff --git a/STREAM/src/host/stream_benchmark.hpp b/STREAM/src/host/stream_benchmark.hpp
index 50f24b88..638868da 100644
--- a/STREAM/src/host/stream_benchmark.hpp
+++ b/STREAM/src/host/stream_benchmark.hpp
@@ -52,12 +52,6 @@ class StreamProgramSettings : public hpcc_base::BaseSettings {
      */
     uint streamArraySize;
 
-    /**
-     * @brief The number of used kernel replications
-     * 
-     */
-    uint kernelReplications;
-
     /**
      * @brief Indicator if the single kernel or the legacy kernel are used for execution
      * 
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index a2f7de95..766fbc13 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -200,7 +200,11 @@ class BaseSettings {
     }
         return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, 
                 {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"},
-                {"Communication Type", commToString(communicationType)}};
+                {"Communication Type", commToString(communicationType)}
+#ifdef INTEL_FPGA
+                ,{"Memory Interleaving", useMemoryInterleaving ? "Yes" : "No"}
+#endif 
+                };
     }
 
 };
@@ -275,6 +279,19 @@ class ExecutionSettings {
         }
         return device_name;
     }
+    
+    /*
+    std::string
+    getPlatformName() const {
+        std::string platform_name;
+        if (!programSettings->testOnly) {
+            platform->getInfo(CL_PLATFORM_NAME, &platform_name);
+        } else {
+            platform_name = "TEST RUN: Not selected!";
+        }
+        return platform_name;
+    }
+    */
 
 };
 
@@ -585,6 +602,15 @@ class HpccFpgaBenchmark {
         j["Q"] = stoi(q_str.substr(q_str.find("=") + 1, q_str.size()));
         return j;
     }
+    
+    std::string
+    getCurrentTime() {
+        time_t time = std::time(0);
+        const tm *local_time = std::localtime(&time);
+        std::ostringstream oss;
+        oss << std::put_time(local_time, "%a %b %d %T %Z %Y");
+        return oss.str();
+    }
 
     std::map<std::string, json>
     jsonifySettingsMap(std::map<std::string, std::string> settings_map) {
@@ -598,7 +624,7 @@ class HpccFpgaBenchmark {
             } catch (std::invalid_argument const &ex) {
                 if (key == "FPGA Torus") {
                     j[key] = parseFPGATorusString(value);
-                } else if (key == "Emulate" || key == "Replicate Inputs") {
+                } else if (key == "Emulate" || key == "Test Mode" || key == "Memory Interleaving" || key == "Replicate Inputs" || key == "Inverse" || key == "Diagonally Dominant" || "Dist. Buffers") {
                     j[key] = value == "Yes";
                 } else {
                     j[key] = value; 
@@ -621,6 +647,7 @@ class HpccFpgaBenchmark {
             dump["mpi"] ={{"version", MPI_VERSION}, {"subversion", MPI_SUBVERSION}};
 #endif
             dump["config_time"] = CONFIG_TIME;
+            dump["execution_time"] = getCurrentTime();
             dump["git_commit"] = GIT_COMMIT_HASH;
             dump["version"] = VERSION;
             dump["device"] = executionSettings->getDeviceName();
@@ -765,7 +792,9 @@ class HpccFpgaBenchmark {
             if (!executionSettings->programSettings->skipValidation) {
                 auto eval_start = std::chrono::high_resolution_clock::now();
                 validateSuccess = validateOutput(*data);
-                printError();
+                if (mpi_comm_rank == 0) {
+                    printError();
+                }
                 std::chrono::duration<double> eval_time = std::chrono::high_resolution_clock::now() - eval_start;
 
                 if (mpi_comm_rank == 0) {

From 911bc0569c2fad4b4e0ceba79c30150686dec233 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 14 Dec 2022 10:12:56 +0100
Subject: [PATCH 225/318] add validate flag to json-dump

---
 shared/include/hpcc_benchmark.hpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 766fbc13..081b7a72 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -373,6 +373,13 @@ class HpccFpgaBenchmark {
      */
     std::map<std::string, HpccResult> errors;
 
+    /**
+     * @brief This flag indicates whether the validation was successful
+     *
+     */
+    bool validated = false;
+
+
 public:
 
     /**
@@ -655,6 +662,7 @@ class HpccFpgaBenchmark {
             dump["timings"] = getTimingsJson();
             dump["results"] = getResultsJson();
             dump["errors"] = getErrorsJson();
+            dump["validated"] = validated;
             dump["environment"] = getEnvironmentMap();
 
             fs << dump;
@@ -773,7 +781,6 @@ class HpccFpgaBenchmark {
                         << HLINE;
             }
 
-            bool validateSuccess = false;
             auto exe_start = std::chrono::high_resolution_clock::now();
             executeKernel(*data);
 
@@ -791,7 +798,7 @@ class HpccFpgaBenchmark {
 
             if (!executionSettings->programSettings->skipValidation) {
                 auto eval_start = std::chrono::high_resolution_clock::now();
-                validateSuccess = validateOutput(*data);
+                validated = validateOutput(*data);
                 if (mpi_comm_rank == 0) {
                     printError();
                 }
@@ -811,7 +818,7 @@ class HpccFpgaBenchmark {
             
                 printResults();
 
-                if (!validateSuccess) {
+                if (!validated) {
                     std::cerr << HLINE << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl;
                 }
                 else {
@@ -819,7 +826,7 @@ class HpccFpgaBenchmark {
                 }
             }
 
-            return validateSuccess;
+            return validated;
        }
        catch (const std::exception& e) {
             std::cerr << "An error occured while executing the benchmark: " << std::endl;

From 778f729f3a364ac9aa4d4a40e19d7bd95bb28800 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 14 Dec 2022 10:13:31 +0100
Subject: [PATCH 226/318] set both config and execution time to UTC

---
 shared/include/hpcc_benchmark.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 081b7a72..8262fa80 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -613,9 +613,9 @@ class HpccFpgaBenchmark {
     std::string
     getCurrentTime() {
         time_t time = std::time(0);
-        const tm *local_time = std::localtime(&time);
+        const tm *utc_time = std::gmtime(&time);
         std::ostringstream oss;
-        oss << std::put_time(local_time, "%a %b %d %T %Z %Y");
+        oss << std::put_time(utc_time, "%a %b %d %T UTC %Y");
         return oss.str();
     }
 

From bdfdf4ee0532fac79ea3e2b564487ed57aef3589 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 14 Dec 2022 11:01:52 +0100
Subject: [PATCH 227/318] remove empty unit from errors

---
 FFT/src/host/fft_benchmark.cpp                |  8 +--
 GEMM/src/host/gemm_benchmark.cpp              | 10 ++--
 LINPACK/src/host/linpack_benchmark.cpp        | 10 ++--
 PTRANS/src/host/transpose_benchmark.cpp       |  6 +-
 .../src/host/random_access_benchmark.cpp      |  4 +-
 STREAM/src/host/stream_benchmark.cpp          | 56 +++++++++----------
 b_eff/src/host/network_benchmark.cpp          |  4 +-
 shared/include/hpcc_benchmark.hpp             | 15 +----
 8 files changed, 51 insertions(+), 62 deletions(-)

diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp
index 3e783685..8610b121 100644
--- a/FFT/src/host/fft_benchmark.cpp
+++ b/FFT/src/host/fft_benchmark.cpp
@@ -163,16 +163,16 @@ fft::FFTBenchmark::validateOutput(fft::FFTData &data) {
     double error = residual_max /
                    (std::numeric_limits<HOST_DATA_TYPE>::epsilon() * LOG_FFT_SIZE);
     
-    errors.emplace("residual", hpcc_base::HpccResult(error, ""));
-    errors.emplace("epsilon", hpcc_base::HpccResult(std::numeric_limits<HOST_DATA_TYPE>::epsilon(), ""));
+    errors.emplace("residual", error);
+    errors.emplace("epsilon", std::numeric_limits<HOST_DATA_TYPE>::epsilon());
 
     return error < 1.0;
 }
 
 void fft::FFTBenchmark::printError() {
     if (mpi_comm_rank == 0) {
-        std::cout << std::left << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl;
-        std::cout << errors.at("residual") << errors.at("epsilon") << std::endl << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << "res. error" << std::setw(ENTRY_SPACE) << "mach. eps" << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << errors.at("residual") << std::setw(ENTRY_SPACE) << errors.at("epsilon") << std::endl << std::endl;
     }
 
 }
diff --git a/GEMM/src/host/gemm_benchmark.cpp b/GEMM/src/host/gemm_benchmark.cpp
index 21896b4d..f624ea57 100644
--- a/GEMM/src/host/gemm_benchmark.cpp
+++ b/GEMM/src/host/gemm_benchmark.cpp
@@ -196,9 +196,9 @@ gemm::GEMMBenchmark::validateOutput(gemm::GEMMData &data) {
         double eps = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
         double residn = resid / (executionSettings->programSettings->matrixSize*executionSettings->programSettings->matrixSize*ref_data->normtotal*normx*eps);
 
-        errors.emplace("epsilon", hpcc_base::HpccResult(eps, ""));
-        errors.emplace("residual", hpcc_base::HpccResult(resid, ""));
-        errors.emplace("residual_norm", hpcc_base::HpccResult(residn, ""));
+        errors.emplace("epsilon", eps);
+        errors.emplace("residual", resid);
+        errors.emplace("residual_norm", residn);
 
         return residn < 1.0;
     }
@@ -209,8 +209,8 @@ gemm::GEMMBenchmark::validateOutput(gemm::GEMMData &data) {
 void
 gemm::GEMMBenchmark::printError() {
     if (mpi_comm_rank == 0) {
-        std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl;
-        std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << errors.at("residual_norm") << std::setw(ENTRY_SPACE) << errors.at("residual") << std::setw(ENTRY_SPACE) << errors.at("epsilon") << std::endl;
     }
 }
 
diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp
index ce4fba22..16d35e02 100644
--- a/LINPACK/src/host/linpack_benchmark.cpp
+++ b/LINPACK/src/host/linpack_benchmark.cpp
@@ -421,9 +421,9 @@ linpack::LinpackBenchmark::validateOutput(linpack::LinpackData &data) {
         }
     #endif
 
-    errors.emplace("epsilon", hpcc_base::HpccResult(eps, ""));
-    errors.emplace("residual", hpcc_base::HpccResult(resid, ""));
-    errors.emplace("residual_norm", hpcc_base::HpccResult(residn, ""));
+    errors.emplace("epsilon", eps);
+    errors.emplace("residual", resid);
+    errors.emplace("residual_norm", residn);
 
     if (mpi_comm_rank == 0) {
         return residn < 1;
@@ -435,8 +435,8 @@ linpack::LinpackBenchmark::validateOutput(linpack::LinpackData &data) {
 void
 linpack::LinpackBenchmark::printError() {
     if (mpi_comm_rank == 0) {
-        std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl;
-        std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << errors.at("residual_norm") << std::setw(ENTRY_SPACE) << errors.at("residual") << std::setw(ENTRY_SPACE) << errors.at("epsilon") << std::endl;
     }
 }
 
diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp
index 782b4680..213f6c7e 100644
--- a/PTRANS/src/host/transpose_benchmark.cpp
+++ b/PTRANS/src/host/transpose_benchmark.cpp
@@ -182,8 +182,8 @@ transpose::TransposeBenchmark::validateOutput(transpose::TransposeData &data) {
     double global_max_error = 0;
     MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
 
-    errors.emplace("epsilon", hpcc_base::HpccResult(std::numeric_limits<HOST_DATA_TYPE>::epsilon(), ""));
-    errors.emplace("max_error", hpcc_base::HpccResult(global_max_error, ""));
+    errors.emplace("epsilon", std::numeric_limits<HOST_DATA_TYPE>::epsilon());
+    errors.emplace("max_error", global_max_error);
 
     return static_cast<double>(global_max_error) < 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon();
 }
@@ -191,7 +191,7 @@ transpose::TransposeBenchmark::validateOutput(transpose::TransposeData &data) {
 void
 transpose::TransposeBenchmark::printError() {
     if (mpi_comm_rank == 0) {
-        std::cout << "Maximum error: " << errors.at("epsilon") << " < " << 100 * errors.at("epsilon").value <<  std::endl;
+        std::cout << "Maximum error: " << errors.at("epsilon") << " < " << 100 * errors.at("epsilon") <<  std::endl;
         std::cout << "Mach. Epsilon: " << errors.at("epsilon")  << std::endl;
     }
 }
diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp
index 1fe1142a..94c63d0a 100644
--- a/RandomAccess/src/host/random_access_benchmark.cpp
+++ b/RandomAccess/src/host/random_access_benchmark.cpp
@@ -201,7 +201,7 @@ random_access::RandomAccessBenchmark::validateOutput(random_access::RandomAccess
 
         // The overall error is calculated in percent of the overall array size
         double error_ratio = static_cast<double>(error_count) / (executionSettings->programSettings->dataSize * mpi_comm_size);
-        errors.emplace("ratio", hpcc_base::HpccResult(error_ratio, ""));
+        errors.emplace("ratio", error_ratio);
 
 #ifdef _USE_MPI_
         if (mpi_comm_rank == 0 && mpi_comm_size > 1) {
@@ -219,6 +219,6 @@ random_access::RandomAccessBenchmark::validateOutput(random_access::RandomAccess
 void
 random_access::RandomAccessBenchmark::printError() {
     if (mpi_comm_rank == 0) {
-        std::cout  << "Error: " << errors.at("ratio") << std::endl;
+        std::cout  << "Error: " << errors.at("ratio") * 100 << " %" << std::endl;
     }
 }
diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp
index a47ab743..f0cc01f3 100644
--- a/STREAM/src/host/stream_benchmark.cpp
+++ b/STREAM/src/host/stream_benchmark.cpp
@@ -223,20 +223,20 @@ stream::StreamBenchmark::validateOutput(stream::StreamData &data) {
 
     bool success = true;
     if (mpi_comm_rank == 0) {
-        errors.emplace("a_expected_value", hpcc_base::HpccResult(aj, ""));
-        errors.emplace("a_average_error", hpcc_base::HpccResult(aAvgErr, ""));
-        errors.emplace("a_average_relative_error", hpcc_base::HpccResult(abs(aAvgErr)/aj, ""));
+        errors.emplace("a_expected", aj);
+        errors.emplace("a_average_error", aAvgErr);
+        errors.emplace("a_average_relative_error", abs(aAvgErr)/aj);
 
-        errors.emplace("b_expected_value", hpcc_base::HpccResult(bj, ""));
-        errors.emplace("b_average_error", hpcc_base::HpccResult(bAvgErr, ""));
-        errors.emplace("b_average_relative_error", hpcc_base::HpccResult(abs(bAvgErr)/bj, ""));
+        errors.emplace("b_expected", bj);
+        errors.emplace("b_average_error", bAvgErr);
+        errors.emplace("b_average_relative_error", abs(bAvgErr)/bj);
 
-        errors.emplace("c_expected_value", hpcc_base::HpccResult(cj, ""));
-        errors.emplace("c_average_error", hpcc_base::HpccResult(cAvgErr, ""));
-        errors.emplace("c_average_relative_error", hpcc_base::HpccResult(abs(cAvgErr)/cj, ""));
+        errors.emplace("c_expected", cj);
+        errors.emplace("c_average_error", cAvgErr);
+        errors.emplace("c_average_relative_error", abs(cAvgErr)/cj);
 
         epsilon = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
-        errors.emplace("epsilon", hpcc_base::HpccResult(epsilon, ""));
+        errors.emplace("epsilon", epsilon);
 
         if (abs(aAvgErr/aj) > epsilon) {
             success = false;
@@ -246,7 +246,7 @@ stream::StreamBenchmark::validateOutput(stream::StreamData &data) {
                     ierr++;
                 }
             }
-            errors.emplace("a_error_count", hpcc_base::HpccResult(ierr, ""));
+            errors.emplace("a_error_count", ierr);
             ierr = 0;
         }
         if (abs(bAvgErr/bj) > epsilon) {
@@ -257,7 +257,7 @@ stream::StreamBenchmark::validateOutput(stream::StreamData &data) {
                     ierr++;
                 }
             }
-            errors.emplace("b_error_count", hpcc_base::HpccResult(ierr, ""));
+            errors.emplace("b_error_count", ierr);
         }
         if (abs(cAvgErr/cj) > epsilon) {
             success = false;
@@ -267,7 +267,7 @@ stream::StreamBenchmark::validateOutput(stream::StreamData &data) {
                     ierr++;
                 }
             }
-            errors.emplace("b_error_count", hpcc_base::HpccResult(ierr, ""));
+            errors.emplace("b_error_count", ierr);
         }
     }
     return success;
@@ -277,30 +277,30 @@ void
 stream::StreamBenchmark::printError() {
     if (mpi_comm_rank == 0) {
         int err = 0;
-        double epsilon = errors.at("epsilon").value;
-        if (errors.at("a_average_relative_error").value > epsilon) {
+        double epsilon = errors.at("epsilon");
+        if (errors.at("a_average_relative_error") > epsilon) {
             err++;
-            printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value);
-            printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("a_expected_value").value, errors.at("a_average_error").value, errors.at("a_average_relative_error").value);
+            printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon"));
+            printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("a_expected"), errors.at("a_average_error"), errors.at("a_average_relative_error"));
             printf("     For array a[], %d errors were found.\n", errors.at("a_error_count"));
         }
 
-        if (errors.at("b_average_relative_error").value > epsilon) {
+        if (errors.at("b_average_relative_error") > epsilon) {
             err++;
-            printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value);
-            printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("b_expected_value").value, errors.at("b_average_error").value, errors.at("b_average_relative_error").value);
-            printf("     AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value);
-            printf("     For array b[], %d errors were found.\n", errors.at("b_error_count").value);
+            printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon"));
+            printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("b_expected"), errors.at("b_average_error"), errors.at("b_average_relative_error"));
+            printf("     AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon"));
+            printf("     For array b[], %d errors were found.\n", errors.at("b_error_count"));
         }
-        if (errors.at("c_average_relative_error").value > epsilon) {
+        if (errors.at("c_average_relative_error") > epsilon) {
             err++;
-            printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value);
-            printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("c_expected_value").value, errors.at("c_average_error").value, errors.at("c_average_relative_error").value);
-            printf("     AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value);
-            printf("     For array c[], %d errors were found.\n", errors.at("c_error_count").value);
+            printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon"));
+            printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("c_expected"), errors.at("c_average_error"), errors.at("c_average_relative_error"));
+            printf("     AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon"));
+            printf("     For array c[], %d errors were found.\n", errors.at("c_error_count"));
         }
         if (err == 0) {
-            printf ("Solution Validates: avg error less than %e on all three arrays\n", errors.at("epsilon").value);
+            printf ("Solution Validates: avg error less than %e on all three arrays\n", errors.at("epsilon"));
         }
     }
 }
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index 2986465d..dbe1f610 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -250,7 +250,7 @@ network::NetworkBenchmark::validateOutput(network::NetworkData &data) {
             }
         }
         if (error_count > 0) {
-            errors.emplace(std::to_string(item.messageSize), hpcc_base::HpccResult(error_count, "")); 
+            errors.emplace(std::to_string(item.messageSize), error_count); 
         }
         total_error += error_count;
     }
@@ -262,7 +262,7 @@ network::NetworkBenchmark::validateOutput(network::NetworkData &data) {
 void
 network::NetworkBenchmark::printError() {
     for (const auto& error: errors) {
-        std::cerr << "Validation data invalid for message size " << (1 << stoi(error.first)) << " in " << int(error.second.value) << " cases!" << std::endl; 
+        std::cerr << "Validation data invalid for message size " << (1 << stoi(error.first)) << " in " << int(error.second) << " cases!" << std::endl; 
     }
 }
 
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 8262fa80..69da2bfe 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -371,7 +371,7 @@ class HpccFpgaBenchmark {
      * @brief map containing the errors of the benchmark
      *
      */
-    std::map<std::string, HpccResult> errors;
+    std::map<std::string, double> errors;
 
     /**
      * @brief This flag indicates whether the validation was successful
@@ -581,17 +581,6 @@ class HpccFpgaBenchmark {
         return results_string;
     }
 
-    std::map<std::string, json> getErrorsJson() {
-        std::map<std::string, json> errors_string; 
-        for (auto const &error: errors) {
-            json j;
-            j["unit"] = error.second.unit;
-            j["value"] = error.second.value;
-            errors_string[error.first] = j;
-        }
-        return errors_string;
-    }
-    
     std::map<std::string, std::string>
     getEnvironmentMap() {
         std::map<std::string, std::string> env; 
@@ -661,7 +650,7 @@ class HpccFpgaBenchmark {
             dump["settings"] = jsonifySettingsMap(executionSettings->programSettings->getSettingsMap());
             dump["timings"] = getTimingsJson();
             dump["results"] = getResultsJson();
-            dump["errors"] = getErrorsJson();
+            dump["errors"] = errors;
             dump["validated"] = validated;
             dump["environment"] = getEnvironmentMap();
 

From 86b9642e89cb46bab084bdbb075f7817953c178e Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 14 Dec 2022 11:02:38 +0100
Subject: [PATCH 228/318] update READMEs

---
 FFT/README.md                    |  55 ++++----
 GEMM/{Readme.md => README.md}    |  54 ++++---
 LINPACK/{Readme.md => README.md} |  80 +++++------
 PTRANS/README.md                 | 136 +++++-------------
 RandomAccess/README.md           |  43 +++---
 STREAM/README.md                 | 234 ++++++++++++++-----------------
 b_eff/README.md                  |  86 ++++++------
 7 files changed, 287 insertions(+), 401 deletions(-)
 rename GEMM/{Readme.md => README.md} (92%)
 rename LINPACK/{Readme.md => README.md} (90%)

diff --git a/FFT/README.md b/FFT/README.md
index 2926a5ac..52269e94 100644
--- a/FFT/README.md
+++ b/FFT/README.md
@@ -130,95 +130,92 @@ The json output looks like the following.
 ```json
 
 {
-  "config_time": "Thu Dec 08 10:39:10 UTC 2022",
+  "config_time": "Wed Dec 14 08:40:17 UTC 2022",
   "device": "Intel(R) FPGA Emulation Device",
   "environment": {
     "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
   },
   "errors": {
-    "epsilon": {
-      "unit": "",
-      "value": 1.1920928955078125e-07
-    },
-    "residual": {
-      "unit": "",
-      "value": 0.2635231415430705
-    }
+    "epsilon": 1.1920928955078125e-07,
+    "residual": 0.2635231415430705
   },
-  "git_commit": "86e0064-dirty",
+  "execution_time": "Wed Dec 14 08:55:51 GMT 2022",
+  "git_commit": "be1a4e9-dirty",
   "name": "FFT",
   "results": {
     "gflops_avg": {
       "unit": "GFLOP/s",
-      "value": 0.2751268094908118
+      "value": 0.2573525536079919
     },
     "gflops_min": {
       "unit": "GFLOP/s",
-      "value": 0.2813275822966743
+      "value": 0.2842073122577159
     },
     "t_avg": {
       "unit": "s",
-      "value": 0.0008932608220000002
+      "value": 0.0009549545810000001
     },
     "t_min": {
       "unit": "s",
-      "value": 0.0008735723600000001
+      "value": 0.00086472089
     }
   },
   "settings": {
     "Batch Size": 100,
-    "Communication Type": "UNSUPPORTED",
+    "Communication Type": false,
     "FFT Size": 4096,
-    "Kernel File": "./bin/fft1d_float_8_emulate.aocx",
+    "Inverse": false,
+    "Kernel File": false,
     "Kernel Replications": 1,
-    "MPI Ranks": "None",
+    "MPI Ranks": false,
     "Repetitions": 10,
-    "Test Mode": "No"
+    "Test Mode": false
   },
   "timings": {
-    "calculation": [
+    "execution": [
       {
         "unit": "s",
-        "value": 0.090378907
+        "value": 0.151814849
       },
       {
         "unit": "s",
-        "value": 0.089294969
+        "value": 0.086472089
       },
       {
         "unit": "s",
-        "value": 0.08941156
+        "value": 0.089654183
       },
       {
         "unit": "s",
-        "value": 0.089993811
+        "value": 0.09003793
       },
       {
         "unit": "s",
-        "value": 0.087884474
+        "value": 0.089870966
       },
       {
         "unit": "s",
-        "value": 0.087357236
+        "value": 0.089802216
       },
       {
         "unit": "s",
-        "value": 0.089228888
+        "value": 0.089816195
       },
       {
         "unit": "s",
-        "value": 0.089401591
+        "value": 0.089979618
       },
       {
         "unit": "s",
-        "value": 0.089537203
+        "value": 0.090762352
       },
       {
         "unit": "s",
-        "value": 0.090772183
+        "value": 0.086744183
       }
     ]
   },
+  "validated": true,
   "version": "1.4"
 }
 
diff --git a/GEMM/Readme.md b/GEMM/README.md
similarity index 92%
rename from GEMM/Readme.md
rename to GEMM/README.md
index 33f0419b..8ac117df 100755
--- a/GEMM/Readme.md
+++ b/GEMM/README.md
@@ -147,26 +147,18 @@ The json output looks like the following.
 ```json
 
 {
-  "config_time": "Thu Dec 08 10:39:51 UTC 2022",
+  "config_time": "Wed Dec 14 08:40:52 UTC 2022",
   "device": "Intel(R) FPGA Emulation Device",
   "environment": {
     "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
   },
   "errors": {
-    "epsilon": {
-      "unit": "",
-      "value": 1.1920928955078125e-07
-    },
-    "residual": {
-      "unit": "",
-      "value": 7.62939453125e-06
-    },
-    "residual_norm": {
-      "unit": "",
-      "value": 8.08345175162664e-05
-    }
+    "epsilon": 1.1920928955078125e-07,
+    "residual": 7.62939453125e-06,
+    "residual_norm": 8.08345175162664e-05
   },
-  "git_commit": "86e0064-dirty",
+  "execution_time": "Wed Dec 14 09:14:09 UTC 2022",
+  "git_commit": "be1a4e9-dirty",
   "mpi": {
     "subversion": 1,
     "version": 3
@@ -175,71 +167,73 @@ The json output looks like the following.
   "results": {
     "gflops": {
       "unit": "GFLOP/s",
-      "value": 5.347517549652832
+      "value": 5.297554068962992
     },
     "t_mean": {
       "unit": "s",
-      "value": 0.009541589199999999
+      "value": 0.010202154299999999
     },
     "t_min": {
       "unit": "s",
-      "value": 0.006274768
+      "value": 0.006333948
     }
   },
   "settings": {
-    "Communication Type": "UNSUPPORTED",
-    "Kernel File": "./bin/gemm_base_emulate.aocx",
+    "Block Size": 32,
+    "Communication Type": false,
+    "Kernel File": false,
     "Kernel Replications": 4,
     "MPI Ranks": 1,
     "Matrix Size": 256,
     "Repetitions": 10,
     "Replicate Inputs": false,
-    "Test Mode": "No"
+    "Test Mode": false
   },
   "timings": {
     "execution": [
       {
         "unit": "s",
-        "value": 0.012631986
+        "value": 0.012732567
       },
       {
         "unit": "s",
-        "value": 0.012796959
+        "value": 0.006511861
       },
       {
         "unit": "s",
-        "value": 0.012527344
+        "value": 0.006333948
       },
       {
         "unit": "s",
-        "value": 0.012579805
+        "value": 0.012710817
       },
       {
         "unit": "s",
-        "value": 0.0064457
+        "value": 0.006552662
       },
       {
         "unit": "s",
-        "value": 0.006274768
+        "value": 0.006600733
       },
       {
         "unit": "s",
-        "value": 0.00642924
+        "value": 0.012673167
       },
       {
         "unit": "s",
-        "value": 0.012808459
+        "value": 0.012720237
       },
       {
         "unit": "s",
-        "value": 0.006587663
+        "value": 0.012608296
       },
       {
         "unit": "s",
-        "value": 0.006333968
+        "value": 0.012577255
       }
     ]
   },
+  "validated": true,
   "version": "1.3"
 }
 
diff --git a/LINPACK/Readme.md b/LINPACK/README.md
similarity index 90%
rename from LINPACK/Readme.md
rename to LINPACK/README.md
index 62162c43..7135b511 100644
--- a/LINPACK/Readme.md
+++ b/LINPACK/README.md
@@ -160,26 +160,18 @@ The json output looks like the following.
 ```json
 
 {
-  "config_time": "Thu Dec 08 10:41:13 UTC 2022",
+  "config_time": "Wed Dec 14 08:41:58 UTC 2022",
   "device": "Intel(R) FPGA Emulation Device",
   "environment": {
     "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
   },
   "errors": {
-    "epsilon": {
-      "unit": "",
-      "value": 1.1920928955078125e-07
-    },
-    "residual": {
-      "unit": "",
-      "value": 5.960464477539062e-07
-    },
-    "residual_norm": {
-      "unit": "",
-      "value": 0.004354506590071576
-    }
+    "epsilon": 1.1920928955078125e-07,
+    "residual": 5.960464477539062e-07,
+    "residual_norm": 0.004354506590071576
   },
-  "git_commit": "86e0064-dirty",
+  "execution_time": "Wed Dec 14 09:20:49 UTC 2022",
+  "git_commit": "be1a4e9-dirty",
   "mpi": {
     "subversion": 1,
     "version": 3
@@ -188,104 +180,105 @@ The json output looks like the following.
   "results": {
     "gflops": {
       "unit": "GFLOP/s",
-      "value": 0.000213044786995575
+      "value": 0.0006047108051562395
     },
     "gflops_lu": {
       "unit": "GFLOP/s",
-      "value": 0.00019478383998887983
+      "value": 0.0005528788702090362
     },
     "gflops_sl": {
       "unit": "GFLOP/s",
-      "value": 102.4
+      "value": 68.26666666666668
     },
     "t_mean": {
       "unit": "s",
-      "value": 0.1161132923
+      "value": 0.041533081799999996
     },
     "t_min": {
       "unit": "s",
-      "value": 0.112151692
+      "value": 0.039512
     },
     "tlu_mean": {
       "unit": "s",
-      "value": 0.11611325259999998
+      "value": 0.041533051599999996
     },
     "tlu_min": {
       "unit": "s",
-      "value": 0.112151672
+      "value": 0.03951197
     },
     "tsl_mean": {
       "unit": "s",
-      "value": 3.97e-08
+      "value": 3.019999999999999e-08
     },
     "tsl_min": {
       "unit": "s",
-      "value": 2e-08
+      "value": 3e-08
     }
   },
   "settings": {
     "Block Size": 16,
-    "Communication Type": "IEC",
-    "Data Type": "cl_float",
+    "Communication Type": false,
+    "Data Type": false,
+    "Diagonally Dominant": true,
     "Emulate": false,
     "FPGA Torus": {
       "P": 1,
       "Q": 1
     },
-    "Kernel File": "./bin/hpl_torus_IEC_emulate.aocx",
+    "Kernel File": false,
     "Kernel Replications": 3,
     "MPI Ranks": 1,
     "Matrix Size": 32,
     "Repetitions": 10,
-    "Test Mode": "No"
+    "Test Mode": false
   },
   "timings": {
     "gefa": [
       {
         "unit": "s",
-        "value": 0.112151672
+        "value": 0.040978706
       },
       {
         "unit": "s",
-        "value": 0.112186842
+        "value": 0.041104108
       },
       {
         "unit": "s",
-        "value": 0.114559183
+        "value": 0.040878394
       },
       {
         "unit": "s",
-        "value": 0.114920089
+        "value": 0.040391036
       },
       {
         "unit": "s",
-        "value": 0.113395783
+        "value": 0.044723132
       },
       {
         "unit": "s",
-        "value": 0.113512676
+        "value": 0.03951197
       },
       {
         "unit": "s",
-        "value": 0.118974459
+        "value": 0.043374308
       },
       {
         "unit": "s",
-        "value": 0.11378015
+        "value": 0.04179909
       },
       {
         "unit": "s",
-        "value": 0.131815478
+        "value": 0.041162129
       },
       {
         "unit": "s",
-        "value": 0.115836194
+        "value": 0.041407643
       }
     ],
     "gesl": [
       {
         "unit": "s",
-        "value": 2e-08
+        "value": 3e-08
       },
       {
         "unit": "s",
@@ -297,11 +290,11 @@ The json output looks like the following.
       },
       {
         "unit": "s",
-        "value": 2.9e-08
+        "value": 3e-08
       },
       {
         "unit": "s",
-        "value": 1.5e-07
+        "value": 3e-08
       },
       {
         "unit": "s",
@@ -309,15 +302,15 @@ The json output looks like the following.
       },
       {
         "unit": "s",
-        "value": 2e-08
+        "value": 3.1e-08
       },
       {
         "unit": "s",
-        "value": 2.9e-08
+        "value": 3.1e-08
       },
       {
         "unit": "s",
-        "value": 2.9e-08
+        "value": 3e-08
       },
       {
         "unit": "s",
@@ -325,6 +318,7 @@ The json output looks like the following.
       }
     ]
   },
+  "validated": true,
   "version": "2.6"
 }
 
diff --git a/PTRANS/README.md b/PTRANS/README.md
index 9350e1de..521389a0 100644
--- a/PTRANS/README.md
+++ b/PTRANS/README.md
@@ -172,22 +172,17 @@ The json output looks like the following.
 ```json
 
 {
-  "config_time": "Thu Dec 08 10:41:51 UTC 2022",
+  "config_time": "Wed Dec 14 08:42:29 UTC 2022",
   "device": "Intel(R) FPGA Emulation Device",
   "environment": {
-    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
   },
   "errors": {
-    "epsilon": {
-      "unit": "",
-      "value": 1.1920928955078125e-07
-    },
-    "max_error": {
-      "unit": "",
-      "value": 7.62939453125e-06
-    }
+    "epsilon": 1.1920928955078125e-07,
+    "max_error": 199.96849060058594
   },
-  "git_commit": "86e0064-dirty",
+  "execution_time": "Wed Dec 14 09:57:30 UTC 2022",
+  "git_commit": "be1a4e9-dirty",
   "mpi": {
     "subversion": 1,
     "version": 3
@@ -196,151 +191,84 @@ The json output looks like the following.
   "results": {
     "avg_calc_flops": {
       "unit": "GFLOP/s",
-      "value": 0.36102157111728794
+      "value": 0.011002914958427963
     },
     "avg_calc_t": {
       "unit": "s",
-      "value": 0.0464715057
+      "value": 1.524797389
     },
     "avg_mem_bandwidth": {
       "unit": "GB/s",
-      "value": 4.332258853407454
+      "value": 0.13203497950113555
     },
     "avg_t": {
       "unit": "s",
-      "value": 0.061001096899999996
+      "value": 1.5332141689999998
     },
     "avg_transfer_bandwidth": {
       "unit": "GB/s",
-      "value": 13.856314966383914
+      "value": 23.919669042080226
     },
     "avg_transfer_t": {
       "unit": "s",
-      "value": 0.0145295912
+      "value": 0.00841678
     },
     "max_calc_flops": {
       "unit": "GFLOP/s",
-      "value": 0.4431353845559759
+      "value": 0.011002914958427963
     },
     "max_mem_bandwidth": {
       "unit": "GB/s",
-      "value": 5.31762461467171
+      "value": 0.13203497950113555
     },
     "max_transfer_bandwidth": {
       "unit": "GB/s",
-      "value": 17.8236830498358
+      "value": 23.919669042080226
     },
     "min_calc_t": {
       "unit": "s",
-      "value": 0.037860249
+      "value": 1.524797389
     },
     "min_t": {
       "unit": "s",
-      "value": 0.049155702999999995
+      "value": 1.5332141689999998
     },
     "min_transfer_t": {
       "unit": "s",
-      "value": 0.011295454
+      "value": 0.00841678
     }
   },
   "settings": {
     "Block Size": 512,
-    "Communication Type": "PCIE",
-    "Data Handler": "PQ",
-    "Dist. Buffers": "No",
-    "Kernel File": "./bin/transpose_PQ_PCIE_emulate.aocx",
+    "Communication Type": false,
+    "Data Handler": false,
+    "Dist. Buffers": false,
+    "FPGA Torus": {
+      "P": 1,
+      "Q": 3
+    },
+    "Kernel File": false,
     "Kernel Replications": 2,
-    "MPI Ranks": 1,
+    "MPI Ranks": 3,
     "Matrix Size": 4096,
-    "Repetitions": 10,
-    "Test Mode": "No"
+    "Repetitions": 1,
+    "Test Mode": false
   },
   "timings": {
     "calculation": [
       {
         "unit": "s",
-        "value": 0.054139988
-      },
-      {
-        "unit": "s",
-        "value": 0.05014593
-      },
-      {
-        "unit": "s",
-        "value": 0.037867809
-      },
-      {
-        "unit": "s",
-        "value": 0.037973641
-      },
-      {
-        "unit": "s",
-        "value": 0.046004999
-      },
-      {
-        "unit": "s",
-        "value": 0.037860249
-      },
-      {
-        "unit": "s",
-        "value": 0.056381497
-      },
-      {
-        "unit": "s",
-        "value": 0.050036547
-      },
-      {
-        "unit": "s",
-        "value": 0.048048414
-      },
-      {
-        "unit": "s",
-        "value": 0.046255983
+        "value": 1.523696949
       }
     ],
     "transfer": [
       {
         "unit": "s",
-        "value": 0.025985196
-      },
-      {
-        "unit": "s",
-        "value": 0.012733798000000001
-      },
-      {
-        "unit": "s",
-        "value": 0.012989071999999999
-      },
-      {
-        "unit": "s",
-        "value": 0.011295454
-      },
-      {
-        "unit": "s",
-        "value": 0.013326449
-      },
-      {
-        "unit": "s",
-        "value": 0.012952722
-      },
-      {
-        "unit": "s",
-        "value": 0.014228134
-      },
-      {
-        "unit": "s",
-        "value": 0.013149265
-      },
-      {
-        "unit": "s",
-        "value": 0.014597321
-      },
-      {
-        "unit": "s",
-        "value": 0.014038500999999998
+        "value": 0.008189295
       }
     ]
   },
+  "validated": false,
   "version": "1.7"
 }
 
diff --git a/RandomAccess/README.md b/RandomAccess/README.md
index a852b630..ede6a47d 100644
--- a/RandomAccess/README.md
+++ b/RandomAccess/README.md
@@ -146,18 +146,16 @@ The json output looks like the following.
 ```json
 
 {
-  "config_time": "Thu Dec 08 10:42:40 UTC 2022",
+  "config_time": "Wed Dec 14 08:43:07 UTC 2022",
   "device": "Intel(R) FPGA Emulation Device",
   "environment": {
     "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
   },
   "errors": {
-    "ratio": {
-      "unit": "",
-      "value": 0.00390625
-    }
+    "ratio": 0.00390625
   },
-  "git_commit": "86e0064-dirty",
+  "execution_time": "Wed Dec 14 09:54:47 UTC 2022",
+  "git_commit": "be1a4e9-dirty",
   "mpi": {
     "subversion": 1,
     "version": 3
@@ -166,71 +164,72 @@ The json output looks like the following.
   "results": {
     "guops": {
       "unit": "GUOP/s",
-      "value": 0.0022880227372259515
+      "value": 0.0021329867229908477
     },
     "t_mean": {
       "unit": "s",
-      "value": 0.0005729401999999999
+      "value": 0.0005428726000000001
     },
     "t_min": {
       "unit": "s",
-      "value": 0.000447548
+      "value": 0.000480078
     }
   },
   "settings": {
     "#RNGs": 32,
     "Array Size": 256,
-    "Communication Type": "UNSUPPORTED",
-    "Kernel File": "./bin/random_access_kernels_single_emulate.aocx",
+    "Communication Type": false,
+    "Kernel File": false,
     "Kernel Replications": 4,
     "MPI Ranks": 1,
     "Repetitions": 10,
-    "Test Mode": "No"
+    "Test Mode": false
   },
   "timings": {
     "execution": [
       {
         "unit": "s",
-        "value": 0.000672612
+        "value": 0.000643471
       },
       {
         "unit": "s",
-        "value": 0.00058854
+        "value": 0.000516849
       },
       {
         "unit": "s",
-        "value": 0.00058064
+        "value": 0.000606361
       },
       {
         "unit": "s",
-        "value": 0.00057064
+        "value": 0.00058182
       },
       {
         "unit": "s",
-        "value": 0.00053845
+        "value": 0.00060401
       },
       {
         "unit": "s",
-        "value": 0.00055827
+        "value": 0.000485259
       },
       {
         "unit": "s",
-        "value": 0.00056768
+        "value": 0.000484699
       },
       {
         "unit": "s",
-        "value": 0.000649792
+        "value": 0.00053713
       },
       {
         "unit": "s",
-        "value": 0.00055523
+        "value": 0.000489049
       },
       {
         "unit": "s",
-        "value": 0.000447548
+        "value": 0.000480078
       }
     ]
   },
+  "validated": true,
   "version": "2.5"
 }
 
diff --git a/STREAM/README.md b/STREAM/README.md
index 10980aad..298777b3 100644
--- a/STREAM/README.md
+++ b/STREAM/README.md
@@ -164,54 +164,25 @@ The raw data of these runs can be found in the folder `csv_result_export`.
 ```json
 
 {
-  "config_time": "Thu Dec 08 10:43:26 UTC 2022",
+  "config_time": "Wed Dec 14 08:43:42 UTC 2022",
   "device": "Intel(R) FPGA Emulation Device",
   "environment": {
     "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
   },
   "errors": {
-    "a_average_error": {
-      "unit": "",
-      "value": 0
-    },
-    "a_average_relative_error": {
-      "unit": "",
-      "value": 0
-    },
-    "a_expected_value": {
-      "unit": "",
-      "value": 1153300692992
-    },
-    "b_average_error": {
-      "unit": "",
-      "value": 0
-    },
-    "b_average_relative_error": {
-      "unit": "",
-      "value": 0
-    },
-    "b_expected_value": {
-      "unit": "",
-      "value": 230660145152
-    },
-    "c_average_error": {
-      "unit": "",
-      "value": 0
-    },
-    "c_average_relative_error": {
-      "unit": "",
-      "value": 0
-    },
-    "c_expected_value": {
-      "unit": "",
-      "value": 307546849280
-    },
-    "epsilon": {
-      "unit": "",
-      "value": 1.1920928955078125e-07
-    }
+    "a_average_error": 0,
+    "a_average_relative_error": 0,
+    "a_expected": 1153300692992,
+    "b_average_error": 0,
+    "b_average_relative_error": 0,
+    "b_expected": 230660145152,
+    "c_average_error": 0,
+    "c_average_relative_error": 0,
+    "c_expected": 307546849280,
+    "epsilon": 1.1920928955078125e-07
   },
-  "git_commit": "86e0064-dirty",
+  "execution_time": "Wed Dec 14 09:29:17 UTC 2022",
+  "git_commit": "be1a4e9-dirty",
   "mpi": {
     "subversion": 1,
     "version": 3
@@ -220,366 +191,367 @@ The raw data of these runs can be found in the folder `csv_result_export`.
   "results": {
     "Add_avg_t": {
       "unit": "s",
-      "value": 0.033347886300000004
+      "value": 0.0530118015
     },
     "Add_best_rate": {
       "unit": "MB/s",
-      "value": 53622.07621998581
+      "value": 30506.44534004568
     },
     "Add_max_t": {
       "unit": "s",
-      "value": 0.03681156
+      "value": 0.053374228
     },
     "Add_min_t": {
       "unit": "s",
-      "value": 0.030036374
+      "value": 0.052795818
     },
     "Copy_avg_t": {
       "unit": "s",
-      "value": 0.0232275248
+      "value": 0.0389517071
     },
     "Copy_best_rate": {
       "unit": "MB/s",
-      "value": 47558.26475478994
+      "value": 27731.67753145461
     },
     "Copy_max_t": {
       "unit": "s",
-      "value": 0.025507117
+      "value": 0.040187928
     },
     "Copy_min_t": {
       "unit": "s",
-      "value": 0.022577397
+      "value": 0.038718964
     },
     "PCI_read_avg_t": {
       "unit": "s",
-      "value": 0.0672552576
+      "value": 0.0597715322
     },
     "PCI_read_best_rate": {
       "unit": "MB/s",
-      "value": 24721.98479896992
+      "value": 27479.82304062059
     },
     "PCI_read_max_t": {
       "unit": "s",
-      "value": 0.06825187
+      "value": 0.063351971
     },
     "PCI_read_min_t": {
       "unit": "s",
-      "value": 0.065149006
+      "value": 0.058610739
     },
     "PCI_write_avg_t": {
       "unit": "s",
-      "value": 0.0636534559
+      "value": 0.0685080558
     },
     "PCI_write_best_rate": {
       "unit": "MB/s",
-      "value": 26815.238093906166
+      "value": 25765.843668891466
     },
     "PCI_write_max_t": {
       "unit": "s",
-      "value": 0.084513938
+      "value": 0.120777629
     },
     "PCI_write_min_t": {
       "unit": "s",
-      "value": 0.060063339
+      "value": 0.062509606
     },
     "Scale_avg_t": {
       "unit": "s",
-      "value": 0.021342261699999997
+      "value": 0.03978323250000001
     },
     "Scale_best_rate": {
       "unit": "MB/s",
-      "value": 53574.52309080775
+      "value": 27084.469403573872
     },
     "Scale_max_t": {
       "unit": "s",
-      "value": 0.024272246
+      "value": 0.039983335
     },
     "Scale_min_t": {
       "unit": "s",
-      "value": 0.020042023
+      "value": 0.039644189
     },
     "Triad_avg_t": {
       "unit": "s",
-      "value": 0.0346477169
+      "value": 0.052600337100000005
     },
     "Triad_best_rate": {
       "unit": "MB/s",
-      "value": 48456.4004453886
+      "value": 30701.997665172144
     },
     "Triad_max_t": {
       "unit": "s",
-      "value": 0.037008534
+      "value": 0.052735936
     },
     "Triad_min_t": {
       "unit": "s",
-      "value": 0.03323839
+      "value": 0.052459542
     }
   },
   "settings": {
     "Array Size": 134217728,
-    "Communication Type": "UNSUPPORTED",
-    "Data Type": "cl_float",
-    "Kernel File": "./bin/stream_kernels_single_emulate.aocx",
+    "Communication Type": false,
+    "Data Type": false,
+    "Kernel File": false,
     "Kernel Replications": 4,
-    "Kernel Type": "Single",
+    "Kernel Type": false,
     "MPI Ranks": 1,
     "Repetitions": 10,
-    "Test Mode": "No"
+    "Test Mode": false
   },
   "timings": {
     "Add": [
       {
         "unit": "s",
-        "value": 0.03681156
+        "value": 0.052848008
       },
       {
         "unit": "s",
-        "value": 0.030148826
+        "value": 0.052795818
       },
       {
         "unit": "s",
-        "value": 0.034179315
+        "value": 0.053294617
       },
       {
         "unit": "s",
-        "value": 0.03443528
+        "value": 0.053374228
       },
       {
         "unit": "s",
-        "value": 0.030036374
+        "value": 0.052812528
       },
       {
         "unit": "s",
-        "value": 0.03498338
+        "value": 0.053091652
       },
       {
         "unit": "s",
-        "value": 0.033383682
+        "value": 0.052962381
       },
       {
         "unit": "s",
-        "value": 0.03149675
+        "value": 0.052992892
       },
       {
         "unit": "s",
-        "value": 0.035128302
+        "value": 0.052880469
       },
       {
         "unit": "s",
-        "value": 0.032875394
+        "value": 0.053065422
       }
     ],
     "Copy": [
       {
         "unit": "s",
-        "value": 0.023277928
+        "value": 0.040187928
       },
       {
         "unit": "s",
-        "value": 0.023061445
+        "value": 0.038718964
       },
       {
         "unit": "s",
-        "value": 0.022577397
+        "value": 0.038728084
       },
       {
         "unit": "s",
-        "value": 0.025507117
+        "value": 0.038760534
       },
       {
         "unit": "s",
-        "value": 0.022904103
+        "value": 0.038793734
       },
       {
         "unit": "s",
-        "value": 0.023076385
+        "value": 0.039005018
       },
       {
         "unit": "s",
-        "value": 0.022585516
+        "value": 0.038862845
       },
       {
         "unit": "s",
-        "value": 0.023018084
+        "value": 0.038731043
       },
       {
         "unit": "s",
-        "value": 0.023126956
+        "value": 0.038891176
       },
       {
         "unit": "s",
-        "value": 0.023140317
+        "value": 0.038837745
       }
     ],
     "PCI_read": [
       {
         "unit": "s",
-        "value": 0.066263925
+        "value": 0.058610739
       },
       {
         "unit": "s",
-        "value": 0.065149006
+        "value": 0.059211539
       },
       {
         "unit": "s",
-        "value": 0.06823823
+        "value": 0.059094178
       },
       {
         "unit": "s",
-        "value": 0.067614649
+        "value": 0.063351971
       },
       {
         "unit": "s",
-        "value": 0.068157828
+        "value": 0.059738369
       },
       {
         "unit": "s",
-        "value": 0.06825187
+        "value": 0.059645487
       },
       {
         "unit": "s",
-        "value": 0.068159038
+        "value": 0.059697218
       },
       {
         "unit": "s",
-        "value": 0.066694763
+        "value": 0.059381852
       },
       {
         "unit": "s",
-        "value": 0.067605659
+        "value": 0.059468254
       },
       {
         "unit": "s",
-        "value": 0.066417608
+        "value": 0.059515715
       }
     ],
     "PCI_write": [
       {
         "unit": "s",
-        "value": 0.084513938
+        "value": 0.120777629
       },
       {
         "unit": "s",
-        "value": 0.060253183
+        "value": 0.062600188
       },
       {
         "unit": "s",
-        "value": 0.060325944
+        "value": 0.062606179
       },
       {
         "unit": "s",
-        "value": 0.064254031
+        "value": 0.062711891
       },
       {
         "unit": "s",
-        "value": 0.060529077
+        "value": 0.062509606
       },
       {
         "unit": "s",
-        "value": 0.063792623
+        "value": 0.062803592
       },
       {
         "unit": "s",
-        "value": 0.060357565
+        "value": 0.062787151
       },
       {
         "unit": "s",
-        "value": 0.060063339
+        "value": 0.062679419
       },
       {
         "unit": "s",
-        "value": 0.060287283
+        "value": 0.06271488
       },
       {
         "unit": "s",
-        "value": 0.062157576
+        "value": 0.062890023
       }
     ],
     "Scale": [
       {
         "unit": "s",
-        "value": 0.021235864
+        "value": 0.039983335
       },
       {
         "unit": "s",
-        "value": 0.020608554
+        "value": 0.039644189
       },
       {
         "unit": "s",
-        "value": 0.020822067
+        "value": 0.039831532
       },
       {
         "unit": "s",
-        "value": 0.020042023
+        "value": 0.039766591
       },
       {
         "unit": "s",
-        "value": 0.021288745
+        "value": 0.039660679
       },
       {
         "unit": "s",
-        "value": 0.020088374
+        "value": 0.039933614
       },
       {
         "unit": "s",
-        "value": 0.021096531
+        "value": 0.039789862
       },
       {
         "unit": "s",
-        "value": 0.021525769
+        "value": 0.03967413
       },
       {
         "unit": "s",
-        "value": 0.024272246
+        "value": 0.039722601
       },
       {
         "unit": "s",
-        "value": 0.022442444
+        "value": 0.039825792
       }
     ],
     "Triad": [
       {
         "unit": "s",
-        "value": 0.037008534
+        "value": 0.052583184
       },
       {
         "unit": "s",
-        "value": 0.036020228
+        "value": 0.052564403
       },
       {
         "unit": "s",
-        "value": 0.033424273
+        "value": 0.052735936
       },
       {
         "unit": "s",
-        "value": 0.033462613
+        "value": 0.052644865
       },
       {
         "unit": "s",
-        "value": 0.033843901
+        "value": 0.052699956
       },
       {
         "unit": "s",
-        "value": 0.033447893
+        "value": 0.052459542
       },
       {
         "unit": "s",
-        "value": 0.03323839
+        "value": 0.052657585
       },
       {
         "unit": "s",
-        "value": 0.036342203
+        "value": 0.052493212
       },
       {
         "unit": "s",
-        "value": 0.03446487
+        "value": 0.052600984
       },
       {
         "unit": "s",
-        "value": 0.035224264
+        "value": 0.052563704
       }
     ]
   },
+  "validated": true,
   "version": "2.6"
 }
 
diff --git a/b_eff/README.md b/b_eff/README.md
index 157b0a67..cdbb8c92 100644
--- a/b_eff/README.md
+++ b/b_eff/README.md
@@ -179,13 +179,14 @@ The json output looks like the following.
 ```json
 
 {
-  "config_time": "Thu Dec 08 10:38:28 UTC 2022",
+  "config_time": "Wed Dec 14 08:39:42 UTC 2022",
   "device": "Intel(R) FPGA Emulation Device",
   "environment": {
     "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
   },
   "errors": {},
-  "git_commit": "86e0064-dirty",
+  "execution_time": "Wed Dec 14 09:56:29 UTC 2022",
+  "git_commit": "be1a4e9-dirty",
   "mpi": {
     "subversion": 1,
     "version": 3
@@ -194,23 +195,23 @@ The json output looks like the following.
   "results": {
     "b_eff": {
       "unit": "B/s",
-      "value": 21935372.01805185
+      "value": 22061624.19637537
     }
   },
   "settings": {
-    "Communication Type": "IEC",
-    "Kernel File": "./bin/communication_bw520n_IEC_emulate.aocx",
+    "Communication Type": false,
+    "Kernel File": false,
     "Kernel Replications": 2,
     "Loop Length": 5,
     "MPI Ranks": 1,
     "Message Sizes": 2,
     "Repetitions": 10,
-    "Test Mode": "No"
+    "Test Mode": false
   },
   "timings": {
     "6": {
-      "maxCalcBW": 14601537.724441605,
-      "maxMinCalculationTime": 4.3831e-05,
+      "maxCalcBW": 9880812.696844315,
+      "maxMinCalculationTime": 6.4772e-05,
       "timings": [
         {
           "looplength": 5,
@@ -218,51 +219,51 @@ The json output looks like the following.
           "timings": [
             {
               "unit": "s",
-              "value": 0.013389739
+              "value": 0.010991125
             },
             {
               "unit": "s",
-              "value": 6.2761e-05
+              "value": 8.8202e-05
             },
             {
               "unit": "s",
-              "value": 4.9321e-05
+              "value": 0.000133323
             },
             {
               "unit": "s",
-              "value": 4.3831e-05
+              "value": 8.5442e-05
             },
             {
               "unit": "s",
-              "value": 4.951e-05
+              "value": 0.000272905
             },
             {
               "unit": "s",
-              "value": 4.7561e-05
+              "value": 0.000168784
             },
             {
               "unit": "s",
-              "value": 5.2311e-05
+              "value": 6.4772e-05
             },
             {
               "unit": "s",
-              "value": 5.0441e-05
+              "value": 0.000171733
             },
             {
               "unit": "s",
-              "value": 4.6901e-05
+              "value": 0.000163393
             },
             {
               "unit": "s",
-              "value": 5.4401e-05
+              "value": 8.0391e-05
             }
           ]
         }
       ]
     },
     "7": {
-      "maxCalcBW": 18104411.535904724,
-      "maxMinCalculationTime": 7.0701e-05,
+      "maxCalcBW": 19143908.348538782,
+      "maxMinCalculationTime": 6.6862e-05,
       "timings": [
         {
           "looplength": 5,
@@ -270,51 +271,51 @@ The json output looks like the following.
           "timings": [
             {
               "unit": "s",
-              "value": 0.000104852
+              "value": 0.000135662
             },
             {
               "unit": "s",
-              "value": 0.000125222
+              "value": 0.000119343
             },
             {
               "unit": "s",
-              "value": 7.9731e-05
+              "value": 0.000178914
             },
             {
               "unit": "s",
-              "value": 0.000151442
+              "value": 7.7691e-05
             },
             {
               "unit": "s",
-              "value": 9.3052e-05
+              "value": 9.1922e-05
             },
             {
               "unit": "s",
-              "value": 0.000193763
+              "value": 0.000259545
             },
             {
               "unit": "s",
-              "value": 8.4472e-05
+              "value": 0.000143233
             },
             {
               "unit": "s",
-              "value": 0.000116562
+              "value": 0.000149763
             },
             {
               "unit": "s",
-              "value": 8.2471e-05
+              "value": 6.6862e-05
             },
             {
               "unit": "s",
-              "value": 7.0701e-05
+              "value": 7.2351e-05
             }
           ]
         }
       ]
     },
     "8": {
-      "maxCalcBW": 33100166.79380923,
-      "maxMinCalculationTime": 7.7341e-05,
+      "maxCalcBW": 37160151.543743014,
+      "maxMinCalculationTime": 6.8891e-05,
       "timings": [
         {
           "looplength": 5,
@@ -322,49 +323,50 @@ The json output looks like the following.
           "timings": [
             {
               "unit": "s",
-              "value": 0.000711343
+              "value": 0.000159723
             },
             {
               "unit": "s",
-              "value": 0.000378606
+              "value": 0.000104432
             },
             {
               "unit": "s",
-              "value": 0.000280195
+              "value": 0.000166953
             },
             {
               "unit": "s",
-              "value": 0.000107392
+              "value": 7.7492e-05
             },
             {
               "unit": "s",
-              "value": 0.000203963
+              "value": 7.8241e-05
             },
             {
               "unit": "s",
-              "value": 0.000122193
+              "value": 9.5762e-05
             },
             {
               "unit": "s",
-              "value": 8.2151e-05
+              "value": 0.000235084
             },
             {
               "unit": "s",
-              "value": 8.6861e-05
+              "value": 0.000280265
             },
             {
               "unit": "s",
-              "value": 0.000167473
+              "value": 0.000130013
             },
             {
               "unit": "s",
-              "value": 7.7341e-05
+              "value": 6.8891e-05
             }
           ]
         }
       ]
     }
   },
+  "validated": true,
   "version": "1.3"
 }
 

From 7bbdeeab19a4b638cf304c6c4c6fcd2abd0cc921 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Wed, 14 Dec 2022 17:30:00 +0100
Subject: [PATCH 229/318] fix FFT build

---
 FFT/src/host/fft_benchmark.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp
index 8610b121..e9b86e12 100644
--- a/FFT/src/host/fft_benchmark.cpp
+++ b/FFT/src/host/fft_benchmark.cpp
@@ -35,7 +35,7 @@ SOFTWARE.
 #include "parameters.h"
 
 fft::FFTProgramSettings::FFTProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
-    iterations(results["b"].as<uint>()), inverse(results.count("inverse")), kernelReplications(results["r"].as<uint>()) {
+    iterations(results["b"].as<uint>()), inverse(results.count("inverse")) {
 
 }
 

From a5d6c9eb5fb2a7acae9c8cf91a37e67a430e477c Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Fri, 16 Dec 2022 13:43:43 +0100
Subject: [PATCH 230/318] fix output parsing

---
 scripts/evaluation/parse_raw_to_csv.py | 64 +++++++++++++++++++++++---
 1 file changed, 58 insertions(+), 6 deletions(-)

diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py
index e5306dc7..743b5410 100755
--- a/scripts/evaluation/parse_raw_to_csv.py
+++ b/scripts/evaluation/parse_raw_to_csv.py
@@ -9,12 +9,64 @@
 import sys
 
 # Regular expressions for the raw output of all 
-fft_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Batch\\sSize\\s+(?P<batch_size>\d+)\n(.*\n)FFT\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+GFLOPS:\\s+(?P<avg_flops>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_flops>(\d|\.|\+|-|e)+)"
-gemm_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<resid>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<gflops>(\d|\.|\+|-|e)+)"
-ra_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+(?P<size>(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+Error:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<gops>(\d|\.|\+|-|e)+)"
-trans_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+total\\s+time\\s+transfer\\s+time\\s+calc\\s+time\\s+calc\\s+FLOPS\\s+Memory\\s+Bandwidth\\s+PCIe\\s+Bandwidth\n\\s*avg:\\s+(?P<avg_total_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_transfer_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_trans_bw>(\d|\.|\+|-|e|inf)+)\\s+.+\n\\s*best:\\s+(?P<best_total_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_transfer_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_trans_bw>(\d|\.|\+|-|e|inf)+)"
-stream_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P<size>(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P<data_type>.+)\n(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Kernel\\sType\\s+(?P<type>.+)\n(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\s+Avg\\stime\\s+Min\\stime\\s+Max\\stime\n\\s+PCI_write\\s+(?P<pciw_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pciw_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pciw_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pciw_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+PCI_read\\s+(?P<pcir_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pcir_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pcir_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<pcir_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+Copy\\s+(?P<copy_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<copy_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<copy_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<copy_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+Scale\\s+(?P<scale_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<scale_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<scale_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<scale_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+Add\\s+(?P<add_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<add_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<add_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<add_max_time>(\d|\.|\+|-|e)+)\\s+.+\n\\s+Triad\\s+(?P<triad_rate>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<triad_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<triad_min_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<triad_max_time>(\d|\.|\+|-|e)+)"
-linpack_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P<error>((\d|\.|\+|-|e)+|nan))\\s+(?P<resid>((\d|\.|\+|-|e)+|nan))\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P<total_best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<total_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<total_gflops>(\d|\.|\+|-|e)+)(\\s+.+\\s*\n)\\s+GEFA\\s+(?P<lu_best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<lu_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<lu_gflops>(\d|\.|\+|-|e)+)(\\s+.+\\s*\n)\\s+GESL\\s+(?P<sl_best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<sl_avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<sl_gflops>(\d|\.|\+|-|e)+)"
+fft_regex = ("Version:\\s+(?P<version>.+)\n"
+    "(.*\n)+Batch\\sSize\\s+(?P<batch_size>\d+)\n"
+    "(.*\n)FFT\\sSize\\s+(?P<size>\d+)"
+    "(.*\n)+Device\\s+(?P<device>.+)\n"
+    "(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n"
+    "\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)"
+    "(.*\n)+\\s+avg\\s+best\\s+\n"
+    "\\s+Time\\s+in\\s+s:\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+.+\n"
+    "\\s+GFLOPS:\\s+(?P<avg_flops>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_flops>(\d|\.|\+|-|e)+)\\sGFLOP")
+
+gemm_regex = ("Version:\\s+(?P<version>.+)\n"
+    "(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)"
+    "(.*\n)+Device\\s+(?P<device>.+)\n"
+    "(.*\n)+\\s+norm\.\\sresidual\\s+res\.\\serror\\s+mach\.\\seps\n"
+    "\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<resid>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)"
+    "(.*\n)+\\s+best\\s+mean\\s+GFLOPS\\s+\n"
+    "(?P<best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<gflops>(\d|\.|\+|-|e)+)\\s+GFLOP")
+
+ra_regex = ("Version:\\s+(?P<version>.+)\n"
+    "(.*\n)+Array\\sSize\\s+(?P<size>(\d|\.|\+|-|e)+)"
+    "(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)"
+    "(.*\n)+Device\\s+(?P<device>.+)\n"
+    "(.*\n)+Error:\\s+(?P<error>(\d|\.|\+|-|e)+)"
+    "(.*\n)+best\\s+mean\\s+GUOPS\\s+\n"
+    "(?P<best_time>(\d|\.|\+|-|e)+)\\s.\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<gops>(\d|\.|\+|-|e)+)\\sGUOP")
+
+#TODO
+trans_regex = ("Version:\\s+(?P<version>.+)\n"
+    "(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)"
+    "(.*\n)+Device\\s+(?P<device>.+)\n"
+    "(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)"
+    "(.*\n)+\\s+total\\stime\\s+transfer\\stime\\s+calc\\s+time\\s+calc\\sFLOPS\\s+Memory\\sBandwidth\\s+PCIe\\sBandwidth\\s+\n"
+    "\\s+avg:\\s+(?P<avg_total_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<avg_transfer_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<avg_trans_bw>(\d|\.|\+|-|e|inf)+)\\s.+\\s+\n"
+    "\\s+best:\\s+(?P<best_total_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<best_transfer_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<best_trans_bw>(\d|\.|\+|-|e|inf)+)\\s.+\\s.\n")
+
+stream_regex = ("Version:\\s+(?P<version>.+)\n"
+    "(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P<size>(\d|\.|\+|-|e)+)"
+    "(.*\n)+Data\\sType\\s+(?P<data_type>.+)\n"
+    "(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)"
+    "(.*\n)+Kernel\\sType\\s+(?P<type>.+)\n"
+    "(.*\n)+Device\\s+(?P<device>.+)\n"
+    "(.*\n)+Function\\s+Best\\sRate\\s+Avg\\stime\\s+Min\\stime\\s+Max\\stime\\s+\n"
+    "PCI_write\\s+(?P<pciw_rate>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<pciw_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<pciw_min_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<pciw_max_time>(\d|\.|\+|-|e)+)\\s.+\\s+\n"
+    "PCI_read\\s+(?P<pcir_rate>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<pcir_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<pcir_min_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<pcir_max_time>(\d|\.|\+|-|e)+)\\s+.+\\s+\n"
+    "Copy\\s+(?P<copy_rate>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<copy_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<copy_min_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<copy_max_time>(\d|\.|\+|-|e)+)\\s+.+\\s+\n"
+    "Scale\\s+(?P<scale_rate>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<scale_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<scale_min_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<scale_max_time>(\d|\.|\+|-|e)+)\\s.+\\s+\n"
+    "Add\\s+(?P<add_rate>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<add_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<add_min_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<add_max_time>(\d|\.|\+|-|e)+)\\s.+\\s+\n"
+    "Triad\\s+(?P<triad_rate>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<triad_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<triad_min_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<triad_max_time>(\d|\.|\+|-|e)+)\\s.+\\s+\n")
+
+linpack_regex = ("Version:\\s+(?P<version>.+)\n"
+    "(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)"
+    "(.*\n)+Device\\s+(?P<device>.+)\n"
+    "(.*\n)+\\s+norm\.\\sresidual\\s+res\.\\serror\\s+mach\.\\seps\n"
+    "\\s+(?P<error>((\d|\.|\+|-|e)+|nan))\\s+(?P<resid>((\d|\.|\+|-|e)+|nan))\\s+(?P<epsilon>(\d|\.|\+|-|e)+)\n"
+    "(.*\n)+\\sMethod\\s+best\\s+mean\\s+GFLOPS\\s+\n"
+    "\\stotal\\s+(?P<total_best_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<total_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<total_gflops>(\d|\.|\+|-|e)+)\\s.+\\s+\n"
+    "\\sGEFA\\s+(?P<lu_best_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<lu_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<lu_gflops>(\d|\.|\+|-|e)+)\\s.+\\s+\n"
+    "\\sGESL\\s+(?P<sl_best_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<sl_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<sl_gflops>(\d|\.|\+|-|e)+)\\s.+\\s+\n")
 
 
 def parse_network(file_content):

From 8652cbb5f6d0d09f19ec938109edd6116c1398d6 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Fri, 16 Dec 2022 14:16:32 +0100
Subject: [PATCH 231/318] fix PTRANS unit tests

---
 PTRANS/src/host/transpose_benchmark.cpp  | 2 +-
 PTRANS/tests/test_host_functionality.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp
index 213f6c7e..9b16e38d 100644
--- a/PTRANS/src/host/transpose_benchmark.cpp
+++ b/PTRANS/src/host/transpose_benchmark.cpp
@@ -191,7 +191,7 @@ transpose::TransposeBenchmark::validateOutput(transpose::TransposeData &data) {
 void
 transpose::TransposeBenchmark::printError() {
     if (mpi_comm_rank == 0) {
-        std::cout << "Maximum error: " << errors.at("epsilon") << " < " << 100 * errors.at("epsilon") <<  std::endl;
+        std::cout << "Maximum error: " << errors.at("max_error") << " < " << 100 * errors.at("epsilon") <<  std::endl;
         std::cout << "Mach. Epsilon: " << errors.at("epsilon")  << std::endl;
     }
 }
diff --git a/PTRANS/tests/test_host_functionality.cpp b/PTRANS/tests/test_host_functionality.cpp
index 4f7ebed6..486b178b 100644
--- a/PTRANS/tests/test_host_functionality.cpp
+++ b/PTRANS/tests/test_host_functionality.cpp
@@ -39,7 +39,7 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatHeader) {
     std::cout.rdbuf(oldStdOutBuffer);
 
     EXPECT_THAT(newStdOutBuffer.str(),
-                ::testing::MatchesRegex("(\\s+)total\\stime(\\s+)transfer\\stime(\\s+)calc\\s+time(\\s+)calc\\sFLOPS(\\s+)Memory\\sBandwidth(\\s+)PCIe\\sBandwidth\n.*"));
+                ::testing::MatchesRegex("(\\s+)total\\stime(\\s+)transfer\\stime(\\s+)calc\\s+time(\\s+)calc\\sFLOPS(\\s+)Memory\\sBandwidth(\\s+)PCIe\\sBandwidth(\\s+)\n.*"));
 }
 
 /**
@@ -66,7 +66,7 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatValues) {
     std::cout.rdbuf(oldStdOutBuffer);
 
     EXPECT_THAT(newStdOutBuffer.str(),
-                ::testing::MatchesRegex(".*\navg:\\s+2\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s.*\n.*\n"));
+                ::testing::MatchesRegex(".*\n\\s+avg:\\s+2\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s.*\n.*\n"));
 }
 
 /**

From 73b73cade692f8245f8d6fe8e0813b75a167534e Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Fri, 16 Dec 2022 14:33:00 +0100
Subject: [PATCH 232/318] remove myst_parser dependency from sphinx

---
 docs/source/conf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 99328fa6..73c3c248 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -43,7 +43,6 @@
     'sphinx.ext.githubpages',
  #   'breathe',
     'sphinx_rtd_theme',
-    'myst_parser'
 ]
 
 # Enable Figure numbering and referencing

From ea66d52433bb96327884390de5d66506ca296540 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 9 Dec 2022 16:01:09 +0100
Subject: [PATCH 233/318] Delete unused header file

---
 b_eff/src/host/execution.h | 52 --------------------------------------
 1 file changed, 52 deletions(-)
 delete mode 100644 b_eff/src/host/execution.h

diff --git a/b_eff/src/host/execution.h b/b_eff/src/host/execution.h
deleted file mode 100644
index f43c31de..00000000
--- a/b_eff/src/host/execution.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
-Copyright (c) 2019 Marius Meyer
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-*/
-#ifndef SRC_HOST_EXECUTION_H_
-#define SRC_HOST_EXECUTION_H_
-
-/* C++ standard library headers */
-#include <map>
-#include <memory>
-#include <vector>
-
-/* External library headers */
-#include "parameters.h"
-#include "network_benchmark.hpp"
-
-
-namespace bm_execution {
-
-/**
-The actual execution of the benchmark.
-This method can be implemented in multiple *.cpp files. This header enables
-simple exchange of the different calculation methods.
-
-@param config struct that contains all necessary information to execute the kernel on the FPGA
-
-
-@return The resulting matrix
-*/
-    network::ExecutionTimings
-    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength, cl::vector<HOST_DATA_TYPE> &validationData);
-
-}  // namespace bm_execution
-
-#endif  // SRC_HOST_EXECUTION_H_

From a69c22b0cabf73ee1425a18fb9e9d9f218301d2c Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 9 Dec 2022 16:41:32 +0100
Subject: [PATCH 234/318] Update hlslib

---
 extern/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index d54a37c5..ae08a768 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -28,7 +28,7 @@ FetchContent_Declare(
 
   # unfortunately they do not use releases, so the latest commit was used
   GIT_REPOSITORY      https://github.com/definelicht/hlslib.git
-  GIT_TAG             v1.2.1)
+  GIT_TAG             v1.4.3)
 
 FetchContent_GetProperties(extern_hlslib)
 if(NOT extern_hlslib_POPULATED)

From c0209c8588fa44d96f11fd3870392b74faf40d3e Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 9 Dec 2022 17:36:30 +0100
Subject: [PATCH 235/318] Remove Intel FPGA limitation from beff

---
 b_eff/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt
index b894bb48..b613e3d4 100755
--- a/b_eff/CMakeLists.txt
+++ b/b_eff/CMakeLists.txt
@@ -23,7 +23,3 @@ set(DATA_TYPE char)
 include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake)
 unset(DATA_TYPE CACHE)
 find_package(MPI REQUIRED)
-
-if (NOT INTELFPGAOPENCL_FOUND)
-    message(ERROR "Benchmark does only support the Intel OpenCL SDK")
-endif()

From cba6581a40663306233ea8851335118f83347202 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 9 Dec 2022 17:42:10 +0100
Subject: [PATCH 236/318] Add PCIE dummy kernel

---
 b_eff/src/device/CMakeLists.txt        | 30 ++++++++++++++-------
 b_eff/src/device/communication_PCIE.cl | 37 ++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 9 deletions(-)
 create mode 100644 b_eff/src/device/communication_PCIE.cl

diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt
index 8316a884..0a15211e 100644
--- a/b_eff/src/device/CMakeLists.txt
+++ b/b_eff/src/device/CMakeLists.txt
@@ -3,12 +3,24 @@ set(KERNEL_REPLICATION_ENABLED Yes CACHE INTERNAL "Enables kernel replication in
 set(NUM_REPLICATIONS 2)
 include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake)
 
-generate_kernel_targets_intel(communication_bw520n_IEC)
-add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 -m 0 -n 1
-        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1
-        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type PCIE -l 1 -u 10 -m 0 -n 1
-        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 1 -m 20 -n 1
-        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+if (INTELFPGAOPENCL_FOUND)
+        generate_kernel_targets_intel(communication_bw520n_IEC communication_PCIE)
+        add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 -m 0 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type PCIE -l 1 -u 10 -m 0 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 1 -m 20 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+endif()
+
+if (Vitis_FOUND)
+        generate_kernel_targets_xilinx(communication_PCIE)
+        add_test(NAME test_emulation_pcie_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 10 -m 0 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_emulation_cpu_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin --comm-type CPU -l 1 -u 10 -m 0 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 1 -m 20 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+endif()
diff --git a/b_eff/src/device/communication_PCIE.cl b/b_eff/src/device/communication_PCIE.cl
new file mode 100644
index 00000000..dfae7ca8
--- /dev/null
+++ b/b_eff/src/device/communication_PCIE.cl
@@ -0,0 +1,37 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "parameters.h"
+
+
+/**
+ *   Minimal kernel only used to measure the startup latency of a kernel and to provide a 
+ *      memory buffe for Xilinx FPGAs to measure PCIe read and write performance
+ *
+ * @param input Dummy input
+ */
+__kernel
+__attribute__ ((max_global_work_dim(0)))
+void dummyKernel(__global char *input) {
+    // Minimal kernel only used to measure the startup latency of a kernel and to provide a 
+    // memory buffe for Xilinx FPGAs to measure PCIe read and write performance
+}

From c640a141e954b35011246c612d690075fa681953 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 9 Dec 2022 17:43:31 +0100
Subject: [PATCH 237/318] Add Xilinx compatability for host and add PCIe
 reverse execution

---
 b_eff/src/host/CMakeLists.txt                 |  14 ++
 b_eff/src/host/execution_types/execution.hpp  |   5 +-
 .../execution_pcie_reverse.hpp                | 142 ++++++++++++++++++
 b_eff/src/host/network_benchmark.cpp          |  23 ++-
 b_eff/src/host/network_benchmark.hpp          |  55 +++++++
 5 files changed, 234 insertions(+), 5 deletions(-)
 create mode 100644 b_eff/src/host/execution_types/execution_pcie_reverse.hpp

diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt
index fb08281f..deaf1aae 100755
--- a/b_eff/src/host/CMakeLists.txt
+++ b/b_eff/src/host/CMakeLists.txt
@@ -17,3 +17,17 @@ if (INTELFPGAOPENCL_FOUND)
     target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_intel_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_intel> -h)
 endif()
+
+if (Vitis_FOUND)
+    add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE})
+    target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS})
+    target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host)
+    add_executable(${HOST_EXE_NAME}_xilinx main.cpp)
+    target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
+    target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
+    target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx)
+    target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
+    target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA)
+    target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
+    add_test(NAME test_xilinx_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_xilinx> -h)
+endif()
diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp
index df630838..f1e0600c 100644
--- a/b_eff/src/host/execution_types/execution.hpp
+++ b/b_eff/src/host/execution_types/execution.hpp
@@ -22,4 +22,7 @@ SOFTWARE.
 
 #include "execution_types/execution_cpu.hpp"
 #include "execution_types/execution_pcie.hpp"
-#include "execution_types/execution_iec.hpp"
\ No newline at end of file
+#include "execution_types/execution_pcie_reverse.hpp"
+#if INTEL_FPGA
+#include "execution_types/execution_iec.hpp"
+#endif
\ No newline at end of file
diff --git a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
new file mode 100644
index 00000000..5f44522e
--- /dev/null
+++ b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
@@ -0,0 +1,142 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_PCIE_REVERSE_HPP
+#define SRC_HOST_EXECUTION_TYPES_EXECUTION_PCIE_REVERSE_HPP
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* External library headers */
+#include "mpi.h"
+
+/* Project's headers */
+
+namespace network::execution_types::pcie_reverse {
+
+    /*
+    Implementation for the single kernel.
+     @copydoc bm_execution::calculate()
+    */
+    std::shared_ptr<network::ExecutionTimings>
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength,
+                cl::vector<HOST_DATA_TYPE> &validationData) {
+
+        int err;
+        std::vector<cl::CommandQueue> sendQueues;
+        std::vector<cl::Buffer> dummyBuffers;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+        std::vector<cl::Kernel> dummyKernels;
+
+        cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
+
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+        std::vector<double> calculationTimings;
+        for (uint r =0; r < config.programSettings->numRepetitions; r++) {
+            sendQueues.clear();
+            dummyBuffers.clear();
+            dummyBufferContents.clear();
+            dummyKernels.clear();
+
+            // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
+            for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+                dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err));
+                ASSERT_CL(err)
+
+                dummyKernels.push_back(cl::Kernel(*config.program,
+                                                    "dummyKernel", &err));
+
+                err = dummyKernels[r].setArg(0, dummyBuffers[r]);
+                ASSERT_CL(err);
+
+                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+
+                cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err);
+                ASSERT_CL(err)
+
+                sendQueue.enqueueWriteBuffer(dummyBuffers.back(), CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents.back().data());
+
+                sendQueues.push_back(sendQueue);
+
+            }
+            double calculationTime = 0.0;
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto startCalculation = std::chrono::high_resolution_clock::now();
+                for (int l = 0; l < looplength; l++) {
+                        if (config.programSettings->pcie_reverse_write_pcie) {
+                            sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
+                            sendQueues[i].finish();
+                        }
+                        if (config.programSettings->pcie_reverse_execute_kernel) {
+                            sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1));
+                            sendQueues[i].finish();
+                        }
+                        if (config.programSettings->pcie_reverse_read_pcie) {
+                            sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
+                            sendQueues[i].finish();
+                        }
+                }
+                auto endCalculation = std::chrono::high_resolution_clock::now();
+                calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
+                #endif
+            }
+            calculationTimings.push_back(calculationTime);
+#ifndef NDEBUG
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+        std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
+#endif
+        }
+        // Read validation data from FPGA will be placed sequentially in buffer for all replications
+        // The data order should not matter, because every byte should have the same value!
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+            std::cout << validationData.size() << std::endl;
+            err = sendQueues[r].enqueueReadBuffer(
+                dummyBuffers[r], CL_TRUE, 0, 
+                sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, 
+                &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
+            ASSERT_CL(err);
+            sendQueues[r].finish();
+        }
+        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
+                looplength,
+                messageSize,
+                calculationTimings
+        });
+        return result;
+    }
+
+}  // namespace bm_execution
+
+#endif
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index dbe1f610..5b9d4f2c 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -36,7 +36,11 @@ SOFTWARE.
 
 network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
     maxLoopLength(results["u"].as<uint>()), minLoopLength(results["l"].as<uint>()), maxMessageSize(results["m"].as<uint>()), 
-    minMessageSize(results["min-size"].as<uint>()), llOffset(results["o"].as<uint>()), llDecrease(results["d"].as<uint>()) {
+    minMessageSize(results["min-size"].as<uint>()), llOffset(results["o"].as<uint>()), llDecrease(results["d"].as<uint>()),
+    pcie_reverse_write_pcie(results["pcie-read"].count()), pcie_reverse_read_pcie(results["pcie-write"].count()),
+    pcie_reverse_execute_kernel(results["kernel-latency"].count()) {
+
+    pcie_reverse = pcie_reverse_execute_kernel | pcie_reverse_read_pcie | pcie_reverse_write_pcie;
 
 }
 
@@ -49,7 +53,7 @@ network::NetworkProgramSettings::getSettingsMap() {
 }
 
 network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize, unsigned int _loopLength) : messageSize(_messageSize), loopLength(_loopLength), 
-                                                                            validationBuffer(CHANNEL_WIDTH * 2 * 2, 0) {
+                                                                            validationBuffer((1 << _messageSize) * 2 * 2, 0) {
                                                                                 // TODO: fix the validation buffer size to use the variable number of kernel replications and channels
                                                                                 // Validation data buffer should be big enough to fit the data of two channels
                                                                                 // for every repetition. The number of kernel replications is fixed to 2, which 
@@ -86,7 +90,10 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options)
         ("o", "Offset used before reducing repetitions",
             cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_OFFSET)))
         ("d", "Number os steps the repetitions are decreased to its minimum",
-            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE)));
+            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE)))
+        ("pcie-read", "Use reverse PCIe experiment and measure PCIe read performance from device")
+        ("pcie-write", "Use reverse PCIe experiment and measure PCIe write performance from device")
+        ("kernel-latency", "Use reverse PCIe experiment and measure kernel execution latency");
 }
 
 void
@@ -108,8 +115,16 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
         network::ExecutionTimings timing;
         switch (executionSettings->programSettings->communicationType) {
             case hpcc_base::CommunicationType::cpu_only: timing = execution_types::cpu::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
-            case hpcc_base::CommunicationType::pcie_mpi: timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
+            case hpcc_base::CommunicationType::pcie_mpi: 
+            if (executionSettings->programSettings->pcie_reverse) {
+                timing = execution_types::pcie_reverse::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
+            } else {
+                timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); 
+            }
+            break;
+#if INTEL_FPGA
             case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
+#endif
             default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType));
         }
         timing_results.push_back(timing);
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index 4d47c392..1eb1825d 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -31,6 +31,33 @@ SOFTWARE.
 #include "hpcc_benchmark.hpp"
 #include "parameters.h"
 
+#ifdef XILINX_FPGA
+template <typename T>
+struct aligned_allocator {
+
+   //    typedefs
+          typedef T value_type;
+          typedef value_type* pointer;
+          typedef const value_type* const_pointer;
+
+	   pointer allocate(size_t pCount, const_pointer = 0){ 
+	    	T* mem = 0;
+	    	if (posix_memalign(reinterpret_cast<void**>(&mem), 4096, sizeof(T) * pCount) != 0) {
+	    		throw std::bad_alloc();
+	        }
+		return mem; 
+	   }
+
+	   void deallocate(pointer pPtr, size_t pCount) { 
+	       free(pPtr);
+	   }
+};
+	   
+namespace cl {
+    template <class T> using vector = std::vector<T,aligned_allocator<T>>; 
+}
+#endif
+
 /**
  * @brief Contains all classes and methods needed by the Network benchmark
  * 
@@ -128,6 +155,34 @@ class NetworkProgramSettings : public hpcc_base::BaseSettings {
      */
     uint llDecrease;
 
+    /**
+     * @brief his is automatically set to true if one of pcie_reverse_write_pcie, pcie_reverse_read_pcie, 
+     * or pcie_reverse_execute_kernel is set to true. The reverse PCIe experiment will be executed in that case.
+     * 
+     */
+    bool pcie_reverse;
+
+    /**
+     * @brief If true, the benchmark will execute the reverse PCIe benchmark instead. It will write data to the FPGA. 
+     * The other pcie_reverse flags can be set to do additional operations within the measurement.
+     * 
+     */
+    bool pcie_reverse_write_pcie;
+
+    /**
+     * @brief If true, the benchmark will execute the reverse PCIe benchmark instead. It will execute an empty kernel. 
+     * The other pcie_reverse flags can be set to do additional operations within the measurement.
+     * 
+     */
+    bool pcie_reverse_execute_kernel;
+
+    /**
+     * @brief If true, the benchmark will execute the reverse PCIe benchmark instead. It will read data from the FPGA. 
+     * The other pcie_reverse flags can be set to do additional operations within the measurement.
+     * 
+     */
+    bool pcie_reverse_read_pcie;
+
     /**
      * @brief Construct a new Network Program Settings object
      * 

From 2df2fca858011bab50d73864094dfe44e57bdb47 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 9 Dec 2022 17:44:44 +0100
Subject: [PATCH 238/318] Fix test CMakeLists

---
 b_eff/tests/CMakeLists.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/b_eff/tests/CMakeLists.txt b/b_eff/tests/CMakeLists.txt
index 2a00ea83..6604c769 100755
--- a/b_eff/tests/CMakeLists.txt
+++ b/b_eff/tests/CMakeLists.txt
@@ -6,4 +6,9 @@ set(TEST_SOURCES test_kernel_functionality_and_host_integration.cpp)
 
 include(${CMAKE_SOURCE_DIR}/../cmake/unitTestTargets.cmake)
 
-target_link_libraries(${LIB_NAME}_intel ${MPI_LIBRARIES})
+if (INTELFPGAOPENCL_FOUND)
+    target_link_libraries(${LIB_NAME}_intel ${MPI_LIBRARIES})
+endif()
+if (Vitis_FOUND)
+    target_link_libraries(${LIB_NAME}_xilinx ${MPI_LIBRARIES})
+endif()
\ No newline at end of file

From a8c871c2783b07e67c60c5f231c1885019f44c38 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 9 Dec 2022 17:47:01 +0100
Subject: [PATCH 239/318] Add config for U280

---
 b_eff/configs/Xilinx_U280_DDR.cmake              | 14 ++++++++++++++
 b_eff/settings/settings.compile.xilinx.u280.ini  |  0
 b_eff/settings/settings.link.xilinx.u280.ddr.ini |  4 ++++
 3 files changed, 18 insertions(+)
 create mode 100644 b_eff/configs/Xilinx_U280_DDR.cmake
 create mode 100644 b_eff/settings/settings.compile.xilinx.u280.ini
 create mode 100644 b_eff/settings/settings.link.xilinx.u280.ddr.ini

diff --git a/b_eff/configs/Xilinx_U280_DDR.cmake b/b_eff/configs/Xilinx_U280_DDR.cmake
new file mode 100644
index 00000000..61d9003b
--- /dev/null
+++ b/b_eff/configs/Xilinx_U280_DDR.cmake
@@ -0,0 +1,14 @@
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.u280.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.u280.ini CACHE FILEPATH "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
\ No newline at end of file
diff --git a/b_eff/settings/settings.compile.xilinx.u280.ini b/b_eff/settings/settings.compile.xilinx.u280.ini
new file mode 100644
index 00000000..e69de29b
diff --git a/b_eff/settings/settings.link.xilinx.u280.ddr.ini b/b_eff/settings/settings.link.xilinx.u280.ddr.ini
new file mode 100644
index 00000000..4d8fb9bd
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.u280.ddr.ini
@@ -0,0 +1,4 @@
+[connectivity]
+nk=dummyKernel:1:dummyKernel
+
+sp=dummyKernel.m_axi_gmem:DDR[0]
\ No newline at end of file

From 6b48b26328042b1c1055fbc83c373fbf4fa62859 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 12 Dec 2022 08:36:32 +0100
Subject: [PATCH 240/318] Add Intel PCIE config

---
 b_eff/configs/Bittware_520N_PCIE.cmake | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 b_eff/configs/Bittware_520N_PCIE.cmake

diff --git a/b_eff/configs/Bittware_520N_PCIE.cmake b/b_eff/configs/Bittware_520N_PCIE.cmake
new file mode 100644
index 00000000..b5fb6dad
--- /dev/null
+++ b/b_eff/configs/Bittware_520N_PCIE.cmake
@@ -0,0 +1,17 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "p520_hpc_sg280l" CACHE STRING "" FORCE)
+set(AOC_FLAGS "-fpc -fp-relaxed -seed=7" CACHE STRING "" FORCE)
+
+# GEMM specific options
+set(CHANNEL_WIDTH 32 CACHE STRING "Width of a single external channel in Byte" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications" FORCE)
\ No newline at end of file

From fbebd06a3ae06d08e1953b91d0f1b993812eeed5 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 12 Dec 2022 16:41:00 +0100
Subject: [PATCH 241/318] Clean up CPU only code

---
 .../host/execution_types/execution_cpu.hpp    | 33 ++++++-------------
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_cpu.hpp b/b_eff/src/host/execution_types/execution_cpu.hpp
index ec37dcb6..6fcf636c 100644
--- a/b_eff/src/host/execution_types/execution_cpu.hpp
+++ b/b_eff/src/host/execution_types/execution_cpu.hpp
@@ -43,9 +43,8 @@ namespace network::execution_types::cpu {
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
         int err;
-        std::vector<cl::CommandQueue> sendQueues;
-        std::vector<cl::Buffer> dummyBuffers;
-        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferReadContents;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferWriteContents;
 
         cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
 
@@ -57,32 +56,20 @@ namespace network::execution_types::cpu {
 
         std::vector<double> calculationTimings;
         for (uint r =0; r < config.programSettings->numRepetitions; r++) {
-            sendQueues.clear();
-            dummyBuffers.clear();
-            dummyBufferContents.clear();
+            dummyBufferReadContents.clear();
+            dummyBufferWriteContents.clear();
             // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-
-                dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err));
-                ASSERT_CL(err)
-
-                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
-
-                cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err);
-                ASSERT_CL(err)
-
-                sendQueue.enqueueWriteBuffer(dummyBuffers.back(), CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents.back().data());
-
-                sendQueues.push_back(sendQueue);
-
+                dummyBufferReadContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+                dummyBufferWriteContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
             }
             double calculationTime = 0.0;
             for (int i = 0; i < config.programSettings->kernelReplications; i++) {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 for (int l = 0; l < looplength; l++) {
-                        MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, 
-                                        dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2)  + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                        MPI_Sendrecv(dummyBufferReadContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, 
+                                        dummyBufferWriteContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2)  + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
@@ -102,8 +89,8 @@ namespace network::execution_types::cpu {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-            err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
-            ASSERT_CL(err);
+            std::copy(dummyBufferWriteContents[r].begin(),dummyBufferWriteContents[r].begin() + dummyBufferWriteContents[r].size() / config.programSettings->kernelReplications,
+                        &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
         }
         return network::ExecutionTimings{
                 looplength,

From 3107f22f5fe8c7bed6fc1653abc6f82b605b70ba Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 12 Dec 2022 16:41:23 +0100
Subject: [PATCH 242/318] Fix message size bug (where four times too large)

---
 b_eff/src/host/network_benchmark.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index 5b9d4f2c..274f2006 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -53,7 +53,7 @@ network::NetworkProgramSettings::getSettingsMap() {
 }
 
 network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize, unsigned int _loopLength) : messageSize(_messageSize), loopLength(_loopLength), 
-                                                                            validationBuffer((1 << _messageSize) * 2 * 2, 0) {
+                                                                            validationBuffer((1 << _messageSize), 0) {
                                                                                 // TODO: fix the validation buffer size to use the variable number of kernel replications and channels
                                                                                 // Validation data buffer should be big enough to fit the data of two channels
                                                                                 // for every repetition. The number of kernel replications is fixed to 2, which 

From a922f9bf30c901539a5845b54cc06ccd920ffd5a Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 13 Dec 2022 11:36:52 +0100
Subject: [PATCH 243/318] Improve validation scheme for b_eff

---
 b_eff/src/device/communication_PCIE.cl        | 11 +++--
 .../host/execution_types/execution_cpu.hpp    |  6 +--
 .../host/execution_types/execution_iec.hpp    |  2 +-
 .../host/execution_types/execution_pcie.hpp   |  4 +-
 .../execution_pcie_reverse.hpp                | 41 ++++++++++---------
 b_eff/src/host/network_benchmark.cpp          | 11 ++---
 b_eff/src/host/network_benchmark.hpp          |  7 +++-
 7 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/b_eff/src/device/communication_PCIE.cl b/b_eff/src/device/communication_PCIE.cl
index dfae7ca8..af4f4f81 100644
--- a/b_eff/src/device/communication_PCIE.cl
+++ b/b_eff/src/device/communication_PCIE.cl
@@ -27,11 +27,14 @@ SOFTWARE.
  *   Minimal kernel only used to measure the startup latency of a kernel and to provide a 
  *      memory buffe for Xilinx FPGAs to measure PCIe read and write performance
  *
- * @param input Dummy input
+ * @param output Output buffer that will be used to write the verification data into
+ * @param verification Verification value that will be written to the buffer
+ * @param messageSize size of the output buffer
  */
 __kernel
 __attribute__ ((max_global_work_dim(0)))
-void dummyKernel(__global char *input) {
-    // Minimal kernel only used to measure the startup latency of a kernel and to provide a 
-    // memory buffe for Xilinx FPGAs to measure PCIe read and write performance
+void dummyKernel(__global DEVICE_DATA_TYPE *output, DEVICE_DATA_TYPE verification, int messageSize) {
+    for (int m=0; m < messageSize; m++) {
+        output[m] = verification;
+    }
 }
diff --git a/b_eff/src/host/execution_types/execution_cpu.hpp b/b_eff/src/host/execution_types/execution_cpu.hpp
index 6fcf636c..bb125b59 100644
--- a/b_eff/src/host/execution_types/execution_cpu.hpp
+++ b/b_eff/src/host/execution_types/execution_cpu.hpp
@@ -46,7 +46,7 @@ namespace network::execution_types::cpu {
         std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferReadContents;
         std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferWriteContents;
 
-        cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
+        cl_uint size_in_bytes = (1 << messageSize);
 
         int current_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
@@ -89,8 +89,8 @@ namespace network::execution_types::cpu {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-            std::copy(dummyBufferWriteContents[r].begin(),dummyBufferWriteContents[r].begin() + dummyBufferWriteContents[r].size() / config.programSettings->kernelReplications,
-                        &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
+            std::copy(dummyBufferWriteContents[r].begin(),dummyBufferWriteContents[r].end(),
+                        &validationData.data()[r * size_in_bytes]);
         }
         return network::ExecutionTimings{
                 looplength,
diff --git a/b_eff/src/host/execution_types/execution_iec.hpp b/b_eff/src/host/execution_types/execution_iec.hpp
index 4225c783..2d0cec0e 100644
--- a/b_eff/src/host/execution_types/execution_iec.hpp
+++ b/b_eff/src/host/execution_types/execution_iec.hpp
@@ -161,7 +161,7 @@ namespace network::execution_types::iec {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-            err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
+            err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * (1 << messageSize), &validationData.data()[r * (1 << messageSize)]);
             ASSERT_CL(err);
         }
         return network::ExecutionTimings{
diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp
index 50d357e6..cc3e5548 100644
--- a/b_eff/src/host/execution_types/execution_pcie.hpp
+++ b/b_eff/src/host/execution_types/execution_pcie.hpp
@@ -47,7 +47,7 @@ namespace network::execution_types::pcie {
         std::vector<cl::Buffer> dummyBuffers;
         std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
 
-        cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
+        cl_uint size_in_bytes = (1 << messageSize);
 
         int current_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
@@ -108,7 +108,7 @@ namespace network::execution_types::pcie {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-            err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
+            err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, &validationData.data()[r * size_in_bytes]);
             ASSERT_CL(err);
         }
         return network::ExecutionTimings{
diff --git a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
index 5f44522e..a606f891 100644
--- a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
+++ b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
@@ -48,7 +48,7 @@ namespace network::execution_types::pcie_reverse {
         std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
         std::vector<cl::Kernel> dummyKernels;
 
-        cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
+        cl_uint size_in_bytes = (1 << messageSize);
 
         int current_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
@@ -74,6 +74,10 @@ namespace network::execution_types::pcie_reverse {
 
                 err = dummyKernels[r].setArg(0, dummyBuffers[r]);
                 ASSERT_CL(err);
+                err = dummyKernels[r].setArg(1, (HOST_DATA_TYPE)(messageSize & 255));
+                ASSERT_CL(err);
+                err = dummyKernels[r].setArg(2, (1 << messageSize));
+                ASSERT_CL(err);
 
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
 
@@ -90,19 +94,17 @@ namespace network::execution_types::pcie_reverse {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 for (int l = 0; l < looplength; l++) {
-                        if (config.programSettings->pcie_reverse_write_pcie) {
-                            sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
-                            sendQueues[i].finish();
-                        }
-                        if (config.programSettings->pcie_reverse_execute_kernel) {
-                            sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1));
-                            sendQueues[i].finish();
-                        }
-                        if (config.programSettings->pcie_reverse_read_pcie) {
-                            sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
-                            sendQueues[i].finish();
-                        }
+                    if (config.programSettings->pcie_reverse_write_pcie) {
+                        sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
+                    }
+                    if (config.programSettings->pcie_reverse_execute_kernel) {
+                        sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1));
+                    }
+                    if (config.programSettings->pcie_reverse_read_pcie) {
+                        sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
+                    }
                 }
+                sendQueues[i].finish();
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
                 #ifndef NDEBUG
@@ -121,13 +123,12 @@ namespace network::execution_types::pcie_reverse {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-            std::cout << validationData.size() << std::endl;
-            err = sendQueues[r].enqueueReadBuffer(
-                dummyBuffers[r], CL_TRUE, 0, 
-                sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, 
-                &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
-            ASSERT_CL(err);
-            sendQueues[r].finish();
+            if (!config.programSettings->pcie_reverse_read_pcie) {
+                err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[r].data());
+                err = sendQueues[r].finish();
+                ASSERT_CL(err)
+            }
+            std::copy(dummyBufferContents[r].begin(), dummyBufferContents[r].end(), &validationData.data()[r * size_in_bytes]);
         }
         std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
                 looplength,
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index 274f2006..86e1dd15 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -52,8 +52,8 @@ network::NetworkProgramSettings::getSettingsMap() {
         return map;
 }
 
-network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize, unsigned int _loopLength) : messageSize(_messageSize), loopLength(_loopLength), 
-                                                                            validationBuffer((1 << _messageSize), 0) {
+network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize, unsigned int _loopLength, unsigned int replications) : messageSize(_messageSize), loopLength(_loopLength), 
+                                                                            validationBuffer((1 << _messageSize) * replications, 0) {
                                                                                 // TODO: fix the validation buffer size to use the variable number of kernel replications and channels
                                                                                 // Validation data buffer should be big enough to fit the data of two channels
                                                                                 // for every repetition. The number of kernel replications is fixed to 2, which 
@@ -61,13 +61,13 @@ network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize
                                                                             }
 
 network::NetworkData::NetworkData(unsigned int max_looplength, unsigned int min_looplength, unsigned int min_messagesize, unsigned int max_messagesize, 
-                                unsigned int offset, unsigned int decrease) {
+                                unsigned int offset, unsigned int decrease, unsigned int replications) {
     uint decreasePerStep = (max_looplength - min_looplength) / decrease;
     for (uint i = min_messagesize; i <= max_messagesize; i++) {
         uint messageSizeDivOffset = (i > offset) ? i - offset : 0u;
         uint newLooplength = (max_looplength > messageSizeDivOffset * decreasePerStep) ? max_looplength - messageSizeDivOffset * decreasePerStep : 0u;
         uint looplength = std::max(newLooplength, min_looplength);
-        this->items.push_back(NetworkDataItem(i, looplength));
+        this->items.push_back(NetworkDataItem(i, looplength, replications));
     }
 }
 
@@ -244,7 +244,8 @@ network::NetworkBenchmark::generateInputData() {
                                                                             executionSettings->programSettings->minMessageSize,
                                                                             executionSettings->programSettings->maxMessageSize,
                                                                             executionSettings->programSettings->llOffset,
-                                                                            executionSettings->programSettings->llDecrease));
+                                                                            executionSettings->programSettings->llDecrease,
+                                                                            executionSettings->programSettings->kernelReplications));
     return d;
 }
 
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index 1eb1825d..27481194 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -239,8 +239,9 @@ class NetworkData {
          * 
          * @param messageSize The message size in bytes
          * @param loopLength The number of repetitions in the kernel
+         * @param replications The number of kernel replications
          */
-        NetworkDataItem(unsigned int messageSize, unsigned int loopLength);
+        NetworkDataItem(unsigned int messageSize, unsigned int loopLength, unsigned int replications);
     };
 
 
@@ -259,8 +260,10 @@ class NetworkData {
      * @param max_messagesize The maximum message size
      * @param offset The used offset to scale the loop length. The higher the offset, the later the loop lenght will be decreased
      * @param decrease Number of steps the looplength will be decreased to the minimum
+     * @param replications The number of kernel replications
      */
-    NetworkData(unsigned int max_looplength, unsigned int min_looplength,  unsigned int min_messagesize, unsigned int max_messagesize, unsigned int offset, unsigned int decrease);
+    NetworkData(unsigned int max_looplength, unsigned int min_looplength,  unsigned int min_messagesize, unsigned int max_messagesize,
+                unsigned int offset, unsigned int decrease, unsigned int replications);
 
 };
 

From 3802ee4d59315e6a51cc2ed4e3f1949218c60d51 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 13 Dec 2022 13:08:02 +0100
Subject: [PATCH 244/318] Add PCIe reverse batch option

---
 .../host/execution_types/execution_pcie_reverse.hpp | 13 ++++++++++++-
 b_eff/src/host/network_benchmark.cpp                |  6 ++++--
 b_eff/src/host/network_benchmark.hpp                |  6 ++++++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
index a606f891..4146912f 100644
--- a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
+++ b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
@@ -96,15 +96,26 @@ namespace network::execution_types::pcie_reverse {
                 for (int l = 0; l < looplength; l++) {
                     if (config.programSettings->pcie_reverse_write_pcie) {
                         sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
+                        if (!config.programSettings->pcie_reverse_batch) {
+                            sendQueues[i].finish();
+                        }
                     }
                     if (config.programSettings->pcie_reverse_execute_kernel) {
                         sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1));
+                        if (!config.programSettings->pcie_reverse_batch) {
+                            sendQueues[i].finish();
+                        }
                     }
                     if (config.programSettings->pcie_reverse_read_pcie) {
                         sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
+                        if (!config.programSettings->pcie_reverse_batch) {
+                            sendQueues[i].finish();
+                        }
                     }
                 }
-                sendQueues[i].finish();
+                if (config.programSettings->pcie_reverse_batch) {
+                    sendQueues[i].finish();
+                }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
                 #ifndef NDEBUG
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index 86e1dd15..a1f86b12 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -38,7 +38,8 @@ network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &re
     maxLoopLength(results["u"].as<uint>()), minLoopLength(results["l"].as<uint>()), maxMessageSize(results["m"].as<uint>()), 
     minMessageSize(results["min-size"].as<uint>()), llOffset(results["o"].as<uint>()), llDecrease(results["d"].as<uint>()),
     pcie_reverse_write_pcie(results["pcie-read"].count()), pcie_reverse_read_pcie(results["pcie-write"].count()),
-    pcie_reverse_execute_kernel(results["kernel-latency"].count()) {
+    pcie_reverse_execute_kernel(results["kernel-latency"].count()),
+    pcie_reverse_batch(results["pcie-batch"].count()) {
 
     pcie_reverse = pcie_reverse_execute_kernel | pcie_reverse_read_pcie | pcie_reverse_write_pcie;
 
@@ -93,7 +94,8 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options)
             cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE)))
         ("pcie-read", "Use reverse PCIe experiment and measure PCIe read performance from device")
         ("pcie-write", "Use reverse PCIe experiment and measure PCIe write performance from device")
-        ("kernel-latency", "Use reverse PCIe experiment and measure kernel execution latency");
+        ("kernel-latency", "Use reverse PCIe experiment and measure kernel execution latency")
+        ("pcie-batch", "Execute the reverse PCIe experiments in batch mode to make use of the queues of the schedulers");
 }
 
 void
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index 27481194..cb488686 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -183,6 +183,12 @@ class NetworkProgramSettings : public hpcc_base::BaseSettings {
      */
     bool pcie_reverse_read_pcie;
 
+    /**
+     * @brief If true, the reverse experiments are executed in batch mode per looplength to make use of the scheduling queues
+     * 
+     */
+    bool pcie_reverse_batch;
+
     /**
      * @brief Construct a new Network Program Settings object
      * 

From ea874c3e79d28ba60db6e43ef487de7d4281df1b Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 13 Dec 2022 16:10:14 +0100
Subject: [PATCH 245/318] Fix unit tests and add reverse tests

---
 b_eff/src/device/CMakeLists.txt               |   4 +
 ...nel_functionality_and_host_integration.cpp | 113 ++++++++++--------
 2 files changed, 64 insertions(+), 53 deletions(-)

diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt
index 0a15211e..146a4407 100644
--- a/b_eff/src/device/CMakeLists.txt
+++ b/b_eff/src/device/CMakeLists.txt
@@ -23,4 +23,8 @@ if (Vitis_FOUND)
                 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
         add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 1 -m 20 -n 1
                 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_emulation_pcie_reverse_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 10 -m 0 -n 1 --kernel-latency --pcie-write --pcie-read
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_emulation_pcie_reverse_batch_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 10 -m 0 -n 1 --kernel-latency --pcie-write --pcie-read --pcie-batch
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
 endif()
diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
index 4e1cdb62..e7a51712 100644
--- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
@@ -10,7 +10,7 @@
 #include "test_program_settings.h"
 #include <fstream>
 
-struct NetworkKernelTest : testing::TestWithParam<hpcc_base::CommunicationType> {
+struct NetworkKernelTest : testing::Test {
     std::unique_ptr<network::NetworkBenchmark> bm;
     std::unique_ptr<network::NetworkData> data;
     unsigned numberOfChannels = 4;
@@ -22,7 +22,6 @@ struct NetworkKernelTest : testing::TestWithParam<hpcc_base::CommunicationType>
     void SetUp() override {
         bm = std::unique_ptr<network::NetworkBenchmark>(new network::NetworkBenchmark(global_argc, global_argv));
         bm->getExecutionSettings().programSettings->numRepetitions = 1;
-        bm->getExecutionSettings().programSettings->communicationType = GetParam();
         data = bm->generateInputData();
         createChannelFilesAndSymbolicLinks();
     }
@@ -48,9 +47,9 @@ struct NetworkKernelTest : testing::TestWithParam<hpcc_base::CommunicationType>
 /**
  * Tests if calculate returns the correct execution results
  */
-TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) {
+TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) {
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(1,1));
+    data->items.push_back(network::NetworkData::NetworkDataItem(1,1, bm->getExecutionSettings().programSettings->kernelReplications));
     bm->executeKernel(*data);
     EXPECT_NE(bm->collected_timings.end(), bm->collected_timings.find(1));
     EXPECT_EQ(1, bm->collected_timings.find(1)->second.execution_timings.at(0).looplength);
@@ -60,10 +59,10 @@ TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) {
 /**
  * Tests if calculate returns the correct execution results for multiple repetitions
  */
-TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) {
+TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) {
     bm->getExecutionSettings().programSettings->numRepetitions = 2;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(8,4));
+    data->items.push_back(network::NetworkData::NetworkDataItem(8,4, bm->getExecutionSettings().programSettings->kernelReplications));
     bm->executeKernel(*data);
     EXPECT_NE(bm->collected_timings.end(), bm->collected_timings.find(8));
     EXPECT_EQ(4, bm->collected_timings.find(8)->second.execution_timings.at(0).looplength);
@@ -73,7 +72,7 @@ TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) {
 /**
  * Tests if data is written to the channels for small message sizes
  */
-TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) {
+TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) {
     if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
         // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
         GTEST_SKIP();
@@ -81,7 +80,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel)
     const unsigned messageSize = std::log2(CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     bm->executeKernel(*data);
     HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2];
     for (int i=0; i < numberOfChannels; i++) {
@@ -101,7 +100,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel)
 /**
  * Tests if data is written to the channels for small message sizes filling two channels
  */
-TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) {
+TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) {
     if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
         // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
         GTEST_SKIP();
@@ -109,7 +108,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize, looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize, looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     bm->executeKernel(*data);
     HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2];
     for (int i=0; i < numberOfChannels; i++) {
@@ -126,7 +125,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels
 /**
  * Tests if data is written to the channels for message sizes filling more than two channels
  */
-TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) {
+TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) {
     if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
         // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
         GTEST_SKIP();
@@ -134,7 +133,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwo
     const unsigned messageSize = std::log2(8 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 1;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     bm->executeKernel(*data);
     HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2];
     for (int i=0; i < numberOfChannels; i++) {
@@ -151,7 +150,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwo
 /**
  * Tests if correct data is written to the channels
  */
-TEST_P(NetworkKernelTest, CorrectDataIsWrittenToChannel) {
+TEST_F(NetworkKernelTest, CorrectDataIsWrittenToChannel) {
     if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
         // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
         GTEST_SKIP();
@@ -159,7 +158,7 @@ TEST_P(NetworkKernelTest, CorrectDataIsWrittenToChannel) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     bm->executeKernel(*data);
     HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[messageSize * looplength * 2];
     for (int i=0; i < numberOfChannels; i++) {
@@ -175,11 +174,11 @@ TEST_P(NetworkKernelTest, CorrectDataIsWrittenToChannel) {
     delete [] buffer;
 }
 
-TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) {
+TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     bm->executeKernel(*data);
     HOST_DATA_TYPE cvalue = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     EXPECT_EQ(cvalue, data->items[0].validationBuffer[0]);
@@ -190,11 +189,11 @@ TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) {
     EXPECT_TRUE(all_same);
 }
 
-TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) {
+TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) {
     const unsigned messageSize = 0;
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     bm->executeKernel(*data);
     HOST_DATA_TYPE cvalue = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     EXPECT_EQ(cvalue, data->items[0].validationBuffer[0]);
@@ -205,72 +204,86 @@ TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) {
     EXPECT_TRUE(all_same);
 }
 
-TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) {
+TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
+    bm->getExecutionSettings().programSettings->kernelReplications = 1;
     const unsigned looplength = 4;
+    const unsigned replications = 1;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    bm->executeKernel(*data);
-    EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size());
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1));
+    auto result = bm->executeKernel(*data);
+    EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size());
 }
 
-TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) {
+TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
+    bm->getExecutionSettings().programSettings->kernelReplications = 1;
     const unsigned looplength = 1;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    bm->executeKernel(*data);
-    EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size());
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1));
+    auto result = bm->executeKernel(*data);
+    EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size());
 }
 
-TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) {
+TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) {
     const unsigned messageSize = 0;
+    bm->getExecutionSettings().programSettings->kernelReplications = 1;
     const unsigned looplength = 1;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    bm->executeKernel(*data);
-    EXPECT_EQ(looplength * CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size());
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1));
+    auto result = bm->executeKernel(*data);
+    EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size());
 }
 
-TEST_P(NetworkKernelTest, ValidationDataSingleItemWrongCheckFails) {
-    const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
+TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForReplication2) {
+    const unsigned messageSize = 4;
+    const unsigned looplength = 2;
+    bm->getExecutionSettings().programSettings->kernelReplications = 2;
+    data->items.clear();
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 2));
+    auto result = bm->executeKernel(*data);
+    EXPECT_EQ((1 << messageSize) * 2, data->items[0].validationBuffer.size());
+}
+
+TEST_F(NetworkKernelTest, ValidationDataSingleItemWrongCheckFails) {
+    const unsigned messageSize = 4;
     const HOST_DATA_TYPE expected_data = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data;});
     data->items[0].validationBuffer[looplength] = expected_data + 1;
     EXPECT_FALSE(bm->validateOutput(*data));
     bm->printError();
 }
 
-TEST_P(NetworkKernelTest, ValidationDataWrongCheckFails) {
+TEST_F(NetworkKernelTest, ValidationDataWrongCheckFails) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const HOST_DATA_TYPE expected_data = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data - 1;});
     EXPECT_FALSE(bm->validateOutput(*data));
     bm->printError();
 }
 
-TEST_P(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) {
+TEST_F(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const HOST_DATA_TYPE expected_data = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data;});
     EXPECT_TRUE(bm->validateOutput(*data));
     bm->printError();
 }
 
-TEST_P(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) {
+TEST_F(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     bm->executeKernel(*data);
     EXPECT_TRUE(bm->validateOutput(*data));
     bm->printError();
@@ -279,32 +292,32 @@ TEST_P(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) {
 // This test is disabled because it does not work with the current implementation of the
 // external channels in software emulation. The different kernel executions will read 
 // the old data from the channel file, which will lead to a failing validation!
-TEST_P(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExecution) {
+TEST_F(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExecution) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     bm->executeKernel(*data);
     EXPECT_TRUE(bm->validateOutput(*data));
     bm->printError();
 }
 
-TEST_P(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) {
+TEST_F(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     bm->executeKernel(*data);
     data->items[1].validationBuffer[0] = static_cast<HOST_DATA_TYPE>(0);
     EXPECT_FALSE(bm->validateOutput(*data));
     bm->printError();
 }
 
-TEST_P(NetworkKernelTest, JsonDump) {
+TEST_F(NetworkKernelTest, JsonDump) {
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(8,4));
+    data->items.push_back(network::NetworkData::NetworkDataItem(8,4, bm->getExecutionSettings().programSettings->kernelReplications));
     bm->executeKernel(*data);
     bm->collectResults();
     bm->dumpConfigurationAndResults("b_eff.json");
@@ -336,9 +349,3 @@ TEST_P(NetworkKernelTest, JsonDump) {
         }
     }
 }
-
-
-INSTANTIATE_TEST_CASE_P(
-        NetworkKernelParametrizedTests,
-        NetworkKernelTest,
-        ::testing::Values(hpcc_base::CommunicationType::intel_external_channels,hpcc_base::CommunicationType::cpu_only, hpcc_base::CommunicationType::pcie_mpi));

From a13ef51210286ebb2ec727c057ea0c0efb177c38 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 16 Dec 2022 10:45:38 +0100
Subject: [PATCH 246/318] Additionally enqueue kernels to correctly measure for
 Xilinx FPGAs

---
 .../src/host/execution_types/execution_pcie.hpp | 17 +++++++++++++++--
 .../execution_types/execution_pcie_reverse.hpp  |  2 +-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp
index cc3e5548..de79379c 100644
--- a/b_eff/src/host/execution_types/execution_pcie.hpp
+++ b/b_eff/src/host/execution_types/execution_pcie.hpp
@@ -45,6 +45,7 @@ namespace network::execution_types::pcie {
         int err;
         std::vector<cl::CommandQueue> sendQueues;
         std::vector<cl::Buffer> dummyBuffers;
+        std::vector<cl::Kernel> dummyKernels;
         std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
 
         cl_uint size_in_bytes = (1 << messageSize);
@@ -66,6 +67,16 @@ namespace network::execution_types::pcie {
                 dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err));
                 ASSERT_CL(err)
 
+                dummyKernels.push_back(cl::Kernel(*config.program,
+                                                    "dummyKernel", &err));
+
+                err = dummyKernels[r].setArg(0, dummyBuffers[r]);
+                ASSERT_CL(err);
+                err = dummyKernels[r].setArg(1, (HOST_DATA_TYPE)(messageSize & 255));
+                ASSERT_CL(err);
+                err = dummyKernels[r].setArg(2, 1); 
+                ASSERT_CL(err);
+
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
 
                 cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err);
@@ -81,14 +92,16 @@ namespace network::execution_types::pcie {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 for (int l = 0; l < looplength; l++) {
-
+                        sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1));
                         sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
+                        sendQueues[i].finish();
 
                         MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, 
                                         dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2)  + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
 
                         sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
-
+                        sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1));
+                        sendQueues[i].finish();
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
diff --git a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
index 4146912f..434fb95a 100644
--- a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
+++ b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
@@ -76,7 +76,7 @@ namespace network::execution_types::pcie_reverse {
                 ASSERT_CL(err);
                 err = dummyKernels[r].setArg(1, (HOST_DATA_TYPE)(messageSize & 255));
                 ASSERT_CL(err);
-                err = dummyKernels[r].setArg(2, (1 << messageSize));
+                err = dummyKernels[r].setArg(2, 1); 
                 ASSERT_CL(err);
 
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));

From ec02f72da07dcf11e702a4e69bda20084ed4782c Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 16 Dec 2022 14:32:45 +0100
Subject: [PATCH 247/318] Exlude too short messages for IEC test

---
 b_eff/src/device/CMakeLists.txt                           | 8 ++++----
 .../test_kernel_functionality_and_host_integration.cpp    | 7 +++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt
index 146a4407..4ee0f8a3 100644
--- a/b_eff/src/device/CMakeLists.txt
+++ b/b_eff/src/device/CMakeLists.txt
@@ -5,13 +5,13 @@ include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake)
 
 if (INTELFPGAOPENCL_FOUND)
         generate_kernel_targets_intel(communication_bw520n_IEC communication_PCIE)
-        add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 -m 0 -n 1
+        add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 --min-size 6 -m 6 -n 1
                 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-        add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1
+        add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_PCIE_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1
                 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-        add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type PCIE -l 1 -u 10 -m 0 -n 1
+        add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_PCIE_emulate.aocx -l 1 -u 10 -m 0 -n 1
                 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-        add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 1 -m 20 -n 1
+        add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_PCIE_emulate.aocx -l 1 -u 1 -m 20 -n 1
                 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
 endif()
 
diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
index e7a51712..613f1b13 100644
--- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
@@ -48,6 +48,10 @@ struct NetworkKernelTest : testing::Test {
  * Tests if calculate returns the correct execution results
  */
 TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) {
+    if (bm->getExecutionSettings().programSettings->communicationType == hpcc_base::CommunicationType::intel_external_channels) {
+        // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
+        GTEST_SKIP() << "Intel external channel needs at least message size of 64 byte to fill channel!";
+    }
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(1,1, bm->getExecutionSettings().programSettings->kernelReplications));
     bm->executeKernel(*data);
@@ -190,6 +194,9 @@ TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) {
 }
 
 TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) {
+    if (bm->getExecutionSettings().programSettings->communicationType == hpcc_base::CommunicationType::intel_external_channels) {
+        GTEST_SKIP() << "Intel external channel needs at least message size of 64 byte to fill channel!";
+    }
     const unsigned messageSize = 0;
     const unsigned looplength = 4;
     data->items.clear();

From 1cdcc0b8627efb0dfda8a81321cba1558dc3d9f6 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 16 Dec 2022 14:33:07 +0100
Subject: [PATCH 248/318] Update IEC implementation to store whole message

---
 b_eff/src/device/communication_bw520n_IEC.cl | 25 ++++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/b_eff/src/device/communication_bw520n_IEC.cl b/b_eff/src/device/communication_bw520n_IEC.cl
index 8f43756b..26379080 100644
--- a/b_eff/src/device/communication_bw520n_IEC.cl
+++ b/b_eff/src/device/communication_bw520n_IEC.cl
@@ -119,6 +119,21 @@ void recv{{ i }}(__global DEVICE_DATA_TYPE* validation_buffer,
         for (unsigned k=0; k < send_iterations; k++) {
             recv_part1 = read_channel_intel(ch_in_{{ 2*i+1 }});
             recv_part2 = read_channel_intel(ch_in_{{ 2*i+2 }});
+
+            DEVICE_DATA_TYPE mem_buffer[2 * ITEMS_PER_CHANNEL];
+            // Store the last received data chunks in global memory for later validation
+            __attribute__((opencl_unroll_hint(ITEMS_PER_CHANNEL)))
+            for (DEVICE_DATA_TYPE d = 0; d < ITEMS_PER_CHANNEL; d++) {
+                mem_buffer[d] = recv_part1.values[d];
+            }
+            __attribute__((opencl_unroll_hint(ITEMS_PER_CHANNEL)))
+            for (DEVICE_DATA_TYPE d = 0; d < ITEMS_PER_CHANNEL; d++) {
+                mem_buffer[ITEMS_PER_CHANNEL + d] = recv_part2.values[d];
+            }
+            __attribute__((opencl_unroll_hint(2*ITEMS_PER_CHANNEL)))
+            for (DEVICE_DATA_TYPE d = 0; d < 2*ITEMS_PER_CHANNEL; d++) {
+                validation_buffer[k * (2 * ITEMS_PER_CHANNEL) + d] = mem_buffer[d];
+            }
         }
 #ifndef EMULATE
         // Introduce data dependency between loop iterations to prevent coalescing of loop
@@ -127,16 +142,6 @@ void recv{{ i }}(__global DEVICE_DATA_TYPE* validation_buffer,
         write_channel_intel(ch_exchange{{ 2*i+2 }}, recv_part2);
 #endif
     }
-
-    // Store the last received data chunks in global memory for later validation
-    __attribute__((opencl_unroll_hint(ITEMS_PER_CHANNEL)))
-    for (DEVICE_DATA_TYPE d = 0; d < ITEMS_PER_CHANNEL; d++) {
-        validation_buffer[d] = recv_part1.values[d];
-    }
-    __attribute__((opencl_unroll_hint(ITEMS_PER_CHANNEL)))
-    for (DEVICE_DATA_TYPE d = 0; d < ITEMS_PER_CHANNEL; d++) {
-        validation_buffer[ITEMS_PER_CHANNEL + d] = recv_part2.values[d];
-    }
 }
 
 {% endfor %}

From 512c4c68b31f209d85d00096e341761496b403a8 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 16 Dec 2022 15:18:26 +0100
Subject: [PATCH 249/318] Fix tests for updated IEC

---
 .../test_kernel_functionality_and_host_integration.cpp    | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
index 613f1b13..4cc30e25 100644
--- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
@@ -218,7 +218,6 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) {
     const unsigned replications = 1;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1));
-    auto result = bm->executeKernel(*data);
     EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size());
 }
 
@@ -228,7 +227,6 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) {
     const unsigned looplength = 1;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1));
-    auto result = bm->executeKernel(*data);
     EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size());
 }
 
@@ -237,8 +235,7 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) {
     bm->getExecutionSettings().programSettings->kernelReplications = 1;
     const unsigned looplength = 1;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1));
-    auto result = bm->executeKernel(*data);
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size());
 }
 
@@ -248,7 +245,6 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForReplication2) {
     bm->getExecutionSettings().programSettings->kernelReplications = 2;
     data->items.clear();
     data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 2));
-    auto result = bm->executeKernel(*data);
     EXPECT_EQ((1 << messageSize) * 2, data->items[0].validationBuffer.size());
 }
 
@@ -299,7 +295,7 @@ TEST_F(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) {
 // This test is disabled because it does not work with the current implementation of the
 // external channels in software emulation. The different kernel executions will read 
 // the old data from the channel file, which will lead to a failing validation!
-TEST_F(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExecution) {
+TEST_F(NetworkKernelTest, ValidationDataCorrectTwoMessageSizesAfterExecution) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();

From 6df822b0e18914987aa72d24f777ef268102bafb Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 16 Dec 2022 15:37:55 +0100
Subject: [PATCH 250/318] Fix PCIe reverse signature

---
 b_eff/src/host/execution_types/execution_pcie_reverse.hpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
index 434fb95a..2395b7bd 100644
--- a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
+++ b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
@@ -38,7 +38,7 @@ namespace network::execution_types::pcie_reverse {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
-    std::shared_ptr<network::ExecutionTimings>
+    network::ExecutionTimings
     calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength,
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
@@ -141,12 +141,11 @@ namespace network::execution_types::pcie_reverse {
             }
             std::copy(dummyBufferContents[r].begin(), dummyBufferContents[r].end(), &validationData.data()[r * size_in_bytes]);
         }
-        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
+        return network::ExecutionTimings{
                 looplength,
                 messageSize,
                 calculationTimings
-        });
-        return result;
+        };
     }
 
 }  // namespace bm_execution

From 25134b6820164dae8e45cef83cea32cb602e5a14 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Fri, 16 Dec 2022 17:06:05 +0100
Subject: [PATCH 251/318] add comments to json dump helper functions

---
 shared/include/hpcc_benchmark.hpp | 64 +++++++++++++++++++++++++++++--
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 69da2bfe..9328a251 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -544,17 +544,33 @@ class HpccFpgaBenchmark {
         std::cout << *executionSettings << std::endl;
     }
     
+    /*
+     * @brief Returns the map of the timings
+     *
+     * @return The timings map
+     */
     std::map<std::string, std::vector<double>>
     getTimingsMap() {
         return timings;
     }
     
+    /*
+     * @brief adds a timing to the timings map
+     *
+     * @param key The key
+     */
     void
     addTimings(std::string key, std::vector<double> value) {
         timings.emplace(key, value);
     }
     
-    // override for special benchmarks like b_eff
+    /*
+     * @brief Returns the timings map as json
+     *
+     * @return The json object
+     *
+     * It should be overwritten for benchmarks with special timings format, like b_eff
+     */
     virtual json getTimingsJson() {
         json j;
         for (auto const &key: timings) {
@@ -570,6 +586,12 @@ class HpccFpgaBenchmark {
         return j;
     }
 
+    /**
+     * @brief Returns the results map as json
+     *
+     * @return The return object
+     *
+     */
     std::map<std::string, json> getResultsJson() {
         std::map<std::string, json> results_string;
         for (auto const &result: results) {
@@ -581,13 +603,27 @@ class HpccFpgaBenchmark {
         return results_string;
     }
 
+    /**
+     * @brief Returns the map of the dumped environment variables
+     *
+     * @param The environment map
+     *
+     * Can be extended as needed
+     */
     std::map<std::string, std::string>
     getEnvironmentMap() {
         std::map<std::string, std::string> env; 
         env["LD_LIBRARY_PATH"] = std::string(std::getenv("LD_LIBRARY_PATH"));
         return env;
     }
-
+    /**
+     * @brief Format the FPGA Torus setting string
+     *
+     * @param The setting string
+     *
+     * @return The parsed json object
+     *
+     */
     json
     parseFPGATorusString(std::string str) {
         json j; 
@@ -599,6 +635,13 @@ class HpccFpgaBenchmark {
         return j;
     }
     
+    /**
+     * @brief Get current time as string
+     *
+     * @return The time string
+     *
+     * Has the same format as CONFIG_TIME
+     */
     std::string
     getCurrentTime() {
         time_t time = std::time(0);
@@ -608,6 +651,15 @@ class HpccFpgaBenchmark {
         return oss.str();
     }
 
+    /**
+     * @brief Convert the settings map to json
+     *
+     * @param settings_map The settings map
+     *
+     * @return the json object
+     *
+     * This function checks for settings which are not strings and converts them
+     */
     std::map<std::string, json>
     jsonifySettingsMap(std::map<std::string, std::string> settings_map) {
         json j;
@@ -629,7 +681,13 @@ class HpccFpgaBenchmark {
         }
         return j;
     }
-    
+
+    /**
+     * @brief Dumps the benchmark configuration and results to a json file
+     *
+     * @param file_path Path where the json will be saved
+     *
+     */
     void
     dumpConfigurationAndResults(std::string file_path) {
         std::fstream fs;

From a0ccd3a45a4385af17c0c28aad1253dbf7327459 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 6 Jan 2023 16:14:45 +0100
Subject: [PATCH 252/318] Re-add emulation kernels as unit test build
 dependency

---
 cmake/unitTestTargets.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/unitTestTargets.cmake b/cmake/unitTestTargets.cmake
index 263d4033..4e949a9d 100644
--- a/cmake/unitTestTargets.cmake
+++ b/cmake/unitTestTargets.cmake
@@ -29,6 +29,7 @@ if (Vitis_FOUND)
     add_executable(${HOST_EXE_NAME}_test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES})
     target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
     target_link_libraries(${HOST_EXE_NAME}_test_xilinx hpcc_fpga_base_test)
+    add_dependencies(${HOST_EXE_NAME}_test_xilinx ${kernel_emulation_targets_xilinx})
     target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
     if (USE_ACCL)

From de85402d74afdeec8e1338821699db85719a8b8e Mon Sep 17 00:00:00 2001
From: Marius Meyer <marius.meyer@uni-paderborn.de>
Date: Thu, 19 Jan 2023 10:11:32 +0100
Subject: [PATCH 253/318] Update references in README

---
 README.md | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 1814c5a0..fefd250b 100755
--- a/README.md
+++ b/README.md
@@ -296,14 +296,31 @@ If you are using one of the benchmarks contained in the HPCC FPGA benchmark suit
         doi={10.1109/H2RC51942.2020.00007}
     }
 
-If the focus is on multi-FPGA execution and inter-FPGA communication, you may rather want to cite 
 
-    @misc{hpcc_multi_fpga,
-        doi = {10.48550/ARXIV.2202.13995},
-        url = {https://arxiv.org/abs/2202.13995},
-        author = {Meyer, Marius and Kenter, Tobias and Plessl, Christian},
-        title = {Multi-FPGA Designs and Scaling of HPC Challenge Benchmarks via MPI and Circuit-Switched Inter-FPGA Networks},
-        publisher = {arXiv},
+    @article{hpcc_fpga_in_depth,
+        author = {Marius Meyer and Tobias Kenter and Christian Plessl},
+        doi = {https://doi.org/10.1016/j.jpdc.2021.10.007},
+        issn = {0743-7315},
+        journal = {Journal of Parallel and Distributed Computing},
+        keywords = {FPGA, OpenCL, High level synthesis, HPC benchmarking},
+        pages = {79-89},
+        title = {In-depth FPGA accelerator performance evaluation with single node benchmarks from the HPC challenge benchmark suite for Intel and Xilinx FPGAs using OpenCL},
+        url = {https://www.sciencedirect.com/science/article/pii/S0743731521002057},
+        volume = {160},
         year = {2022}
     }
 
+
+If the focus is on multi-FPGA execution and inter-FPGA communication, you may rather want to cite 
+
+    @article{hpcc_multi_fpga, 
+        author = {Meyer, Marius and Kenter, Tobias and Plessl, Christian},
+        title = {Multi-FPGA Designs and Scaling of HPC Challenge Benchmarks via MPI and Circuit-Switched Inter-FPGA Networks}, 
+        year = {2023}, 
+        publisher = {Association for Computing Machinery}, 
+        address = {New York, NY, USA}, 
+        issn = {1936-7406}, 
+        url = {https://doi.org/10.1145/3576200}, 
+        doi = {10.1145/3576200}
+     }
+

From a0477d02d6a9621951686773f599a7dbd3de1f23 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 16 Jan 2023 17:50:30 +0100
Subject: [PATCH 254/318] Fix IEC execution code

---
 b_eff/src/host/execution_types/execution_iec.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/b_eff/src/host/execution_types/execution_iec.hpp b/b_eff/src/host/execution_types/execution_iec.hpp
index 2d0cec0e..471a3547 100644
--- a/b_eff/src/host/execution_types/execution_iec.hpp
+++ b/b_eff/src/host/execution_types/execution_iec.hpp
@@ -39,8 +39,9 @@ namespace network::execution_types::iec {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
+   template<class TDevice, class TContext, class TProgram>
     network::ExecutionTimings
-    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength,
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
         int err;

From 2fc4c98d078225e4cf228039fc6a4e7c0b0dbb96 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 16 Jan 2023 17:51:15 +0100
Subject: [PATCH 255/318] Add step size and PCIe reverse flag

---
 b_eff/src/host/network_benchmark.cpp | 22 ++++++++++++----------
 b_eff/src/host/network_benchmark.hpp |  9 ++++++++-
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index a1f86b12..7b1cdde3 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -36,13 +36,11 @@ SOFTWARE.
 
 network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
     maxLoopLength(results["u"].as<uint>()), minLoopLength(results["l"].as<uint>()), maxMessageSize(results["m"].as<uint>()), 
-    minMessageSize(results["min-size"].as<uint>()), llOffset(results["o"].as<uint>()), llDecrease(results["d"].as<uint>()),
-    pcie_reverse_write_pcie(results["pcie-read"].count()), pcie_reverse_read_pcie(results["pcie-write"].count()),
-    pcie_reverse_execute_kernel(results["kernel-latency"].count()),
-    pcie_reverse_batch(results["pcie-batch"].count()) {
-
-    pcie_reverse = pcie_reverse_execute_kernel | pcie_reverse_read_pcie | pcie_reverse_write_pcie;
-
+    minMessageSize(results["min-size"].as<uint>()), stepSize(results["step-size"].as<uint>()), llOffset(results["o"].as<uint>()), 
+    llDecrease(results["d"].as<uint>()), pcie_reverse_write_pcie(results["pcie-read"].count()), 
+    pcie_reverse_read_pcie(results["pcie-write"].count()), pcie_reverse_execute_kernel(results["kernel-latency"].count()),
+    pcie_reverse_batch(results["pcie-batch"].count()), pcie_reverse(results["pcie-reverse"].count())
+{
 }
 
 std::map<std::string, std::string>
@@ -62,9 +60,9 @@ network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize
                                                                             }
 
 network::NetworkData::NetworkData(unsigned int max_looplength, unsigned int min_looplength, unsigned int min_messagesize, unsigned int max_messagesize, 
-                                unsigned int offset, unsigned int decrease, unsigned int replications) {
+                                unsigned int stepsize, unsigned int offset, unsigned int decrease, unsigned int replications) {
     uint decreasePerStep = (max_looplength - min_looplength) / decrease;
-    for (uint i = min_messagesize; i <= max_messagesize; i++) {
+    for (uint i = min_messagesize; i <= max_messagesize; i += stepsize) {
         uint messageSizeDivOffset = (i > offset) ? i - offset : 0u;
         uint newLooplength = (max_looplength > messageSizeDivOffset * decreasePerStep) ? max_looplength - messageSizeDivOffset * decreasePerStep : 0u;
         uint looplength = std::max(newLooplength, min_looplength);
@@ -88,6 +86,8 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options)
         ("min-size", "Minimum Message Size", cxxopts::value<uint>()->default_value(std::to_string(0)))
         ("m", "Maximum message size",
              cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_MAX_MESSAGE_SIZE)))
+        ("step-size", "Step size to generate message sizes in the specified range",
+            cxxopts::value<uint>()->default_value(std::to_string(1)))
         ("o", "Offset used before reducing repetitions",
             cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_OFFSET)))
         ("d", "Number os steps the repetitions are decreased to its minimum",
@@ -95,7 +95,8 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options)
         ("pcie-read", "Use reverse PCIe experiment and measure PCIe read performance from device")
         ("pcie-write", "Use reverse PCIe experiment and measure PCIe write performance from device")
         ("kernel-latency", "Use reverse PCIe experiment and measure kernel execution latency")
-        ("pcie-batch", "Execute the reverse PCIe experiments in batch mode to make use of the queues of the schedulers");
+        ("pcie-batch", "Execute the reverse PCIe experiments in batch mode to make use of the queues of the schedulers")
+        ("pcie-reverse", "Execute the reverse PCIe experiments");
 }
 
 void
@@ -245,6 +246,7 @@ network::NetworkBenchmark::generateInputData() {
                                                                             executionSettings->programSettings->minLoopLength,
                                                                             executionSettings->programSettings->minMessageSize,
                                                                             executionSettings->programSettings->maxMessageSize,
+                                                                            executionSettings->programSettings->stepSize,
                                                                             executionSettings->programSettings->llOffset,
                                                                             executionSettings->programSettings->llDecrease,
                                                                             executionSettings->programSettings->kernelReplications));
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index cb488686..814075c0 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -143,6 +143,12 @@ class NetworkProgramSettings : public hpcc_base::BaseSettings {
      */
     uint minMessageSize;
 
+    /**
+     * @brief Step size for tested message sizes
+     * 
+     */
+    uint stepSize;
+
     /**
      * @brief Offset that is used before the loop length will be reduced for higher message sizes
      * 
@@ -264,12 +270,13 @@ class NetworkData {
      * @param min_looplength The minimum number of iterations that should be done for a message size
      * @param max_messagesize The minimum message size
      * @param max_messagesize The maximum message size
+     * @param stepSize Step size used to generate tested message sizes
      * @param offset The used offset to scale the loop length. The higher the offset, the later the loop lenght will be decreased
      * @param decrease Number of steps the looplength will be decreased to the minimum
      * @param replications The number of kernel replications
      */
     NetworkData(unsigned int max_looplength, unsigned int min_looplength,  unsigned int min_messagesize, unsigned int max_messagesize,
-                unsigned int offset, unsigned int decrease, unsigned int replications);
+                unsigned int stepSize, unsigned int offset, unsigned int decrease, unsigned int replications);
 
 };
 

From 24e1f540e0145851c17657c5127484ed48f489a6 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 16 Jan 2023 09:53:08 +0100
Subject: [PATCH 256/318] Make kernel executiuon in base PCIE version optional

---
 b_eff/src/host/execution_types/execution_pcie.hpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp
index de79379c..b13f8e4f 100644
--- a/b_eff/src/host/execution_types/execution_pcie.hpp
+++ b/b_eff/src/host/execution_types/execution_pcie.hpp
@@ -92,7 +92,9 @@ namespace network::execution_types::pcie {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 for (int l = 0; l < looplength; l++) {
-                        sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1));
+                        if(config.programSettings->pcie_reverse_execute_kernel) {
+                            sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1));
+                        }
                         sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
                         sendQueues[i].finish();
 
@@ -100,7 +102,9 @@ namespace network::execution_types::pcie {
                                         dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2)  + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
 
                         sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
-                        sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1));
+                        if(config.programSettings->pcie_reverse_execute_kernel) {
+                            sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1));
+                        }
                         sendQueues[i].finish();
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();

From fadc681f151cc1ba0825d62aa3bfbac2ca895a2c Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 17 Jan 2023 16:12:08 +0100
Subject: [PATCH 257/318] Update HPL DDR link settings to jinja2

---
 ....link.xilinx.hpl_torus_pcie.ddr.generator.ini | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini
index e032e407..e419e22e 100644
--- a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini
@@ -9,9 +9,9 @@ nk=inner_update_mm0:$PY_CODE_GEN num_replications$
 slr=lu_1:SLR0
 slr=left_update_1:SLR0
 slr=top_update_1:SLR0
-# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
-slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN (i +1) % 3$
-# PY_CODE_GEN block_end
+{% for i in range(num_replications) %}
+slr=inner_update_mm0_{{ i+1 }}:SLR{{ (i+1) % 3 }}
+{% endfor %}
 
 # matrix ports
 sp=lu_1.m_axi_gmem0:DDR[0]
@@ -26,9 +26,9 @@ sp=left_update_1.m_axi_gmem0:DDR[0]
 sp=left_update_1.m_axi_gmem1:DDR[1]
 sp=left_update_1.m_axi_gmem2:DDR[1]
 
-# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
-sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:DDR[0]
-sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:DDR[1]
-sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:DDR[0]
-# PY_CODE_GEN block_end
+{% for i in range(num_replications) %}
+sp=inner_update_mm0_{{ i+1 }}.m_axi_gmem0:DDR[0]
+sp=inner_update_mm0_{{ i+1 }}.m_axi_gmem1:DDR[1]
+sp=inner_update_mm0_{{ i+1 }}.m_axi_gmem2:DDR[0]
+{% endfor %}
 

From 22bb76eb68cf97d2095729b9d249a879168449ac Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 17 Jan 2023 16:14:29 +0100
Subject: [PATCH 258/318] Update Xilinx kernel build for jinja2

---
 cmake/kernelTargets.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index 1d7e667f..7542d4d4 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -57,7 +57,7 @@ function(generate_kernel_targets_xilinx)
         )
         if (XILINX_GENERATE_LINK_SETTINGS)
             add_custom_command(OUTPUT ${xilinx_link_settings}
-                    COMMAND ${Python3_EXECUTABLE} ${CODE_GENERATOR} -o ${xilinx_link_settings} -p num_replications=${NUM_REPLICATIONS} --comment "\"#\"" --comment-ml-start "\"$$\"" --comment-ml-end "\"$$\"" ${gen_xilinx_link_settings}
+                    COMMAND ${Python3_EXECUTABLE} ${CODE_GENERATOR} -o ${xilinx_link_settings} -p num_replications=${NUM_REPLICATIONS} ${gen_xilinx_link_settings}
                     MAIN_DEPENDENCY ${gen_xilinx_link_settings}
                     )
         else()

From 5db53122c0997c3f4d3ec6a2c22e7402f6f9bbbc Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Fri, 20 Jan 2023 13:37:56 +0100
Subject: [PATCH 259/318] update to Sphinx 4.0.0

---
 docs/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index c675a279..f705e859 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,2 +1,2 @@
-Sphinx==3.0.3
-sphinx-rtd-theme==0.5.0
+Sphinx==4.0.0
+sphinx-rtd-theme==1.1.1

From d91bee60527d3cb52601c40c2c7d7b8e0d3e22df Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Fri, 20 Jan 2023 13:38:13 +0100
Subject: [PATCH 260/318] remove some warnings from sphinx html build

---
 docs/source/FFT/index.rst                                   | 6 +++---
 docs/source/GEMM/index.rst                                  | 1 +
 docs/source/LINPACK/index.rst                               | 1 +
 docs/source/PTRANS/index.rst                                | 1 +
 docs/source/RandomAccess/index.rst                          | 1 +
 docs/source/STREAM/index.rst                                | 1 +
 docs/source/b_eff/index.rst                                 | 1 +
 docs/source/index.rst                                       | 2 --
 .../technical_support/Host Input Parameters/index.rst       | 1 +
 9 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/docs/source/FFT/index.rst b/docs/source/FFT/index.rst
index 4f54398b..353691bd 100644
--- a/docs/source/FFT/index.rst
+++ b/docs/source/FFT/index.rst
@@ -1,7 +1,8 @@
 .. _fft:
-======
+
+====== 
 FFT
-======
+====== 
 
 This section contains all information related to the FFT benchmark.
 The benchmark executes a batched calculation of 1d FFTs on a single FPGA.
@@ -13,7 +14,6 @@ It is possible to specify the size of the FFT and the number of kernel replicati
    :glob:
 
    */index
-   ../../../FFT/README.md
 
 ------------------------
 Configuration Parameters
diff --git a/docs/source/GEMM/index.rst b/docs/source/GEMM/index.rst
index 14f597ed..df3899ed 100644
--- a/docs/source/GEMM/index.rst
+++ b/docs/source/GEMM/index.rst
@@ -1,4 +1,5 @@
 .. _gemm:
+
 ======
 GEMM
 ======
diff --git a/docs/source/LINPACK/index.rst b/docs/source/LINPACK/index.rst
index 7ce28dd4..440616bd 100644
--- a/docs/source/LINPACK/index.rst
+++ b/docs/source/LINPACK/index.rst
@@ -1,4 +1,5 @@
 .. _hpl:
+
 =======
 LINPACK
 =======
diff --git a/docs/source/PTRANS/index.rst b/docs/source/PTRANS/index.rst
index b5a9c93d..07bf00c2 100644
--- a/docs/source/PTRANS/index.rst
+++ b/docs/source/PTRANS/index.rst
@@ -1,4 +1,5 @@
 .. _ptrans:
+
 ======
 PTRANS
 ======
diff --git a/docs/source/RandomAccess/index.rst b/docs/source/RandomAccess/index.rst
index 607b311a..02b510d4 100644
--- a/docs/source/RandomAccess/index.rst
+++ b/docs/source/RandomAccess/index.rst
@@ -1,4 +1,5 @@
 .. _randomaccess:
+
 ============
 RandomAccess
 ============
diff --git a/docs/source/STREAM/index.rst b/docs/source/STREAM/index.rst
index 7b4f41ff..26dbc1c8 100644
--- a/docs/source/STREAM/index.rst
+++ b/docs/source/STREAM/index.rst
@@ -1,4 +1,5 @@
 .. _stream:
+
 =======
 STREAM
 =======
diff --git a/docs/source/b_eff/index.rst b/docs/source/b_eff/index.rst
index f8cc2f18..030bee78 100644
--- a/docs/source/b_eff/index.rst
+++ b/docs/source/b_eff/index.rst
@@ -1,4 +1,5 @@
 .. _beff: 
+
 =======
 b_eff
 =======
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8139915b..8f3cd6bb 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -70,8 +70,6 @@ Further optimized implementations that use such device-specific communication ap
    :caption: Benchmark Results:
    :glob:
    
-   ../../../*/README.md
-
    
 ----------
 References
diff --git a/docs/source/technical_support/Host Input Parameters/index.rst b/docs/source/technical_support/Host Input Parameters/index.rst
index 550e8f19..b45e8a36 100644
--- a/docs/source/technical_support/Host Input Parameters/index.rst	
+++ b/docs/source/technical_support/Host Input Parameters/index.rst	
@@ -1,4 +1,5 @@
 .. _execution:
+
 ========================
 Execution of a Benchmark
 ========================

From 4e58959fc108c71fa580d874ad7ed08f7a39a62f Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 6 Feb 2023 09:18:26 +0100
Subject: [PATCH 261/318] update host input parameters page

---
 .../technical_support/Host Input Parameters/index.rst       | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/technical_support/Host Input Parameters/index.rst b/docs/source/technical_support/Host Input Parameters/index.rst
index b45e8a36..c201524e 100644
--- a/docs/source/technical_support/Host Input Parameters/index.rst	
+++ b/docs/source/technical_support/Host Input Parameters/index.rst	
@@ -28,10 +28,16 @@ Input parameters (or options) can be appended to the host execution call like th
     The number of repetitions can be given with this parameter as a positive integer. The benchmark experiment will be repeated the given number of times. The benchmark will show 
     the aggregated results for all runs, but only validate the output of the last run.
 
+``-i``:
+    Use `Intel memory interleaving <https://www.intel.com/content/www/us/en/docs/programmable/683846/22-4/disabling-burst-interleaving-of-global.html>`_.
+
 ``--platform INT``:
     Also an integer. It can be used to specify the index of the OpenCL platform that should be used for execution. By default, it is set to -1. This will make the host code ask you
     to select a platform if multiple platforms are available. This option can become handy if you want to automize the execution of your benchmark.
 
+``--platform_str arg``:
+    A string which can be used to specify the wanted platform independent of the index. The exact platform name needs to be specified. When given, the value of the platform index specified by the flag above will be ignored.
+
 ``--device INT``:
     Also an integer. It can be used to specify the index of the OpenCL device that should be used for execution. By default, it is set to -1. This will make the host code ask you
     to select a device if multiple devices are available. This option can become handy if you want to automize the execution of your benchmark.

From 47ba61dc6eeffdc3fb3e0ed79084214275f7c101 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 6 Feb 2023 09:18:41 +0100
Subject: [PATCH 262/318] add noctua2 experiments to results

---
 docs/source/FFT/results/fft-1-1.csv           | 43 +++++++--------
 docs/source/FFT/results/index.rst             |  2 +-
 docs/source/GEMM/results/gemm-1-0.csv         | 48 ++++++++---------
 .../RandomAccess/results/randomaccess-2-2.csv | 40 +++++++-------
 docs/source/STREAM/results/stream-2-3.csv     | 54 +++++++++----------
 5 files changed, 94 insertions(+), 93 deletions(-)

diff --git a/docs/source/FFT/results/fft-1-1.csv b/docs/source/FFT/results/fft-1-1.csv
index c98312bc..7099394e 100644
--- a/docs/source/FFT/results/fft-1-1.csv
+++ b/docs/source/FFT/results/fft-1-1.csv
@@ -1,21 +1,22 @@
-FPGA board,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
-FPGA,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
-Memory Type,DDR,DDR,HBM2,SVM
-SDK,19.4.0,2019.2,2019.2,19.4.0
-BSP/Shell,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
-CPU,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
-System,`Noctua <https://pc2.uni-paderborn.de/hpc-services/available-systems/noctua/>`_,,,
-LOG_FFT_SIZE,17,9,5,17
-NUM_REPLICATIONS,2,1,15,1
-LUT,276676,83494,602125,192189
-LUT percent,36.0,7.39,54.13,22.0
-Register,724790,168150,941404,480285
-Register percent,36.0,7.19,42.18,22.0
-BRAM,4177,39,405,2147
-BRAM percent,36.0,2.28,22.35,18.0
-DSP,1414,672,5280,707
-DSP percent,25.0,7.46,58.58,12.0
-Frequency,413.34,248.00,254.00,348.00
-GFLOPs,349.45,78.26,576.00,119.66
-GBs,65.78,27.83,368.77,22.54
-Error,7.1e-1,3.9e-1,5.4e-1,7.1e-1
+Version,1.4,1.1,1.1,1.1,1.1
+FPGA board,BittWare 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
+FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
+Memory Type,DDR,DDR,DDR,HBM2,SVM
+SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0
+BSP/Shell,20.4.0,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
+CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
+System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_
+LOG_FFT_SIZE,12,17,9,5,17
+NUM_REPLICATIONS,2,2,1,15,1
+LUT,280105,276676,83494,602125,192189
+LUT percent,30,36.0,7.39,54.13,22.0
+Register,611446,724790,168150,941404,480285
+Register percent,,36.0,7.19,42.18,22.0
+BRAM,1811,4177,39,405,2147
+BRAM percent,15,36.0,2.28,22.35,18.0
+DSP,1560,1414,672,5280,707
+DSP percent,27,25.0,7.46,58.58,12.0
+Frequency,402.41,413.34,248.00,254.00,348.00
+GFLOPs,239.598,349.45,78.26,576.00,119.66
+GBs,,65.78,27.83,368.77,22.54
+Error,3.00463e-1,7.1e-1,3.9e-1,5.4e-1,7.1e-1
\ No newline at end of file
diff --git a/docs/source/FFT/results/index.rst b/docs/source/FFT/results/index.rst
index e2f705db..8672be27 100644
--- a/docs/source/FFT/results/index.rst
+++ b/docs/source/FFT/results/index.rst
@@ -9,7 +9,7 @@ The measurements were executed 10 times and the best result is published.
 The results and the used configuration is given in :numref:`tbl_fft_1_1_results` and are also available as :download:`CSV <fft-1-1.csv>`.
 
 .. _tbl_fft_1_1_results:
-.. csv-table:: FFT FPGA Benchmark Results for version 1.1
+.. csv-table:: FFT FPGA Benchmark Results
     :file: fft-1-1.csv
     :stub-columns: 1
 
diff --git a/docs/source/GEMM/results/gemm-1-0.csv b/docs/source/GEMM/results/gemm-1-0.csv
index 6b36ebc3..c8142d6b 100644
--- a/docs/source/GEMM/results/gemm-1-0.csv
+++ b/docs/source/GEMM/results/gemm-1-0.csv
@@ -1,24 +1,24 @@
-FPGA board,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
-FPGA,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
-Memory Type,DDR,DDR,HBM2,SVM
-SDK,19.4.0,2019.2,2019.2,19.4.0
-BSP/Shell,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
-CPU,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
-System,`Noctua <https://pc2.uni-paderborn.de/hpc-services/available-systems/noctua/>`_,,,
-BLOCK_SIZE,512,256,256,512
-GEMM_SIZE,8,8,8,8
-GLOBAL_MEM_UNROLL,16,16,16,16
-DATA_TYPE,float,float,float,float
-NUM_REPLICATIONS,5,3,3,5
-LUT,275754,568558,499002,299427
-LUT percent,36.0,51.87,42.64,33.0
-Register,861277,441602,920127,829802
-Register percent,36.0,19.43,38.7,33.0
-BRAM,8860,666,666,9041
-BRAM percent,76.0,43.11,36.71,77.0
-DSP,3398,7683,7683,3398
-DSP percent,59.0,85.23,85.18,59.0
-Frequency,160.42,100.00,236.00,225.00
-GFLOPs,708.95,266.91,603.86,739.59
-GFLOPs norm,88.39,85.29,88.97,65.74
-Error,6.0e-7,2.0e-6,2.0e-6,6.0e-7
+FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
+FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
+Memory Type,DDR,DDR,DDR,HBM2,SVM
+SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0
+BSP/Shell,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
+CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
+System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,
+BLOCK_SIZE,"512 ? ",512,256,256,512
+GEMM_SIZE,8,8,8,8,8
+GLOBAL_MEM_UNROLL,8,16,16,16,16
+DATA_TYPE,float,float,float,float,float
+NUM_REPLICATIONS,5,5,3,3,5
+LUT,310564,275754,568558,499002,299427
+LUT percent,33,36.0,51.87,42.64,33.0
+Register,793535,861277,441602,920127,829802
+Register percent,,36.0,19.43,38.7,33.0
+BRAM,8321,8860,666,666,9041
+BRAM percent,71,76.0,43.11,36.71,77.0
+DSP,3318,3398,7683,7683,3398
+DSP percent,58,59.0,85.23,85.18,59.0
+Frequency,273.07,160.42,100.00,236.00,225.00
+GFLOPs,1232.50,708.95,266.91,603.86,739.59
+GFLOPs norm,90.27,88.39,85.29,88.97,65.74
+Error,9.15527e-5,6.0e-7,2.0e-6,2.0e-6,6.0e-7
\ No newline at end of file
diff --git a/docs/source/RandomAccess/results/randomaccess-2-2.csv b/docs/source/RandomAccess/results/randomaccess-2-2.csv
index 68969c49..698edc3a 100644
--- a/docs/source/RandomAccess/results/randomaccess-2-2.csv
+++ b/docs/source/RandomAccess/results/randomaccess-2-2.csv
@@ -1,20 +1,20 @@
-FPGA board,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
-FPGA,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
-Memory Type,DDR,DDR,HBM2,SVM
-SDK,19.4.0,2019.2,2019.2,19.4.0
-BSP/Shell,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
-CPU,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
-System,`Noctua <https://pc2.uni-paderborn.de/hpc-services/available-systems/noctua/>`_,,,
-DEVICE_BUFFER_SIZE,1,1024,1024,1024
-NUM_REPLICATIONS,4,2,32,1
-LUT,115743,7256,116096,103397
-LUT percent,18.0,0.65,10.68,12.0
-Register,253578,11716,187456,225293
-Register percent,18.0,0.5,8.76,12.0
-BRAM,489,38,608,535
-BRAM percent,4.0,2.23,33.55,5.0
-DSP,14,14,224,0
-DSP percent,1.0,0.16,2.48,0.0
-Frequency,329.17,446.0,450.0,322.0
-MUOPs,245.0,40.3,128.1,0.5
-Error,0.0099,0.0106,0.0106,0.0106
+FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
+FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
+Memory Type,DDR,DDR,DDR,HBM2,SVM
+SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0
+BSP/Shell,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
+CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
+System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_
+DEVICE_BUFFER_SIZE,1,1,1024,1024,1024
+NUM_REPLICATIONS,4,4,2,32,1
+LUT,222405,115743,7256,116096,103397
+LUT percent,24,18.0,0.65,10.68,12.0
+Register,434090,253578,11716,187456,225293
+Register percent,24,18.0,0.5,8.76,12.0
+BRAM,602,489,38,608,535
+BRAM percent,5,4.0,2.23,33.55,5.0
+DSP,14,14,14,224,0
+DSP percent,< 1.0,< 1.0,0.16,2.48,0.0
+Frequency,326.05,329.17,446.0,450.0,322.0
+MUOPs,185.633,245.0,40.3,128.1,0.5
+Error,0.0689179,0.0099,0.0106,0.0106,0.0106
\ No newline at end of file
diff --git a/docs/source/STREAM/results/stream-2-3.csv b/docs/source/STREAM/results/stream-2-3.csv
index aa9a49ee..25f1e366 100644
--- a/docs/source/STREAM/results/stream-2-3.csv
+++ b/docs/source/STREAM/results/stream-2-3.csv
@@ -1,27 +1,27 @@
-FPGA board,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
-FPGA,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
-Memory Type,DDR,DDR,HBM2,SVM
-SDK,19.4.0,2019.2,2019.2,19.4.0
-BSP/Shell,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
-CPU,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
-System,`Noctua <https://pc2.uni-paderborn.de/hpc-services/available-systems/noctua/>`_,,,
-DATA_TYPE,float,float,float,float
-VECTOR_COUNT,16,16,16,16
-GLOBAL_MEM_UNROLL,1,1,1,1
-DEVICE_BUFFER_SIZE,4096,16384,2048,1
-NUM_REPLICATIONS,4,2,32,1
-LUT,176396,20832,331904,103628
-LUT percent,25.0,1.9,20.69,12.0
-Register,449231,39002,574976,244354
-Register percent,25.0,1.39,27.24,12.0
-BRAM,4029,558,1408,548
-BRAM percent,34.0,34.19,77.7,5.0
-DSP,128,160,2560,32
-DSP percent,2.0,1.78,28.38,1.0
-Frequency,316.67,300.0,370.0,346.0
-Copy,67.01,33.94,377.42,20.15
-Scale,67.24,33.92,365.8,20.04
-Add,68.9,34.58,374.03,15.04
-Triad,68.9,34.57,378.88,15.12
-PCIe Read,6.41,5.68,6.66,inf
-PCIe Write,6.32,5.47,6.03,inf
+FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
+FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
+Memory Type,DDR,DDR,DDR,HBM2,SVM
+SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0
+BSP/Shell,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
+CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
+System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_
+DATA_TYPE,float,float,float,float,float
+VECTOR_COUNT,16,16,16,16,16
+GLOBAL_MEM_UNROLL,1,1,1,1,1
+DEVICE_BUFFER_SIZE,65536,4096,16384,2048,1
+NUM_REPLICATIONS,4,4,2,32,1
+LUT,178268,176396,20832,331904,103628
+LUT percent,19,25.0,1.9,20.69,12.0
+Register,297342,449231,39002,574976,244354
+Register percent,,25.0,1.39,27.24,12.0
+BRAM,3926,4029,558,1408,548
+BRAM percent,33,34.0,34.19,77.7,5.0
+DSP,128,128,160,2560,32
+DSP percent,2,2.0,1.78,28.38,1.0
+Frequency,342.23,316.67,300.0,370.0,346.0
+Copy,65.63,67.01,33.94,377.42,20.15
+Scale,65.63,67.24,33.92,365.8,20.04
+Add,67.78,68.9,34.58,374.03,15.04
+Triad,67.80,68.9,34.57,378.88,15.12
+PCIe Read,6.28,6.41,5.68,6.66,inf
+PCIe Write,5.87,6.32,5.47,6.03,inf
\ No newline at end of file

From b0d2fa79143b2a487a78779667c339a69e5ca0cc Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Mon, 6 Feb 2023 15:56:38 +0100
Subject: [PATCH 263/318] update results for intel boards

---
 docs/source/GEMM/results/gemm-1-0.csv                 | 3 ++-
 docs/source/GEMM/results/index.rst                    | 2 +-
 docs/source/RandomAccess/results/index.rst            | 2 +-
 docs/source/RandomAccess/results/randomaccess-2-2.csv | 1 +
 docs/source/STREAM/results/index.rst                  | 2 +-
 docs/source/STREAM/results/stream-2-3.csv             | 3 ++-
 6 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/source/GEMM/results/gemm-1-0.csv b/docs/source/GEMM/results/gemm-1-0.csv
index c8142d6b..211d4e9f 100644
--- a/docs/source/GEMM/results/gemm-1-0.csv
+++ b/docs/source/GEMM/results/gemm-1-0.csv
@@ -1,3 +1,4 @@
+Version,1.4,1.0,1.0,1.0,1.0
 FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
 FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
 Memory Type,DDR,DDR,DDR,HBM2,SVM
@@ -5,7 +6,7 @@ SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0
 BSP/Shell,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
 CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
 System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,
-BLOCK_SIZE,"512 ? ",512,256,256,512
+BLOCK_SIZE,512,512,256,256,512
 GEMM_SIZE,8,8,8,8,8
 GLOBAL_MEM_UNROLL,8,16,16,16,16
 DATA_TYPE,float,float,float,float,float
diff --git a/docs/source/GEMM/results/index.rst b/docs/source/GEMM/results/index.rst
index 923b78d2..7e08adb0 100644
--- a/docs/source/GEMM/results/index.rst
+++ b/docs/source/GEMM/results/index.rst
@@ -10,7 +10,7 @@ The measurements were executed 10 times and the best result is published.
 The results and the used configuration is given in :numref:`tbl_gemm_1_0_results` and are also available as :download:`CSV <gemm-1-0.csv>`.
 
 .. _tbl_gemm_1_0_results:
-.. csv-table:: GEMM FPGA Benchmark Results for version 1.0
+.. csv-table:: GEMM FPGA Benchmark Results
     :file: gemm-1-0.csv
     :stub-columns: 1
 
diff --git a/docs/source/RandomAccess/results/index.rst b/docs/source/RandomAccess/results/index.rst
index 52dd983d..a4330c56 100644
--- a/docs/source/RandomAccess/results/index.rst
+++ b/docs/source/RandomAccess/results/index.rst
@@ -9,7 +9,7 @@ The measurements were executed 10 times and the best result is published.
 The results and the used configuration is given in :numref:`tbl_randomaccess_2_2_results` and are also available as :download:`CSV <randomaccess-2-2.csv>`.
 
 .. _tbl_randomaccess_2_2_results:
-.. csv-table:: RandomAccess FPGA Benchmark Results for version 2.2
+.. csv-table:: RandomAccess FPGA Benchmark Results
     :file: randomaccess-2-2.csv
     :stub-columns: 1
 
diff --git a/docs/source/RandomAccess/results/randomaccess-2-2.csv b/docs/source/RandomAccess/results/randomaccess-2-2.csv
index 698edc3a..766a6287 100644
--- a/docs/source/RandomAccess/results/randomaccess-2-2.csv
+++ b/docs/source/RandomAccess/results/randomaccess-2-2.csv
@@ -1,3 +1,4 @@
+Version,2.5,2.2,2.2,2.2,2.2
 FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
 FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
 Memory Type,DDR,DDR,DDR,HBM2,SVM
diff --git a/docs/source/STREAM/results/index.rst b/docs/source/STREAM/results/index.rst
index 4b0d8d4a..b529fcee 100644
--- a/docs/source/STREAM/results/index.rst
+++ b/docs/source/STREAM/results/index.rst
@@ -18,7 +18,7 @@ The results and the used configuration is given in :numref:`tbl_stream_2_3_resul
 
 
 .. _tbl_stream_2_3_results:
-.. csv-table:: STREAM FPGA Benchmark Results for version 2.3
+.. csv-table:: STREAM FPGA Benchmark Results
     :file: stream-2-3.csv
     :stub-columns: 1
 
diff --git a/docs/source/STREAM/results/stream-2-3.csv b/docs/source/STREAM/results/stream-2-3.csv
index 25f1e366..7ee2c8d7 100644
--- a/docs/source/STREAM/results/stream-2-3.csv
+++ b/docs/source/STREAM/results/stream-2-3.csv
@@ -1,3 +1,4 @@
+Version,2.6,2.3,2.3,2.3,2.3
 FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
 FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
 Memory Type,DDR,DDR,DDR,HBM2,SVM
@@ -8,7 +9,7 @@ System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/1
 DATA_TYPE,float,float,float,float,float
 VECTOR_COUNT,16,16,16,16,16
 GLOBAL_MEM_UNROLL,1,1,1,1,1
-DEVICE_BUFFER_SIZE,65536,4096,16384,2048,1
+DEVICE_BUFFER_SIZE,32768,4096,16384,2048,1
 NUM_REPLICATIONS,4,4,2,32,1
 LUT,178268,176396,20832,331904,103628
 LUT percent,19,25.0,1.9,20.69,12.0

From 313b49297db34604f0ebc306f8b0f4334fa59d6f Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Sat, 11 Feb 2023 11:15:42 +0100
Subject: [PATCH 264/318] add u280 results for stream and ra

---
 .../RandomAccess/results/randomaccess-2-2.csv | 42 +++++++-------
 docs/source/STREAM/results/stream-2-3.csv     | 56 +++++++++----------
 2 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/docs/source/RandomAccess/results/randomaccess-2-2.csv b/docs/source/RandomAccess/results/randomaccess-2-2.csv
index 766a6287..59685a9d 100644
--- a/docs/source/RandomAccess/results/randomaccess-2-2.csv
+++ b/docs/source/RandomAccess/results/randomaccess-2-2.csv
@@ -1,21 +1,21 @@
-Version,2.5,2.2,2.2,2.2,2.2
-FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
-FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
-Memory Type,DDR,DDR,DDR,HBM2,SVM
-SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0
-BSP/Shell,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
-CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
-System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_
-DEVICE_BUFFER_SIZE,1,1,1024,1024,1024
-NUM_REPLICATIONS,4,4,2,32,1
-LUT,222405,115743,7256,116096,103397
-LUT percent,24,18.0,0.65,10.68,12.0
-Register,434090,253578,11716,187456,225293
-Register percent,24,18.0,0.5,8.76,12.0
-BRAM,602,489,38,608,535
-BRAM percent,5,4.0,2.23,33.55,5.0
-DSP,14,14,14,224,0
-DSP percent,< 1.0,< 1.0,0.16,2.48,0.0
-Frequency,326.05,329.17,446.0,450.0,322.0
-MUOPs,185.633,245.0,40.3,128.1,0.5
-Error,0.0689179,0.0099,0.0106,0.0106,0.0106
\ No newline at end of file
+Version,2.5,2.5,2.2,2.2,2.2,2.2
+FPGA board,Alveo U280,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
+FPGA,Xilinx XCU280,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
+Memory Type,DDR,DDR,DDR,DDR,HBM2,SVM
+SDK,2019.2,21.2.0,19.4.0,2019.2,2019.2,19.4.0
+BSP/Shell,2019.2.3,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
+CPU,AMD EPYC Milan 7763,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
+System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_
+DEVICE_BUFFER_SIZE,1024,1,1,1024,1024,1024
+NUM_REPLICATIONS,2,4,4,2,32,1
+LUT,,222405,115743,7256,116096,103397
+LUT percent,,24,18.0,0.65,10.68,12.0
+Register,,434090,253578,11716,187456,225293
+Register percent,,24,18.0,0.5,8.76,12.0
+BRAM,,602,489,38,608,535
+BRAM percent,,5,4.0,2.23,33.55,5.0
+DSP,,14,14,14,224,0
+DSP percent,,< 1.0,< 1.0,0.16,2.48,0.0
+Frequency,411.015198,326.05,329.17,446.0,450.0,322.0
+MUOPs,39.7888,185.633,245.0,40.3,128.1,0.5
+Error,0.00662282,0.0689179,0.0099,0.0106,0.0106,0.0106
\ No newline at end of file
diff --git a/docs/source/STREAM/results/stream-2-3.csv b/docs/source/STREAM/results/stream-2-3.csv
index 7ee2c8d7..643ae5ed 100644
--- a/docs/source/STREAM/results/stream-2-3.csv
+++ b/docs/source/STREAM/results/stream-2-3.csv
@@ -1,28 +1,28 @@
-Version,2.6,2.3,2.3,2.3,2.3
-FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
-FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
-Memory Type,DDR,DDR,DDR,HBM2,SVM
-SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0
-BSP/Shell,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
-CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
-System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_
-DATA_TYPE,float,float,float,float,float
-VECTOR_COUNT,16,16,16,16,16
-GLOBAL_MEM_UNROLL,1,1,1,1,1
-DEVICE_BUFFER_SIZE,32768,4096,16384,2048,1
-NUM_REPLICATIONS,4,4,2,32,1
-LUT,178268,176396,20832,331904,103628
-LUT percent,19,25.0,1.9,20.69,12.0
-Register,297342,449231,39002,574976,244354
-Register percent,,25.0,1.39,27.24,12.0
-BRAM,3926,4029,558,1408,548
-BRAM percent,33,34.0,34.19,77.7,5.0
-DSP,128,128,160,2560,32
-DSP percent,2,2.0,1.78,28.38,1.0
-Frequency,342.23,316.67,300.0,370.0,346.0
-Copy,65.63,67.01,33.94,377.42,20.15
-Scale,65.63,67.24,33.92,365.8,20.04
-Add,67.78,68.9,34.58,374.03,15.04
-Triad,67.80,68.9,34.57,378.88,15.12
-PCIe Read,6.28,6.41,5.68,6.66,inf
-PCIe Write,5.87,6.32,5.47,6.03,inf
\ No newline at end of file
+Version,2.6,2.6,2.3,2.3,2.3,2.3
+FPGA board,Alveo U280,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
+FPGA,Xilinx XCU280,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
+Memory Type,DDR,DDR,DDR,DDR,HBM2,SVM
+SDK,2019.2,21.2.0,19.4.0,2019.2,2019.2,19.4.0
+BSP/Shell,2019.2.3,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
+CPU,AMD EPYC Milan 7763,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
+System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_
+DATA_TYPE,,float,float,float,float,float
+VECTOR_COUNT,,16,16,16,16,16
+GLOBAL_MEM_UNROLL,,1,1,1,1,1
+DEVICE_BUFFER_SIZE,,32768,4096,16384,2048,1
+NUM_REPLICATIONS,,4,4,2,32,1
+LUT,,178268,176396,20832,331904,103628
+LUT percent,,19,25.0,1.9,20.69,12.0
+Register,,297342,449231,39002,574976,244354
+Register percent,,,25.0,1.39,27.24,12.0
+BRAM,,3926,4029,558,1408,548
+BRAM percent,,33,34.0,34.19,77.7,5.0
+DSP,,128,128,160,2560,32
+DSP percent,,2,2.0,1.78,28.38,1.0
+Frequency,,342.23,316.67,300.0,370.0,346.0
+Copy (GB/s),32.98,65.63,67.01,33.94,377.42,20.15
+Scale (GB/s),32.98,65.63,67.24,33.92,365.8,20.04
+Add (GB/s),33.88,67.78,68.9,34.58,374.03,15.04
+Triad,33.89,67.80,68.9,34.57,378.88,15.12
+PCIe Read,6.35,6.28,6.41,5.68,6.66,inf
+PCIe Write,4.00,5.87,6.32,5.47,6.03,inf
\ No newline at end of file

From 5bc22eb234f72c9aa690d4cb790b7a66b53bdc61 Mon Sep 17 00:00:00 2001
From: Gerrit Pape <papeg@mail.upb.de>
Date: Fri, 17 Feb 2023 13:45:41 +0100
Subject: [PATCH 265/318] add data for stream and ra

---
 .../RandomAccess/results/randomaccess-2-2.csv | 16 +++++------
 docs/source/STREAM/results/stream-2-3.csv     | 28 +++++++++----------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/docs/source/RandomAccess/results/randomaccess-2-2.csv b/docs/source/RandomAccess/results/randomaccess-2-2.csv
index 59685a9d..b101cbc6 100644
--- a/docs/source/RandomAccess/results/randomaccess-2-2.csv
+++ b/docs/source/RandomAccess/results/randomaccess-2-2.csv
@@ -8,14 +8,14 @@ CPU,AMD EPYC Milan 7763,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold
 System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_
 DEVICE_BUFFER_SIZE,1024,1,1,1024,1024,1024
 NUM_REPLICATIONS,2,4,4,2,32,1
-LUT,,222405,115743,7256,116096,103397
-LUT percent,,24,18.0,0.65,10.68,12.0
-Register,,434090,253578,11716,187456,225293
-Register percent,,24,18.0,0.5,8.76,12.0
-BRAM,,602,489,38,608,535
-BRAM percent,,5,4.0,2.23,33.55,5.0
-DSP,,14,14,14,224,0
-DSP percent,,< 1.0,< 1.0,0.16,2.48,0.0
+LUT,184888,222405,115743,7256,116096,103397
+LUT percent,14.19,24,18.0,0.65,10.68,12.0
+Register,288566,434090,253578,11716,187456,225293
+Register percent,11.08,24,18.0,0.5,8.76,12.0
+BRAM,349.5,602,489,38,608,535
+BRAM percent,17.34,5,4.0,2.23,33.55,5.0
+DSP,24,14,14,14,224,0
+DSP percent,0.27,< 1.0,< 1.0,0.16,2.48,0.0
 Frequency,411.015198,326.05,329.17,446.0,450.0,322.0
 MUOPs,39.7888,185.633,245.0,40.3,128.1,0.5
 Error,0.00662282,0.0689179,0.0099,0.0106,0.0106,0.0106
\ No newline at end of file
diff --git a/docs/source/STREAM/results/stream-2-3.csv b/docs/source/STREAM/results/stream-2-3.csv
index 643ae5ed..bf12dedf 100644
--- a/docs/source/STREAM/results/stream-2-3.csv
+++ b/docs/source/STREAM/results/stream-2-3.csv
@@ -6,20 +6,20 @@ SDK,2019.2,21.2.0,19.4.0,2019.2,2019.2,19.4.0
 BSP/Shell,2019.2.3,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
 CPU,AMD EPYC Milan 7763,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
 System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_
-DATA_TYPE,,float,float,float,float,float
-VECTOR_COUNT,,16,16,16,16,16
-GLOBAL_MEM_UNROLL,,1,1,1,1,1
-DEVICE_BUFFER_SIZE,,32768,4096,16384,2048,1
-NUM_REPLICATIONS,,4,4,2,32,1
-LUT,,178268,176396,20832,331904,103628
-LUT percent,,19,25.0,1.9,20.69,12.0
-Register,,297342,449231,39002,574976,244354
-Register percent,,,25.0,1.39,27.24,12.0
-BRAM,,3926,4029,558,1408,548
-BRAM percent,,33,34.0,34.19,77.7,5.0
-DSP,,128,128,160,2560,32
-DSP percent,,2,2.0,1.78,28.38,1.0
-Frequency,,342.23,316.67,300.0,370.0,346.0
+DATA_TYPE,float,float,float,float,float,float
+VECTOR_COUNT,16,16,16,16,16,16
+GLOBAL_MEM_UNROLL,1,1,1,1,1,1
+DEVICE_BUFFER_SIZE,16384,32768,4096,16384,2048,1
+NUM_REPLICATIONS,2,4,4,2,32,1
+LUT,188124,178268,176396,20832,331904,103628
+LUT percent,14.44,19,25.0,1.9,20.69,12.0
+Register,298365,297342,449231,39002,574976,244354
+Register percent,11.45,,25.0,1.39,27.24,12.0
+BRAM,853.5,3926,4029,558,1408,548
+BRAM percent,42.43,33,34.0,34.19,77.7,5.0
+DSP,170,128,128,160,2560,32
+DSP percent,1.88,2,2.0,1.78,28.38,1.0
+Frequency,411.015198,342.23,316.67,300.0,370.0,346.0
 Copy (GB/s),32.98,65.63,67.01,33.94,377.42,20.15
 Scale (GB/s),32.98,65.63,67.24,33.92,365.8,20.04
 Add (GB/s),33.88,67.78,68.9,34.58,374.03,15.04

From d138c904b9b9524ea5b137e64aab2854305e0d33 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 21 Apr 2023 18:42:27 +0200
Subject: [PATCH 266/318] Update hlslib to follow master to support XRT 2.14

---
 extern/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 197fa734..0e8bed30 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -28,7 +28,7 @@ FetchContent_Declare(
 
   # unfortunately they do not use releases, so the latest commit was used
   GIT_REPOSITORY      https://github.com/definelicht/hlslib.git
-  GIT_TAG             v1.4.3)
+  GIT_TAG             master)
 
 FetchContent_GetProperties(extern_hlslib)
 if(NOT extern_hlslib_POPULATED)

From 074e9544495215e7dcd8cca917f2ab73ba35475c Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 21 Apr 2023 19:52:52 +0200
Subject: [PATCH 267/318] Disable TCP bypass in ACCL stack to prevent data loss

---
 cmake/accl.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index fd29f4ee..7a31a665 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -59,7 +59,7 @@ add_custom_command(
     COMMAND mkdir build && cd build && cmake .. -DFDEV_NAME=u280 
             -DVIVADO_HLS_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 
             -DVIVADO_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 
-            -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=1 -DTCP_STACK_WINDOW_SCALING_EN=0 &&
+            -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=0 -DTCP_STACK_WINDOW_SCALING_EN=0 &&
             make installip
     WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR})
 

From 6a68ca4bb1036fb27dd56341fd621ba81e5dc50e Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 25 Apr 2023 09:05:57 +0200
Subject: [PATCH 268/318] Fix ACCL host signatures in b_eff

---
 b_eff/src/host/execution_types/execution_accl.hpp    | 11 +++++------
 b_eff/src/host/execution_types/execution_accl_pl.hpp |  7 +++----
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
index 2ade570b..998e4d78 100644
--- a/b_eff/src/host/execution_types/execution_accl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -40,7 +40,7 @@ namespace network::execution_types::accl {
      @copydoc bm_execution::calculate()
     */
 	template<class TDevice, class TContext, class TProgram>
-    std::shared_ptr<network::ExecutionTimings>
+    network::ExecutionTimings
     calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
@@ -111,14 +111,13 @@ namespace network::execution_types::accl {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-		std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
+		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
         }
-        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
-                looplength,
+        return network::ExecutionTimings{
+               looplength,
                 messageSize,
                 calculationTimings
-        });
-        return result;
+        };
     }
 
 }  // namespace bm_execution
diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
index eecb552e..d5df937f 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -47,7 +47,7 @@ namespace network::execution_types::accl_pl {
      @copydoc bm_execution::calculate()
     */
 	template<class TDevice, class TContext, class TProgram>
-    std::shared_ptr<network::ExecutionTimings>
+    network::ExecutionTimings
     calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
@@ -137,12 +137,11 @@ namespace network::execution_types::accl_pl {
             }
 		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
         }
-        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
+        return network::ExecutionTimings{
                 looplength,
                 messageSize,
                 calculationTimings
-        });
-        return result;
+        };
     }
 
 }  // namespace bm_execution

From c989680d141f29aebbf99b8a7840cc1ab9e7ff74 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 25 Apr 2023 14:53:15 +0200
Subject: [PATCH 269/318] Fix ACCl PL kernel

---
 b_eff/src/device/communication_ACCL_pl.cpp | 31 +++++++++++++++++++---
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/b_eff/src/device/communication_ACCL_pl.cpp b/b_eff/src/device/communication_ACCL_pl.cpp
index 97a21907..4d4548c5 100644
--- a/b_eff/src/device/communication_ACCL_pl.cpp
+++ b/b_eff/src/device/communication_ACCL_pl.cpp
@@ -22,12 +22,35 @@ SOFTWARE.
 #include "accl_hls.h"
 
 
-void send_recv(const float *read_buffer,float *write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations, 
+void send_recv(ap_uint<64> read_buffer,ap_uint<64> write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations,
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
                 STREAM<command_word> &cmd, STREAM<command_word> &sts) {
-    accl_hls::ACCLCommand accl_cmd(cmd, sts, communicator_addr, datapath_cfg,0,0);
+#pragma HLS INTERFACE s_axilite port=read_buffer
+#pragma HLS INTERFACE s_axilite port=write_buffer
+#pragma HLS INTERFACE s_axilite port=size
+#pragma HLS INTERFACE s_axilite port=num_iterations
+#pragma HLS INTERFACE s_axilite port=neighbor_rank
+#pragma HLS INTERFACE s_axilite port=communicator_addr
+#pragma HLS INTERFACE s_axilite port=datapath_cfg
+#pragma HLS INTERFACE axis port=cmd
+#pragma HLS INTERFACE axis port=sts
+#pragma HLS INTERFACE s_axilite port=return
+    accl_hls::ACCLCommand accl(cmd, sts);
     for (int i = 0; i < num_iterations; i++) {
-        accl_cmd.send(size, 0, neighbor_rank, (ap_uint<64>)read_buffer);
-        accl_cmd.recv(size, 0, neighbor_rank, (ap_uint<64>)write_buffer);
+        #pragma HLS protocol fixed
+        accl.start_call(
+            ACCL_SEND, size, communicator_addr, neighbor_rank, 0, 0,
+                datapath_cfg, 0, 0,
+                read_buffer, 0, 0);
+        ap_wait();
+        accl.finalize_call();
+        ap_wait();
+        accl.start_call(
+            ACCL_RECV, size, communicator_addr, neighbor_rank, 0, 0,
+                datapath_cfg, 0, 0,
+                0, write_buffer, 0);
+        ap_wait();
+        accl.finalize_call();
     }
 }
+

From d81c17736478bfa3f98b7618fc31e081b8768c45 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 25 Apr 2023 14:54:06 +0200
Subject: [PATCH 270/318] Start device indexing at 0 for XRT

---
 shared/setup/fpga_setup_xrt.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp
index 1b41f9e0..f5d7ef32 100644
--- a/shared/setup/fpga_setup_xrt.cpp
+++ b/shared/setup/fpga_setup_xrt.cpp
@@ -42,7 +42,7 @@ namespace fpga_setup {
         } else {
             //TODO Use xrt::system::enumerate_devices() in "experimental/xrt_system.h" for future XRT versions
             // instead of hardcoded number of devices.
-            current_device = current_device + 1 % 3;
+            current_device = current_device % 3;
         }
         return std::unique_ptr<xrt::device>(new xrt::device(current_device));
     } 

From 7d84815146e2caee54e9edd9e3e82736f9e924d0 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 25 Apr 2023 14:55:59 +0200
Subject: [PATCH 271/318] Adjust copying of validation data

---
 b_eff/src/host/execution_types/execution_accl.hpp    | 3 ++-
 b_eff/src/host/execution_types/execution_accl_pl.hpp | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
index 998e4d78..32e5a34e 100644
--- a/b_eff/src/host/execution_types/execution_accl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -111,7 +111,8 @@ namespace network::execution_types::accl {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
+            acclRecvBuffers[r]->sync_from_device();
+		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
         }
         return network::ExecutionTimings{
                looplength,
diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
index d5df937f..2fc79956 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -133,9 +133,9 @@ namespace network::execution_types::accl_pl {
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
             if (!config.programSettings->useAcclEmulation) {
-                acclRecvBuffers.back()->sync_from_device();
+                acclRecvBuffers[r]->sync_from_device();
             }
-		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
+		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
         }
         return network::ExecutionTimings{
                 looplength,

From 0c0ff70b2f717909cb47bc19389565825e52e916 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 26 Apr 2023 14:56:10 +0200
Subject: [PATCH 272/318] Fix ACCL executor

---
 .../host/execution_types/execution_accl.hpp   | 36 ++++++++++---------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
index 32e5a34e..b678e6fb 100644
--- a/b_eff/src/host/execution_types/execution_accl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -45,11 +45,11 @@ namespace network::execution_types::accl {
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
         int err;
-        std::vector<cl::vector<float>> dummyBufferContents;
-        std::vector<cl::vector<float>> recvBufferContents;
-	std::vector<std::unique_ptr<ACCL::Buffer<float>>> acclSendBuffers;
-	std::vector<std::unique_ptr<ACCL::Buffer<float>>> acclRecvBuffers;
-        size_t size_in_bytes = std::max(static_cast<size_t>(validationData.size()), static_cast<size_t>(1 << messageSize));
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+        std::vector<cl::vector<HOST_DATA_TYPE>> recvBufferContents;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclSendBuffers;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclRecvBuffers;
+        size_t size_in_bytes = std::max((1 << messageSize), 4);
 
         int current_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
@@ -66,12 +66,12 @@ namespace network::execution_types::accl {
 	    int size_in_values = (size_in_bytes + 3) / 4;
             // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-                dummyBufferContents.emplace_back(size_in_values, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
-                recvBufferContents.emplace_back(size_in_values, static_cast<HOST_DATA_TYPE>(0));
-		acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_values, ACCL::dataType::float32));
-		acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_values, ACCL::dataType::float32));
-		acclSendBuffers.back()->sync_to_device();
-		acclRecvBuffers.back()->sync_to_device();
+                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+                recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
+		        acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32));
+		        acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32));
+		        acclSendBuffers.back()->sync_to_device();
+		        acclRecvBuffers.back()->sync_to_device();
             }
 
             double calculationTime = 0.0;
@@ -80,15 +80,19 @@ namespace network::execution_types::accl {
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 for (int l = 0; l < looplength; l++) {
 #ifndef NDEBUG
-                    std::cout << "Send " << size_in_values << " bytes to " 
+                    std::cout << "Send " << size_in_bytes << " bytes to " 
                                 << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl;
 #endif
-			config.context->accl->send(*acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+			config.context->accl->send(*acclSendBuffers[i], size_in_values, 
+                                        (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                        0, ACCL::GLOBAL_COMM, true);
 #ifndef NDEBUG
-                    std::cout << "Recv " << size_in_values << " bytes from " 
+                    std::cout << "Recv " << size_in_bytes << " bytes from " 
                                 << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl;
 #endif
-            config.context->accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0);
+            config.context->accl->recv(*acclRecvBuffers[i], size_in_values,
+                                        (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                        0, ACCL::GLOBAL_COMM, true);
 #ifndef NDEBUG
                     std::cout << "Done" << std::endl;
 #endif
@@ -112,7 +116,7 @@ namespace network::execution_types::accl {
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
             acclRecvBuffers[r]->sync_from_device();
-		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
+		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]);
         }
         return network::ExecutionTimings{
                looplength,

From c2d022d34818c12c919fc5da0a9aa195bee9c0e5 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 26 Apr 2023 14:58:04 +0200
Subject: [PATCH 273/318] Adjust ACCL PL executor for updated validation scheme

---
 .../execution_types/execution_accl_pl.hpp     | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
index 2fc79956..ab765de7 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -35,7 +35,7 @@ SOFTWARE.
 
 /* Project's headers */
 
-extern void send_recv(const float *read_buffer,float *write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations, 
+extern void send_recv(ap_uint<64> read_buffer,ap_uint<64> write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, 
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
                 STREAM<command_word> &cmd, STREAM<command_word > &sts);
 
@@ -54,9 +54,9 @@ namespace network::execution_types::accl_pl {
         int err;
         std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
         std::vector<cl::vector<HOST_DATA_TYPE>> recvBufferContents;
-	std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclSendBuffers;
-	std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclRecvBuffers;
-        cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclSendBuffers;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclRecvBuffers;
+        cl_uint size_in_bytes = (1 << messageSize);
 
         int current_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
@@ -77,8 +77,6 @@ namespace network::execution_types::accl_pl {
 
         std::vector<double> calculationTimings;
         for (uint r =0; r < config.programSettings->numRepetitions; r++) {
-            dummyBufferContents.clear();
-            recvBufferContents.clear();
             acclSendBuffers.clear();
             acclRecvBuffers.clear();
             int size_in_values = (size_in_bytes + 3) / 4;
@@ -86,8 +84,8 @@ namespace network::execution_types::accl_pl {
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
                 recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
-                acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
-                acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32));
+                acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32));
+                acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32));
                 acclSendBuffers.back()->sync_to_device();
                 acclRecvBuffers.back()->sync_to_device();
             }
@@ -102,12 +100,12 @@ namespace network::execution_types::accl_pl {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 if (!config.programSettings->useAcclEmulation) {
-                auto run = sendrecvKernel(*(acclSendBuffers[i]->bo()), *(acclRecvBuffers[i]->bo()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
-                                            config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}));
+                auto run = sendrecvKernel(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_bytes, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                            config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}));
                 run.wait();
                 } else {
-                    send_recv(reinterpret_cast<float*>(acclSendBuffers[i]->buffer()), reinterpret_cast<float*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
-                                            config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}),
+                    send_recv(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_bytes, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                            config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}),
                                             cmd, sts);
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
@@ -135,7 +133,11 @@ namespace network::execution_types::accl_pl {
             if (!config.programSettings->useAcclEmulation) {
                 acclRecvBuffers[r]->sync_from_device();
             }
-		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r);
+            for (int c=0; c < size_in_bytes; c++) {
+                std::cout << int(recvBufferContents[r][c]) << ",";
+            }
+            std::cout << std::endl;
+		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]);
         }
         return network::ExecutionTimings{
                 looplength,

From 54ea54db9b0d532d0504f4b205e41b036a21eee3 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 26 Apr 2023 14:59:58 +0200
Subject: [PATCH 274/318] Fix bandwidth calculation. Still 0s?

---
 b_eff/src/host/network_benchmark.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index 66cbb2a4..d4412461 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -213,11 +213,11 @@ network::NetworkBenchmark::collectResults() {
             int messageSize = timing.first;
             int num_timings = timing.second.execution_timings.size();
             // The total sent data in bytes will be:
-            // #Nodes * message_size * looplength * 2
-            // the * 2 is because we have two kernels per bitstream that will send and receive simultaneously.
+            // #Nodes * message_size * looplength * kernel_replications
+            // the * kernel_replications is because we have multiple replications per bitstream that will send and receive simultaneously.
             // This will be divided by half of the maximum of the minimum measured runtime over all ranks.
-            timing.second.maxCalcBW = static_cast<double>(num_timings * 2 * (1 << messageSize) * looplength)
-                                                                / timing.second.maxMinCalculationTime;
+            timing.second.maxCalcBW = static_cast<double>( num_timings * executionSettings->programSettings->kernelReplications
+                                                            * (1 << messageSize) * looplength) / timing.second.maxMinCalculationTime;
 
             maxBandwidths.push_back(timing.second.maxCalcBW);
 
@@ -231,7 +231,7 @@ network::NetworkBenchmark::collectResults() {
 void network::NetworkBenchmark::printResults() {
     std::cout << std::setw(ENTRY_SPACE) << "MSize" << "   "
             << std::setw(ENTRY_SPACE) << "looplength" << "   "
-            << std::setw(ENTRY_SPACE) << "transfer" << "   "
+            << std::setw(ENTRY_SPACE) << "time [s]" << "   "
             << std::setw(ENTRY_SPACE) << "B/s" << std::endl;
 
     for (const auto& timing : collected_timings) {

From 57ac05745b41e4c5c87d63fc475c99012a73cccb Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 26 Apr 2023 16:19:05 +0200
Subject: [PATCH 275/318] Fix b_eff ACCL PL kernel recv

---
 b_eff/src/device/communication_ACCL_pl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/b_eff/src/device/communication_ACCL_pl.cpp b/b_eff/src/device/communication_ACCL_pl.cpp
index 4d4548c5..c32a3af5 100644
--- a/b_eff/src/device/communication_ACCL_pl.cpp
+++ b/b_eff/src/device/communication_ACCL_pl.cpp
@@ -48,7 +48,7 @@ void send_recv(ap_uint<64> read_buffer,ap_uint<64> write_buffer,  ap_uint<32> si
         accl.start_call(
             ACCL_RECV, size, communicator_addr, neighbor_rank, 0, 0,
                 datapath_cfg, 0, 0,
-                0, write_buffer, 0);
+                0, 0, write_buffer);
         ap_wait();
         accl.finalize_call();
     }

From cbdc76e81c7c04a45c79c2dadd9f7a17d563d05b Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 26 Apr 2023 16:19:49 +0200
Subject: [PATCH 276/318] Fix ACCL PL host code to pass validation

---
 b_eff/src/host/execution_types/execution_accl_pl.hpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
index ab765de7..1e8eee04 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -65,7 +65,7 @@ namespace network::execution_types::accl_pl {
         MPI_Comm_size(MPI_COMM_WORLD, & current_size);
 
         hlslib::Stream<stream_word> cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo");
-        hlslib::Stream<command_word> cmd, sts;
+        hlslib::Stream<command_word> cmd("cmd"), sts("sts");
 
         std::vector<unsigned int> dest = {0};
         std::unique_ptr<CCLO_BFM> cclo;
@@ -100,11 +100,11 @@ namespace network::execution_types::accl_pl {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 if (!config.programSettings->useAcclEmulation) {
-                auto run = sendrecvKernel(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_bytes, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                    auto run = sendrecvKernel(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}));
-                run.wait();
+                    run.wait();
                 } else {
-                    send_recv(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_bytes, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                    send_recv(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}),
                                             cmd, sts);
                 }
@@ -130,9 +130,7 @@ namespace network::execution_types::accl_pl {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-            if (!config.programSettings->useAcclEmulation) {
-                acclRecvBuffers[r]->sync_from_device();
-            }
+            acclRecvBuffers[r]->sync_from_device();
             for (int c=0; c < size_in_bytes; c++) {
                 std::cout << int(recvBufferContents[r][c]) << ",";
             }

From 77db40f6e478e505f4937a043ad7ae39c4409788 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 26 Apr 2023 16:24:32 +0200
Subject: [PATCH 277/318] Remove debug output

---
 b_eff/src/host/execution_types/execution_accl_pl.hpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
index 1e8eee04..5df86d1e 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -131,10 +131,6 @@ namespace network::execution_types::accl_pl {
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
             acclRecvBuffers[r]->sync_from_device();
-            for (int c=0; c < size_in_bytes; c++) {
-                std::cout << int(recvBufferContents[r][c]) << ",";
-            }
-            std::cout << std::endl;
 		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]);
         }
         return network::ExecutionTimings{

From 881fdc14b49056ca5d5a235368001bd343abccfb Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 27 Apr 2023 18:36:50 +0200
Subject: [PATCH 278/318] Change default rxbuf banks to 2,3

---
 shared/setup/fpga_setup_accl.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 36561553..4e293910 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -122,10 +122,11 @@ ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
           64 * 1024 * 1024, ACCL::dataType::int8, device, network_krnl.group_id(4)));
       configure_tcp(*accl.tx_buf_network, *accl.rx_buf_network, network_krnl, ranks, current_rank);
     }
-    std::vector<int> mem(1, 0);
+    std::vector<int> mem = {2, 3};
     std::cout << "Create ACCL" << std::endl;
     accl.accl = std::unique_ptr<ACCL::ACCL>(
-        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize));
+        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, 
+            mem, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize, programSettings.acclBufferSize));
   } else {
     // TODO: Add start port here. Currenty hardcoded!
     accl.accl = std::unique_ptr<ACCL::ACCL>(

From 39924f967ae7a9bbdce204e427198e0e9e9d7530 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 27 Apr 2023 18:37:34 +0200
Subject: [PATCH 279/318] Set send and recv buffer to different banks ACCL

---
 b_eff/src/host/execution_types/execution_accl.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
index b678e6fb..3d5f41e5 100644
--- a/b_eff/src/host/execution_types/execution_accl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -68,8 +68,8 @@ namespace network::execution_types::accl {
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
                 recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
-		        acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32));
-		        acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32));
+		        acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 0));
+		        acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 1));
 		        acclSendBuffers.back()->sync_to_device();
 		        acclRecvBuffers.back()->sync_to_device();
             }

From d027197d97b076c272eafd5f797567ee49700170 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 27 Apr 2023 18:37:42 +0200
Subject: [PATCH 280/318] Set send and recv buffer to different banks ACCL PL

---
 b_eff/src/host/execution_types/execution_accl_pl.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
index 5df86d1e..9135ec84 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -84,8 +84,8 @@ namespace network::execution_types::accl_pl {
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
                 recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
-                acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32));
-                acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32));
+                acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 0));
+                acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 1));
                 acclSendBuffers.back()->sync_to_device();
                 acclRecvBuffers.back()->sync_to_device();
             }

From b0ca5a8ced121c6d16aa3aa95a88e0c0d48c96fe Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 28 Apr 2023 16:10:40 +0200
Subject: [PATCH 281/318] Fix JSON dump feature

---
 shared/hpcc_settings.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/shared/hpcc_settings.cpp b/shared/hpcc_settings.cpp
index 136534ff..8cbd2319 100644
--- a/shared/hpcc_settings.cpp
+++ b/shared/hpcc_settings.cpp
@@ -19,6 +19,7 @@ hpcc_base::BaseSettings::BaseSettings(cxxopts::ParseResult &results) : numRepeti
             defaultPlatform(results["platform"].as<int>()),
             defaultDevice(results["device"].as<int>()),
             kernelFileName(results["f"].as<std::string>()),
+            dumpfilePath(results["dump-json"].as<std::string>()),
 #ifdef NUM_REPLICATIONS
             kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : NUM_REPLICATIONS),
 #else
@@ -57,4 +58,4 @@ hpcc_base::BaseSettings::getSettingsMap() {
     return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, 
             {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"},
             {"Communication Type", commToString(communicationType)}};
-}
\ No newline at end of file
+}

From a7d417b24eb311376c3e291e736ed1b781b2dc8e Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 5 May 2023 11:40:04 +0200
Subject: [PATCH 282/318] Allow compilation iwth new cl header with Xilinx

---
 b_eff/src/host/network_benchmark.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index 12b1f51f..a017aa2c 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -31,7 +31,7 @@ SOFTWARE.
 #include "hpcc_benchmark.hpp"
 #include "parameters.h"
 
-#ifdef XILINX_FPGA
+#ifdef USE_DEPRECATED_HPP_HEADER
 template <typename T>
 struct aligned_allocator {
 

From 71b9b575d89b08940951c7ef3aef517e38136704 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 15 May 2023 20:29:21 +0200
Subject: [PATCH 283/318] Convert replication to Jinja2

---
 PTRANS/src/device/transpose_PQ_ACCL_stream.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
index 739792e0..15a16edf 100644
--- a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
+++ b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
@@ -15,6 +15,7 @@ const unsigned int block_size = BLOCK_SIZE;
 const unsigned int channel_width = CHANNEL_WIDTH;
 
 // PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 /**
  * Read blocks of matrix A and transpose them in memory.
@@ -34,7 +35,7 @@ const unsigned int channel_width = CHANNEL_WIDTH;
  * @param width_in_blocks The with of matrix A in blocks
  * @param height_in_blocks The height of matix A in blocks
  */
-void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
+void transpose_read{{ i }}( const DEVICE_DATA_TYPE *A,
             const unsigned int offset_a,
             const unsigned int number_of_blocks,
             const unsigned int width_in_blocks,
@@ -145,7 +146,7 @@ void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
  * @param width_in_blocks The with of matrix A in blocks
  * @param height_in_blocks The height of matix A in blocks
  */
-void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B,
+void transpose_write{{ i }}(const DEVICE_DATA_TYPE *B,
                                  DEVICE_DATA_TYPE *A_out,
             const unsigned int offset_b,
             const unsigned int number_of_blocks,
@@ -194,5 +195,5 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B,
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
 

From c3bfb08b6072724ce34182e3b886ebd6baab3716 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 23 May 2023 18:53:21 +0200
Subject: [PATCH 284/318] Send data to stream for full-duplex

---
 .../device/communication_ACCL_pl_stream.cpp   | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 b_eff/src/device/communication_ACCL_pl_stream.cpp

diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
new file mode 100644
index 00000000..f22913ef
--- /dev/null
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -0,0 +1,63 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#include "accl_hls.h"
+
+
+void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations,
+                ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
+                STREAM<stream_word> &data_in, STREAM<stream_word> &data_out,
+                STREAM<command_word> &cmd, STREAM<command_word> &sts) {
+#pragma HLS INTERFACE s_axilite port=read_buffer
+#pragma HLS INTERFACE m_axi port=write_buffer
+#pragma HLS INTERFACE s_axilite port=size
+#pragma HLS INTERFACE s_axilite port=num_iterations
+#pragma HLS INTERFACE s_axilite port=neighbor_rank
+#pragma HLS INTERFACE s_axilite port=communicator_addr
+#pragma HLS INTERFACE s_axilite port=datapath_cfg
+#pragma HLS INTERFACE axis port=data_in
+#pragma HLS INTERFACE axis port=data_out
+#pragma HLS INTERFACE axis port=cmd
+#pragma HLS INTERFACE axis port=sts
+#pragma HLS INTERFACE s_axilite port=return
+    accl_hls::ACCLCommand accl(cmd, sts);
+    for (int i = 0; i < num_iterations; i++) {
+        #pragma HLS protocol fixed
+        // Send data from global memory to the remote FPGA.
+        // Remote FPGA will immediatly move data to stream.
+        // This will allow overlapping of send and recv.
+        accl.start_call(
+            ACCL_SEND, size, communicator_addr, neighbor_rank, 0, 0,
+                datapath_cfg, 0, 2,
+                read_buffer, 0, 0);
+        ap_wait();
+        // receive the incoming data while send may still be in progress
+        for (int chunk = 0; chunk < (size + 15) / 16 ; chunk++) {
+            #pragma HLS pipeline II=1
+            stream_word word = data_in.read();
+            write_buffer[chunk] = word.data;
+        }
+        // Wait to complete send
+        accl.finalize_call();
+        ap_wait();
+    }
+}
+

From 7168b67a97ddb6e3f9d624c98de1bbfec54ad3e6 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 23 May 2023 18:54:18 +0200
Subject: [PATCH 285/318] Use streaming ACCL PL as default

---
 b_eff/src/host/CMakeLists.txt                 |   2 +-
 b_eff/src/host/execution_types/execution.hpp  |   2 +-
 .../execution_accl_pl_stream.hpp              | 145 ++++++++++++++++++
 3 files changed, 147 insertions(+), 2 deletions(-)
 create mode 100644 b_eff/src/host/execution_types/execution_accl_pl_stream.hpp

diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt
index e5e09aed..ac11320e 100755
--- a/b_eff/src/host/CMakeLists.txt
+++ b/b_eff/src/host/CMakeLists.txt
@@ -23,7 +23,7 @@ if (USE_ACCL)
     set(CMAKE_SKIP_BUILD_RPATH No)
     set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes)
     list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib)
-    list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl.cpp)
+    list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl_stream.cpp)
 endif()
     add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE})
     target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS})
diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp
index 17c3241c..86aec21c 100644
--- a/b_eff/src/host/execution_types/execution.hpp
+++ b/b_eff/src/host/execution_types/execution.hpp
@@ -29,5 +29,5 @@ SOFTWARE.
 #endif
 #else
 #include "execution_types/execution_accl.hpp"
-#include "execution_types/execution_accl_pl.hpp"
+#include "execution_types/execution_accl_pl_stream.hpp"
 #endif
diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
new file mode 100644
index 00000000..934f0d68
--- /dev/null
+++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
@@ -0,0 +1,145 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_PL_STREAM_HPP
+#define SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_PL_STREAM_HPP
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* External library headers */
+#include "mpi.h"
+#include "accl.hpp"
+#include "cclo_bfm.h"
+#include "accl_hls.h"
+
+/* Project's headers */
+
+extern void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, 
+                ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
+                STREAM<stream_word> &data_in, STREAM<stream_word > &data_out, STREAM<command_word> &cmd, STREAM<command_word > &sts);
+
+namespace network::execution_types::accl_pl {
+
+
+    /*
+    Implementation for the single kernel.
+     @copydoc bm_execution::calculate()
+    */
+	template<class TDevice, class TContext, class TProgram>
+    network::ExecutionTimings
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
+                cl::vector<HOST_DATA_TYPE> &validationData) {
+
+        int err;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+        std::vector<cl::vector<HOST_DATA_TYPE>> recvBufferContents;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclSendBuffers;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclRecvBuffers;
+        cl_uint size_in_bytes = (1 << messageSize);
+
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+        hlslib::Stream<stream_word> cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo");
+        hlslib::Stream<command_word> cmd("cmd"), sts("sts");
+
+        std::vector<unsigned int> dest = {0};
+        std::unique_ptr<CCLO_BFM> cclo;
+        if (config.programSettings->useAcclEmulation) {
+            cclo = std::make_unique<CCLO_BFM>(6000, current_rank, current_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+            cclo->run();
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+
+        std::vector<double> calculationTimings;
+        for (uint r =0; r < config.programSettings->numRepetitions; r++) {
+            acclSendBuffers.clear();
+            acclRecvBuffers.clear();
+            int size_in_values = (size_in_bytes + 3) / 4;
+            // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
+            for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+                recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
+                acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 0));
+                acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 1));
+                acclSendBuffers.back()->sync_to_device();
+                acclRecvBuffers.back()->sync_to_device();
+            }
+
+            xrt::kernel sendrecvKernel;
+            if (!config.programSettings->useAcclEmulation) {
+                sendrecvKernel = xrt::kernel(*config.device, *config.program, "send_recv_stream");
+            }
+
+            double calculationTime = 0.0;
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto startCalculation = std::chrono::high_resolution_clock::now();
+                if (!config.programSettings->useAcclEmulation) {
+                    auto run = sendrecvKernel(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                            config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}));
+                    run.wait();
+                } else {
+                    send_recv_stream(acclSendBuffers[i]->physical_address(), reinterpret_cast<ap_uint<512>*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                            config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}),
+                                            cclo2krnl, krnl2cclo, cmd, sts);
+                }
+                auto endCalculation = std::chrono::high_resolution_clock::now();
+                calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
+                #endif
+            }
+            calculationTimings.push_back(calculationTime);
+#ifndef NDEBUG
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+        std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
+#endif
+        }
+
+        if (config.programSettings->useAcclEmulation) {
+            cclo->stop();
+        }
+        // Read validation data from FPGA will be placed sequentially in buffer for all replications
+        // The data order should not matter, because every byte should have the same value!
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+            acclRecvBuffers[r]->sync_from_device();
+		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]);
+        }
+        return network::ExecutionTimings{
+                looplength,
+                messageSize,
+                calculationTimings
+        };
+    }
+
+}  // namespace bm_execution
+
+#endif

From b536c3a03050886e4fccddd593bc50e5f2660872 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 23 May 2023 19:09:06 +0200
Subject: [PATCH 286/318] Use stream design for kernel build

---
 b_eff/src/device/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt
index e878e9c5..865cb249 100644
--- a/b_eff/src/device/CMakeLists.txt
+++ b/b_eff/src/device/CMakeLists.txt
@@ -18,7 +18,8 @@ endif()
 if (Vitis_FOUND)
         generate_kernel_targets_xilinx(communication_PCIE)
         if (USE_ACCL)
-                generate_kernel_targets_xilinx(communication_ACCL communication_ACCL_pl)
+                generate_kernel_targets_xilinx(communication_ACCL
+                    communication_ACCL_pl_stream)
         endif()
         add_test(NAME test_emulation_pcie_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 10 -m 0 -n 1
                 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})

From 7ba851d9e25e21e565de59b04beea03d09c08ea1 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 24 May 2023 10:38:50 +0200
Subject: [PATCH 287/318] Force data_out to be master

---
 b_eff/src/device/communication_ACCL_pl_stream.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
index f22913ef..2e3a6089 100644
--- a/b_eff/src/device/communication_ACCL_pl_stream.cpp
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -38,6 +38,17 @@ void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer,  ap_ui
 #pragma HLS INTERFACE axis port=cmd
 #pragma HLS INTERFACE axis port=sts
 #pragma HLS INTERFACE s_axilite port=return
+    
+    // This is just dummycode to define data_out as
+    // master AXI stream. There seems to be no interface pragma to do this
+    // and if it isn't done, the stream is implemented as slave and throw an
+    // error during synthesis.
+    if (false) {
+        stream_word tmp;
+        data_out.write(tmp);
+    }
+
+
     accl_hls::ACCLCommand accl(cmd, sts);
     for (int i = 0; i < num_iterations; i++) {
         #pragma HLS protocol fixed

From 233c4e684bf41cfe59ad4d9a5faa3b91dedfaa5b Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 24 May 2023 13:54:04 +0200
Subject: [PATCH 288/318] Add loopback for reduce kernel

---
 .../src/device/communication_ACCL_pl_stream.cpp  | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
index 2e3a6089..5db824f5 100644
--- a/b_eff/src/device/communication_ACCL_pl_stream.cpp
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -72,3 +72,19 @@ void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer,  ap_ui
     }
 }
 
+void loopback_reduce(STREAM<stream_word> & in0, STREAM<stream_word> & in1, STREAM<stream_word> & out) {
+#pragma HLS INTERFACE axis register both port=in0
+#pragma HLS INTERFACE axis register both port=in1
+#pragma HLS INTERFACE axis register both port=out
+#pragma HLS INTERFACE ap_ctrl_none port=return
+
+stream_word tmp;
+
+do{
+#pragma HLS PIPELINE II=1
+	tmp = in0.read();
+    tmp = in1.read();
+	out.write(tmp);
+} while(tmp.last == 0);
+
+}

From d58a6b6e2c37c12f465a0c8c3d5fb1dcec6742a7 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 24 May 2023 15:58:51 +0200
Subject: [PATCH 289/318] Attempt to not ptimize away write

---
 b_eff/src/device/communication_ACCL_pl_stream.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
index 5db824f5..d88f10d0 100644
--- a/b_eff/src/device/communication_ACCL_pl_stream.cpp
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -43,7 +43,7 @@ void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer,  ap_ui
     // master AXI stream. There seems to be no interface pragma to do this
     // and if it isn't done, the stream is implemented as slave and throw an
     // error during synthesis.
-    if (false) {
+    if (num_iterations == 0) {
         stream_word tmp;
         data_out.write(tmp);
     }

From b0ff73fb2b39c47fbf041c88c151cf5bf1e1cf6a Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 25 May 2023 18:22:55 +0200
Subject: [PATCH 290/318] Modify kernel to read and write over stream

---
 .../device/communication_ACCL_pl_stream.cpp   | 78 +++++++++++--------
 .../execution_accl_pl_stream.hpp              | 10 ++-
 2 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
index d88f10d0..060efbba 100644
--- a/b_eff/src/device/communication_ACCL_pl_stream.cpp
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -22,12 +22,50 @@ SOFTWARE.
 #include "accl_hls.h"
 
 
-void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations,
+void
+write_data(ap_uint<512>* read_buffer, ap_uint<32> size, STREAM<stream_word> &data_out) {
+    // receive the incoming data while send may still be in progress
+    for (int chunk = 0; chunk < (size + 15) / 16; chunk++) {
+        #pragma HLS pipeline II=1
+        stream_word word;
+        word.last = 1;
+        word.keep = -1;
+        word.dest = 0;
+        word.data = read_buffer[chunk];
+        data_out.write(word);
+    }
+}
+
+void
+read_data(ap_uint<512>* write_buffer, ap_uint<32> size, STREAM<stream_word> &data_in, ap_uint<32> neighbor_rank, 
+        ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM<command_word> &cmd, STREAM<command_word> &sts) {
+    #pragma HLS protocol fixed
+    // Send data from stream to the remote FPGA.
+    // Remote FPGA will immediatly move data to stream.
+    // This will allow overlapping of send and recv.
+    accl_hls::ACCLCommand accl(cmd, sts);
+    accl.start_call(
+        ACCL_SEND, size, communicator_addr, neighbor_rank, 0, 0,
+            datapath_cfg, 0, 3,
+            0, 0, 0);
+    ap_wait(); 
+    // receive the incoming data while send may still be in progress
+    for (int chunk = 0; chunk < (size + 15) / 16 ; chunk++) {
+        #pragma HLS pipeline II=1
+        stream_word word = data_in.read();
+        write_buffer[chunk] = word.data;
+    }
+    ap_wait();
+    accl.finalize_call();
+}
+
+
+void send_recv_stream(ap_uint<512>* read_buffer,ap_uint<512>* write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations,
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
                 STREAM<stream_word> &data_in, STREAM<stream_word> &data_out,
                 STREAM<command_word> &cmd, STREAM<command_word> &sts) {
-#pragma HLS INTERFACE s_axilite port=read_buffer
-#pragma HLS INTERFACE m_axi port=write_buffer
+#pragma HLS INTERFACE m_axi port=read_buffer bundle=read
+#pragma HLS INTERFACE m_axi port=write_buffer bundle=write
 #pragma HLS INTERFACE s_axilite port=size
 #pragma HLS INTERFACE s_axilite port=num_iterations
 #pragma HLS INTERFACE s_axilite port=neighbor_rank
@@ -38,37 +76,13 @@ void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer,  ap_ui
 #pragma HLS INTERFACE axis port=cmd
 #pragma HLS INTERFACE axis port=sts
 #pragma HLS INTERFACE s_axilite port=return
-    
-    // This is just dummycode to define data_out as
-    // master AXI stream. There seems to be no interface pragma to do this
-    // and if it isn't done, the stream is implemented as slave and throw an
-    // error during synthesis.
-    if (num_iterations == 0) {
-        stream_word tmp;
-        data_out.write(tmp);
-    }
-
 
-    accl_hls::ACCLCommand accl(cmd, sts);
     for (int i = 0; i < num_iterations; i++) {
-        #pragma HLS protocol fixed
-        // Send data from global memory to the remote FPGA.
-        // Remote FPGA will immediatly move data to stream.
-        // This will allow overlapping of send and recv.
-        accl.start_call(
-            ACCL_SEND, size, communicator_addr, neighbor_rank, 0, 0,
-                datapath_cfg, 0, 2,
-                read_buffer, 0, 0);
-        ap_wait();
-        // receive the incoming data while send may still be in progress
-        for (int chunk = 0; chunk < (size + 15) / 16 ; chunk++) {
-            #pragma HLS pipeline II=1
-            stream_word word = data_in.read();
-            write_buffer[chunk] = word.data;
-        }
-        // Wait to complete send
-        accl.finalize_call();
-        ap_wait();
+        #pragma HLS dataflow
+
+        write_data(read_buffer, size, data_out);
+
+        read_data(write_buffer, size, data_in, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts);
     }
 }
 
diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
index 934f0d68..0801a23c 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
@@ -35,7 +35,7 @@ SOFTWARE.
 
 /* Project's headers */
 
-extern void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, 
+extern void send_recv_stream(ap_uint<512>* read_buffer,ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, 
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
                 STREAM<stream_word> &data_in, STREAM<stream_word > &data_out, STREAM<command_word> &cmd, STREAM<command_word > &sts);
 
@@ -100,11 +100,11 @@ namespace network::execution_types::accl_pl {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 if (!config.programSettings->useAcclEmulation) {
-                    auto run = sendrecvKernel(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                    auto run = sendrecvKernel(*acclSendBuffers[i]->bo(), *acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}));
                     run.wait();
                 } else {
-                    send_recv_stream(acclSendBuffers[i]->physical_address(), reinterpret_cast<ap_uint<512>*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                    send_recv_stream(reinterpret_cast<ap_uint<512>*>(acclSendBuffers[i]->buffer()), reinterpret_cast<ap_uint<512>*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}),
                                             cclo2krnl, krnl2cclo, cmd, sts);
                 }
@@ -130,7 +130,9 @@ namespace network::execution_types::accl_pl {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-            acclRecvBuffers[r]->sync_from_device();
+            if (!config.programSettings->useAcclEmulation) {
+                acclRecvBuffers[r]->sync_from_device();
+            }
 		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]);
         }
         return network::ExecutionTimings{

From fe2e3aed07b9b517a2ecca72115780accf88faf8 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 26 May 2023 14:48:57 +0200
Subject: [PATCH 291/318] Fix gmem port names

---
 b_eff/src/device/communication_ACCL_pl_stream.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
index 060efbba..44e05826 100644
--- a/b_eff/src/device/communication_ACCL_pl_stream.cpp
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -64,8 +64,8 @@ void send_recv_stream(ap_uint<512>* read_buffer,ap_uint<512>* write_buffer,  ap_
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
                 STREAM<stream_word> &data_in, STREAM<stream_word> &data_out,
                 STREAM<command_word> &cmd, STREAM<command_word> &sts) {
-#pragma HLS INTERFACE m_axi port=read_buffer bundle=read
-#pragma HLS INTERFACE m_axi port=write_buffer bundle=write
+#pragma HLS INTERFACE m_axi port=read_buffer bundle=gmem_in
+#pragma HLS INTERFACE m_axi port=write_buffer bundle=gmem_out
 #pragma HLS INTERFACE s_axilite port=size
 #pragma HLS INTERFACE s_axilite port=num_iterations
 #pragma HLS INTERFACE s_axilite port=neighbor_rank

From da59d220f91319417343459b0a9d452114a4ff23 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 30 May 2023 19:27:51 +0200
Subject: [PATCH 292/318] Set mem bank by group id

---
 .../device/communication_ACCL_pl_stream.cpp   | 10 ++++++----
 .../execution_accl_pl_stream.hpp              | 20 ++++++++++++-------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
index 44e05826..bf728f61 100644
--- a/b_eff/src/device/communication_ACCL_pl_stream.cpp
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -39,6 +39,7 @@ write_data(ap_uint<512>* read_buffer, ap_uint<32> size, STREAM<stream_word> &dat
 void
 read_data(ap_uint<512>* write_buffer, ap_uint<32> size, STREAM<stream_word> &data_in, ap_uint<32> neighbor_rank, 
         ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM<command_word> &cmd, STREAM<command_word> &sts) {
+    issue_and_recv: {
     #pragma HLS protocol fixed
     // Send data from stream to the remote FPGA.
     // Remote FPGA will immediatly move data to stream.
@@ -57,6 +58,7 @@ read_data(ap_uint<512>* write_buffer, ap_uint<32> size, STREAM<stream_word> &dat
     }
     ap_wait();
     accl.finalize_call();
+    }
 }
 
 
@@ -71,10 +73,10 @@ void send_recv_stream(ap_uint<512>* read_buffer,ap_uint<512>* write_buffer,  ap_
 #pragma HLS INTERFACE s_axilite port=neighbor_rank
 #pragma HLS INTERFACE s_axilite port=communicator_addr
 #pragma HLS INTERFACE s_axilite port=datapath_cfg
-#pragma HLS INTERFACE axis port=data_in
-#pragma HLS INTERFACE axis port=data_out
-#pragma HLS INTERFACE axis port=cmd
-#pragma HLS INTERFACE axis port=sts
+#pragma HLS INTERFACE axis register both port=data_in
+#pragma HLS INTERFACE axis register both port=data_out
+#pragma HLS INTERFACE axis register both port=cmd
+#pragma HLS INTERFACE axis register both port=sts
 #pragma HLS INTERFACE s_axilite port=return
 
     for (int i = 0; i < num_iterations; i++) {
diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
index 0801a23c..c888e6b7 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
@@ -80,21 +80,27 @@ namespace network::execution_types::accl_pl {
             acclSendBuffers.clear();
             acclRecvBuffers.clear();
             int size_in_values = (size_in_bytes + 3) / 4;
+
+            xrt::kernel sendrecvKernel;
+            if (!config.programSettings->useAcclEmulation) {
+                sendrecvKernel = xrt::kernel(*config.device, *config.program, "send_recv_stream");
+            }
             // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
                 recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
-                acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 0));
-                acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 1));
+                if (!config.programSettings->useAcclEmulation) {
+                    acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 0));
+                    acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 1));
+                }
+                else {
+                    acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, sendrecvKernel.group_id(0)));
+                    acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, sendrecvKernel.group_id(1)));               
+                }
                 acclSendBuffers.back()->sync_to_device();
                 acclRecvBuffers.back()->sync_to_device();
             }
 
-            xrt::kernel sendrecvKernel;
-            if (!config.programSettings->useAcclEmulation) {
-                sendrecvKernel = xrt::kernel(*config.device, *config.program, "send_recv_stream");
-            }
-
             double calculationTime = 0.0;
             for (int i = 0; i < config.programSettings->kernelReplications; i++) {
                 MPI_Barrier(MPI_COMM_WORLD);

From b08f31836eaf9d19fd11c9f41a8c438fbc7ffd40 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 31 May 2023 18:40:37 +0200
Subject: [PATCH 293/318] Separate logic into three kernels

---
 .../device/communication_ACCL_pl_stream.cpp   | 81 ++++++++++++++-----
 .../execution_accl_pl_stream.hpp              | 43 +++++++---
 2 files changed, 91 insertions(+), 33 deletions(-)

diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
index bf728f61..2b9850b0 100644
--- a/b_eff/src/device/communication_ACCL_pl_stream.cpp
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -21,6 +21,7 @@ SOFTWARE.
 */
 #include "accl_hls.h"
 
+typedef ap_axiu<1, 0, 0, 0> notify_word;
 
 void
 write_data(ap_uint<512>* read_buffer, ap_uint<32> size, STREAM<stream_word> &data_out) {
@@ -37,9 +38,20 @@ write_data(ap_uint<512>* read_buffer, ap_uint<32> size, STREAM<stream_word> &dat
 }
 
 void
-read_data(ap_uint<512>* write_buffer, ap_uint<32> size, STREAM<stream_word> &data_in, ap_uint<32> neighbor_rank, 
-        ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM<command_word> &cmd, STREAM<command_word> &sts) {
-    issue_and_recv: {
+read_data(ap_uint<512>* write_buffer, ap_uint<32> size, STREAM<stream_word> &data_in) {
+    // receive the incoming data while send may still be in progress
+    for (int chunk = 0; chunk < (size + 15) / 16 ; chunk++) {
+        #pragma HLS pipeline II=1
+        stream_word word = data_in.read();
+        write_buffer[chunk] = word.data;
+    }
+}
+
+void
+schedule_send(ap_uint<32> size, ap_uint<32> neighbor_rank, 
+        ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, 
+        STREAM<command_word> &cmd, STREAM<command_word> &sts) {
+    send_fixed: {
     #pragma HLS protocol fixed
     // Send data from stream to the remote FPGA.
     // Remote FPGA will immediatly move data to stream.
@@ -49,45 +61,70 @@ read_data(ap_uint<512>* write_buffer, ap_uint<32> size, STREAM<stream_word> &dat
         ACCL_SEND, size, communicator_addr, neighbor_rank, 0, 0,
             datapath_cfg, 0, 3,
             0, 0, 0);
-    ap_wait(); 
-    // receive the incoming data while send may still be in progress
-    for (int chunk = 0; chunk < (size + 15) / 16 ; chunk++) {
-        #pragma HLS pipeline II=1
-        stream_word word = data_in.read();
-        write_buffer[chunk] = word.data;
-    }
     ap_wait();
     accl.finalize_call();
     }
 }
 
+void recv_stream(ap_uint<512>* write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations,
+                STREAM<stream_word> &data_in,
+                STREAM<notify_word> &notify) {
+#pragma HLS INTERFACE m_axi port=write_buffer bundle=gmem_out
+#pragma HLS INTERFACE s_axilite port=size
+#pragma HLS INTERFACE s_axilite port=num_iterations
+#pragma HLS INTERFACE s_axilite port=neighbor_rank
+#pragma HLS INTERFACE s_axilite port=communicator_addr
+#pragma HLS INTERFACE s_axilite port=datapath_cfg
+#pragma HLS INTERFACE axis port=data_in
+#pragma HLS INTERFACE axis port=cmd
+#pragma HLS INTERFACE axis port=sts
+#pragma HLS INTERFACE axis port=notify
+#pragma HLS INTERFACE s_axilite port=return
+
+    notify_word w;
+    for (int i = 0; i < num_iterations; i++) {
+        read_data(write_buffer, size, data_in);
+        notify.write(w);
+    }
+}
 
-void send_recv_stream(ap_uint<512>* read_buffer,ap_uint<512>* write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations,
+void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations,
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
-                STREAM<stream_word> &data_in, STREAM<stream_word> &data_out,
-                STREAM<command_word> &cmd, STREAM<command_word> &sts) {
-#pragma HLS INTERFACE m_axi port=read_buffer bundle=gmem_in
-#pragma HLS INTERFACE m_axi port=write_buffer bundle=gmem_out
+                STREAM<command_word> &cmd, STREAM<command_word> &sts,
+                STREAM<notify_word> &notify) {
 #pragma HLS INTERFACE s_axilite port=size
 #pragma HLS INTERFACE s_axilite port=num_iterations
 #pragma HLS INTERFACE s_axilite port=neighbor_rank
 #pragma HLS INTERFACE s_axilite port=communicator_addr
 #pragma HLS INTERFACE s_axilite port=datapath_cfg
-#pragma HLS INTERFACE axis register both port=data_in
-#pragma HLS INTERFACE axis register both port=data_out
-#pragma HLS INTERFACE axis register both port=cmd
-#pragma HLS INTERFACE axis register both port=sts
+#pragma HLS INTERFACE axis port=cmd
+#pragma HLS INTERFACE axis port=sts
+#pragma HLS INTERFACE axis port=notify        
 #pragma HLS INTERFACE s_axilite port=return
 
     for (int i = 0; i < num_iterations; i++) {
-        #pragma HLS dataflow
+        schedule_send(size, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts);
+        notify_word w = notify.read();
+    }
+}
 
-        write_data(read_buffer, size, data_out);
+void send_stream(ap_uint<512>* read_buffer,  ap_uint<32> size, ap_uint<32> num_iterations,
+                STREAM<stream_word> &data_out) {
+#pragma HLS INTERFACE m_axi port=read_buffer bundle=gmem_in
+#pragma HLS INTERFACE s_axilite port=size
+#pragma HLS INTERFACE s_axilite port=num_iterations
+#pragma HLS INTERFACE s_axilite port=neighbor_rank
+#pragma HLS INTERFACE s_axilite port=communicator_addr
+#pragma HLS INTERFACE s_axilite port=datapath_cfg
+#pragma HLS INTERFACE axis port=data_out
+#pragma HLS INTERFACE s_axilite port=return
 
-        read_data(write_buffer, size, data_in, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts);
+    for (int i = 0; i < num_iterations; i++) {
+        write_data(read_buffer, size, data_out);
     }
 }
 
+
 void loopback_reduce(STREAM<stream_word> & in0, STREAM<stream_word> & in1, STREAM<stream_word> & out) {
 #pragma HLS INTERFACE axis register both port=in0
 #pragma HLS INTERFACE axis register both port=in1
diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
index c888e6b7..2b0e9039 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
@@ -34,10 +34,17 @@ SOFTWARE.
 #include "accl_hls.h"
 
 /* Project's headers */
+typedef ap_axiu<1, 0, 0, 0> notify_word;
 
-extern void send_recv_stream(ap_uint<512>* read_buffer,ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, 
+extern void send_stream(ap_uint<512>* read_buffer, ap_uint<32> size, ap_uint<32> num_iterations, 
+                        STREAM<stream_word > &data_out);
+
+extern void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, 
+                STREAM<stream_word> &data_in, STREAM<notify_word> &notify);
+
+extern void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, 
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
-                STREAM<stream_word> &data_in, STREAM<stream_word > &data_out, STREAM<command_word> &cmd, STREAM<command_word > &sts);
+                STREAM<command_word> &cmd, STREAM<command_word > &sts, STREAM<notify_word> &notify);
 
 namespace network::execution_types::accl_pl {
 
@@ -66,6 +73,7 @@ namespace network::execution_types::accl_pl {
 
         hlslib::Stream<stream_word> cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo");
         hlslib::Stream<command_word> cmd("cmd"), sts("sts");
+        hlslib::Stream<notify_word> notify("notify");
 
         std::vector<unsigned int> dest = {0};
         std::unique_ptr<CCLO_BFM> cclo;
@@ -81,21 +89,25 @@ namespace network::execution_types::accl_pl {
             acclRecvBuffers.clear();
             int size_in_values = (size_in_bytes + 3) / 4;
 
-            xrt::kernel sendrecvKernel;
+            xrt::kernel sendKernel;
+            xrt::kernel recvKernel;
+            xrt::kernel scheduleKernel;
             if (!config.programSettings->useAcclEmulation) {
-                sendrecvKernel = xrt::kernel(*config.device, *config.program, "send_recv_stream");
+                sendKernel = xrt::kernel(*config.device, *config.program, "send_stream");
+                recvKernel = xrt::kernel(*config.device, *config.program, "recv_stream");
+                scheduleKernel = xrt::kernel(*config.device, *config.program, "schedule_stream");
             }
             // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
                 recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
-                if (!config.programSettings->useAcclEmulation) {
+                if (config.programSettings->useAcclEmulation) {
                     acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 0));
                     acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 1));
                 }
                 else {
-                    acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, sendrecvKernel.group_id(0)));
-                    acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, sendrecvKernel.group_id(1)));               
+                    acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, sendKernel.group_id(0)));
+                    acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, recvKernel.group_id(0)));               
                 }
                 acclSendBuffers.back()->sync_to_device();
                 acclRecvBuffers.back()->sync_to_device();
@@ -106,13 +118,22 @@ namespace network::execution_types::accl_pl {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 if (!config.programSettings->useAcclEmulation) {
-                    auto run = sendrecvKernel(*acclSendBuffers[i]->bo(), *acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                    auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength);
+                    auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength);
+                    MPI_Barrier(MPI_COMM_WORLD);
+                    auto run_schedule = scheduleKernel(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}));
-                    run.wait();
+                    run_send.wait();
+                    run_recv.wait();
+                    run_schedule.wait();
                 } else {
-                    send_recv_stream(reinterpret_cast<ap_uint<512>*>(acclSendBuffers[i]->buffer()), reinterpret_cast<ap_uint<512>*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                    send_stream(reinterpret_cast<ap_uint<512>*>(acclSendBuffers[i]->buffer()), size_in_values, looplength,
+                                            krnl2cclo);
+                    schedule_stream(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}),
-                                            cclo2krnl, krnl2cclo, cmd, sts);
+                                            cmd, sts, notify);
+                    recv_stream(reinterpret_cast<ap_uint<512>*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength,
+                                            cclo2krnl, notify);
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();

From cddae806168fbe399dcdd177c73364b08593d444 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 31 May 2023 22:20:22 +0200
Subject: [PATCH 294/318] Remove unused interface pragmas

---
 b_eff/src/device/communication_ACCL_pl_stream.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
index 2b9850b0..fb1e2ee1 100644
--- a/b_eff/src/device/communication_ACCL_pl_stream.cpp
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -72,12 +72,7 @@ void recv_stream(ap_uint<512>* write_buffer,  ap_uint<32> size, ap_uint<32> num_
 #pragma HLS INTERFACE m_axi port=write_buffer bundle=gmem_out
 #pragma HLS INTERFACE s_axilite port=size
 #pragma HLS INTERFACE s_axilite port=num_iterations
-#pragma HLS INTERFACE s_axilite port=neighbor_rank
-#pragma HLS INTERFACE s_axilite port=communicator_addr
-#pragma HLS INTERFACE s_axilite port=datapath_cfg
 #pragma HLS INTERFACE axis port=data_in
-#pragma HLS INTERFACE axis port=cmd
-#pragma HLS INTERFACE axis port=sts
 #pragma HLS INTERFACE axis port=notify
 #pragma HLS INTERFACE s_axilite port=return
 
@@ -113,9 +108,6 @@ void send_stream(ap_uint<512>* read_buffer,  ap_uint<32> size, ap_uint<32> num_i
 #pragma HLS INTERFACE m_axi port=read_buffer bundle=gmem_in
 #pragma HLS INTERFACE s_axilite port=size
 #pragma HLS INTERFACE s_axilite port=num_iterations
-#pragma HLS INTERFACE s_axilite port=neighbor_rank
-#pragma HLS INTERFACE s_axilite port=communicator_addr
-#pragma HLS INTERFACE s_axilite port=datapath_cfg
 #pragma HLS INTERFACE axis port=data_out
 #pragma HLS INTERFACE s_axilite port=return
 

From ae34f5c04f18896570b14c477b1550117f2f5ecf Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 1 Jun 2023 11:43:01 +0200
Subject: [PATCH 295/318] Thread emulation

---
 .../execution_types/execution_accl_pl_stream.hpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
index 2b0e9039..f6805bad 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
@@ -26,6 +26,7 @@ SOFTWARE.
 #include <memory>
 #include <vector>
 #include <chrono>
+#include <thread>
 
 /* External library headers */
 #include "mpi.h"
@@ -127,13 +128,16 @@ namespace network::execution_types::accl_pl {
                     run_recv.wait();
                     run_schedule.wait();
                 } else {
-                    send_stream(reinterpret_cast<ap_uint<512>*>(acclSendBuffers[i]->buffer()), size_in_values, looplength,
-                                            krnl2cclo);
-                    schedule_stream(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                    std::thread run_send(send_stream, reinterpret_cast<ap_uint<512>*>(acclSendBuffers[i]->buffer()), size_in_values, looplength,
+                                            std::ref(krnl2cclo));
+                    std::thread run_recv(recv_stream, reinterpret_cast<ap_uint<512>*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength,
+                                            std::ref(cclo2krnl), std::ref(notify));
+                    std::thread run_schedule(schedule_stream,size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}),
-                                            cmd, sts, notify);
-                    recv_stream(reinterpret_cast<ap_uint<512>*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength,
-                                            cclo2krnl, notify);
+                                            std::ref(cmd), std::ref(sts), std::ref(notify));
+                    run_send.join();
+                    run_recv.join();
+                    run_schedule.join();
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();

From 060af8d044478d4ad9781967450dbcc004a1cc22 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 1 Jun 2023 14:25:19 +0200
Subject: [PATCH 296/318] Fix protocol for schedule and recv sync

---
 b_eff/src/device/communication_ACCL_pl_stream.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
index fb1e2ee1..aa2697d0 100644
--- a/b_eff/src/device/communication_ACCL_pl_stream.cpp
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -78,7 +78,9 @@ void recv_stream(ap_uint<512>* write_buffer,  ap_uint<32> size, ap_uint<32> num_
 
     notify_word w;
     for (int i = 0; i < num_iterations; i++) {
+        #pragma HLS protocol fixed
         read_data(write_buffer, size, data_in);
+        ap_wait();
         notify.write(w);
     }
 }
@@ -98,7 +100,9 @@ void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations,
 #pragma HLS INTERFACE s_axilite port=return
 
     for (int i = 0; i < num_iterations; i++) {
+        #pragma HLS protocol fixed
         schedule_send(size, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts);
+        ap_wait();
         notify_word w = notify.read();
     }
 }

From 682bff4a327cdeae2e6720b28d85f59b3999a411 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 1 Jun 2023 19:46:59 +0200
Subject: [PATCH 297/318] Fix signatures

---
 .../execution_types/execution_xrt_accl_pq.hpp | 11 +++++------
 .../execution_xrt_accl_stream_pq.hpp          | 19 +++++++++----------
 .../execution_xrt_accl_stream_pq_sendrecv.hpp | 11 +++++------
 .../execution_types/execution_xrt_pcie_pq.hpp | 10 +++++-----
 4 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
index 3a2111f3..13c7c263 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
@@ -366,7 +366,7 @@ void accl_exchangeData(
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured
  * execution times
  */
-static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
+static std::map<std::string, std::vector<double>>  calculate(
     const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
                                        xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config,
     transpose::TransposeData<fpga_setup::ACCLContext> &data,
@@ -586,11 +586,10 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     transferTimings.push_back(transferTime.count());
   }
 
-  std::unique_ptr<transpose::TransposeExecutionTimings> result(
-      new transpose::TransposeExecutionTimings{transferTimings,
-                                               calculationTimings});
-
-  return result;
+  std::map<std::string, std::vector<double>> timings;
+  timings["transfer"] = transferTimings;
+  timings["calculation"] = calculationTimings;
+  return timings;
 }
 
 } // namespace accl_pq
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
index 27e240e6..84121480 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
@@ -35,7 +35,7 @@ SOFTWARE.
 #include "Simulation.h"
 #include "accl.hpp"
 
-extern void transpose_write(const DEVICE_DATA_TYPE *B,
+extern void transpose_write0(const DEVICE_DATA_TYPE *B,
                                  DEVICE_DATA_TYPE *A_out,
             const unsigned int offset_b,
             const unsigned int number_of_blocks,
@@ -43,7 +43,7 @@ extern void transpose_write(const DEVICE_DATA_TYPE *B,
             const unsigned int height_in_blocks,
             hlslib::Stream<stream_word> &cclo2krnl);
   
-extern void transpose_read( const DEVICE_DATA_TYPE *A,
+extern void transpose_read0( const DEVICE_DATA_TYPE *A,
             const unsigned int offset_a,
             const unsigned int number_of_blocks,
             const unsigned int width_in_blocks,
@@ -66,7 +66,7 @@ namespace accl_stream_pq {
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured
  * execution times
  */
-static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
+static std::map<std::string, std::vector<double>>  calculate(
     const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
                                        xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config,
     transpose::TransposeData<fpga_setup::ACCLContext> &data,
@@ -248,7 +248,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
                 (bufferSizeList[r]) /
                 (local_matrix_width * data.blockSize * data.blockSize))));
       } else {
-        HLSLIB_DATAFLOW_FUNCTION(transpose_read,
+        HLSLIB_DATAFLOW_FUNCTION(transpose_read0,
             (config.programSettings->copyA ? data.A : data.A),
             static_cast<cl_uint>(bufferOffsetList[r]),
             static_cast<cl_uint>(blocksPerReplication[r]),
@@ -257,7 +257,7 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
                 (bufferSizeList[r]) /
                 (local_matrix_width * data.blockSize * data.blockSize)),
                 krnl2cclo);
-        HLSLIB_DATAFLOW_FUNCTION(transpose_write,
+        HLSLIB_DATAFLOW_FUNCTION(transpose_write0,
             data.B, data.result,
             static_cast<cl_uint>(bufferOffsetList[r]),
             static_cast<cl_uint>(blocksPerReplication[r]),
@@ -348,11 +348,10 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     transferTimings.push_back(transferTime.count());
   }
 
-  std::unique_ptr<transpose::TransposeExecutionTimings> result(
-      new transpose::TransposeExecutionTimings{transferTimings,
-                                               calculationTimings});
-
-  return result;
+  std::map<std::string, std::vector<double>> timings;
+  timings["transfer"] = transferTimings;
+  timings["calculation"] = calculationTimings;
+  return timings;
 }
 
 } // namespace accl_pq
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp
index 5282b5da..20c9f596 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp
@@ -70,7 +70,7 @@ namespace accl_stream_sendrecv_pq {
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured
  * execution times
  */
-static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
+static std::map<std::string, std::vector<double>> calculate(
     const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
                                        xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config,
     transpose::TransposeData<fpga_setup::ACCLContext> &data,
@@ -455,11 +455,10 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     transferTimings.push_back(transferTime.count());
   }
 
-  std::unique_ptr<transpose::TransposeExecutionTimings> result(
-      new transpose::TransposeExecutionTimings{transferTimings,
-                                               calculationTimings});
-
-  return result;
+  std::map<std::string, std::vector<double>> timings;
+  timings["transfer"] = transferTimings;
+  timings["calculation"] = calculationTimings;
+  return timings;
 }
 
 } // namespace accl_pq
diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
index 0fa0f9c2..f621394a 100644
--- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
@@ -49,7 +49,7 @@ namespace pcie_pq {
  * execution times
  */
 template<class TContext>
-static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
+static std::map<std::string, std::vector<double>> calculate(
     const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
                                        xrt::device, TContext, xrt::uuid> &config,
           transpose::TransposeData<TContext> &data,
@@ -270,11 +270,11 @@ static std::unique_ptr<transpose::TransposeExecutionTimings> calculate(
     transferTimings.push_back(transferTime.count());
   }
 
-  std::unique_ptr<transpose::TransposeExecutionTimings> result(
-      new transpose::TransposeExecutionTimings{transferTimings,
-                                               calculationTimings});
+  std::map<std::string, std::vector<double>> timings;
+  timings["transfer"] = transferTimings;
+  timings["calculation"] = calculationTimings;
+  return timings;
 
-  return result;
 }
 
 } // namespace pcie_pq

From a512815d6699557c471a4e75983dba3efd1f072f Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 1 Jun 2023 19:47:56 +0200
Subject: [PATCH 298/318] Kernel replication for host side

---
 PTRANS/src/host/CMakeLists.txt | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt
index b9e0541b..554b4a3e 100755
--- a/PTRANS/src/host/CMakeLists.txt
+++ b/PTRANS/src/host/CMakeLists.txt
@@ -37,7 +37,15 @@ if (Vitis_FOUND)
         set(CMAKE_SKIP_BUILD_RPATH No)
         set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes)
         list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib)
-        list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp ${CMAKE_SOURCE_DIR}/src/device/transpose_PQ_ACCL_stream.cpp)
+        list(APPEND kernel_files transpose_PQ_ACCL_stream_sendrecv.cpp transpose_PQ_ACCL_stream.cpp)
+        foreach (files ${kernel_files})
+            set(source_f "${CMAKE_BINARY_DIR}/src/device/${files}")
+            set(base_file "${CMAKE_SOURCE_DIR}/src/device/${files}")
+            add_custom_command(OUTPUT ${source_f}
+            COMMAND ${Python3_EXECUTABLE} ${CODE_GENERATOR} -o ${source_f} -p num_replications=1 -p num_total_replications=1 ${base_file}
+            MAIN_DEPENDENCY ${base_file})
+            list(APPEND HOST_SOURCE ${source_f})
+        endforeach()
     endif()
     add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE})
     target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS})

From e3c7266870d96fd437716ae0cf59d98696bb2f05 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 1 Jun 2023 19:48:21 +0200
Subject: [PATCH 299/318] Use custom ACCL branch

---
 extern/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 0e8bed30..ebeabdf3 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -62,8 +62,8 @@ message(STATUS "ACCL was selected. Fetch ACCL dependencies")
 FetchContent_Declare(
 	extern_accl
 
-    GIT_REPOSITORY	https://github.com/Xilinx/ACCL.git
-	GIT_TAG		dev)
+    GIT_REPOSITORY	https://github.com/Mellich/ACCL.git
+	GIT_TAG		modded)
 
 FetchContent_GetProperties(extern_accl)
 if(NOT extern_accl_POPULATED)

From 77894e7bccbb6843ca0e07e9254b07fcf7aaaf4d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 5 Jun 2023 17:57:26 +0200
Subject: [PATCH 300/318] Fix validation for multiple repetitions

---
 b_eff/src/host/execution_types/execution_accl_pl_stream.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
index f6805bad..12d95d5c 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
@@ -88,6 +88,8 @@ namespace network::execution_types::accl_pl {
         for (uint r =0; r < config.programSettings->numRepetitions; r++) {
             acclSendBuffers.clear();
             acclRecvBuffers.clear();
+            dummyBufferContents.clear();
+            recvBufferContents.clear();
             int size_in_values = (size_in_bytes + 3) / 4;
 
             xrt::kernel sendKernel;

From 480c4e19b1496468be373fc6529c6ba4987f6ef8 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 5 Jun 2023 18:59:36 +0200
Subject: [PATCH 301/318] Add default mem bank option for ACCL

---
 cmake/accl.cmake                    |  2 ++
 shared/hpcc_settings.cpp            |  2 ++
 shared/include/base_parameters.h.in |  2 ++
 shared/include/hpcc_settings.hpp    | 10 ++++++++++
 shared/setup/fpga_setup_accl.cpp    |  5 ++---
 5 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index 7a31a665..01335805 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -5,6 +5,8 @@ set(ACCL_UDP_ETH_IFS 1 CACHE STRING "Number of Ethernet interfaces to synthesize
 set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform")
 set(DEFAULT_ACCL_BUFFER_SIZE 8192 CACHE STRING "Size of ACCL buffers in KB")
 set(DEFAULT_ACCL_BUFFER_COUNT 16 CACHE STRING "Number of ACCL buffers")
+set(DEFAULT_ACCL_BUFFER_BANK 0 CACHE STRING "Default memory bank for ACCL buffers")
+set(DEFAULT_ACCL_RECV_BUFFER_BANKS 1 CACHE STRING "Memory banks to allocate recevie buffers (can be comma-separated list)")
 set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware)
 set(ACCL_CCLO_ADDITIONAL_BUILD_ARGS "" CACHE STRING "Add additional build arguments that will be passed to the CCLO makefile")
 set(ACCL_CCLO_BUILD_ARGS ${ACCL_CCLO_ADDITIONAL_BUILD_ARGS})
diff --git a/shared/hpcc_settings.cpp b/shared/hpcc_settings.cpp
index 8cbd2319..b0f096b8 100644
--- a/shared/hpcc_settings.cpp
+++ b/shared/hpcc_settings.cpp
@@ -30,6 +30,8 @@ hpcc_base::BaseSettings::BaseSettings(cxxopts::ParseResult &results) : numRepeti
             acclProtocol(fpga_setup::acclProtocolStringToEnum(results["accl-protocol"].as<std::string>())),
             acclBufferSize(results["accl-buffer-size"].as<uint>() * 1024),
             acclBufferCount(results["accl-buffer-count"].as<uint>()),
+            acclRecvBufferMemBanks(results["accl-recv-banks"].as<std::vector<int>>()),
+            acclDefaultBank(results["accl-default-bank"].as<int>()),
 #endif
 #ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED
             communicationType(retrieveCommunicationType(results["comm-type"].as<std::string>(), results["f"].as<std::string>())),
diff --git a/shared/include/base_parameters.h.in b/shared/include/base_parameters.h.in
index 45a1100b..2946e7cb 100644
--- a/shared/include/base_parameters.h.in
+++ b/shared/include/base_parameters.h.in
@@ -13,6 +13,8 @@
 #cmakedefine DEFAULT_ACCL_BUFFER_SIZE @DEFAULT_ACCL_BUFFER_SIZE@
 #cmakedefine DEFAULT_ACCL_BUFFER_COUNT @DEFAULT_ACCL_BUFFER_COUNT@
 #cmakedefine ACCL_STACK_TYPE "@ACCL_STACK_TYPE@"
+#cmakedefine DEFAULT_ACCL_RECV_BUFFER_BANKS @DEFAULT_ACCL_RECV_BUFFER_BANKS@
+#cmakedefine DEFAULT_ACCL_BUFFER_BANK @DEFAULT_ACCL_BUFFER_BANK@
 
 /**
 Output separator
diff --git a/shared/include/hpcc_settings.hpp b/shared/include/hpcc_settings.hpp
index 7597b68d..39836045 100644
--- a/shared/include/hpcc_settings.hpp
+++ b/shared/include/hpcc_settings.hpp
@@ -120,6 +120,16 @@ class BaseSettings {
      * 
      */
     uint acclBufferCount;
+
+    /**
+     * @brief Memory banks used to create ACCL receive buffers
+    */
+    std::vector<int> acclRecvBufferMemBanks;
+
+    /**
+     * @brief Default bank for memory buffer created with ACCL driver
+    */
+    int acclDefaultBank;
 #endif
 
     /**
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
index 4e293910..fdaeaf7f 100644
--- a/shared/setup/fpga_setup_accl.cpp
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -122,11 +122,10 @@ ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
           64 * 1024 * 1024, ACCL::dataType::int8, device, network_krnl.group_id(4)));
       configure_tcp(*accl.tx_buf_network, *accl.rx_buf_network, network_krnl, ranks, current_rank);
     }
-    std::vector<int> mem = {2, 3};
     std::cout << "Create ACCL" << std::endl;
     accl.accl = std::unique_ptr<ACCL::ACCL>(
-        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, 
-            mem, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize, programSettings.acclBufferSize));
+        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, programSettings.acclDefaultBank, 
+            programSettings.acclRecvBufferMemBanks, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize, programSettings.acclBufferSize));
   } else {
     // TODO: Add start port here. Currenty hardcoded!
     accl.accl = std::unique_ptr<ACCL::ACCL>(

From e94de6cfd87e63a85a6d2e0ee678b9b679926254 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 5 Jun 2023 19:02:40 +0200
Subject: [PATCH 302/318] Add accl buffer options to parser

---
 shared/include/hpcc_benchmark.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 3b96a4a6..3ffa11b8 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -272,6 +272,10 @@ class HpccFpgaBenchmark {
                 cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_SIZE)))
                 ("accl-buffer-count", "Specify the number of ACCL buffers used within the benchmark",
                 cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_COUNT)))
+                ("accl-default-bank", "Default memory bank used by ACCL to create new FPGA buffers",
+                cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_BANK)))
+                ("accl-recv-banks", "Memory banks used by ACCL for receive buffers",
+                cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_ACCL_RECV_BUFFER_BANKS)))
 #endif
                 ("skip-validation", "Skip the validation of the output data. This will speed up execution and helps when working with special data types.")
                 ("device", "Index of the device that has to be used. If not given you "\

From b24953224fe36b4132183b75c0c793f8d2a0a1d6 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 12 Jun 2023 11:45:13 +0200
Subject: [PATCH 303/318] Fix ACCL buffer size parameters

---
 shared/hpcc_settings.cpp                 | 17 ++++++++++++++++-
 shared/include/base_parameters.h.in      |  2 +-
 shared/include/hpcc_benchmark.hpp        |  4 ++--
 shared/include/setup/fpga_setup_accl.hpp | 10 ++++++++++
 4 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/shared/hpcc_settings.cpp b/shared/hpcc_settings.cpp
index b0f096b8..e621ffe6 100644
--- a/shared/hpcc_settings.cpp
+++ b/shared/hpcc_settings.cpp
@@ -57,7 +57,22 @@ hpcc_base::BaseSettings::getSettingsMap() {
     if (mpi_size > 0) {
         str_mpi_ranks = std::to_string(mpi_size);
     }
+#ifdef USE_ACCL
+    std::stringstream accl_recv_banks;
+    for (auto& b: acclRecvBufferMemBanks) {
+        accl_recv_banks << b << ",";
+    }
+#endif
     return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, 
             {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"},
-            {"Communication Type", commToString(communicationType)}};
+            {"Communication Type", commToString(communicationType)}
+#ifdef USE_ACCL
+            ,{"ACCL Protocol", fpga_setup::acclEnumToProtocolString(acclProtocol)},
+            {"ACCL Recv. Banks", accl_recv_banks.str()},
+            {"ACCL Default Bank", std::to_string(acclDefaultBank)},
+            {"ACCL Buffer Size", std::to_string(acclBufferSize) + "KB"},
+            {"ACCL Buffer Count", std::to_string(acclBufferCount)},
+            {"ACCL Emulation", useAcclEmulation ? "Yes" : "No"}
+#endif
+            };
 }
diff --git a/shared/include/base_parameters.h.in b/shared/include/base_parameters.h.in
index 2946e7cb..6915a14c 100644
--- a/shared/include/base_parameters.h.in
+++ b/shared/include/base_parameters.h.in
@@ -13,7 +13,7 @@
 #cmakedefine DEFAULT_ACCL_BUFFER_SIZE @DEFAULT_ACCL_BUFFER_SIZE@
 #cmakedefine DEFAULT_ACCL_BUFFER_COUNT @DEFAULT_ACCL_BUFFER_COUNT@
 #cmakedefine ACCL_STACK_TYPE "@ACCL_STACK_TYPE@"
-#cmakedefine DEFAULT_ACCL_RECV_BUFFER_BANKS @DEFAULT_ACCL_RECV_BUFFER_BANKS@
+#cmakedefine DEFAULT_ACCL_RECV_BUFFER_BANKS "@DEFAULT_ACCL_RECV_BUFFER_BANKS@"
 #cmakedefine DEFAULT_ACCL_BUFFER_BANK @DEFAULT_ACCL_BUFFER_BANK@
 
 /**
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 3ffa11b8..f49edd9b 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -273,9 +273,9 @@ class HpccFpgaBenchmark {
                 ("accl-buffer-count", "Specify the number of ACCL buffers used within the benchmark",
                 cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_COUNT)))
                 ("accl-default-bank", "Default memory bank used by ACCL to create new FPGA buffers",
-                cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_BANK)))
+                cxxopts::value<int>()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_BANK)))
                 ("accl-recv-banks", "Memory banks used by ACCL for receive buffers",
-                cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_ACCL_RECV_BUFFER_BANKS)))
+                cxxopts::value<std::vector<int>>()->default_value(DEFAULT_ACCL_RECV_BUFFER_BANKS))
 #endif
                 ("skip-validation", "Skip the validation of the output data. This will speed up execution and helps when working with special data types.")
                 ("device", "Index of the device that has to be used. If not given you "\
diff --git a/shared/include/setup/fpga_setup_accl.hpp b/shared/include/setup/fpga_setup_accl.hpp
index 0f451ced..fb7d85b3 100644
--- a/shared/include/setup/fpga_setup_accl.hpp
+++ b/shared/include/setup/fpga_setup_accl.hpp
@@ -50,6 +50,16 @@ static const std::map<std::string, ACCL::networkProtocol> acclProtocolMap = {
     {"TCP", ACCL::networkProtocol::TCP} 
 };
 
+static std::string acclEnumToProtocolString(ACCL::networkProtocol p) {
+    for (const auto& entry: acclProtocolMap) {
+        if (entry.second == p) {
+            return entry.first;
+        }
+    }
+    std::runtime_error("ACCL network protocol could not be parsed to string!");
+    return "";
+}
+
 static ACCL::networkProtocol acclProtocolStringToEnum(std::string string_representation) {
     if (acclProtocolMap.count(string_representation)) {
         return acclProtocolMap.at(string_representation);

From 23303389210bd1101516d88c9525cd796aa77f2f Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 29 Jun 2023 18:06:10 +0200
Subject: [PATCH 304/318] Sleep before actual send

---
 b_eff/src/host/execution_types/execution_accl_pl_stream.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
index 12d95d5c..bf7b36fd 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
@@ -122,8 +122,10 @@ namespace network::execution_types::accl_pl {
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 if (!config.programSettings->useAcclEmulation) {
                     auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength);
+                    std::this_thread::sleep_for(std::chrono::milliseconds(100));
                     auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength);
                     MPI_Barrier(MPI_COMM_WORLD);
+                    startCalculation = std::chrono::high_resolution_clock::now();
                     auto run_schedule = scheduleKernel(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}));
                     run_send.wait();

From d88c5cccaa3ffe70c8055d535fcf5615f5bca004 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 3 Jul 2023 17:26:35 +0200
Subject: [PATCH 305/318] Allow both streaming and buffered

---
 b_eff/src/device/CMakeLists.txt                       |  2 +-
 b_eff/src/host/execution_types/execution.hpp          |  1 +
 .../host/execution_types/execution_accl_pl_stream.hpp |  2 +-
 b_eff/src/host/network_benchmark.cpp                  | 11 +++++++++--
 b_eff/src/host/network_benchmark.hpp                  |  5 +++++
 5 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt
index 865cb249..e1b372ea 100644
--- a/b_eff/src/device/CMakeLists.txt
+++ b/b_eff/src/device/CMakeLists.txt
@@ -18,7 +18,7 @@ endif()
 if (Vitis_FOUND)
         generate_kernel_targets_xilinx(communication_PCIE)
         if (USE_ACCL)
-                generate_kernel_targets_xilinx(communication_ACCL
+                generate_kernel_targets_xilinx(communication_ACCL communication_ACCL_pl
                     communication_ACCL_pl_stream)
         endif()
         add_test(NAME test_emulation_pcie_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 10 -m 0 -n 1
diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp
index 86aec21c..133282ea 100644
--- a/b_eff/src/host/execution_types/execution.hpp
+++ b/b_eff/src/host/execution_types/execution.hpp
@@ -29,5 +29,6 @@ SOFTWARE.
 #endif
 #else
 #include "execution_types/execution_accl.hpp"
+#include "execution_types/execution_accl_pl.hpp"
 #include "execution_types/execution_accl_pl_stream.hpp"
 #endif
diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
index bf7b36fd..fad46d4d 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
@@ -47,7 +47,7 @@ extern void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations,
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
                 STREAM<command_word> &cmd, STREAM<command_word > &sts, STREAM<notify_word> &notify);
 
-namespace network::execution_types::accl_pl {
+namespace network::execution_types::accl_pl_stream {
 
 
     /*
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index d4412461..a7e07d3a 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -41,7 +41,7 @@ network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &re
     pcie_reverse_read_pcie(results["pcie-write"].count()), pcie_reverse_execute_kernel(results["kernel-latency"].count()),
     pcie_reverse_batch(results["pcie-batch"].count()), pcie_reverse(results["pcie-reverse"].count())
 #ifdef USE_ACCL
-    , accl_from_programable_logic(results["accl-pl"].count()) 
+    , accl_from_programable_logic(results["accl-pl"].count()), accl_axi_stream(results["accl-stream"].count())  
 #endif    
 {
 
@@ -100,6 +100,7 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options)
             cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE)))
 #ifdef USE_ACCL
         ("accl-pl", "Use second ACCL command kernel to schedule sends and recevs from PL")
+        ("accl-stream", "Send and receive data to AXI streams instead of global memory")
 #endif
         ("pcie-read", "Use reverse PCIe experiment and measure PCIe read performance from device")
         ("pcie-write", "Use reverse PCIe experiment and measure PCIe write performance from device")
@@ -140,7 +141,13 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
 #endif
 #else
 	    case hpcc_base::CommunicationType::accl: if (!executionSettings->programSettings->accl_from_programable_logic) { timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
-                                                } else { timing = execution_types::accl_pl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);} break;
+                                                } else { 
+                                                   if (!executionSettings->programSettings->accl_axi_stream) { 
+                                                    timing = execution_types::accl_pl_stream::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
+                                                } 
+                                                else {
+                                                    timing = execution_types::accl_pl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
+                                                }} break;
 #endif
 	    default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType));
         }
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index a017aa2c..52e2a479 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -167,6 +167,11 @@ class NetworkProgramSettings : public hpcc_base::BaseSettings {
     */
     bool accl_from_programable_logic;
 
+    /**
+     * @brief Forward data to AXI stream instead of global memory to further reduce latency
+    */
+    bool accl_axi_stream;
+
     /**
      * @brief his is automatically set to true if one of pcie_reverse_write_pcie, pcie_reverse_read_pcie, 
      * or pcie_reverse_execute_kernel is set to true. The reverse PCIe experiment will be executed in that case.

From e301a3c4cc4485eb578d77321dd9602bb8974f2d Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 6 Jul 2023 16:13:57 +0200
Subject: [PATCH 306/318] Move barrier behind sleep

---
 b_eff/src/host/execution_types/execution_accl_pl_stream.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
index fad46d4d..f5ccd1a8 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
@@ -123,8 +123,8 @@ namespace network::execution_types::accl_pl_stream {
                 if (!config.programSettings->useAcclEmulation) {
                     auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength);
                     std::this_thread::sleep_for(std::chrono::milliseconds(100));
-                    auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength);
                     MPI_Barrier(MPI_COMM_WORLD);
+                    auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength);
                     startCalculation = std::chrono::high_resolution_clock::now();
                     auto run_schedule = scheduleKernel(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}));

From c23e36a9bb9b7f7b2556294c0f0672bb62a7e082 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 6 Jul 2023 16:14:18 +0200
Subject: [PATCH 307/318] Enable TCP bypassing

---
 cmake/accl.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/accl.cmake b/cmake/accl.cmake
index 01335805..dd00a8b4 100644
--- a/cmake/accl.cmake
+++ b/cmake/accl.cmake
@@ -61,7 +61,7 @@ add_custom_command(
     COMMAND mkdir build && cd build && cmake .. -DFDEV_NAME=u280 
             -DVIVADO_HLS_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 
             -DVIVADO_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 
-            -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=0 -DTCP_STACK_WINDOW_SCALING_EN=0 &&
+            -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=1 -DTCP_STACK_WINDOW_SCALING_EN=0 &&
             make installip
     WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR})
 

From 0412449fd431ccc76e89ca5e6f77317517161531 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 7 Jul 2023 14:53:26 +0200
Subject: [PATCH 308/318] Fix host linking

---
 b_eff/src/host/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt
index ac11320e..28e92c94 100755
--- a/b_eff/src/host/CMakeLists.txt
+++ b/b_eff/src/host/CMakeLists.txt
@@ -23,7 +23,8 @@ if (USE_ACCL)
     set(CMAKE_SKIP_BUILD_RPATH No)
     set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes)
     list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib)
-    list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl_stream.cpp)
+    list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl_stream.cpp 
+                                ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl.cpp)
 endif()
     add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE})
     target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS})

From a514628286836d213cfdce451b96dd05c46be4b8 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Fri, 11 Aug 2023 16:49:01 +0200
Subject: [PATCH 309/318] Switch back to ACCL dev

---
 extern/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index ebeabdf3..0e8bed30 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -62,8 +62,8 @@ message(STATUS "ACCL was selected. Fetch ACCL dependencies")
 FetchContent_Declare(
 	extern_accl
 
-    GIT_REPOSITORY	https://github.com/Mellich/ACCL.git
-	GIT_TAG		modded)
+    GIT_REPOSITORY	https://github.com/Xilinx/ACCL.git
+	GIT_TAG		dev)
 
 FetchContent_GetProperties(extern_accl)
 if(NOT extern_accl_POPULATED)

From f144324481fb42592401a65657cf5f8df0d4fec5 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 14 Aug 2023 19:19:15 +0200
Subject: [PATCH 310/318] Add ACCL stream executor and fix accl-stream flag

---
 b_eff/src/host/execution_types/execution.hpp  |   1 +
 .../execution_types/execution_accl_stream.hpp | 123 ++++++++++++++++++
 b_eff/src/host/network_benchmark.cpp          |  11 +-
 3 files changed, 132 insertions(+), 3 deletions(-)
 create mode 100644 b_eff/src/host/execution_types/execution_accl_stream.hpp

diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp
index 133282ea..0cd828bc 100644
--- a/b_eff/src/host/execution_types/execution.hpp
+++ b/b_eff/src/host/execution_types/execution.hpp
@@ -29,6 +29,7 @@ SOFTWARE.
 #endif
 #else
 #include "execution_types/execution_accl.hpp"
+#include "execution_types/execution_accl_stream.hpp"
 #include "execution_types/execution_accl_pl.hpp"
 #include "execution_types/execution_accl_pl_stream.hpp"
 #endif
diff --git a/b_eff/src/host/execution_types/execution_accl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_stream.hpp
new file mode 100644
index 00000000..d59afeb4
--- /dev/null
+++ b/b_eff/src/host/execution_types/execution_accl_stream.hpp
@@ -0,0 +1,123 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_STREAM_HPP
+#define SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_STREAM_HPP
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* External library headers */
+#include "mpi.h"
+#include "accl.hpp"
+
+/* Project's headers */
+
+namespace network::execution_types::accl_stream {
+
+    /*
+    Implementation for the single kernel.
+     @copydoc bm_execution::calculate()
+    */
+	template<class TDevice, class TContext, class TProgram>
+    network::ExecutionTimings
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
+                cl::vector<HOST_DATA_TYPE> &validationData) {
+
+        int err;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+        std::vector<cl::vector<HOST_DATA_TYPE>> recvBufferContents;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclSendBuffers;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclRecvBuffers;
+        size_t size_in_bytes = std::max((1 << messageSize), 4);
+
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+        std::vector<double> calculationTimings;
+        for (uint r =0; r < config.programSettings->numRepetitions; r++) {
+            dummyBufferContents.clear();
+	    recvBufferContents.clear();
+	    acclSendBuffers.clear();
+	    acclRecvBuffers.clear();
+	    int size_in_values = (size_in_bytes + 3) / 4;
+            // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
+            for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+                recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
+		        acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 0));
+		        acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 1));
+		        acclSendBuffers.back()->sync_to_device();
+		        acclRecvBuffers.back()->sync_to_device();
+            }
+
+            double calculationTime = 0.0;
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto startCalculation = std::chrono::high_resolution_clock::now();
+                for (int l = 0; l < looplength; l++) {
+#ifndef NDEBUG
+                    std::cout << "Stream " << size_in_bytes << " bytes to " 
+                                << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl;
+#endif
+            config.context->accl->stream_put(*acclSendBuffers[i], size_in_values, 
+                                        (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                        0, ACCL::GLOBAL_COMM, true);
+#ifndef NDEBUG
+                    std::cout << "Done" << std::endl;
+#endif
+                }
+                auto endCalculation = std::chrono::high_resolution_clock::now();
+                calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
+                #endif
+            }
+            calculationTimings.push_back(calculationTime);
+#ifndef NDEBUG
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+        std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
+#endif
+        }
+        // Read validation data from FPGA will be placed sequentially in buffer for all replications
+        // The data order should not matter, because every byte should have the same value!
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+            acclRecvBuffers[r]->sync_from_device();
+		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]);
+        }
+        return network::ExecutionTimings{
+               looplength,
+                messageSize,
+                calculationTimings
+        };
+    }
+
+}  // namespace bm_execution
+
+#endif
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index a7e07d3a..4058c527 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -140,13 +140,18 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
 	    case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
 #endif
 #else
-	    case hpcc_base::CommunicationType::accl: if (!executionSettings->programSettings->accl_from_programable_logic) { timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
+	    case hpcc_base::CommunicationType::accl: if (!executionSettings->programSettings->accl_from_programable_logic) { 
+                                                    if (!executionSettings->programSettings->accl_axi_stream) { 
+                                                        timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
+                                                    }else {
+                                                       timing = execution_types::accl_stream::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); 
+                                                    }
                                                 } else { 
                                                    if (!executionSettings->programSettings->accl_axi_stream) { 
-                                                    timing = execution_types::accl_pl_stream::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
+                                                    timing = execution_types::accl_pl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
                                                 } 
                                                 else {
-                                                    timing = execution_types::accl_pl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
+                                                timing = execution_types::accl_pl_stream::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
                                                 }} break;
 #endif
 	    default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType));

From 5d7447c11e94b5bf6a65eccf89922ee3418a0cfc Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 15 Aug 2023 11:28:01 +0200
Subject: [PATCH 311/318] Fix non-PL stream

---
 b_eff/src/device/communication_ACCL_pl_stream.cpp        | 6 ++++--
 .../host/execution_types/execution_accl_pl_stream.hpp    | 6 +++---
 b_eff/src/host/execution_types/execution_accl_stream.hpp | 9 +++++++++
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
index aa2697d0..cdf8d3d6 100644
--- a/b_eff/src/device/communication_ACCL_pl_stream.cpp
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -85,7 +85,7 @@ void recv_stream(ap_uint<512>* write_buffer,  ap_uint<32> size, ap_uint<32> num_
     }
 }
 
-void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations,
+void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, uint enable,
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
                 STREAM<command_word> &cmd, STREAM<command_word> &sts,
                 STREAM<notify_word> &notify) {
@@ -101,7 +101,9 @@ void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations,
 
     for (int i = 0; i < num_iterations; i++) {
         #pragma HLS protocol fixed
-        schedule_send(size, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts);
+        if (enable) {
+            schedule_send(size, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts);
+        }
         ap_wait();
         notify_word w = notify.read();
     }
diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
index f5ccd1a8..72732615 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
@@ -43,7 +43,7 @@ extern void send_stream(ap_uint<512>* read_buffer, ap_uint<32> size, ap_uint<32>
 extern void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, 
                 STREAM<stream_word> &data_in, STREAM<notify_word> &notify);
 
-extern void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, 
+extern void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, uint enable,
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
                 STREAM<command_word> &cmd, STREAM<command_word > &sts, STREAM<notify_word> &notify);
 
@@ -126,7 +126,7 @@ namespace network::execution_types::accl_pl_stream {
                     MPI_Barrier(MPI_COMM_WORLD);
                     auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength);
                     startCalculation = std::chrono::high_resolution_clock::now();
-                    auto run_schedule = scheduleKernel(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                    auto run_schedule = scheduleKernel(size_in_values, looplength, 1, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}));
                     run_send.wait();
                     run_recv.wait();
@@ -136,7 +136,7 @@ namespace network::execution_types::accl_pl_stream {
                                             std::ref(krnl2cclo));
                     std::thread run_recv(recv_stream, reinterpret_cast<ap_uint<512>*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength,
                                             std::ref(cclo2krnl), std::ref(notify));
-                    std::thread run_schedule(schedule_stream,size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                    std::thread run_schedule(schedule_stream,size_in_values, looplength, 1, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}),
                                             std::ref(cmd), std::ref(sts), std::ref(notify));
                     run_send.join();
diff --git a/b_eff/src/host/execution_types/execution_accl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_stream.hpp
index d59afeb4..2bae5d9f 100644
--- a/b_eff/src/host/execution_types/execution_accl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_stream.hpp
@@ -77,7 +77,13 @@ namespace network::execution_types::accl_stream {
             double calculationTime = 0.0;
             for (int i = 0; i < config.programSettings->kernelReplications; i++) {
                 MPI_Barrier(MPI_COMM_WORLD);
+                auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength);
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
+                auto run_schedule = scheduleKernel(size_in_values, looplength, 0, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                        config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}));
                 for (int l = 0; l < looplength; l++) {
 #ifndef NDEBUG
                     std::cout << "Stream " << size_in_bytes << " bytes to " 
@@ -90,6 +96,9 @@ namespace network::execution_types::accl_stream {
                     std::cout << "Done" << std::endl;
 #endif
                 }
+                run_send.wait();
+                run_recv.wait();
+                run_schedule.wait();
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
                 #ifndef NDEBUG

From f3b56c3afbbe79367ead8e284f74c9db55be3ad2 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Tue, 15 Aug 2023 16:26:54 +0200
Subject: [PATCH 312/318] Change enable data type

---
 b_eff/src/device/communication_ACCL_pl_stream.cpp           | 2 +-
 b_eff/src/host/execution_types/execution_accl_pl_stream.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
index cdf8d3d6..f57818bf 100644
--- a/b_eff/src/device/communication_ACCL_pl_stream.cpp
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -85,7 +85,7 @@ void recv_stream(ap_uint<512>* write_buffer,  ap_uint<32> size, ap_uint<32> num_
     }
 }
 
-void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, uint enable,
+void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, int enable,
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
                 STREAM<command_word> &cmd, STREAM<command_word> &sts,
                 STREAM<notify_word> &notify) {
diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
index 72732615..c4027988 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
@@ -43,7 +43,7 @@ extern void send_stream(ap_uint<512>* read_buffer, ap_uint<32> size, ap_uint<32>
 extern void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, 
                 STREAM<stream_word> &data_in, STREAM<notify_word> &notify);
 
-extern void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, uint enable,
+extern void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, int enable,
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
                 STREAM<command_word> &cmd, STREAM<command_word > &sts, STREAM<notify_word> &notify);
 

From f6aa8192a00036a3fd67de16b3ec17c1eb32a8d2 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 16 Aug 2023 11:45:07 +0200
Subject: [PATCH 313/318] Fix ACCL stream executor

---
 b_eff/src/host/execution_types/execution_accl_stream.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/b_eff/src/host/execution_types/execution_accl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_stream.hpp
index 2bae5d9f..8803217c 100644
--- a/b_eff/src/host/execution_types/execution_accl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_stream.hpp
@@ -73,7 +73,12 @@ namespace network::execution_types::accl_stream {
 		        acclSendBuffers.back()->sync_to_device();
 		        acclRecvBuffers.back()->sync_to_device();
             }
-
+            xrt::kernel sendKernel;
+            xrt::kernel recvKernel;
+            xrt::kernel scheduleKernel;
+            sendKernel = xrt::kernel(*config.device, *config.program, "send_stream");
+            recvKernel = xrt::kernel(*config.device, *config.program, "recv_stream");
+            scheduleKernel = xrt::kernel(*config.device, *config.program, "schedule_stream");
             double calculationTime = 0.0;
             for (int i = 0; i < config.programSettings->kernelReplications; i++) {
                 MPI_Barrier(MPI_COMM_WORLD);

From 618f5ef6ad7ff9eeb466485fd0e3766c4f68b5dc Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 16 Aug 2023 12:41:19 +0200
Subject: [PATCH 314/318] Set correct memory banks

---
 .../execution_types/execution_accl_stream.hpp    | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_stream.hpp
index 8803217c..32dd58fb 100644
--- a/b_eff/src/host/execution_types/execution_accl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_stream.hpp
@@ -64,21 +64,21 @@ namespace network::execution_types::accl_stream {
 	    acclSendBuffers.clear();
 	    acclRecvBuffers.clear();
 	    int size_in_values = (size_in_bytes + 3) / 4;
+        xrt::kernel sendKernel;
+        xrt::kernel recvKernel;
+        xrt::kernel scheduleKernel;
+        sendKernel = xrt::kernel(*config.device, *config.program, "send_stream");
+        recvKernel = xrt::kernel(*config.device, *config.program, "recv_stream");
+        scheduleKernel = xrt::kernel(*config.device, *config.program, "schedule_stream");
             // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
                 recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
-		        acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 0));
-		        acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 1));
+		        acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, sendKernel.group_id(0)));
+		        acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, recvKernel.group_id(0)));
 		        acclSendBuffers.back()->sync_to_device();
 		        acclRecvBuffers.back()->sync_to_device();
             }
-            xrt::kernel sendKernel;
-            xrt::kernel recvKernel;
-            xrt::kernel scheduleKernel;
-            sendKernel = xrt::kernel(*config.device, *config.program, "send_stream");
-            recvKernel = xrt::kernel(*config.device, *config.program, "recv_stream");
-            scheduleKernel = xrt::kernel(*config.device, *config.program, "schedule_stream");
             double calculationTime = 0.0;
             for (int i = 0; i < config.programSettings->kernelReplications; i++) {
                 MPI_Barrier(MPI_COMM_WORLD);

From 34dc2871133059b75a237f6b152d66b8a847ba8a Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Wed, 16 Aug 2023 12:59:23 +0200
Subject: [PATCH 315/318] Change call to stream-to-stream

---
 b_eff/src/host/execution_types/execution_accl_stream.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/b_eff/src/host/execution_types/execution_accl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_stream.hpp
index 32dd58fb..a12436c6 100644
--- a/b_eff/src/host/execution_types/execution_accl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_stream.hpp
@@ -94,9 +94,9 @@ namespace network::execution_types::accl_stream {
                     std::cout << "Stream " << size_in_bytes << " bytes to " 
                                 << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl;
 #endif
-            config.context->accl->stream_put(*acclSendBuffers[i], size_in_values, 
+            config.context->accl->stream_put(ACCL::dataType::float32, size_in_values, 
                                         (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
-                                        0, ACCL::GLOBAL_COMM, true);
+                                        0);
 #ifndef NDEBUG
                     std::cout << "Done" << std::endl;
 #endif

From 3cdc7321c3bb83f998623c8ba60eca731b8bb9fb Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 24 Aug 2023 17:45:20 +0200
Subject: [PATCH 316/318] Fix some includes for base build

---
 shared/include/hpcc_benchmark.hpp | 4 ++++
 shared/include/hpcc_settings.hpp  | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index f49edd9b..4a1c79e0 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -561,9 +561,13 @@ class HpccFpgaBenchmark {
 #ifndef USE_ACCL
                 context = std::unique_ptr<bool>(new bool(false));
 #endif
+#ifdef USE_ACCL
                 if (!programSettings->useAcclEmulation) {
+#endif
                     program = fpga_setup::fpgaSetup(*usedDevice, programSettings->kernelFileName);
+#ifdef USE_ACCL
                 }
+#endif
 #endif                                                             
 #ifdef USE_OCL_HOST
                 usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
diff --git a/shared/include/hpcc_settings.hpp b/shared/include/hpcc_settings.hpp
index 39836045..71c7a290 100644
--- a/shared/include/hpcc_settings.hpp
+++ b/shared/include/hpcc_settings.hpp
@@ -1,11 +1,15 @@
 #ifndef HPCC_BASE_SETTINGS_H_
 #define HPCC_BASE_SETTINGS_H_
 
+#ifdef USE_OCL_HOST
 #ifdef USE_DEPRECATED_HPP_HEADER
 #include "CL/cl.hpp"
 #else
 #include OPENCL_HPP_HEADER
 #endif
+#else
+#include "xrt/xrt_device.h"
+#endif
 #include "cxxopts.hpp"
 #include "parameters.h"
 #include "communication_types.hpp"

From f8b3ce02c377f07e4fb4dfa8c12345e2fcfb2793 Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Mon, 28 Aug 2023 19:39:05 +0200
Subject: [PATCH 317/318] Change PL scheduling notification

---
 b_eff/src/device/communication_ACCL_pl_stream.cpp    | 12 +++++++-----
 .../execution_types/execution_accl_pl_stream.hpp     |  8 ++++----
 .../host/execution_types/execution_accl_stream.hpp   |  5 +----
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
index f57818bf..eb68fe8e 100644
--- a/b_eff/src/device/communication_ACCL_pl_stream.cpp
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -67,11 +67,13 @@ schedule_send(ap_uint<32> size, ap_uint<32> neighbor_rank,
 }
 
 void recv_stream(ap_uint<512>* write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations,
+                ap_uint<32> notify_enabled,
                 STREAM<stream_word> &data_in,
                 STREAM<notify_word> &notify) {
 #pragma HLS INTERFACE m_axi port=write_buffer bundle=gmem_out
 #pragma HLS INTERFACE s_axilite port=size
 #pragma HLS INTERFACE s_axilite port=num_iterations
+#pragma HLS INTERFACE s_axilite port=notify_enabled
 #pragma HLS INTERFACE axis port=data_in
 #pragma HLS INTERFACE axis port=notify
 #pragma HLS INTERFACE s_axilite port=return
@@ -81,11 +83,13 @@ void recv_stream(ap_uint<512>* write_buffer,  ap_uint<32> size, ap_uint<32> num_
         #pragma HLS protocol fixed
         read_data(write_buffer, size, data_in);
         ap_wait();
-        notify.write(w);
+        if (notify_enabled != 0) {
+            notify.write(w);
+        }
     }
 }
 
-void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, int enable,
+void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations,
                 ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
                 STREAM<command_word> &cmd, STREAM<command_word> &sts,
                 STREAM<notify_word> &notify) {
@@ -101,9 +105,7 @@ void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, int enable,
 
     for (int i = 0; i < num_iterations; i++) {
         #pragma HLS protocol fixed
-        if (enable) {
-            schedule_send(size, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts);
-        }
+        schedule_send(size, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts);
         ap_wait();
         notify_word w = notify.read();
     }
diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
index c4027988..2b12b6d1 100644
--- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
@@ -121,12 +121,12 @@ namespace network::execution_types::accl_pl_stream {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 if (!config.programSettings->useAcclEmulation) {
-                    auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength);
+                    auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength, 1);
                     std::this_thread::sleep_for(std::chrono::milliseconds(100));
                     MPI_Barrier(MPI_COMM_WORLD);
                     auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength);
                     startCalculation = std::chrono::high_resolution_clock::now();
-                    auto run_schedule = scheduleKernel(size_in_values, looplength, 1, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                    auto run_schedule = scheduleKernel(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}));
                     run_send.wait();
                     run_recv.wait();
@@ -134,9 +134,9 @@ namespace network::execution_types::accl_pl_stream {
                 } else {
                     std::thread run_send(send_stream, reinterpret_cast<ap_uint<512>*>(acclSendBuffers[i]->buffer()), size_in_values, looplength,
                                             std::ref(krnl2cclo));
-                    std::thread run_recv(recv_stream, reinterpret_cast<ap_uint<512>*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength,
+                    std::thread run_recv(recv_stream, reinterpret_cast<ap_uint<512>*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, 1,
                                             std::ref(cclo2krnl), std::ref(notify));
-                    std::thread run_schedule(schedule_stream,size_in_values, looplength, 1, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                    std::thread run_schedule(schedule_stream,size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
                                             config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}),
                                             std::ref(cmd), std::ref(sts), std::ref(notify));
                     run_send.join();
diff --git a/b_eff/src/host/execution_types/execution_accl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_stream.hpp
index a12436c6..797b8ca7 100644
--- a/b_eff/src/host/execution_types/execution_accl_stream.hpp
+++ b/b_eff/src/host/execution_types/execution_accl_stream.hpp
@@ -82,13 +82,11 @@ namespace network::execution_types::accl_stream {
             double calculationTime = 0.0;
             for (int i = 0; i < config.programSettings->kernelReplications; i++) {
                 MPI_Barrier(MPI_COMM_WORLD);
-                auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength);
+                auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength, 0);
                 std::this_thread::sleep_for(std::chrono::milliseconds(100));
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
-                auto run_schedule = scheduleKernel(size_in_values, looplength, 0, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
-                                        config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}));
                 for (int l = 0; l < looplength; l++) {
 #ifndef NDEBUG
                     std::cout << "Stream " << size_in_bytes << " bytes to " 
@@ -103,7 +101,6 @@ namespace network::execution_types::accl_stream {
                 }
                 run_send.wait();
                 run_recv.wait();
-                run_schedule.wait();
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
                 #ifndef NDEBUG

From 052ad9e1d4a5e9659480c729a2ddf309133745fd Mon Sep 17 00:00:00 2001
From: Marius Meyer <mariusme@mail.upb.de>
Date: Thu, 23 Nov 2023 11:41:19 +0100
Subject: [PATCH 318/318] Fix HPL XRT baseline hostcode signature

---
 .../host/execution_types/execution_xrt_pcie.hpp   | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
index 77885103..a4de60ad 100644
--- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
@@ -48,7 +48,7 @@ namespace xrt_pcie {
  @copydoc bm_execution::calculate()
 */
 template<class TContext>
-std::unique_ptr<linpack::LinpackExecutionTimings> inline calculate(
+std::map<std::string, std::vector<double>> inline calculate(
     const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings,
                                        xrt::device, TContext, xrt::uuid> &config,
     linpack::LinpackData<TContext> &data) {
@@ -459,13 +459,14 @@ std::unique_ptr<linpack::LinpackExecutionTimings> inline calculate(
     Buffer_pivot.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
   }
 
-  std::unique_ptr<linpack::LinpackExecutionTimings> results(
-      new linpack::LinpackExecutionTimings{gefaExecutionTimes,
-                                           geslExecutionTimes});
-
-  MPI_Barrier(MPI_COMM_WORLD);
+    std::map<std::string, std::vector<double>> timings;
+    
+    timings["gefa"] = gefaExecutionTimes;
+    timings["gesl"] = geslExecutionTimes;
+    
+    MPI_Barrier(MPI_COMM_WORLD);
 
-  return results;
+    return timings;
 }
 
 } // namespace xrt_pcie