From a030a12798476af0fcb57427f01b14bf8f78cd4a Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 7 Apr 2022 10:51:47 +0100 Subject: [PATCH 001/318] Add ACCL as external dependency --- extern/CMakeLists.txt | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 75025b7c..77d5e3ac 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -28,7 +28,7 @@ FetchContent_Declare( # unfortunately they do not use releases, so the latest commit was used GIT_REPOSITORY https://github.com/definelicht/hlslib.git - GIT_TAG v1.2.1) + GIT_TAG v1.4.3) FetchContent_GetProperties(extern_hlslib) if(NOT extern_hlslib_POPULATED) @@ -54,3 +54,18 @@ if(NOT extern_cxxopts_POPULATED) ${extern_cxxopts_BINARY_DIR} EXCLUDE_FROM_ALL) endif() + +# ------------------------------------------------------------------------------- +# ACCL Library +FetchContent_Declare( + extern_accl + + GIT_REPOSITORY https://github.com/Mellich/ACCL.git + GIT_TAG dev) + +FetchContent_GetProperties(extern_accl) +if(NOT extern_accl_POPULATED) + message(STATUS "Fetching mandatory build dependency ACCL") + FetchContent_Populate(extern_accl) + set(extern_accl_SOURCE_DIR ${extern_accl_SOURCE_DIR} PARENT_SCOPE) +endif() From 37eca9d1ae611038a2f71ba116e4c7c601049af8 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 8 Apr 2022 10:28:10 +0100 Subject: [PATCH 002/318] Make base class more generic --- PTRANS/src/host/transpose_benchmark.hpp | 8 ++- b_eff/src/host/network_benchmark.hpp | 6 ++- b_eff/tests/CMakeLists.txt | 4 +- cmake/unitTestTargets.cmake | 4 +- shared/include/hpcc_benchmark.hpp | 27 ++++++---- shared/include/setup/fpga_setup_xrt.hpp | 66 +++++++++++++++++++++++++ 6 files changed, 100 insertions(+), 15 deletions(-) create mode 100644 shared/include/setup/fpga_setup_xrt.hpp diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index 5de333ca..0136c22c 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -46,8 +46,12 @@ namespace transpose { * @brief Implementation of the transpose benchmark * */ -class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark { - +class TransposeBenchmark : +#ifndef USE_XRT_BINDINGS +public hpcc_base::HpccFpgaBenchmark { +#else +// TODO initialize benchmark wth XRT bindings +#endif protected: /** diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index 0fdf8064..df445649 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -213,8 +213,12 @@ class NetworkExecutionTimings { * @brief Implementation of the Network benchmark * */ -class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark { +class NetworkBenchmark : +#ifndef USE_XRT_BINDINGS +public hpcc_base::HpccFpgaBenchmark { +#else +#endif protected: /** diff --git a/b_eff/tests/CMakeLists.txt b/b_eff/tests/CMakeLists.txt index 2a00ea83..be73f519 100755 --- a/b_eff/tests/CMakeLists.txt +++ b/b_eff/tests/CMakeLists.txt @@ -6,4 +6,6 @@ set(TEST_SOURCES test_kernel_functionality_and_host_integration.cpp) include(${CMAKE_SOURCE_DIR}/../cmake/unitTestTargets.cmake) -target_link_libraries(${LIB_NAME}_intel ${MPI_LIBRARIES}) +if (INTELFPGAOPENCL_FOUND) + target_link_libraries(${LIB_NAME}_intel ${MPI_LIBRARIES}) +endif() diff --git a/cmake/unitTestTargets.cmake b/cmake/unitTestTargets.cmake index 2597017b..0f36d3da 100644 --- a/cmake/unitTestTargets.cmake +++ b/cmake/unitTestTargets.cmake @@ -24,7 +24,9 @@ if (Vitis_FOUND) add_executable(${HOST_EXE_NAME}_test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") target_link_libraries(${HOST_EXE_NAME}_test_xilinx hpcc_fpga_base_test) - add_dependencies(${HOST_EXE_NAME}_test_xilinx ${kernel_emulation_targets_xilinx}) + if (NOT "${kernel_emulation_targets_xilinx}" STREQUAL "") + add_dependencies(${HOST_EXE_NAME}_test_xilinx "${kernel_emulation_targets_xilinx}") + endif() target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") foreach (kernel_target ${kernel_emulation_targets_xilinx}) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 17e17bb9..e85579d2 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -35,6 +35,9 @@ SOFTWARE. #endif /* Project's headers */ +#ifdef USE_XRT_BINDINGS +#include "setup/fpga_setup_xrt.hpp" +#endif #include "setup/fpga_setup.hpp" #include "cxxopts.hpp" #include "parameters.h" @@ -176,7 +179,7 @@ class BaseSettings { * * @tparam TSettings The program settings class that should be used (Must derive from BaseSettings) */ -template +template class ExecutionSettings { public: @@ -190,19 +193,19 @@ class ExecutionSettings { * @brief The OpenCL device that should be used for execution * */ - std::unique_ptr device; + std::unique_ptr device; /** * @brief The OpenCL context that should be used for execution * */ - std::unique_ptr context; + std::unique_ptr context; /** * @brief The OpenCL program that contains the benchmark kernel * */ - std::unique_ptr program; + std::unique_ptr program; /** * @brief Construct a new Execution Settings object @@ -238,7 +241,7 @@ class ExecutionSettings { * @tparam TData Class used to represent the benchmark input and output data * @tparam TOutput Class representing the measurements like timings etc */ -template +template class HpccFpgaBenchmark { private: @@ -258,7 +261,7 @@ class HpccFpgaBenchmark { * It should be laos used by all other methods to read the current benchmark settings. * */ - std::unique_ptr> executionSettings; + std::unique_ptr> executionSettings; /** * @brief Add additional options to the program parameter parser @@ -472,20 +475,24 @@ class HpccFpgaBenchmark { std::unique_ptr programSettings = parseProgramParameters(tmp_argc, tmp_argv); - std::unique_ptr context; - std::unique_ptr program; - std::unique_ptr usedDevice; + std::unique_ptr context; + std::unique_ptr program; + std::unique_ptr usedDevice; if (!programSettings->testOnly) { +#ifndef USE_XRT_BINDINGS usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice); context = std::unique_ptr(new cl::Context(*usedDevice)); program = fpga_setup::fpgaSetup(context.get(), {*usedDevice}, &programSettings->kernelFileName); + #else + // TODO: Select XRT device and program here! + #endif } - executionSettings = std::unique_ptr>(new ExecutionSettings(std::move(programSettings), std::move(usedDevice), + executionSettings = std::unique_ptr>(new ExecutionSettings(std::move(programSettings), std::move(usedDevice), std::move(context), std::move(program))); if (mpi_comm_rank == 0) { if (!checkInputParameters()) { diff --git a/shared/include/setup/fpga_setup_xrt.hpp b/shared/include/setup/fpga_setup_xrt.hpp new file mode 100644 index 00000000..73f5c56f --- /dev/null +++ b/shared/include/setup/fpga_setup_xrt.hpp @@ -0,0 +1,66 @@ +/* +Copyright (c) 2022 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_FPGA_SETUP_XRT_H_ +#define SRC_HOST_FPGA_SETUP_XRT_H_ + +#include +#include +#include +#include +#include +#include +#include + +/* External libraries */ +#include "xrt/xrt_device.h" + + +namespace fpga_setup { + +/** +Sets up the given FPGA with the kernel in the provided file. + +@param device The device used for the program +@param usedKernelFile The path to the kernel file +@return The program that is used to create the benchmark kernels +*/ + std::unique_ptr + fpgaSetupXRT(xrt::device &device, + const std::string *usedKernelFile); + + +/** +Searches an selects an FPGA device using the CL library functions. +If multiple platforms or devices are given, the user will be prompted to +choose a device. + +@param defaultDevice The index of the device that has to be used. If a + value < 0 is given, the device can be chosen + interactively + +@return the selected device +*/ + std::unique_ptr + selectFPGADeviceXRT(int defaultDevice); + +} // namespace fpga_setup +#endif // SRC_HOST_FPGA_SETUP_H_ From d8774266f8ef2a73564046bc18153c482f1c31f9 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 8 Apr 2022 11:26:05 +0100 Subject: [PATCH 003/318] Patch b_eff compatability with Xilinx --- b_eff/CMakeLists.txt | 3 --- b_eff/src/host/CMakeLists.txt | 12 +++++++++ b_eff/src/host/execution_types/execution.hpp | 4 ++- .../host/execution_types/execution_pcie.hpp | 23 ++++++++++++++++ b_eff/src/host/network_benchmark.cpp | 6 +++-- b_eff/src/host/network_benchmark.hpp | 27 +++++++++++++++++++ 6 files changed, 69 insertions(+), 6 deletions(-) diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt index b894bb48..13d93b1b 100755 --- a/b_eff/CMakeLists.txt +++ b/b_eff/CMakeLists.txt @@ -24,6 +24,3 @@ include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake) unset(DATA_TYPE CACHE) find_package(MPI REQUIRED) -if (NOT INTELFPGAOPENCL_FOUND) - message(ERROR "Benchmark does only support the Intel OpenCL SDK") -endif() diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt index fb08281f..9809208c 100755 --- a/b_eff/src/host/CMakeLists.txt +++ b/b_eff/src/host/CMakeLists.txt @@ -17,3 +17,15 @@ if (INTELFPGAOPENCL_FOUND) target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_intel_host_executable COMMAND $ -h) endif() +if (Vitis_FOUND) + add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) + target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) + target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + add_executable(${HOST_EXE_NAME}_xilinx main.cpp) + target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) + target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) + target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME test_xilinx_host_executable COMMAND $ -h) +endif() diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp index df630838..22a8a12e 100644 --- a/b_eff/src/host/execution_types/execution.hpp +++ b/b_eff/src/host/execution_types/execution.hpp @@ -22,4 +22,6 @@ SOFTWARE. #include "execution_types/execution_cpu.hpp" #include "execution_types/execution_pcie.hpp" -#include "execution_types/execution_iec.hpp" \ No newline at end of file +#ifdef INTEL_FPGA +#include "execution_types/execution_iec.hpp" +#endif diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp index 73156b7e..9e266cf8 100644 --- a/b_eff/src/host/execution_types/execution_pcie.hpp +++ b/b_eff/src/host/execution_types/execution_pcie.hpp @@ -45,6 +45,9 @@ namespace network::execution_types::pcie { int err; std::vector sendQueues; std::vector dummyBuffers; +#ifdef XILINX_FPGA + std::vector accesskernel; +#endif std::vector> dummyBufferContents; cl_uint size_in_bytes = std::max(static_cast(validationData.size()), (1 << messageSize)); @@ -59,6 +62,9 @@ namespace network::execution_types::pcie { for (uint r =0; r < config.programSettings->numRepetitions; r++) { sendQueues.clear(); dummyBuffers.clear(); +#ifdef XILINX_FPGA + accesskernel.clear(); +#endif dummyBufferContents.clear(); // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels for (int r = 0; r < config.programSettings->kernelReplications; r++) { @@ -66,6 +72,23 @@ namespace network::execution_types::pcie { dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err)); ASSERT_CL(err) +#ifdef XILINX_FPGA + accesskernel.push_back(cl::Kernel(*config.program, + ("accessMemory_0:{accessMemory_0_" + std::to_string(r + 1) + "}").c_str(), &err)); + + err = accesskernel[r].setArg(0, dummyBuffers[r]); + ASSERT_CL(err); + err = accesskernel[r].setArg(1, dummyBuffers[r]); + ASSERT_CL(err); + err = accesskernel[r].setArg(2, static_cast(0)); + ASSERT_CL(err); + err = accesskernel[r].setArg(3, static_cast(0)); + ASSERT_CL(err); + err = accesskernel[r].setArg(4,(1)); + ASSERT_CL(err); + err = accesskernel[r].setArg(5, cl_uint(0)); + ASSERT_CL(err); +#endif dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err); diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index 7bf728a2..e2b8b830 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -109,8 +109,10 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) { switch (executionSettings->programSettings->communicationType) { case hpcc_base::CommunicationType::cpu_only: timing = execution_types::cpu::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; case hpcc_base::CommunicationType::pcie_mpi: timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; - case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; - default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType)); +#ifdef INTEL_FPGA + case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; +#endif + default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType)); } timing_results.push_back(timing); } diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index df445649..b6d348d0 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -31,6 +31,33 @@ SOFTWARE. #include "hpcc_benchmark.hpp" #include "parameters.h" +#ifdef XILINX_FPGA +template +struct aligned_allocator { + + // typedefs + typedef T value_type; + typedef value_type* pointer; + typedef const value_type* const_pointer; + + pointer allocate(size_t pCount, const_pointer = 0){ + T* mem = 0; + if (posix_memalign(reinterpret_cast(&mem), 1024 , sizeof(T) * pCount) != 0) { + throw std::bad_alloc(); + } + return mem; + } + + void deallocate(pointer pPtr, size_t pCount) { + free(pPtr); + } +}; + +namespace cl { + template using vector = std::vector>; +} +#endif + /** * @brief Contains all classes and methods needed by the Network benchmark * From b96935a9a41a3f75e2ca190e44e7d819b166aa3d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 8 Apr 2022 13:46:26 +0100 Subject: [PATCH 004/318] Fix calculate functions for new templating --- .../host/execution_types/execution_cpu.hpp | 22 +++---------------- .../host/execution_types/execution_pcie.hpp | 3 ++- 2 files changed, 5 insertions(+), 20 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_cpu.hpp b/b_eff/src/host/execution_types/execution_cpu.hpp index 778dc2f1..f70cedf1 100644 --- a/b_eff/src/host/execution_types/execution_cpu.hpp +++ b/b_eff/src/host/execution_types/execution_cpu.hpp @@ -38,13 +38,12 @@ namespace network::execution_types::cpu { Implementation for the single kernel. @copydoc bm_execution::calculate() */ + template std::shared_ptr - calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, + calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, cl::vector &validationData) { int err; - std::vector sendQueues; - std::vector dummyBuffers; std::vector> dummyBufferContents; cl_uint size_in_bytes = std::max(static_cast(validationData.size()), (1 << messageSize)); @@ -57,24 +56,10 @@ namespace network::execution_types::cpu { std::vector calculationTimings; for (uint r =0; r < config.programSettings->numRepetitions; r++) { - sendQueues.clear(); - dummyBuffers.clear(); dummyBufferContents.clear(); // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels for (int r = 0; r < config.programSettings->kernelReplications; r++) { - - dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err)); - ASSERT_CL(err) - dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); - - cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err); - ASSERT_CL(err) - - sendQueue.enqueueWriteBuffer(dummyBuffers.back(), CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents.back().data()); - - sendQueues.push_back(sendQueue); - } double calculationTime = 0.0; for (int i = 0; i < config.programSettings->kernelReplications; i++) { @@ -102,8 +87,7 @@ namespace network::execution_types::cpu { // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { - err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); - ASSERT_CL(err); + std::copy(dummyBufferContents[r].begin(), dummyBufferContents[r].end(),validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); } std::shared_ptr result(new network::ExecutionTimings{ looplength, diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp index 9e266cf8..274e1c1d 100644 --- a/b_eff/src/host/execution_types/execution_pcie.hpp +++ b/b_eff/src/host/execution_types/execution_pcie.hpp @@ -38,8 +38,9 @@ namespace network::execution_types::pcie { Implementation for the single kernel. @copydoc bm_execution::calculate() */ + template std::shared_ptr - calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, + calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, cl::vector &validationData) { int err; From 7bd92e5d856b5c0488eff9e3bf9ca98111516af4 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 8 Apr 2022 14:05:45 +0100 Subject: [PATCH 005/318] Add ACCL to b_eff host build --- b_eff/CMakeLists.txt | 2 ++ b_eff/src/host/CMakeLists.txt | 2 ++ 2 files changed, 4 insertions(+) diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt index 13d93b1b..4b4fbb41 100755 --- a/b_eff/CMakeLists.txt +++ b/b_eff/CMakeLists.txt @@ -24,3 +24,5 @@ include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake) unset(DATA_TYPE CACHE) find_package(MPI REQUIRED) +include(${extern_accl_SOURCE_DIR}/driver/xrt/CMakeLists.txt) + diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt index 9809208c..b8c44859 100755 --- a/b_eff/src/host/CMakeLists.txt +++ b/b_eff/src/host/CMakeLists.txt @@ -24,6 +24,8 @@ if (Vitis_FOUND) add_executable(${HOST_EXE_NAME}_xilinx main.cpp) target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) + target_link_libraries(${LIB_NAME}_xilinx accl) + target_include_directories(${LIB_NAME}_xilinx PRIVATE ${ACCL_INCLUDE_PATH}) target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") From bccab2d9a01f9afad023ef0ec7cb8175e7d72086 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 8 Apr 2022 14:16:54 +0100 Subject: [PATCH 006/318] Temporarily remove OCL device setup to work with ACCL emulator --- b_eff/src/host/network_benchmark.hpp | 3 +++ shared/include/hpcc_benchmark.hpp | 11 +++++++---- shared/setup/fpga_setup.cpp | 8 ++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index b6d348d0..232bfd56 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -31,6 +31,8 @@ SOFTWARE. #include "hpcc_benchmark.hpp" #include "parameters.h" +//TODO: remove this custom allocator since cl2.hpp is available here? +#if 0 #ifdef XILINX_FPGA template struct aligned_allocator { @@ -57,6 +59,7 @@ namespace cl { template using vector = std::vector>; } #endif +#endif /** * @brief Contains all classes and methods needed by the Network benchmark diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index e85579d2..f4f7c080 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -480,6 +480,8 @@ class HpccFpgaBenchmark { std::unique_ptr usedDevice; if (!programSettings->testOnly) { +// TODO: This is temporarily excluded to only usethe ACCL emulator! +#if 0 #ifndef USE_XRT_BINDINGS usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice); @@ -490,6 +492,7 @@ class HpccFpgaBenchmark { #else // TODO: Select XRT device and program here! #endif +#endif } executionSettings = std::unique_ptr>(new ExecutionSettings(std::move(programSettings), std::move(usedDevice), @@ -609,7 +612,7 @@ class HpccFpgaBenchmark { * * @return ExecutionSettings& The execution settings object */ - ExecutionSettings& getExecutionSettings() { + ExecutionSettings& getExecutionSettings() { return *executionSettings; } @@ -664,12 +667,12 @@ class HpccFpgaBenchmark { * @param printedExecutionSettings The execution settings that have to be printed to the stream * @return std::ostream& The output stream after the execution settings are piped in */ -template -std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ +template +std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ std::string device_name; os << std::left; if (!printedExecutionSettings.programSettings->testOnly) { - printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name); +// printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name); } else { device_name = "TEST RUN: Not selected!"; diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp index dd1ddd28..e923545a 100644 --- a/shared/setup/fpga_setup.cpp +++ b/shared/setup/fpga_setup.cpp @@ -135,7 +135,8 @@ Sets up the given FPGA with the kernel in the provided file. #ifdef _USE_MPI_ MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); #endif - +// TODO: Thisis temporarily excluded to work with ACCL emulator without emulation bitstream! +#if 0 if (world_rank == 0) { std::cout << HLINE; std::cout << "FPGA Setup:" << usedKernelFile->c_str() << std::endl; @@ -176,6 +177,9 @@ Sets up the given FPGA with the kernel in the provided file. std::cout << HLINE; } return std::unique_ptr(new cl::Program(program)); +#else + return std::unique_ptr(nullptr); +#endif } /** @@ -322,4 +326,4 @@ choose a device. return std::unique_ptr(new cl::Device(deviceList[chosenDeviceId])); } -} // namespace fpga_setup \ No newline at end of file +} // namespace fpga_setup From 16c5b8af003eda0765b238225eded858d50eebb0 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 8 Apr 2022 17:17:50 +0100 Subject: [PATCH 007/318] First minimal version working with Simulator --- b_eff/src/host/execution_types/execution.hpp | 1 + .../host/execution_types/execution_accl.hpp | 130 ++++++++++++++++++ b_eff/src/host/network_benchmark.cpp | 1 + shared/include/communication_types.hpp | 10 +- 4 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 b_eff/src/host/execution_types/execution_accl.hpp diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp index 22a8a12e..c36459a4 100644 --- a/b_eff/src/host/execution_types/execution.hpp +++ b/b_eff/src/host/execution_types/execution.hpp @@ -25,3 +25,4 @@ SOFTWARE. #ifdef INTEL_FPGA #include "execution_types/execution_iec.hpp" #endif +#include "execution_types/execution_accl.hpp" diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp new file mode 100644 index 00000000..67d8d03a --- /dev/null +++ b/b_eff/src/host/execution_types/execution_accl.hpp @@ -0,0 +1,130 @@ +/* +Copyright (c) 2022 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_HPP +#define SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_HPP + +/* C++ standard library headers */ +#include +#include +#include + +/* External library headers */ +#include "mpi.h" +#include "accl.hpp" + +/* Project's headers */ + +namespace network::execution_types::accl { + + /* + Implementation for the single kernel. + @copydoc bm_execution::calculate() + */ + template + std::shared_ptr + calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, + cl::vector &validationData) { + + int err; + std::vector> dummyBufferContents; + std::vector> recvBufferContents; + std::vector>> acclSendBuffers; + std::vector>> acclRecvBuffers; + cl_uint size_in_bytes = std::max(static_cast(validationData.size()), (1 << messageSize)); + + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + + int current_size; + MPI_Comm_size(MPI_COMM_WORLD, & current_size); + + std::cout << "Setup ACCL..." << std::endl; + + std::vector ranks = {}; + for (int i = 0; i < current_size; ++i) { + ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, + 1024}; + ranks.emplace_back(new_rank); + } + // TODO: Add start port here. Currenty hardcoded! + ACCL::ACCL accl(ranks, current_rank, + "tcp://localhost:" + + std::to_string(5500 + current_rank)); + std::cout << "Start seidnign..." << std::endl; + std::vector calculationTimings; + for (uint r =0; r < config.programSettings->numRepetitions; r++) { + dummyBufferContents.clear(); + recvBufferContents.clear(); + acclSendBuffers.clear(); + acclRecvBuffers.clear(); + // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); + recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); + acclSendBuffers.push_back(accl.create_buffer(dummyBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16)); + acclRecvBuffers.push_back(accl.create_buffer(recvBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16)); + acclSendBuffers.back()->sync_to_device(); + acclRecvBuffers.back()->sync_to_device(); + } + std::cout << "Buffers prepared" << std::endl; + double calculationTime = 0.0; + for (int i = 0; i < config.programSettings->kernelReplications; i++) { + MPI_Barrier(MPI_COMM_WORLD); + auto startCalculation = std::chrono::high_resolution_clock::now(); + for (int l = 0; l < looplength; l++) { + std::cout << "Send from " << current_rank << " to " << (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size << std::endl; + accl.send(0, *acclSendBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + accl.recv(0, *acclRecvBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); +// MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, +// dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } + auto endCalculation = std::chrono::high_resolution_clock::now(); + calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); + #ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl; + #endif + } + calculationTimings.push_back(calculationTime); +#ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Done " << r << std::endl; +#endif + } + // Read validation data from FPGA will be placed sequentially in buffer for all replications + // The data order should not matter, because every byte should have the same value! + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(),validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); + } + std::shared_ptr result(new network::ExecutionTimings{ + looplength, + messageSize, + calculationTimings + }); + return result; + } + +} // namespace bm_execution + +#endif diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index e2b8b830..40332b85 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -112,6 +112,7 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) { #ifdef INTEL_FPGA case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; #endif + case hpcc_base::CommunicationType::accl: timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType)); } timing_results.push_back(timing); diff --git a/shared/include/communication_types.hpp b/shared/include/communication_types.hpp index bb46bb8d..1d74aa6f 100644 --- a/shared/include/communication_types.hpp +++ b/shared/include/communication_types.hpp @@ -52,6 +52,11 @@ typedef enum _CommunicationType { */ smi, + /** + * @brief Communication using ACCL + */ + accl, + /** * @brief Calculate the benchmark on CPU instead of FPGA * @@ -75,7 +80,8 @@ typedef enum _CommunicationType { static const std::map comm_to_str_map{ {"IEC", CommunicationType::intel_external_channels}, {"PCIE", CommunicationType::pcie_mpi}, - {"SMI", CommunicationType::smi}, + {"SMI", CommunicationType::smi}, + {"ACCL", CommunicationType::accl}, {"CPU", CommunicationType::cpu_only}, {"UNSUPPORTED", CommunicationType::unsupported}, {"AUTO", CommunicationType::automatic} @@ -121,4 +127,4 @@ static CommunicationType retrieveCommunicationType(std::string comm_name, std::s } } -#endif \ No newline at end of file +#endif From 137e27b84fd2ee7d35e077305280d02956298302 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 8 Apr 2022 17:41:28 +0100 Subject: [PATCH 008/318] Move ACCL setup to fpga setup --- b_eff/src/host/network_benchmark.hpp | 4 +- cmake/general_benchmark_build_setup.cmake | 4 ++ shared/CMakeLists.txt | 4 ++ shared/include/hpcc_benchmark.hpp | 18 +++---- ...fpga_setup_xrt.hpp => fpga_setup_accl.hpp} | 7 +-- shared/setup/fpga_setup.cpp | 5 -- shared/setup/fpga_setup_accl.cpp | 50 +++++++++++++++++++ 7 files changed, 72 insertions(+), 20 deletions(-) rename shared/include/setup/{fpga_setup_xrt.hpp => fpga_setup_accl.hpp} (93%) create mode 100644 shared/setup/fpga_setup_accl.cpp diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index 232bfd56..89ff9fe0 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -244,10 +244,10 @@ class NetworkExecutionTimings { * */ class NetworkBenchmark : -#ifndef USE_XRT_BINDINGS +#ifndef USE_ACCL public hpcc_base::HpccFpgaBenchmark { #else - +public hpcc_base::HpccFpgaBenchmark { #endif protected: diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake index 64aa8d0a..66153e5f 100644 --- a/cmake/general_benchmark_build_setup.cmake +++ b/cmake/general_benchmark_build_setup.cmake @@ -30,6 +30,7 @@ set(USE_OPENMP ${USE_OPENMP} CACHE BOOL "Use OpenMP in the host code") set(USE_MPI ${USE_MPI} CACHE BOOL "Compile the host code with MPI support. This has to be supported by the host code.") set(USE_SVM No CACHE BOOL "Use SVM pointers instead of creating buffers on the board and transferring the data there before execution.") set(USE_HBM No CACHE BOOL "Use host code specific to HBM FPGAs") +set(USE_ACCL No CACHE BOOL "Use ACCL for communication") set(USE_CUSTOM_KERNEL_TARGETS No CACHE BOOL "Enable build targets for custom kernels") set(USE_DEPRECATED_HPP_HEADER ${header_default} CACHE BOOL "Flag that indicates if the old C++ wrapper header should be used (cl.hpp) or the newer version (cl2.hpp or opencl.hpp)") set(HPCC_FPGA_CONFIG ${HPCC_FPGA_CONFIG} CACHE FILEPATH "Configuration file that is used to overwrite the default configuration") @@ -86,6 +87,9 @@ if (USE_MPI) include_directories(${MPI_CXX_INCLUDE_PATH}) link_libraries(${MPI_LIBRARIES}) endif() +if (USE_ACCL) + add_definitions(-DUSE_ACCL) +endif() # Add configuration time to build string(TIMESTAMP CONFIG_TIME "%a %b %d %H:%M:%S UTC %Y" UTC) diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt index 89a18117..a7e8390b 100644 --- a/shared/CMakeLists.txt +++ b/shared/CMakeLists.txt @@ -2,6 +2,10 @@ project(HPCCBaseLibrary VERSION 1.0.1) add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) +if (defined USE_ACCL) +add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp) +endif() + find_package(OpenCL QUIET) if (INTELFPGAOPENCL_FOUND) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index f4f7c080..dd9b022a 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -35,8 +35,8 @@ SOFTWARE. #endif /* Project's headers */ -#ifdef USE_XRT_BINDINGS -#include "setup/fpga_setup_xrt.hpp" +#ifdef USE_ACCL +#include "setup/fpga_setup_accl.hpp" #endif #include "setup/fpga_setup.hpp" #include "cxxopts.hpp" @@ -480,9 +480,7 @@ class HpccFpgaBenchmark { std::unique_ptr usedDevice; if (!programSettings->testOnly) { -// TODO: This is temporarily excluded to only usethe ACCL emulator! -#if 0 -#ifndef USE_XRT_BINDINGS +#ifndef USE_ACCL usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, programSettings->defaultDevice); @@ -490,9 +488,9 @@ class HpccFpgaBenchmark { program = fpga_setup::fpgaSetup(context.get(), {*usedDevice}, &programSettings->kernelFileName); #else - // TODO: Select XRT device and program here! + program = fpga_setup::fpgaSetupACCL(usedDevice, + &programSettings->kernelFileName); #endif -#endif } executionSettings = std::unique_ptr>(new ExecutionSettings(std::move(programSettings), std::move(usedDevice), @@ -667,12 +665,12 @@ class HpccFpgaBenchmark { * @param printedExecutionSettings The execution settings that have to be printed to the stream * @return std::ostream& The output stream after the execution settings are piped in */ -template -std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ +template +std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ std::string device_name; os << std::left; if (!printedExecutionSettings.programSettings->testOnly) { -// printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name); + printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name); } else { device_name = "TEST RUN: Not selected!"; diff --git a/shared/include/setup/fpga_setup_xrt.hpp b/shared/include/setup/fpga_setup_accl.hpp similarity index 93% rename from shared/include/setup/fpga_setup_xrt.hpp rename to shared/include/setup/fpga_setup_accl.hpp index 73f5c56f..cfc1abe4 100644 --- a/shared/include/setup/fpga_setup_xrt.hpp +++ b/shared/include/setup/fpga_setup_accl.hpp @@ -32,6 +32,7 @@ SOFTWARE. /* External libraries */ #include "xrt/xrt_device.h" +#include "accl.hpp" namespace fpga_setup { @@ -41,10 +42,10 @@ Sets up the given FPGA with the kernel in the provided file. @param device The device used for the program @param usedKernelFile The path to the kernel file -@return The program that is used to create the benchmark kernels +@return The ACCL instance used for communication */ - std::unique_ptr - fpgaSetupXRT(xrt::device &device, + std::unique_ptr + fpgaSetupACCL(xrt::device &device, const std::string *usedKernelFile); diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp index e923545a..aba9b8b2 100644 --- a/shared/setup/fpga_setup.cpp +++ b/shared/setup/fpga_setup.cpp @@ -135,8 +135,6 @@ Sets up the given FPGA with the kernel in the provided file. #ifdef _USE_MPI_ MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); #endif -// TODO: Thisis temporarily excluded to work with ACCL emulator without emulation bitstream! -#if 0 if (world_rank == 0) { std::cout << HLINE; std::cout << "FPGA Setup:" << usedKernelFile->c_str() << std::endl; @@ -177,9 +175,6 @@ Sets up the given FPGA with the kernel in the provided file. std::cout << HLINE; } return std::unique_ptr(new cl::Program(program)); -#else - return std::unique_ptr(nullptr); -#endif } /** diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp new file mode 100644 index 00000000..e0cf3723 --- /dev/null +++ b/shared/setup/fpga_setup_accl.cpp @@ -0,0 +1,50 @@ +// +// Created by Marius Meyer on 04.12.19. +// + +#include "setup/fpga_setup_accl.hpp" + +#include +#include +#include +#include +#include +#include + +/* External libraries */ +#include "parameters.h" + +#ifdef _USE_MPI_ +#include "mpi.h" +#endif + +namespace fpga_setup { + + std::unique_ptr + fpgaSetup(xrt::device &context, + const std::string *usedKernelFile) { + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + + int current_size; + MPI_Comm_size(MPI_COMM_WORLD, & current_size); + + std::vector ranks = {}; + for (int i = 0; i < current_size; ++i) { + ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, + 1024}; + ranks.emplace_back(new_rank); + } + // TODO: Add start port here. Currenty hardcoded! + return std::unique_ptr(new ACCL::ACCL(ranks, current_rank, + "tcp://localhost:" + + std::to_string(5500 + current_rank))); + } + + + std::unique_ptr + selectFPGADevice(int defaultDevice) { + return std::unique_ptr(nullptr); + } + +} // namespace fpga_setup \ No newline at end of file From 364db3c1bfde3ffabc0bd78fc521d4b1207b247d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 8 Apr 2022 17:44:08 +0100 Subject: [PATCH 009/318] Use global ACCL instance --- .../host/execution_types/execution_accl.hpp | 26 ++++--------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp index 67d8d03a..d4822e3d 100644 --- a/b_eff/src/host/execution_types/execution_accl.hpp +++ b/b_eff/src/host/execution_types/execution_accl.hpp @@ -57,19 +57,6 @@ namespace network::execution_types::accl { int current_size; MPI_Comm_size(MPI_COMM_WORLD, & current_size); - std::cout << "Setup ACCL..." << std::endl; - - std::vector ranks = {}; - for (int i = 0; i < current_size; ++i) { - ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, - 1024}; - ranks.emplace_back(new_rank); - } - // TODO: Add start port here. Currenty hardcoded! - ACCL::ACCL accl(ranks, current_rank, - "tcp://localhost:" + - std::to_string(5500 + current_rank)); - std::cout << "Start seidnign..." << std::endl; std::vector calculationTimings; for (uint r =0; r < config.programSettings->numRepetitions; r++) { dummyBufferContents.clear(); @@ -80,22 +67,19 @@ namespace network::execution_types::accl { for (int r = 0; r < config.programSettings->kernelReplications; r++) { dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); - acclSendBuffers.push_back(accl.create_buffer(dummyBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16)); - acclRecvBuffers.push_back(accl.create_buffer(recvBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16)); + acclSendBuffers.push_back(config.program->create_buffer(dummyBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16)); + acclRecvBuffers.push_back(config.program->create_buffer(recvBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16)); acclSendBuffers.back()->sync_to_device(); acclRecvBuffers.back()->sync_to_device(); } - std::cout << "Buffers prepared" << std::endl; + double calculationTime = 0.0; for (int i = 0; i < config.programSettings->kernelReplications; i++) { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); for (int l = 0; l < looplength; l++) { - std::cout << "Send from " << current_rank << " to " << (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size << std::endl; - accl.send(0, *acclSendBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); - accl.recv(0, *acclRecvBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); -// MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, -// dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + config.program->send(0, *acclSendBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + config.program->recv(0, *acclRecvBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); } auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); From 62afe38615dfaa7c9badf4fcd4de3706b6350a04 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 8 Apr 2022 18:59:23 +0100 Subject: [PATCH 010/318] Complete move ACCL setup to FPGA setup --- b_eff/src/host/execution_types/execution.hpp | 2 ++ .../src/host/execution_types/execution_accl.hpp | 4 ++-- b_eff/src/host/network_benchmark.cpp | 4 +++- b_eff/src/host/network_benchmark.hpp | 2 +- shared/CMakeLists.txt | 9 ++++++--- shared/include/hpcc_benchmark.hpp | 16 +++++++++------- shared/setup/fpga_setup_accl.cpp | 4 ++-- 7 files changed, 25 insertions(+), 16 deletions(-) diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp index c36459a4..33d3b0a6 100644 --- a/b_eff/src/host/execution_types/execution.hpp +++ b/b_eff/src/host/execution_types/execution.hpp @@ -21,8 +21,10 @@ SOFTWARE. */ #include "execution_types/execution_cpu.hpp" +#ifndef USE_ACCL #include "execution_types/execution_pcie.hpp" #ifdef INTEL_FPGA #include "execution_types/execution_iec.hpp" #endif +#endif #include "execution_types/execution_accl.hpp" diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp index d4822e3d..f1f5736d 100644 --- a/b_eff/src/host/execution_types/execution_accl.hpp +++ b/b_eff/src/host/execution_types/execution_accl.hpp @@ -67,8 +67,8 @@ namespace network::execution_types::accl { for (int r = 0; r < config.programSettings->kernelReplications; r++) { dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); - acclSendBuffers.push_back(config.program->create_buffer(dummyBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16)); - acclRecvBuffers.push_back(config.program->create_buffer(recvBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16)); + acclSendBuffers.push_back(config.program->create_buffer(dummyBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16)); + acclRecvBuffers.push_back(config.program->create_buffer(recvBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16)); acclSendBuffers.back()->sync_to_device(); acclRecvBuffers.back()->sync_to_device(); } diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index 40332b85..09872106 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -108,9 +108,11 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) { std::shared_ptr timing; switch (executionSettings->programSettings->communicationType) { case hpcc_base::CommunicationType::cpu_only: timing = execution_types::cpu::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; - case hpcc_base::CommunicationType::pcie_mpi: timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; +#ifndef USE_ACCL + case hpcc_base::CommunicationType::pcie_mpi: timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; #ifdef INTEL_FPGA case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; +#endif #endif case hpcc_base::CommunicationType::accl: timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType)); diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index 89ff9fe0..cfe9a25e 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -247,7 +247,7 @@ class NetworkBenchmark : #ifndef USE_ACCL public hpcc_base::HpccFpgaBenchmark { #else -public hpcc_base::HpccFpgaBenchmark { +public hpcc_base::HpccFpgaBenchmark { #endif protected: diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt index a7e8390b..d2fba9a2 100644 --- a/shared/CMakeLists.txt +++ b/shared/CMakeLists.txt @@ -1,9 +1,12 @@ project(HPCCBaseLibrary VERSION 1.0.1) -add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) -if (defined USE_ACCL) -add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp) +if (USE_ACCL) +add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp) +target_include_directories(hpcc_fpga_base PRIVATE ${ACCL_INCLUDE_PATH}) +target_link_libraries(hpcc_fpga_base accl) +else() +add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) endif() find_package(OpenCL QUIET) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index dd9b022a..5d451ff4 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -215,8 +215,8 @@ class ExecutionSettings { * @param context_ Used OpenCL context * @param program_ Used OpenCL program */ - ExecutionSettings(std::unique_ptr programSettings_, std::unique_ptr device_, - std::unique_ptr context_, std::unique_ptr program_): + ExecutionSettings(std::unique_ptr programSettings_, std::unique_ptr device_, + std::unique_ptr context_, std::unique_ptr program_): programSettings(std::move(programSettings_)), device(std::move(device_)), context(std::move(context_)), program(std::move(program_)) {} @@ -488,7 +488,7 @@ class HpccFpgaBenchmark { program = fpga_setup::fpgaSetup(context.get(), {*usedDevice}, &programSettings->kernelFileName); #else - program = fpga_setup::fpgaSetupACCL(usedDevice, + program = fpga_setup::fpgaSetupACCL(*usedDevice, &programSettings->kernelFileName); #endif } @@ -665,13 +665,15 @@ class HpccFpgaBenchmark { * @param printedExecutionSettings The execution settings that have to be printed to the stream * @return std::ostream& The output stream after the execution settings are piped in */ -template -std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ +template +std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ std::string device_name; os << std::left; if (!printedExecutionSettings.programSettings->testOnly) { - printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name); - } +#ifndef USE_ACCL + printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name); +#endif + } else { device_name = "TEST RUN: Not selected!"; } diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index e0cf3723..14eddc18 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -21,7 +21,7 @@ namespace fpga_setup { std::unique_ptr - fpgaSetup(xrt::device &context, + fpgaSetupACCL(xrt::device &context, const std::string *usedKernelFile) { int current_rank; MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); @@ -47,4 +47,4 @@ namespace fpga_setup { return std::unique_ptr(nullptr); } -} // namespace fpga_setup \ No newline at end of file +} // namespace fpga_setup From 82dd098ce7dc8ddac8c91f02a08458fc918aa1f4 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 11 Apr 2022 14:33:01 +0100 Subject: [PATCH 011/318] Fix ACCL bugs in b_eff --- b_eff/src/host/execution_types/execution_accl.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp index f1f5736d..5db9093c 100644 --- a/b_eff/src/host/execution_types/execution_accl.hpp +++ b/b_eff/src/host/execution_types/execution_accl.hpp @@ -63,12 +63,13 @@ namespace network::execution_types::accl { recvBufferContents.clear(); acclSendBuffers.clear(); acclRecvBuffers.clear(); + int size_in_values = (size_in_bytes + 3) / 4; // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels for (int r = 0; r < config.programSettings->kernelReplications; r++) { dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); - acclSendBuffers.push_back(config.program->create_buffer(dummyBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16)); - acclRecvBuffers.push_back(config.program->create_buffer(recvBufferContents.back().data(), size_in_bytes + 1 / 2, ACCL::dataType::float16)); + acclSendBuffers.push_back(config.program->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); + acclRecvBuffers.push_back(config.program->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); acclSendBuffers.back()->sync_to_device(); acclRecvBuffers.back()->sync_to_device(); } @@ -78,8 +79,8 @@ namespace network::execution_types::accl { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); for (int l = 0; l < looplength; l++) { - config.program->send(0, *acclSendBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); - config.program->recv(0, *acclRecvBuffers[i], size_in_bytes, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + config.program->send(0, *acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + config.program->recv(0, *acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); } auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); @@ -99,7 +100,7 @@ namespace network::execution_types::accl { // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { - std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(),validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); + std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); } std::shared_ptr result(new network::ExecutionTimings{ looplength, From 2e5b0e326ac95cd74f30e6f9a24636365bccdb59 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 Apr 2022 15:00:37 +0100 Subject: [PATCH 012/318] Partial restructuring code for XRT --- PTRANS/src/host/data_handlers/diagonal.hpp | 5 +- PTRANS/src/host/data_handlers/handler.hpp | 3 +- PTRANS/src/host/data_handlers/pq.hpp | 7 +- .../host/execution_types/execution_cpu.hpp | 3 +- .../host/execution_types/execution_intel.hpp | 4 +- .../execution_types/execution_intel_pq.hpp | 4 +- .../host/execution_types/execution_pcie.hpp | 2 +- .../execution_types/execution_pcie_pq.hpp | 4 +- PTRANS/src/host/main.cpp | 2 +- PTRANS/src/host/transpose_benchmark.cpp | 9 --- PTRANS/src/host/transpose_benchmark.hpp | 9 +-- b_eff/CMakeLists.txt | 2 - b_eff/src/host/execution_types/execution.hpp | 3 +- .../host/execution_types/execution_accl.hpp | 8 +-- b_eff/src/host/network_benchmark.hpp | 13 ++-- cmake/general_benchmark_build_setup.cmake | 7 +- cmake/unitTestTargets.cmake | 2 +- shared/CMakeLists.txt | 19 ++++-- shared/include/hpcc_benchmark.hpp | 51 +++++++++----- shared/include/setup/fpga_setup_accl.hpp | 21 +----- shared/include/setup/fpga_setup_xrt.hpp | 66 +++++++++++++++++++ shared/setup/fpga_setup_accl.cpp | 8 +-- shared/setup/fpga_setup_xrt.cpp | 39 +++++++++++ 23 files changed, 201 insertions(+), 90 deletions(-) create mode 100644 shared/include/setup/fpga_setup_xrt.hpp create mode 100644 shared/setup/fpga_setup_xrt.cpp diff --git a/PTRANS/src/host/data_handlers/diagonal.hpp b/PTRANS/src/host/data_handlers/diagonal.hpp index e1d72f3b..2edae91a 100644 --- a/PTRANS/src/host/data_handlers/diagonal.hpp +++ b/PTRANS/src/host/data_handlers/diagonal.hpp @@ -44,7 +44,8 @@ namespace transpose { * the missing data. e.g. for N ranks, the pairs will be (0, N/2), (1, N/2 + 1), ... * */ -class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler { +template +class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler { private: @@ -69,7 +70,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler { * @return std::unique_ptr The generated data */ std::unique_ptr - generateData(hpcc_base::ExecutionSettings& settings) override { + generateData(hpcc_base::ExecutionSettings& settings) override { MPI_Type_contiguous(settings.programSettings->blockSize * settings.programSettings->blockSize, MPI_FLOAT, &data_block); MPI_Type_commit(&data_block); diff --git a/PTRANS/src/host/data_handlers/handler.hpp b/PTRANS/src/host/data_handlers/handler.hpp index fe1293fe..b71597bd 100644 --- a/PTRANS/src/host/data_handlers/handler.hpp +++ b/PTRANS/src/host/data_handlers/handler.hpp @@ -43,6 +43,7 @@ namespace data_handler { * calculate the overall validation error. * */ +template class TransposeDataHandler { protected: @@ -68,7 +69,7 @@ class TransposeDataHandler { * @return std::unique_ptr The generated data */ virtual std::unique_ptr - generateData(hpcc_base::ExecutionSettings& settings) = 0; + generateData(hpcc_base::ExecutionSettings& settings) = 0; /** * @brief Exchange the data blocks for verification diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp index 01f2261d..388c83d9 100644 --- a/PTRANS/src/host/data_handlers/pq.hpp +++ b/PTRANS/src/host/data_handlers/pq.hpp @@ -52,7 +52,8 @@ static T mod(T number, T op) { return (result < 0 || result >= op) ? op + result : result; } -class DistributedPQTransposeDataHandler : public TransposeDataHandler { +template +class DistributedPQTransposeDataHandler : public TransposeDataHandler { private: @@ -135,7 +136,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler { * @return std::unique_ptr The generated data */ std::unique_ptr - generateData(hpcc_base::ExecutionSettings& settings) override { + generateData(hpcc_base::ExecutionSettings& settings) override { int width_in_blocks = settings.programSettings->matrixSize / settings.programSettings->blockSize; global_width = width_in_blocks; @@ -384,7 +385,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler { * @param mpi_size Size of the communication world * @param p Width of the PQ grid the FPGAs are arranged in */ - DistributedPQTransposeDataHandler(int mpi_rank, int mpi_size, int p) : TransposeDataHandler(mpi_rank, mpi_size) { + DistributedPQTransposeDataHandler(int mpi_rank, int mpi_size, int p) : TransposeDataHandler(mpi_rank, mpi_size) { if (mpi_size % p != 0) { throw std::runtime_error("Number of MPI ranks must be multiple of P! P=" + std::to_string(p)); } diff --git a/PTRANS/src/host/execution_types/execution_cpu.hpp b/PTRANS/src/host/execution_types/execution_cpu.hpp index ab74fdc9..130b016e 100644 --- a/PTRANS/src/host/execution_types/execution_cpu.hpp +++ b/PTRANS/src/host/execution_types/execution_cpu.hpp @@ -50,8 +50,9 @@ namespace transpose * @param data data object that contains all required data for the execution * @return std::unique_ptr The measured execution times */ + template static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) + calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) { int err; diff --git a/PTRANS/src/host/execution_types/execution_intel.hpp b/PTRANS/src/host/execution_types/execution_intel.hpp index d95bf578..58f5a73f 100644 --- a/PTRANS/src/host/execution_types/execution_intel.hpp +++ b/PTRANS/src/host/execution_types/execution_intel.hpp @@ -43,7 +43,7 @@ namespace intel { * @return std::unique_ptr The measured execution times */ static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data) { + calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data) { int err; if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::diagonal) { @@ -275,4 +275,4 @@ static std::unique_ptr } // namespace fpga_execution } // namespace intel -#endif \ No newline at end of file +#endif diff --git a/PTRANS/src/host/execution_types/execution_intel_pq.hpp b/PTRANS/src/host/execution_types/execution_intel_pq.hpp index 431ff40d..85e596a7 100644 --- a/PTRANS/src/host/execution_types/execution_intel_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_intel_pq.hpp @@ -44,7 +44,7 @@ namespace intel_pq { * @return std::unique_ptr The measured execution times */ static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { + calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { int err; if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { @@ -354,4 +354,4 @@ static std::unique_ptr } // namespace fpga_execution } // namespace intel -#endif \ No newline at end of file +#endif diff --git a/PTRANS/src/host/execution_types/execution_pcie.hpp b/PTRANS/src/host/execution_types/execution_pcie.hpp index 5e29ad2e..a08888de 100644 --- a/PTRANS/src/host/execution_types/execution_pcie.hpp +++ b/PTRANS/src/host/execution_types/execution_pcie.hpp @@ -49,7 +49,7 @@ namespace transpose * @return std::unique_ptr The measured execution times */ static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) + calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) { int err; diff --git a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp index d2cfae7e..db1d9bee 100644 --- a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp @@ -45,7 +45,7 @@ namespace pcie_pq { * @return std::unique_ptr The measured execution times */ static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { + calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { int err; if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { @@ -378,4 +378,4 @@ static std::unique_ptr } // namespace fpga_execution } // namespace intel -#endif \ No newline at end of file +#endif diff --git a/PTRANS/src/host/main.cpp b/PTRANS/src/host/main.cpp index a054f7dd..f65d06ce 100644 --- a/PTRANS/src/host/main.cpp +++ b/PTRANS/src/host/main.cpp @@ -8,7 +8,7 @@ The program entry point int main(int argc, char *argv[]) { // Setup benchmark - TransposeBenchmark bm(argc, argv); + TransposeBenchmark bm(argc, argv); bool success = bm.executeBenchmark(); if (success) { return 0; diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp index 755b11a0..e66b3a36 100644 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -173,13 +173,4 @@ transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeD return static_cast(global_max_error) < 100 * std::numeric_limits::epsilon(); } -void -transpose::TransposeBenchmark::setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier) { - switch (dataHandlerIdentifier) { - case transpose::data_handler::DataHandlerType::diagonal: dataHandler = std::unique_ptr(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break; - case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break; - default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier)); - } - - } diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index 0136c22c..74ada897 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -46,12 +46,9 @@ namespace transpose { * @brief Implementation of the transpose benchmark * */ +template class TransposeBenchmark : -#ifndef USE_XRT_BINDINGS -public hpcc_base::HpccFpgaBenchmark { -#else -// TODO initialize benchmark wth XRT bindings -#endif +public hpcc_base::HpccFpgaBenchmark { protected: /** @@ -62,7 +59,7 @@ public hpcc_base::HpccFpgaBenchmark dataHandler; + std::unique_ptr> dataHandler; public: diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt index 4b4fbb41..13d93b1b 100755 --- a/b_eff/CMakeLists.txt +++ b/b_eff/CMakeLists.txt @@ -24,5 +24,3 @@ include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake) unset(DATA_TYPE CACHE) find_package(MPI REQUIRED) -include(${extern_accl_SOURCE_DIR}/driver/xrt/CMakeLists.txt) - diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp index 33d3b0a6..118f0ebc 100644 --- a/b_eff/src/host/execution_types/execution.hpp +++ b/b_eff/src/host/execution_types/execution.hpp @@ -26,5 +26,6 @@ SOFTWARE. #ifdef INTEL_FPGA #include "execution_types/execution_iec.hpp" #endif -#endif +#else #include "execution_types/execution_accl.hpp" +#endif diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp index 5db9093c..81673835 100644 --- a/b_eff/src/host/execution_types/execution_accl.hpp +++ b/b_eff/src/host/execution_types/execution_accl.hpp @@ -68,8 +68,8 @@ namespace network::execution_types::accl { for (int r = 0; r < config.programSettings->kernelReplications; r++) { dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); - acclSendBuffers.push_back(config.program->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); - acclRecvBuffers.push_back(config.program->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); + acclSendBuffers.push_back(config.accl->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); + acclRecvBuffers.push_back(config.accl->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); acclSendBuffers.back()->sync_to_device(); acclRecvBuffers.back()->sync_to_device(); } @@ -79,8 +79,8 @@ namespace network::execution_types::accl { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); for (int l = 0; l < looplength; l++) { - config.program->send(0, *acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); - config.program->recv(0, *acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + config.accl->send(0, *acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + config.accl->recv(0, *acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); } auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index cfe9a25e..efffe1bf 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -244,12 +244,15 @@ class NetworkExecutionTimings { * */ class NetworkBenchmark : -#ifndef USE_ACCL -public hpcc_base::HpccFpgaBenchmark { -#else -public hpcc_base::HpccFpgaBenchmark { +#ifdef USE_OCL_HOST + public hpcc_base::HpccFpgaBenchmark #endif -protected: +#ifdef USE_XRT_HOST + public hpcc_base::HpccFpgaBenchmark + +#endif + { + protected: /** * @brief Additional input parameters of the Network benchmark diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake index 66153e5f..441b6f41 100644 --- a/cmake/general_benchmark_build_setup.cmake +++ b/cmake/general_benchmark_build_setup.cmake @@ -1,7 +1,7 @@ cmake_policy(VERSION 3.13) INCLUDE (CheckTypeSize) -set (CMAKE_CXX_STANDARD 11) +set (CMAKE_CXX_STANDARD 14) # Download build dependencies add_subdirectory(${CMAKE_SOURCE_DIR}/../extern ${CMAKE_BINARY_DIR}/extern) @@ -31,6 +31,7 @@ set(USE_MPI ${USE_MPI} CACHE BOOL "Compile the host code with MPI support. This set(USE_SVM No CACHE BOOL "Use SVM pointers instead of creating buffers on the board and transferring the data there before execution.") set(USE_HBM No CACHE BOOL "Use host code specific to HBM FPGAs") set(USE_ACCL No CACHE BOOL "Use ACCL for communication") +set(USE_OCL_HOST Yes CACHE BOOL "Use OpenCL host code implementation") set(USE_CUSTOM_KERNEL_TARGETS No CACHE BOOL "Enable build targets for custom kernels") set(USE_DEPRECATED_HPP_HEADER ${header_default} CACHE BOOL "Flag that indicates if the old C++ wrapper header should be used (cl.hpp) or the newer version (cl2.hpp or opencl.hpp)") set(HPCC_FPGA_CONFIG ${HPCC_FPGA_CONFIG} CACHE FILEPATH "Configuration file that is used to overwrite the default configuration") @@ -91,6 +92,10 @@ if (USE_ACCL) add_definitions(-DUSE_ACCL) endif() +if (USE_OCL_HOST) + add_definitions(-DUSE_OCL_HOST) +endif() + # Add configuration time to build string(TIMESTAMP CONFIG_TIME "%a %b %d %H:%M:%S UTC %Y" UTC) add_definitions(-DCONFIG_TIME="${CONFIG_TIME}") diff --git a/cmake/unitTestTargets.cmake b/cmake/unitTestTargets.cmake index 0f36d3da..776269e7 100644 --- a/cmake/unitTestTargets.cmake +++ b/cmake/unitTestTargets.cmake @@ -25,7 +25,7 @@ if (Vitis_FOUND) target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") target_link_libraries(${HOST_EXE_NAME}_test_xilinx hpcc_fpga_base_test) if (NOT "${kernel_emulation_targets_xilinx}" STREQUAL "") - add_dependencies(${HOST_EXE_NAME}_test_xilinx "${kernel_emulation_targets_xilinx}") + add_dependencies(${HOST_EXE_NAME}_test_xilinx ${kernel_emulation_targets_xilinx}) endif() target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt index d2fba9a2..3f3ada79 100644 --- a/shared/CMakeLists.txt +++ b/shared/CMakeLists.txt @@ -1,12 +1,21 @@ project(HPCCBaseLibrary VERSION 1.0.1) +set(HPCC_BASE_SOURCES "") if (USE_ACCL) -add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp) -target_include_directories(hpcc_fpga_base PRIVATE ${ACCL_INCLUDE_PATH}) -target_link_libraries(hpcc_fpga_base accl) -else() -add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) + include(${extern_accl_SOURCE_DIR}/driver/xrt/CMakeLists.txt) + list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp) +endif() +if (USE_XRT_HOST) + list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp) +endif() +if (USE_OCL_HOST) + list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) +endif() +add_library(hpcc_fpga_base STATIC ${HPCC_BASE_SOURCES}) +if (USE_ACCL) + target_include_directories(hpcc_fpga_base PRIVATE ${ACCL_INCLUDE_PATH}) + target_link_libraries(hpcc_fpga_base accl) endif() find_package(OpenCL QUIET) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 5d451ff4..f135fc30 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -172,7 +172,6 @@ class BaseSettings { }; - /** * @brief Settings class that is containing the program settings together with * additional information about the OpenCL runtime @@ -207,6 +206,14 @@ class ExecutionSettings { */ std::unique_ptr program; +#ifdef USE_ACCL + /** + * @brief Pointer to ACCL instance + * + */ + std::unique_ptr accl; +#endif + /** * @brief Construct a new Execution Settings object * @@ -216,9 +223,18 @@ class ExecutionSettings { * @param program_ Used OpenCL program */ ExecutionSettings(std::unique_ptr programSettings_, std::unique_ptr device_, - std::unique_ptr context_, std::unique_ptr program_): + std::unique_ptr context_, std::unique_ptr program_ +#ifdef USE_ACCL + , std::unique_ptr accl_ +#endif + + ): programSettings(std::move(programSettings_)), device(std::move(device_)), - context(std::move(context_)), program(std::move(program_)) {} + context(std::move(context_)), program(std::move(program_)) +#ifdef USE_ACCL + , accl(std::move(accl_)) +#endif + {} /** * @brief Destroy the Execution Settings object. Used to specify the order the contained objects are destroyed @@ -478,23 +494,26 @@ class HpccFpgaBenchmark { std::unique_ptr context; std::unique_ptr program; std::unique_ptr usedDevice; - +#ifdef USE_ACCL + std::unique_ptr accl; +#endif if (!programSettings->testOnly) { -#ifndef USE_ACCL - usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, - programSettings->defaultDevice); - - context = std::unique_ptr(new cl::Context(*usedDevice)); - program = fpga_setup::fpgaSetup(context.get(), {*usedDevice}, - &programSettings->kernelFileName); - #else - program = fpga_setup::fpgaSetupACCL(*usedDevice, - &programSettings->kernelFileName); - #endif +// usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, +// programSettings->defaultDevice); +#ifdef USE_OCL_HOST +// context = std::unique_ptr(new cl::Context(*usedDevice)); +// program = fpga_setup::fpgaSetup(context.get(), {*usedDevice}, +// &programSettings->kernelFileName); +#endif +#ifdef USE_ACCL + xrt::device dev; + xrt::uuid *program; + accl = fpga_setup::fpgaSetupACCL(dev, *program); +#endif } executionSettings = std::unique_ptr>(new ExecutionSettings(std::move(programSettings), std::move(usedDevice), - std::move(context), std::move(program))); + std::move(context), std::move(program), std::move(accl))); if (mpi_comm_rank == 0) { if (!checkInputParameters()) { std::cerr << "ERROR: Input parameter check failed!" << std::endl; diff --git a/shared/include/setup/fpga_setup_accl.hpp b/shared/include/setup/fpga_setup_accl.hpp index cfc1abe4..7158a81b 100644 --- a/shared/include/setup/fpga_setup_accl.hpp +++ b/shared/include/setup/fpga_setup_accl.hpp @@ -19,8 +19,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef SRC_HOST_FPGA_SETUP_XRT_H_ -#define SRC_HOST_FPGA_SETUP_XRT_H_ +#ifndef SRC_HOST_FPGA_SETUP_ACCL_H_ +#define SRC_HOST_FPGA_SETUP_ACCL_H_ #include #include @@ -46,22 +46,7 @@ Sets up the given FPGA with the kernel in the provided file. */ std::unique_ptr fpgaSetupACCL(xrt::device &device, - const std::string *usedKernelFile); - - -/** -Searches an selects an FPGA device using the CL library functions. -If multiple platforms or devices are given, the user will be prompted to -choose a device. - -@param defaultDevice The index of the device that has to be used. If a - value < 0 is given, the device can be chosen - interactively - -@return the selected device -*/ - std::unique_ptr - selectFPGADeviceXRT(int defaultDevice); + xrt::uuid &program); } // namespace fpga_setup #endif // SRC_HOST_FPGA_SETUP_H_ diff --git a/shared/include/setup/fpga_setup_xrt.hpp b/shared/include/setup/fpga_setup_xrt.hpp new file mode 100644 index 00000000..61c74f72 --- /dev/null +++ b/shared/include/setup/fpga_setup_xrt.hpp @@ -0,0 +1,66 @@ +/* +Copyright (c) 2022 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_FPGA_SETUP_XRT_H_ +#define SRC_HOST_FPGA_SETUP_XRT_H_ + +#include +#include +#include +#include +#include +#include +#include + +/* External libraries */ +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +namespace fpga_setup { + +/** +Sets up the given FPGA with the kernel in the provided file. + +@param device The device used for the program +@param usedKernelFile The path to the kernel file +@return The ACCL instance used for communication +*/ + std::unique_ptr + fpgaSetup(xrt::device &device, + const std::string &usedKernelFile); + + +/** +Searches an selects an FPGA device using the CL library functions. +If multiple platforms or devices are given, the user will be prompted to +choose a device. + +@param defaultDevice The index of the device that has to be used. If a + value < 0 is given, the device can be chosen + interactively + +@return the selected device +*/ + std::unique_ptr + selectFPGADevice(int defaultDevice); + +} // namespace fpga_setup +#endif // SRC_HOST_FPGA_SETUP_H_ diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 14eddc18..01d012e3 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -22,7 +22,7 @@ namespace fpga_setup { std::unique_ptr fpgaSetupACCL(xrt::device &context, - const std::string *usedKernelFile) { + xrt::uuid &program) { int current_rank; MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); @@ -41,10 +41,4 @@ namespace fpga_setup { std::to_string(5500 + current_rank))); } - - std::unique_ptr - selectFPGADevice(int defaultDevice) { - return std::unique_ptr(nullptr); - } - } // namespace fpga_setup diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp new file mode 100644 index 00000000..1a9135bb --- /dev/null +++ b/shared/setup/fpga_setup_xrt.cpp @@ -0,0 +1,39 @@ +// +// Created by Marius Meyer on 04.12.19. +// + +#include "setup/fpga_setup_xrt.hpp" + +#include +#include +#include +#include +#include +#include + +/* External libraries */ +#include "parameters.h" + +#ifdef _USE_MPI_ +#include "mpi.h" +#endif + +namespace fpga_setup { + + std::unique_ptr + fpgaSetup(xrt::device &device, + std::string &kernelFileName) { + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + + int current_size; + MPI_Comm_size(MPI_COMM_WORLD, & current_size); + + return std::unique_ptr(new device.load_xclbin(kernelFileName)); + } + + std::unique_ptr + selectFPGADevice(int defaultDevice) { + return std::unique_ptr(new xrt::device(defaultDevice)); + } +} // namespace fpga_setup From e052b1ef295da976d1839b8187daa7ca46074088 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 Apr 2022 15:15:48 +0100 Subject: [PATCH 013/318] Attempt fix generic PTRANS impl --- PTRANS/src/host/CMakeLists.txt | 2 +- PTRANS/src/host/transpose_benchmark.hpp | 136 ++++++++++++++++++++++-- 2 files changed, 126 insertions(+), 12 deletions(-) diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt index 89b45ff8..647ac6ee 100755 --- a/PTRANS/src/host/CMakeLists.txt +++ b/PTRANS/src/host/CMakeLists.txt @@ -1,6 +1,6 @@ add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) -set(HOST_SOURCE transpose_benchmark.cpp transpose_data.cpp) +set(HOST_SOURCE transpose_data.cpp) set(HOST_EXE_NAME Transpose) set(LIB_NAME trans) diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index 74ada897..4b27ecc4 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -57,7 +57,18 @@ public hpcc_base::HpccFpgaBenchmark()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) + ("b", "Block size in number of values in one dimension", + cxxopts::value()->default_value(std::to_string(BLOCK_SIZE))) + ("p", "Value of P that equals the width of the PQ grid of FPGAs. Q is determined by the world size.", + cxxopts::value()->default_value(std::to_string(DEFAULT_P_VALUE))) + ("distribute-buffers", "Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs.") + ("handler", "Specify the used data handler that distributes the data over devices and memory banks", + cxxopts::value()->default_value(DEFAULT_DIST_TYPE)); + } std::unique_ptr> dataHandler; @@ -69,14 +80,22 @@ public hpcc_base::HpccFpgaBenchmark The input and output data of the benchmark */ std::unique_ptr - generateInputData() override; + generateInputData() override { + return dataHandler->generateData(*executionSettings); + } /** * @brief Set the data handler object by calling the function with the matching template argument * */ void - setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier); + setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier) { + switch (dataHandlerIdentifier) { + case transpose::data_handler::DataHandlerType::diagonal: dataHandler = std::unique_ptr(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break; + case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break; + default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier)); + } + } /** * @brief Transpose specific implementation of the kernel execution @@ -85,7 +104,28 @@ public hpcc_base::HpccFpgaBenchmark Measured runtimes of the kernel execution */ std::unique_ptr - executeKernel(TransposeData &data) override; + executeKernel(TransposeData &data) override { + switch (executionSettings->programSettings->communicationType) { + case hpcc_base::CommunicationType::intel_external_channels: + if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { + return transpose::fpga_execution::intel::calculate(*executionSettings, data); + } + else { + return transpose::fpga_execution::intel_pq::calculate(*executionSettings, data, reinterpret_cast(*dataHandler)); + } break; + case hpcc_base::CommunicationType::pcie_mpi : + if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { + return transpose::fpga_execution::pcie::calculate(*executionSettings, data, *dataHandler); + } + else { + return transpose::fpga_execution::pcie_pq::calculate(*executionSettings, data, reinterpret_cast(*dataHandler)); + } break; +#ifdef MKL_FOUND + case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*executionSettings, data, *dataHandler); break; +#endif + default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType)); + } + } /** * @brief Transpose specific implementation of the execution validation @@ -95,7 +135,28 @@ public hpcc_base::HpccFpgaBenchmarkexchangeData(data); + + dataHandler->reference_transpose(data); + + double max_error = 0.0; + for (size_t i = 0; i < executionSettings->programSettings->blockSize * executionSettings->programSettings->blockSize * data.numBlocks; i++) { + max_error = std::max(fabs(data.A[i]), max_error); + } + + double global_max_error = 0; + MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + + if (mpi_comm_rank == 0) { + std::cout << "Maximum error: " << global_max_error << " < " << 100 * std::numeric_limits::epsilon() << std::endl; + std::cout << "Mach. Epsilon: " << std::numeric_limits::epsilon() << std::endl; + } + + return static_cast(global_max_error) < 100 * std::numeric_limits::epsilon(); + } /** * @brief Transpose specific implementation of printing the execution results @@ -103,7 +164,56 @@ public hpcc_base::HpccFpgaBenchmark(executionSettings->programSettings->matrixSize) * executionSettings->programSettings->matrixSize; + + // Number of experiment repetitions + uint number_measurements = output.calculationTimings.size(); + std::vector max_measures(number_measurements); + std::vector max_transfers(number_measurements); +#ifdef _USE_MPI_ + // Copy the object variable to a local variable to make it accessible to the lambda function + int mpi_size = mpi_comm_size; + MPI_Reduce(output.calculationTimings.data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(output.transferTimings.data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); +#else + std::copy(output.calculationTimings.begin(), output.calculationTimings.end(), max_measures.begin()); + std::copy(output.transferTimings.begin(), output.transferTimings.end(), max_transfers.begin()); +#endif + + double avgCalculationTime = accumulate(max_measures.begin(), max_measures.end(), 0.0) + / max_measures.size(); + double minCalculationTime = *min_element(max_measures.begin(), max_measures.end()); + + double avgTransferTime = accumulate(max_transfers.begin(), max_transfers.end(), 0.0) + / max_transfers.size(); + double minTransferTime = *min_element(max_transfers.begin(), max_transfers.end()); + + double avgCalcFLOPS = flops / avgCalculationTime; + double maxCalcFLOPS = flops / minCalculationTime; + double avgMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime; + double maxMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime; + double avgTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime; + double maxTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime; + + if (mpi_comm_rank == 0) { + std::cout << " total [s] transfer [s] calc [s] calc FLOPS Mem [B/s] PCIe [B/s]" << std::endl; + std::cout << "avg: " << (avgTransferTime + avgCalculationTime) + << " " << avgTransferTime + << " " << avgCalculationTime + << " " << avgCalcFLOPS + << " " << avgMemBandwidth + << " " << avgTransferBandwidth + << std::endl; + std::cout << "best: " << (minTransferTime + minCalculationTime) + << " " << minTransferTime + << " " << minCalculationTime + << " " << maxCalcFLOPS + << " " << maxMemBandwidth + << " " << maxTransferBandwidth + << std::endl; + } + } /** * @brief Construct a new Transpose Benchmark object @@ -111,16 +221,20 @@ public hpcc_base::HpccFpgaBenchmarkprogramSettings->dataHandlerIdentifier); + } + } - /** + /** * @brief Construct a new Transpose Benchmark object */ - TransposeBenchmark(); + TransposeBenchmark() : HpccFpgaBenchmark(argc, argv) {} }; -} // namespace stream +} // namespace transpose -#endif // SRC_HOST_STREAM_BENCHMARK_H_ +#endif // SRC_HOST_TRANSPOSE_BENCHMARK_H_ From aa26b97f94b5f7ba2a7f39489e97006c6332c414 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 Apr 2022 15:29:43 +0100 Subject: [PATCH 014/318] Add template to data handler calls --- PTRANS/src/host/transpose_benchmark.hpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index 4b27ecc4..9b515b73 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -31,8 +31,16 @@ SOFTWARE. #include "hpcc_benchmark.hpp" #include "transpose_data.hpp" +#include "execution_types/execution_intel.hpp" +#include "execution_types/execution_intel_pq.hpp" +#include "execution_types/execution_pcie.hpp" +#include "execution_types/execution_pcie_pq.hpp" +#include "execution_types/execution_cpu.hpp" +#include "communication_types.hpp" + #include "data_handlers/data_handler_types.h" -#include "data_handlers/handler.hpp" +#include "data_handlers/diagonal.hpp" +#include "data_handlers/pq.hpp" #include "parameters.h" @@ -91,8 +99,8 @@ public hpcc_base::HpccFpgaBenchmark(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break; - case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break; + case transpose::data_handler::DataHandlerType::diagonal: dataHandler = std::unique_ptr(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break; + case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break; default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier)); } } From 35b218bcda49771e1b5fe025a5b1dc182cddba65 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 Apr 2022 15:36:45 +0100 Subject: [PATCH 015/318] Add explicit this to transpose benchmark --- PTRANS/src/host/transpose_benchmark.hpp | 32 ++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index 9b515b73..2138de6c 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -89,7 +89,7 @@ public hpcc_base::HpccFpgaBenchmark generateInputData() override { - return dataHandler->generateData(*executionSettings); + return this->dataHandler->generateData(*(this->executionSettings)); } /** @@ -99,8 +99,8 @@ public hpcc_base::HpccFpgaBenchmark(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break; - case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break; + case transpose::data_handler::DataHandlerType::diagonal: this->dataHandler = std::unique_ptr(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break; + case transpose::data_handler::DataHandlerType::pq: this->dataHandler = std::unique_ptr(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break; default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier)); } } @@ -113,25 +113,25 @@ public hpcc_base::HpccFpgaBenchmark executeKernel(TransposeData &data) override { - switch (executionSettings->programSettings->communicationType) { + switch (this->executionSettings->programSettings->communicationType) { case hpcc_base::CommunicationType::intel_external_channels: - if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { - return transpose::fpga_execution::intel::calculate(*executionSettings, data); + if (this->executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { + return transpose::fpga_execution::intel::calculate(*(this->executionSettings), data); } else { - return transpose::fpga_execution::intel_pq::calculate(*executionSettings, data, reinterpret_cast(*dataHandler)); + return transpose::fpga_execution::intel_pq::calculate(*(this->executionSettings), data, reinterpret_cast(*this->dataHandler)); } break; case hpcc_base::CommunicationType::pcie_mpi : if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { - return transpose::fpga_execution::pcie::calculate(*executionSettings, data, *dataHandler); + return transpose::fpga_execution::pcie::calculate(*(this->executionSettings), data, *dataHandler); } else { - return transpose::fpga_execution::pcie_pq::calculate(*executionSettings, data, reinterpret_cast(*dataHandler)); + return transpose::fpga_execution::pcie_pq::calculate(*(this->executionSettings), data, reinterpret_cast(*this->dataHandler)); } break; #ifdef MKL_FOUND - case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*executionSettings, data, *dataHandler); break; + case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*(this->executionSettings), data, *dataHandler); break; #endif - default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType)); + default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(this->executionSettings->programSettings->communicationType)); } } @@ -146,9 +146,9 @@ public hpcc_base::HpccFpgaBenchmarkexchangeData(data); + this->dataHandler->exchangeData(data); - dataHandler->reference_transpose(data); + this->dataHandler->reference_transpose(data); double max_error = 0.0; for (size_t i = 0; i < executionSettings->programSettings->blockSize * executionSettings->programSettings->blockSize * data.numBlocks; i++) { @@ -230,15 +230,15 @@ public hpcc_base::HpccFpgaBenchmarkprogramSettings->dataHandlerIdentifier); + if (this->setupBenchmark(argc, argv)) { + this->setTransposeDataHandler(this->executionSettings->programSettings->dataHandlerIdentifier); } } /** * @brief Construct a new Transpose Benchmark object */ - TransposeBenchmark() : HpccFpgaBenchmark(argc, argv) {} + TransposeBenchmark() : HpccFpgaBenchmark() {} }; From c78306a996fcfa34a48b832e7b21296fcad7e1c1 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 12 Apr 2022 17:01:38 +0100 Subject: [PATCH 016/318] Compilable generic PTRANS --- PTRANS/src/host/CMakeLists.txt | 2 ++ PTRANS/src/host/data_handlers/diagonal.hpp | 32 +++++++++---------- PTRANS/src/host/data_handlers/pq.hpp | 9 +++--- .../execution_types/execution_intel_pq.hpp | 2 +- .../host/execution_types/execution_pcie.hpp | 2 +- .../execution_types/execution_pcie_pq.hpp | 2 +- PTRANS/src/host/transpose_benchmark.hpp | 27 ++++++++-------- 7 files changed, 39 insertions(+), 37 deletions(-) diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt index 647ac6ee..2404394f 100755 --- a/PTRANS/src/host/CMakeLists.txt +++ b/PTRANS/src/host/CMakeLists.txt @@ -27,6 +27,7 @@ if (INTELFPGAOPENCL_FOUND) target_include_directories(${LIB_NAME}_intel PRIVATE "$ENV{MKL_ROOT}/include") endif() target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA) + target_compile_definitions(${HOST_EXE_NAME}_intel PRIVATE -DINTEL_FPGA) target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_intel_host_executable COMMAND $ -h) endif() @@ -40,6 +41,7 @@ if (Vitis_FOUND) target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) + target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_xilinx_host_executable COMMAND $ -h) endif() diff --git a/PTRANS/src/host/data_handlers/diagonal.hpp b/PTRANS/src/host/data_handlers/diagonal.hpp index 2edae91a..a2c702c0 100644 --- a/PTRANS/src/host/data_handlers/diagonal.hpp +++ b/PTRANS/src/host/data_handlers/diagonal.hpp @@ -76,37 +76,37 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandlermatrixSize / settings.programSettings->blockSize; - int avg_blocks_per_rank = (width_in_blocks * width_in_blocks) / mpi_comm_size; + int avg_blocks_per_rank = (width_in_blocks * width_in_blocks) / this->mpi_comm_size; int avg_diagonal_blocks = width_in_blocks; if (avg_blocks_per_rank > 0) { avg_diagonal_blocks = (width_in_blocks / avg_blocks_per_rank); } num_diagonal_ranks = std::max(avg_diagonal_blocks, 1); - if (num_diagonal_ranks % 2 != mpi_comm_size % 2) { + if (num_diagonal_ranks % 2 != this->mpi_comm_size % 2) { #ifndef NDEBUG - std::cout << "Rank " << mpi_comm_rank << ": Fail 1!" << std::endl; + std::cout << "Rank " << this->mpi_comm_rank << ": Fail 1!" << std::endl; #endif // Abort if there is a too high difference in the number of matrix blocks between the MPI ranks throw std::runtime_error("Matrix size and MPI ranks to not allow fair distribution of blocks! Increase or reduce the number of MPI ranks by 1."); } - if ((mpi_comm_size - num_diagonal_ranks) % 2 != 0 || (mpi_comm_size - num_diagonal_ranks) == 0 && width_in_blocks > 1) { + if ((this->mpi_comm_size - num_diagonal_ranks) % 2 != 0 || (this->mpi_comm_size - num_diagonal_ranks) == 0 && width_in_blocks > 1) { #ifndef NDEBUG - std::cout << "Rank " << mpi_comm_rank << ": Fail 2!" << std::endl; + std::cout << "Rank " << this->mpi_comm_rank << ": Fail 2!" << std::endl; #endif throw std::runtime_error("Not possible to create pairs of MPI ranks for lower and upper half of matrix. Increase number of MPI ranks!."); } - bool this_rank_is_diagonal = mpi_comm_rank >= (mpi_comm_size - num_diagonal_ranks); - int blocks_if_diagonal = width_in_blocks / num_diagonal_ranks + ( (mpi_comm_rank - (mpi_comm_size - num_diagonal_ranks)) < (width_in_blocks % num_diagonal_ranks) ? 1 : 0); + bool this_rank_is_diagonal = this->mpi_comm_rank >= (this->mpi_comm_size - num_diagonal_ranks); + int blocks_if_diagonal = width_in_blocks / num_diagonal_ranks + ( (this->mpi_comm_rank - (this->mpi_comm_size - num_diagonal_ranks)) < (width_in_blocks % num_diagonal_ranks) ? 1 : 0); int blocks_if_not_diagonal = 0; - if ((mpi_comm_size - num_diagonal_ranks) > 0 ) { - blocks_if_not_diagonal = (width_in_blocks * (width_in_blocks - 1)) / (mpi_comm_size - num_diagonal_ranks) + (mpi_comm_rank < ((width_in_blocks * (width_in_blocks - 1)) % (mpi_comm_size - num_diagonal_ranks)) ? 1 : 0); + if ((this->mpi_comm_size - num_diagonal_ranks) > 0 ) { + blocks_if_not_diagonal = (width_in_blocks * (width_in_blocks - 1)) / (this->mpi_comm_size - num_diagonal_ranks) + (this->mpi_comm_rank < ((width_in_blocks * (width_in_blocks - 1)) % (this->mpi_comm_size - num_diagonal_ranks)) ? 1 : 0); } int blocks_per_rank = (this_rank_is_diagonal) ? blocks_if_diagonal : blocks_if_not_diagonal; - if (mpi_comm_rank == 0) { + if (this->mpi_comm_rank == 0) { std::cout << "Diag. blocks per rank: " << blocks_if_diagonal << std::endl; std::cout << "Blocks per rank: " << blocks_if_not_diagonal << std::endl; std::cout << "Loopback ranks for diagonal blocks: " << num_diagonal_ranks << std::endl; @@ -115,14 +115,14 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandlerblockSize; #ifndef NDEBUG - std::cout << "Rank " << mpi_comm_rank << ": NumBlocks = " << blocks_per_rank << std::endl; + std::cout << "Rank " << this->mpi_comm_rank << ": NumBlocks = " << blocks_per_rank << std::endl; #endif // Allocate memory for a single device and all its memory banks auto d = std::unique_ptr(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank)); // Fill the allocated memory with pseudo random values - std::mt19937 gen(mpi_comm_rank); + std::mt19937 gen(this->mpi_comm_rank); std::uniform_real_distribution<> dis(-100.0, 100.0); for (size_t i = 0; i < data_height_per_rank; i++) { for (size_t j = 0; j < settings.programSettings->blockSize; j++) { @@ -148,10 +148,10 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandlermpi_comm_rank < this->mpi_comm_size - num_diagonal_ranks) { - int first_upper_half_rank = (mpi_comm_size - num_diagonal_ranks)/2; - int pair_rank = (mpi_comm_rank >= first_upper_half_rank) ? mpi_comm_rank - first_upper_half_rank : mpi_comm_rank + first_upper_half_rank; + int first_upper_half_rank = (this->mpi_comm_size - num_diagonal_ranks)/2; + int pair_rank = (this->mpi_comm_rank >= first_upper_half_rank) ? this->mpi_comm_rank - first_upper_half_rank : this->mpi_comm_rank + first_upper_half_rank; // To re-calculate the matrix transposition locally on this host, we need to // exchange matrix A for every kernel replication @@ -197,7 +197,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler(mpi_rank, mpi_size) { if (mpi_rank >= mpi_size) { throw std::runtime_error("MPI rank must be smaller the MPI world size!"); } diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp index 388c83d9..d065150b 100644 --- a/PTRANS/src/host/data_handlers/pq.hpp +++ b/PTRANS/src/host/data_handlers/pq.hpp @@ -26,6 +26,7 @@ SOFTWARE. /* C++ standard library headers */ #include #include +#include /* Project's headers */ #include "handler.hpp" @@ -142,8 +143,8 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandlermpi_comm_rank / pq_width; + pq_col = this->mpi_comm_rank % pq_width; // If the torus width is not a divisor of the matrix size, // distribute remaining blocks to the ranks @@ -167,7 +168,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank)); // Fill the allocated memory with pseudo random values - std::mt19937 gen(mpi_comm_rank); + std::mt19937 gen(this->mpi_comm_rank); std::uniform_real_distribution<> dis(-100.0, 100.0); for (size_t i = 0; i < blocks_per_rank * settings.programSettings->blockSize; i++) { for (size_t j = 0; j < settings.programSettings->blockSize; j++) { @@ -308,7 +309,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandlermpi_comm_rank << ": blocks (" << sending_size / (data.blockSize * data.blockSize) << "," << receiving_size / (data.blockSize * data.blockSize) << ") send " << send_rank << ", recv " << recv_rank << std::endl << std::flush; #endif MPI_Isend(send_buffers[current_parallel_execution].data(), sending_size, MPI_FLOAT, send_rank, 0, MPI_COMM_WORLD, &mpi_requests[current_parallel_execution]); MPI_Irecv(recv_buffers[current_parallel_execution].data(), receiving_size, MPI_FLOAT, recv_rank, 0, MPI_COMM_WORLD, &mpi_requests[gcd + current_parallel_execution]); diff --git a/PTRANS/src/host/execution_types/execution_intel_pq.hpp b/PTRANS/src/host/execution_types/execution_intel_pq.hpp index 85e596a7..f1f4add4 100644 --- a/PTRANS/src/host/execution_types/execution_intel_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_intel_pq.hpp @@ -44,7 +44,7 @@ namespace intel_pq { * @return std::unique_ptr The measured execution times */ static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { + calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { int err; if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { diff --git a/PTRANS/src/host/execution_types/execution_pcie.hpp b/PTRANS/src/host/execution_types/execution_pcie.hpp index a08888de..97bd910f 100644 --- a/PTRANS/src/host/execution_types/execution_pcie.hpp +++ b/PTRANS/src/host/execution_types/execution_pcie.hpp @@ -49,7 +49,7 @@ namespace transpose * @return std::unique_ptr The measured execution times */ static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) + calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) { int err; diff --git a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp index db1d9bee..c369e9cb 100644 --- a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp @@ -45,7 +45,7 @@ namespace pcie_pq { * @return std::unique_ptr The measured execution times */ static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { + calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { int err; if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index 2138de6c..a31bf9d5 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -28,6 +28,7 @@ SOFTWARE. #include /* Project's headers */ +#include "parameters.h" #include "hpcc_benchmark.hpp" #include "transpose_data.hpp" @@ -42,8 +43,6 @@ SOFTWARE. #include "data_handlers/diagonal.hpp" #include "data_handlers/pq.hpp" -#include "parameters.h" - /** * @brief Contains all classes and methods needed by the Transpose benchmark * @@ -99,8 +98,8 @@ public hpcc_base::HpccFpgaBenchmarkdataHandler = std::unique_ptr(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break; - case transpose::data_handler::DataHandlerType::pq: this->dataHandler = std::unique_ptr(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break; + case transpose::data_handler::DataHandlerType::diagonal: this->dataHandler = std::unique_ptr>(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(this->mpi_comm_rank, this->mpi_comm_size)); break; + case transpose::data_handler::DataHandlerType::pq: this->dataHandler = std::unique_ptr>(new transpose::data_handler::DistributedPQTransposeDataHandler(this->mpi_comm_rank, this->mpi_comm_size, this->executionSettings->programSettings->p)); break; default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier)); } } @@ -119,14 +118,14 @@ public hpcc_base::HpccFpgaBenchmarkexecutionSettings), data); } else { - return transpose::fpga_execution::intel_pq::calculate(*(this->executionSettings), data, reinterpret_cast(*this->dataHandler)); + return transpose::fpga_execution::intel_pq::calculate(*(this->executionSettings), data, reinterpret_cast&>(*this->dataHandler)); } break; case hpcc_base::CommunicationType::pcie_mpi : - if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { + if (this->executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { return transpose::fpga_execution::pcie::calculate(*(this->executionSettings), data, *dataHandler); } else { - return transpose::fpga_execution::pcie_pq::calculate(*(this->executionSettings), data, reinterpret_cast(*this->dataHandler)); + return transpose::fpga_execution::pcie_pq::calculate(*(this->executionSettings), data, reinterpret_cast&>(*this->dataHandler)); } break; #ifdef MKL_FOUND case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*(this->executionSettings), data, *dataHandler); break; @@ -151,14 +150,14 @@ public hpcc_base::HpccFpgaBenchmarkdataHandler->reference_transpose(data); double max_error = 0.0; - for (size_t i = 0; i < executionSettings->programSettings->blockSize * executionSettings->programSettings->blockSize * data.numBlocks; i++) { + for (size_t i = 0; i < this->executionSettings->programSettings->blockSize * this->executionSettings->programSettings->blockSize * data.numBlocks; i++) { max_error = std::max(fabs(data.A[i]), max_error); } double global_max_error = 0; MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - if (mpi_comm_rank == 0) { + if (this->mpi_comm_rank == 0) { std::cout << "Maximum error: " << global_max_error << " < " << 100 * std::numeric_limits::epsilon() << std::endl; std::cout << "Mach. Epsilon: " << std::numeric_limits::epsilon() << std::endl; } @@ -173,7 +172,7 @@ public hpcc_base::HpccFpgaBenchmark(executionSettings->programSettings->matrixSize) * executionSettings->programSettings->matrixSize; + double flops = static_cast(this->executionSettings->programSettings->matrixSize) * this->executionSettings->programSettings->matrixSize; // Number of experiment repetitions uint number_measurements = output.calculationTimings.size(); @@ -181,7 +180,7 @@ public hpcc_base::HpccFpgaBenchmark max_transfers(number_measurements); #ifdef _USE_MPI_ // Copy the object variable to a local variable to make it accessible to the lambda function - int mpi_size = mpi_comm_size; + int mpi_size = this->mpi_comm_size; MPI_Reduce(output.calculationTimings.data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); MPI_Reduce(output.transferTimings.data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); #else @@ -204,7 +203,7 @@ public hpcc_base::HpccFpgaBenchmarkmpi_comm_rank == 0) { std::cout << " total [s] transfer [s] calc [s] calc FLOPS Mem [B/s] PCIe [B/s]" << std::endl; std::cout << "avg: " << (avgTransferTime + avgCalculationTime) << " " << avgTransferTime @@ -229,7 +228,7 @@ public hpcc_base::HpccFpgaBenchmark(argc, argv) { if (this->setupBenchmark(argc, argv)) { this->setTransposeDataHandler(this->executionSettings->programSettings->dataHandlerIdentifier); } @@ -238,7 +237,7 @@ public hpcc_base::HpccFpgaBenchmark() {} }; From fa61522c57e78d94b5c9806ce652a9f0a492733f Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 13 Apr 2022 18:19:21 +0100 Subject: [PATCH 017/318] Fix working with OCL bindings --- PTRANS/src/host/main.cpp | 4 ++++ PTRANS/src/host/transpose_benchmark.hpp | 3 ++- shared/CMakeLists.txt | 4 +--- shared/include/hpcc_benchmark.hpp | 25 ++++++++++++++++--------- shared/setup/fpga_setup.cpp | 2 +- shared/setup/fpga_setup_accl.cpp | 22 +++++++++++++++------- shared/setup/fpga_setup_xrt.cpp | 2 +- 7 files changed, 40 insertions(+), 22 deletions(-) diff --git a/PTRANS/src/host/main.cpp b/PTRANS/src/host/main.cpp index f65d06ce..d4db9803 100644 --- a/PTRANS/src/host/main.cpp +++ b/PTRANS/src/host/main.cpp @@ -8,7 +8,11 @@ The program entry point int main(int argc, char *argv[]) { // Setup benchmark +#ifdef USE_OCL_HOST TransposeBenchmark bm(argc, argv); +#else + TransposeBenchmark bm(argc, argv); +#endif bool success = bm.executeBenchmark(); if (success) { return 0; diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index a31bf9d5..148adc7f 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -31,7 +31,6 @@ SOFTWARE. #include "parameters.h" #include "hpcc_benchmark.hpp" #include "transpose_data.hpp" - #include "execution_types/execution_intel.hpp" #include "execution_types/execution_intel_pq.hpp" #include "execution_types/execution_pcie.hpp" @@ -113,6 +112,7 @@ public hpcc_base::HpccFpgaBenchmark executeKernel(TransposeData &data) override { switch (this->executionSettings->programSettings->communicationType) { +#ifdef USE_OCL_HOST case hpcc_base::CommunicationType::intel_external_channels: if (this->executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { return transpose::fpga_execution::intel::calculate(*(this->executionSettings), data); @@ -127,6 +127,7 @@ public hpcc_base::HpccFpgaBenchmarkexecutionSettings), data, reinterpret_cast&>(*this->dataHandler)); } break; +#endif #ifdef MKL_FOUND case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*(this->executionSettings), data, *dataHandler); break; #endif diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt index 3f3ada79..fdb8ca2f 100644 --- a/shared/CMakeLists.txt +++ b/shared/CMakeLists.txt @@ -9,9 +9,7 @@ endif() if (USE_XRT_HOST) list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp) endif() -if (USE_OCL_HOST) - list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) -endif() +list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) add_library(hpcc_fpga_base STATIC ${HPCC_BASE_SOURCES}) if (USE_ACCL) target_include_directories(hpcc_fpga_base PRIVATE ${ACCL_INCLUDE_PATH}) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index f135fc30..b16994f2 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -498,22 +498,29 @@ class HpccFpgaBenchmark { std::unique_ptr accl; #endif if (!programSettings->testOnly) { -// usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, -// programSettings->defaultDevice); +#ifdef USE_XRT_HOST + usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultDevice); + context = false; + program = fpga_setup::fpgaSetup(usedDevice); +#endif #ifdef USE_OCL_HOST -// context = std::unique_ptr(new cl::Context(*usedDevice)); -// program = fpga_setup::fpgaSetup(context.get(), {*usedDevice}, -// &programSettings->kernelFileName); + usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, + programSettings->defaultDevice); + context = std::unique_ptr(new cl::Context(*usedDevice)); + program = fpga_setup::fpgaSetup(context.get(), {*usedDevice}, + &programSettings->kernelFileName); #endif #ifdef USE_ACCL - xrt::device dev; - xrt::uuid *program; - accl = fpga_setup::fpgaSetupACCL(dev, *program); + accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program); #endif } executionSettings = std::unique_ptr>(new ExecutionSettings(std::move(programSettings), std::move(usedDevice), - std::move(context), std::move(program), std::move(accl))); + std::move(context), std::move(program) +#ifdef USE_ACCL + , std::move(accl) +#endif + )); if (mpi_comm_rank == 0) { if (!checkInputParameters()) { std::cerr << "ERROR: Input parameter check failed!" << std::endl; diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp index aba9b8b2..6d08a26f 100644 --- a/shared/setup/fpga_setup.cpp +++ b/shared/setup/fpga_setup.cpp @@ -101,7 +101,7 @@ Converts the reveived OpenCL error to a string CL_ERR_TO_STR(CL_INVALID_DEVICE_PARTITION_COUNT); default: - return "UNKNOWN ERROR CODE"; + return "UNKNOWN ERROR CODE: " + std::to_string(err); } } diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 01d012e3..4abb8533 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -13,7 +13,7 @@ /* External libraries */ #include "parameters.h" - +#include "xrt.h" #ifdef _USE_MPI_ #include "mpi.h" #endif @@ -21,24 +21,32 @@ namespace fpga_setup { std::unique_ptr - fpgaSetupACCL(xrt::device &context, + fpgaSetupACCL(xrt::device &device, xrt::uuid &program) { int current_rank; MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); int current_size; MPI_Comm_size(MPI_COMM_WORLD, & current_size); - - std::vector ranks = {}; + + std::vector ranks = {}; for (int i = 0; i < current_size; ++i) { - ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, + // TODO: Replace the ip addresses and ports here for execution of real hardware? + ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, 1024}; ranks.emplace_back(new_rank); } - // TODO: Add start port here. Currenty hardcoded! +#ifdef ACCL_HARDWARE_SUPPORT + auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}"); + auto hostctl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}", + xrt::kernel::cu_access_mode::exclusive); + return std::unique_ptr(new ACCL::ACCL(ranks, rank, device, cclo_ip, hostctrl_ip, 0, {0}, 0); +#else + // TODO: Add start port here. Currenty hardcoded! return std::unique_ptr(new ACCL::ACCL(ranks, current_rank, "tcp://localhost:" + std::to_string(5500 + current_rank))); - } +#endif + } } // namespace fpga_setup diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp index 1a9135bb..f04e90aa 100644 --- a/shared/setup/fpga_setup_xrt.cpp +++ b/shared/setup/fpga_setup_xrt.cpp @@ -29,7 +29,7 @@ namespace fpga_setup { int current_size; MPI_Comm_size(MPI_COMM_WORLD, & current_size); - return std::unique_ptr(new device.load_xclbin(kernelFileName)); + return std::make_unique(std::move(device.load_xclbin(kernelFileName))); } std::unique_ptr From 5525b086c0810d85df743a05fe095d3ec1e3f0bc Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 14 Apr 2022 16:57:30 +0100 Subject: [PATCH 018/318] Update PTRANS for XRT only execution --- PTRANS/src/host/data_handlers/diagonal.hpp | 8 +- PTRANS/src/host/data_handlers/handler.hpp | 6 +- PTRANS/src/host/data_handlers/pq.hpp | 12 +- .../host/execution_types/execution_cpu.hpp | 2 +- .../host/execution_types/execution_intel.hpp | 2 +- .../execution_types/execution_intel_pq.hpp | 2 +- .../host/execution_types/execution_pcie.hpp | 2 +- .../execution_types/execution_pcie_pq.hpp | 2 +- .../execution_types/execution_xrt_accl_pq.hpp | 230 +++++++++++++++++ .../execution_types/execution_xrt_pcie_pq.hpp | 238 ++++++++++++++++++ PTRANS/src/host/transpose_benchmark.cpp | 176 ------------- PTRANS/src/host/transpose_benchmark.hpp | 30 ++- PTRANS/src/host/transpose_data.cpp | 44 ---- PTRANS/src/host/transpose_data.hpp | 48 +++- 14 files changed, 554 insertions(+), 248 deletions(-) create mode 100644 PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp create mode 100644 PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp delete mode 100644 PTRANS/src/host/transpose_benchmark.cpp diff --git a/PTRANS/src/host/data_handlers/diagonal.hpp b/PTRANS/src/host/data_handlers/diagonal.hpp index a2c702c0..9f601105 100644 --- a/PTRANS/src/host/data_handlers/diagonal.hpp +++ b/PTRANS/src/host/data_handlers/diagonal.hpp @@ -69,7 +69,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler The generated data */ - std::unique_ptr + std::unique_ptr> generateData(hpcc_base::ExecutionSettings& settings) override { MPI_Type_contiguous(settings.programSettings->blockSize * settings.programSettings->blockSize, MPI_FLOAT, &data_block); MPI_Type_commit(&data_block); @@ -119,7 +119,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank)); + auto d = std::unique_ptr>(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank)); // Fill the allocated memory with pseudo random values std::mt19937 gen(this->mpi_comm_rank); @@ -142,7 +142,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler& data) override { #ifndef NDEBUG // std::cout << "Start data exchange " << mpi_comm_rank << std::endl; @@ -185,7 +185,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler& data) { size_t block_offset = data.blockSize * data.blockSize; for (size_t b = 0; b < data.numBlocks; b++) { for (size_t i = 0; i < data.blockSize; i++) { diff --git a/PTRANS/src/host/data_handlers/handler.hpp b/PTRANS/src/host/data_handlers/handler.hpp index b71597bd..646fcdbf 100644 --- a/PTRANS/src/host/data_handlers/handler.hpp +++ b/PTRANS/src/host/data_handlers/handler.hpp @@ -68,7 +68,7 @@ class TransposeDataHandler { * @param settings The execution settings that contain information about the data size * @return std::unique_ptr The generated data */ - virtual std::unique_ptr + virtual std::unique_ptr> generateData(hpcc_base::ExecutionSettings& settings) = 0; /** @@ -78,10 +78,10 @@ class TransposeDataHandler { * Exchanged data will be stored in the same object. */ virtual void - exchangeData(TransposeData& data) = 0; + exchangeData(TransposeData& data) = 0; virtual void - reference_transpose(TransposeData& data) = 0; + reference_transpose(TransposeData& data) = 0; /** * @brief Construct a new Transpose Data Handler object and initialize the MPI rank and MPI size variables if MPI is used diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp index d065150b..0e28c109 100644 --- a/PTRANS/src/host/data_handlers/pq.hpp +++ b/PTRANS/src/host/data_handlers/pq.hpp @@ -136,7 +136,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler The generated data */ - std::unique_ptr + std::unique_ptr> generateData(hpcc_base::ExecutionSettings& settings) override { int width_in_blocks = settings.programSettings->matrixSize / settings.programSettings->blockSize; global_width = width_in_blocks; @@ -165,15 +165,15 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank)); + auto d = std::unique_ptr>(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank)); // Fill the allocated memory with pseudo random values std::mt19937 gen(this->mpi_comm_rank); std::uniform_real_distribution<> dis(-100.0, 100.0); for (size_t i = 0; i < blocks_per_rank * settings.programSettings->blockSize; i++) { for (size_t j = 0; j < settings.programSettings->blockSize; j++) { - d->A[i * settings.programSettings->blockSize + j] = dis(gen); - d->B[i * settings.programSettings->blockSize + j] = dis(gen); + d->A[i * settings.programSettings->blockSize + j] = i * settings.programSettings->blockSize + j;//dis(gen); + d->B[i * settings.programSettings->blockSize + j] = 0.0; //dis(gen); d->result[i * settings.programSettings->blockSize + j] = 0.0; } } @@ -188,7 +188,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler& data) override { MPI_Status status; @@ -371,7 +371,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler& data) { for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { data.A[i * height_per_rank * data.blockSize + j] -= (data.result[j * width_per_rank * data.blockSize + i] - data.B[j * width_per_rank * data.blockSize + i]); diff --git a/PTRANS/src/host/execution_types/execution_cpu.hpp b/PTRANS/src/host/execution_types/execution_cpu.hpp index 130b016e..bc148d98 100644 --- a/PTRANS/src/host/execution_types/execution_cpu.hpp +++ b/PTRANS/src/host/execution_types/execution_cpu.hpp @@ -52,7 +52,7 @@ namespace transpose */ template static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) + calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) { int err; diff --git a/PTRANS/src/host/execution_types/execution_intel.hpp b/PTRANS/src/host/execution_types/execution_intel.hpp index 58f5a73f..64c996e0 100644 --- a/PTRANS/src/host/execution_types/execution_intel.hpp +++ b/PTRANS/src/host/execution_types/execution_intel.hpp @@ -43,7 +43,7 @@ namespace intel { * @return std::unique_ptr The measured execution times */ static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data) { + calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data) { int err; if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::diagonal) { diff --git a/PTRANS/src/host/execution_types/execution_intel_pq.hpp b/PTRANS/src/host/execution_types/execution_intel_pq.hpp index f1f4add4..9c8bf557 100644 --- a/PTRANS/src/host/execution_types/execution_intel_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_intel_pq.hpp @@ -44,7 +44,7 @@ namespace intel_pq { * @return std::unique_ptr The measured execution times */ static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { + calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { int err; if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { diff --git a/PTRANS/src/host/execution_types/execution_pcie.hpp b/PTRANS/src/host/execution_types/execution_pcie.hpp index 97bd910f..2e607a97 100644 --- a/PTRANS/src/host/execution_types/execution_pcie.hpp +++ b/PTRANS/src/host/execution_types/execution_pcie.hpp @@ -49,7 +49,7 @@ namespace transpose * @return std::unique_ptr The measured execution times */ static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) + calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) { int err; diff --git a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp index c369e9cb..9d7d0b45 100644 --- a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp @@ -45,7 +45,7 @@ namespace pcie_pq { * @return std::unique_ptr The measured execution times */ static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { + calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { int err; if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp new file mode 100644 index 00000000..ce42dd6f --- /dev/null +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp @@ -0,0 +1,230 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_ACCL_PQ_EXECUTION_H_ +#define SRC_HOST_ACCL_PQ_EXECUTION_H_ + +/* C++ standard library headers */ +#include +#include +#include + +/* Project's headers */ +#include "transpose_benchmark.hpp" +#include "data_handlers/data_handler_types.h" +#include "data_handlers/pq.hpp" + +namespace transpose { +namespace fpga_execution { +namespace accl_pq { + + /** + * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and PCIe+MPI over the host for communication + * + * @param config The progrma configuration + * @param data data object that contains all required data for the execution on the FPGA + * @param handler data handler instance that should be used to exchange data between hosts + * @return std::unique_ptr The measured execution times + */ +static std::unique_ptr + calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { + int err; + + if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { + throw std::runtime_error("Used data handler not supported by execution handler!"); + } +#ifdef USE_SVM + throw new std::runtime_error("SVM not supported in the host implementation of this communication method"); +#endif +#ifdef USE_BUFFER_WRITE_RECT_FOR_A + throw new std::runtime_error("Using the Write Rect method is not supported in this host implementation of this communication method"); +#endif + + + std::vector bufferSizeList; + std::vector bufferStartList; + std::vector bufferOffsetList; + std::vector bufferListA; + std::vector bufferListB; + std::vector bufferListA_out; + std::vector transposeKernelList; + std::vector blocksPerReplication; + + size_t local_matrix_width = handler.getWidthforRank(); + size_t local_matrix_height = handler.getHeightforRank(); + size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); + + size_t total_offset = 0; + size_t row_offset = 0; + // Setup the kernels depending on the number of kernel replications + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + + // Calculate how many blocks the current kernel replication will need to process. + size_t blocks_per_replication = (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications); + size_t blocks_remainder = (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications; + if (blocks_remainder > r) { + // Catch the case, that the number of blocks is not divisible by the number of kernel replications + blocks_per_replication += 1; + } + if (blocks_per_replication < 1) { + continue; + } + blocksPerReplication.push_back(blocks_per_replication); + size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width * local_matrix_width * data.blockSize * data.blockSize; + bufferSizeList.push_back(buffer_size); + bufferStartList.push_back(total_offset); + bufferOffsetList.push_back(row_offset); + + row_offset = (row_offset + blocks_per_replication) % local_matrix_width; + + total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * local_matrix_width; + + int memory_bank_info_a = 0; + int memory_bank_info_b = 0; + int memory_bank_info_out = 0; + + // create the kernels + xrt::kernel transposeKernel(*config.device, *config.program, ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); + + + xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * + sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0)); + xrt::bo bufferB(*config.device, data.B + bufferStartList[r] * data.blockSize * data.blockSize, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1)); + xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2)); + + auto run = transposeKernel(bufferA, bufferB, bufferA_out, static_cast(bufferOffsetList[r]),static_cast(bufferOffsetList[r]), + static_cast(blocks_per_replication), static_cast(handler.getWidthforRank()), + static_cast((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize))); + + bufferListA.push_back(bufferA); + bufferListB.push_back(bufferB); + bufferListA_out.push_back(bufferA_out); + transposeKernelList.push_back(transposeKernel); + } + + std::vector transferTimings; + std::vector calculationTimings; + + for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) { + + auto startTransfer = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeKernelList.size(); r++) { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + auto endTransfer = std::chrono::high_resolution_clock::now(); + + std::chrono::duration transferTime = + std::chrono::duration_cast> + (endTransfer - startTransfer); + + MPI_Barrier(MPI_COMM_WORLD); + + auto startCalculation = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeKernelList.size(); r++) + { + bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + } + + + // Exchange A data via PCIe and MPI + handler.exchangeData(data); + + for (int r = 0; r < transposeKernelList.size(); r++) + { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + + std::vector runs; + auto startKernelCalculation = std::chrono::high_resolution_clock::now(); + for (int r = 0; r < transposeKernelList.size(); r++) + { + runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast(bufferOffsetList[r]),static_cast(bufferOffsetList[r]), + static_cast(blocksPerReplication[r]), static_cast(handler.getWidthforRank()), + static_cast((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)))); + } + for (int r = 0; r < transposeKernelList.size(); r++) + { + runs[r].wait(); + } + auto endCalculation = std::chrono::high_resolution_clock::now(); +#ifndef NDEBUG + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + std::cout << "Rank " << mpi_rank << ": " << "Done i=" << repetition << std::endl; + std::cout << "Kernel execution time: " << std::chrono::duration_cast>(endCalculation - startKernelCalculation).count() + << "s (" << ((config.programSettings->matrixSize * config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * 3) + / std::chrono::duration_cast>(endCalculation - startKernelCalculation).count() * 1.0e-9) << " GB/s)" << std::endl; +#endif + + // Transfer back data for next repetition! + handler.exchangeData(data); + + std::chrono::duration calculationTime = + std::chrono::duration_cast> + (endCalculation - startCalculation); + calculationTimings.push_back(calculationTime.count()); + + std::vector tmp_write_buffer(local_matrix_height * local_matrix_width * data.blockSize * data.blockSize); + + startTransfer = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeKernelList.size(); r++) { + // Copy possibly incomplete first block row + if (bufferOffsetList[r] != 0) { + bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bufferListA_out[r].read(tmp_write_buffer.data()); + for (int row = 0; row < data.blockSize; row++) { + for (int col = bufferOffsetList[r] * data.blockSize; col < local_matrix_width * data.blockSize; col++) { + data.result[bufferStartList[r] * data.blockSize * data.blockSize + row * local_matrix_width * data.blockSize + col] = + tmp_write_buffer[row * local_matrix_width * data.blockSize + col]; + } + } + // Copy remaining buffer + std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize, tmp_write_buffer.begin() + bufferSizeList[r],&data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize * data.blockSize]); + } + else { + bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bufferListA_out[r].read(data.result + bufferStartList[r] * data.blockSize * data.blockSize); + } + } + endTransfer = std::chrono::high_resolution_clock::now(); + transferTime += + std::chrono::duration_cast> + (endTransfer - startTransfer); + transferTimings.push_back(transferTime.count()); + } + + std::unique_ptr result(new transpose::TransposeExecutionTimings{ + transferTimings, + calculationTimings + }); + + return result; + } + +} // namespace transpose +} // namespace fpga_execution +} // namespace intel + +#endif diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp new file mode 100644 index 00000000..8629af01 --- /dev/null +++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp @@ -0,0 +1,238 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_XRT_PCIE_PQ_EXECUTION_H_ +#define SRC_HOST_XRT_PCIE_PQ_EXECUTION_H_ + +/* C++ standard library headers */ +#include +#include +#include + +/* Project's headers */ +#include "transpose_benchmark.hpp" +#include "data_handlers/data_handler_types.h" +#include "data_handlers/pq.hpp" + +namespace transpose { +namespace fpga_execution { +namespace pcie_pq { + + /** + * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and PCIe+MPI over the host for communication + * + * @param config The progrma configuration + * @param data data object that contains all required data for the execution on the FPGA + * @param handler data handler instance that should be used to exchange data between hosts + * @return std::unique_ptr The measured execution times + */ +static std::unique_ptr + calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { + int err; + + if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { + throw std::runtime_error("Used data handler not supported by execution handler!"); + } +#ifdef USE_SVM + throw new std::runtime_error("SVM not supported in the host implementation of this communication method"); +#endif +#ifdef USE_BUFFER_WRITE_RECT_FOR_A + throw new std::runtime_error("Using the Write Rect method is not supported in this host implementation of this communication method"); +#endif + + + std::vector bufferSizeList; + std::vector bufferStartList; + std::vector bufferOffsetList; + std::vector bufferListA; + std::vector bufferListB; + std::vector bufferListA_out; + std::vector transposeKernelList; + std::vector blocksPerReplication; + + size_t local_matrix_width = handler.getWidthforRank(); + size_t local_matrix_height = handler.getHeightforRank(); + size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); + + size_t total_offset = 0; + size_t row_offset = 0; + // Setup the kernels depending on the number of kernel replications + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + + // Calculate how many blocks the current kernel replication will need to process. + size_t blocks_per_replication = (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications); + size_t blocks_remainder = (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications; + if (blocks_remainder > r) { + // Catch the case, that the number of blocks is not divisible by the number of kernel replications + blocks_per_replication += 1; + } + if (blocks_per_replication < 1) { + continue; + } + blocksPerReplication.push_back(blocks_per_replication); + size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width * local_matrix_width * data.blockSize * data.blockSize; + bufferSizeList.push_back(buffer_size); + bufferStartList.push_back(total_offset); + bufferOffsetList.push_back(row_offset); + + row_offset = (row_offset + blocks_per_replication) % local_matrix_width; + + total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * local_matrix_width; + + int memory_bank_info_a = 0; + int memory_bank_info_b = 0; + int memory_bank_info_out = 0; + + // create the kernels + xrt::kernel transposeKernel(*config.device, *config.program, ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); + + + xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * + sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0)); + xrt::bo bufferB(*config.device, data.B + bufferStartList[r] * data.blockSize * data.blockSize, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1)); + xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2)); + + auto run = transposeKernel(bufferA, bufferB, bufferA_out, static_cast(bufferOffsetList[r]),static_cast(bufferOffsetList[r]), + static_cast(blocks_per_replication), static_cast(handler.getWidthforRank()), + static_cast((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize))); + + bufferListA.push_back(bufferA); + bufferListB.push_back(bufferB); + bufferListA_out.push_back(bufferA_out); + transposeKernelList.push_back(transposeKernel); + } + + std::vector transferTimings; + std::vector calculationTimings; + + for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) { + + auto startTransfer = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeKernelList.size(); r++) { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + auto endTransfer = std::chrono::high_resolution_clock::now(); + + std::chrono::duration transferTime = + std::chrono::duration_cast> + (endTransfer - startTransfer); + + MPI_Barrier(MPI_COMM_WORLD); + + auto startCalculation = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeKernelList.size(); r++) + { + bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + } + + + // Exchange A data via PCIe and MPI + handler.exchangeData(data); + + for (int r = 0; r < transposeKernelList.size(); r++) + { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + + std::vector runs; + auto startKernelCalculation = std::chrono::high_resolution_clock::now(); + for (int r = 0; r < transposeKernelList.size(); r++) + { + runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast(bufferOffsetList[r]),static_cast(bufferOffsetList[r]), + static_cast(blocksPerReplication[r]), static_cast(handler.getWidthforRank()), + static_cast((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)))); + } + for (int r = 0; r < transposeKernelList.size(); r++) + { + runs[r].wait(); + } + auto endCalculation = std::chrono::high_resolution_clock::now(); +#ifndef NDEBUG + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + std::cout << "Rank " << mpi_rank << ": " << "Done i=" << repetition << std::endl; + std::cout << "Kernel execution time: " << std::chrono::duration_cast>(endCalculation - startKernelCalculation).count() + << "s (" << ((config.programSettings->matrixSize * config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * 3) + / std::chrono::duration_cast>(endCalculation - startKernelCalculation).count() * 1.0e-9) << " GB/s)" << std::endl; +#endif + + // Transfer back data for next repetition! + handler.exchangeData(data); + + std::chrono::duration calculationTime = + std::chrono::duration_cast> + (endCalculation - startCalculation); + calculationTimings.push_back(calculationTime.count()); + + std::vector tmp_write_buffer(local_matrix_height * local_matrix_width * data.blockSize * data.blockSize); + + startTransfer = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeKernelList.size(); r++) { + // Copy possibly incomplete first block row + if (bufferOffsetList[r] != 0) { + bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bufferListA_out[r].read(tmp_write_buffer.data()); + for (int row = 0; row < data.blockSize; row++) { + for (int col = bufferOffsetList[r] * data.blockSize; col < local_matrix_width * data.blockSize; col++) { + data.result[bufferStartList[r] * data.blockSize * data.blockSize + row * local_matrix_width * data.blockSize + col] = + tmp_write_buffer[row * local_matrix_width * data.blockSize + col]; + } + } + // Copy remaining buffer + std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize, tmp_write_buffer.begin() + bufferSizeList[r],&data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize * data.blockSize]); + } + else { + bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bufferListA_out[r].read(data.result + bufferStartList[r] * data.blockSize * data.blockSize); + } + } + endTransfer = std::chrono::high_resolution_clock::now(); + transferTime += + std::chrono::duration_cast> + (endTransfer - startTransfer); + transferTimings.push_back(transferTime.count()); + } + + std::unique_ptr result(new transpose::TransposeExecutionTimings{ + transferTimings, + calculationTimings + }); + + for (int i=0; i < local_matrix_height; i++) { + for (int j=0; j < local_matrix_width; j++) { + std::cout << data.result[i * local_matrix_width + j] << ","; + } + std::cout << std::endl; + } + std::cout << std::endl; + + return result; + } + +} // namespace transpose +} // namespace fpga_execution +} // namespace intel + +#endif diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp deleted file mode 100644 index e66b3a36..00000000 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// -// Created by Marius Meyer on 04.12.19. -// - -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#include "transpose_benchmark.hpp" - -/* C++ standard library headers */ -#include -#include - -/* Project's headers */ -#include "execution_types/execution_intel.hpp" -#include "execution_types/execution_intel_pq.hpp" -#include "execution_types/execution_pcie.hpp" -#include "execution_types/execution_pcie_pq.hpp" -#include "execution_types/execution_cpu.hpp" -#include "communication_types.hpp" - -#include "data_handlers/data_handler_types.h" -#include "data_handlers/diagonal.hpp" -#include "data_handlers/pq.hpp" - -#include "parameters.h" - - -transpose::TransposeBenchmark::TransposeBenchmark(int argc, char* argv[]) : HpccFpgaBenchmark(argc, argv) { - if (setupBenchmark(argc, argv)) { - setTransposeDataHandler(executionSettings->programSettings->dataHandlerIdentifier); - } -} - -void -transpose::TransposeBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { - options.add_options() - ("m", "Matrix size in number of blocks in one dimension", - cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) - ("b", "Block size in number of values in one dimension", - cxxopts::value()->default_value(std::to_string(BLOCK_SIZE))) - ("p", "Value of P that equals the width of the PQ grid of FPGAs. Q is determined by the world size.", - cxxopts::value()->default_value(std::to_string(DEFAULT_P_VALUE))) - ("distribute-buffers", "Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs.") - ("handler", "Specify the used data handler that distributes the data over devices and memory banks", - cxxopts::value()->default_value(DEFAULT_DIST_TYPE)); -} - -std::unique_ptr -transpose::TransposeBenchmark::executeKernel(TransposeData &data) { - switch (executionSettings->programSettings->communicationType) { - case hpcc_base::CommunicationType::intel_external_channels: - if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { - return transpose::fpga_execution::intel::calculate(*executionSettings, data); - } - else { - return transpose::fpga_execution::intel_pq::calculate(*executionSettings, data, reinterpret_cast(*dataHandler)); - } break; - case hpcc_base::CommunicationType::pcie_mpi : - if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { - return transpose::fpga_execution::pcie::calculate(*executionSettings, data, *dataHandler); - } - else { - return transpose::fpga_execution::pcie_pq::calculate(*executionSettings, data, reinterpret_cast(*dataHandler)); - } break; -#ifdef MKL_FOUND - case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*executionSettings, data, *dataHandler); break; -#endif - default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType)); - } -} - -void -transpose::TransposeBenchmark::collectAndPrintResults(const transpose::TransposeExecutionTimings &output) { - double flops = static_cast(executionSettings->programSettings->matrixSize) * executionSettings->programSettings->matrixSize; - - // Number of experiment repetitions - uint number_measurements = output.calculationTimings.size(); - std::vector max_measures(number_measurements); - std::vector max_transfers(number_measurements); -#ifdef _USE_MPI_ - // Copy the object variable to a local variable to make it accessible to the lambda function - int mpi_size = mpi_comm_size; - MPI_Reduce(output.calculationTimings.data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - MPI_Reduce(output.transferTimings.data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); -#else - std::copy(output.calculationTimings.begin(), output.calculationTimings.end(), max_measures.begin()); - std::copy(output.transferTimings.begin(), output.transferTimings.end(), max_transfers.begin()); -#endif - - double avgCalculationTime = accumulate(max_measures.begin(), max_measures.end(), 0.0) - / max_measures.size(); - double minCalculationTime = *min_element(max_measures.begin(), max_measures.end()); - - double avgTransferTime = accumulate(max_transfers.begin(), max_transfers.end(), 0.0) - / max_transfers.size(); - double minTransferTime = *min_element(max_transfers.begin(), max_transfers.end()); - - double avgCalcFLOPS = flops / avgCalculationTime; - double maxCalcFLOPS = flops / minCalculationTime; - double avgMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime; - double maxMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime; - double avgTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime; - double maxTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime; - - - - - if (mpi_comm_rank == 0) { - std::cout << " total [s] transfer [s] calc [s] calc FLOPS Mem [B/s] PCIe [B/s]" << std::endl; - std::cout << "avg: " << (avgTransferTime + avgCalculationTime) - << " " << avgTransferTime - << " " << avgCalculationTime - << " " << avgCalcFLOPS - << " " << avgMemBandwidth - << " " << avgTransferBandwidth - << std::endl; - std::cout << "best: " << (minTransferTime + minCalculationTime) - << " " << minTransferTime - << " " << minCalculationTime - << " " << maxCalcFLOPS - << " " << maxMemBandwidth - << " " << maxTransferBandwidth - << std::endl; - } -} - -std::unique_ptr -transpose::TransposeBenchmark::generateInputData() { - return dataHandler->generateData(*executionSettings); -} - -bool -transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeData &data) { - - // exchange the data using MPI depending on the chosen distribution scheme - dataHandler->exchangeData(data); - - dataHandler->reference_transpose(data); - - double max_error = 0.0; - for (size_t i = 0; i < executionSettings->programSettings->blockSize * executionSettings->programSettings->blockSize * data.numBlocks; i++) { - max_error = std::max(fabs(data.A[i]), max_error); - } - - double global_max_error = 0; - MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - - if (mpi_comm_rank == 0) { - std::cout << "Maximum error: " << global_max_error << " < " << 100 * std::numeric_limits::epsilon() << std::endl; - std::cout << "Mach. Epsilon: " << std::numeric_limits::epsilon() << std::endl; - } - - return static_cast(global_max_error) < 100 * std::numeric_limits::epsilon(); -} - -} diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index 148adc7f..d1ab4340 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -31,10 +31,18 @@ SOFTWARE. #include "parameters.h" #include "hpcc_benchmark.hpp" #include "transpose_data.hpp" +#ifdef USE_OCL_HOST #include "execution_types/execution_intel.hpp" #include "execution_types/execution_intel_pq.hpp" #include "execution_types/execution_pcie.hpp" #include "execution_types/execution_pcie_pq.hpp" +#endif +#ifdef USE_XRT_HOST +#include "execution_types/execution_xrt_pcie_pq.hpp" +#ifdef USE_ACCL +#include "execution_types/execution_xrt_accl_pq.hpp" +#endif +#endif #include "execution_types/execution_cpu.hpp" #include "communication_types.hpp" @@ -54,7 +62,7 @@ namespace transpose { */ template class TransposeBenchmark : -public hpcc_base::HpccFpgaBenchmark { +public hpcc_base::HpccFpgaBenchmark, TransposeExecutionTimings> { protected: /** @@ -85,7 +93,7 @@ public hpcc_base::HpccFpgaBenchmark The input and output data of the benchmark */ - std::unique_ptr + std::unique_ptr> generateInputData() override { return this->dataHandler->generateData(*(this->executionSettings)); } @@ -110,7 +118,7 @@ public hpcc_base::HpccFpgaBenchmark Measured runtimes of the kernel execution */ std::unique_ptr - executeKernel(TransposeData &data) override { + executeKernel(TransposeData &data) override { switch (this->executionSettings->programSettings->communicationType) { #ifdef USE_OCL_HOST case hpcc_base::CommunicationType::intel_external_channels: @@ -128,6 +136,14 @@ public hpcc_base::HpccFpgaBenchmarkexecutionSettings), data, reinterpret_cast&>(*this->dataHandler)); } break; #endif +#ifdef USE_XRT_HOST + case hpcc_base::CommunicationType::pcie_mpi: + return transpose::fpga_execution::pcie_pq::calculate(*(this->executionSettings), data, reinterpret_cast&>(*this->dataHandler)); break; +#ifdef USE_ACCL + case hpcc_base::CommunicationType::accl: + return transpose::fpga_execution::accl_pq::calculate(*(this->executionSettings), data, reinterpret_cast&>(*this->dataHandler)); break; +#endif +#endif #ifdef MKL_FOUND case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*(this->executionSettings), data, *dataHandler); break; #endif @@ -143,7 +159,7 @@ public hpcc_base::HpccFpgaBenchmark &data) override { // exchange the data using MPI depending on the chosen distribution scheme this->dataHandler->exchangeData(data); @@ -152,7 +168,7 @@ public hpcc_base::HpccFpgaBenchmarkexecutionSettings->programSettings->blockSize * this->executionSettings->programSettings->blockSize * data.numBlocks; i++) { - max_error = std::max(fabs(data.A[i]), max_error); + max_error = std::max(std::abs(data.A[i]), max_error); } double global_max_error = 0; @@ -229,7 +245,7 @@ public hpcc_base::HpccFpgaBenchmark(argc, argv) { + TransposeBenchmark(int argc, char* argv[]) : hpcc_base::HpccFpgaBenchmark, transpose::TransposeExecutionTimings>(argc, argv) { if (this->setupBenchmark(argc, argv)) { this->setTransposeDataHandler(this->executionSettings->programSettings->dataHandlerIdentifier); } @@ -238,7 +254,7 @@ public hpcc_base::HpccFpgaBenchmark() {} + TransposeBenchmark() : hpcc_base::HpccFpgaBenchmark, transpose::TransposeExecutionTimings>() {} }; diff --git a/PTRANS/src/host/transpose_data.cpp b/PTRANS/src/host/transpose_data.cpp index af794f30..20d6560f 100644 --- a/PTRANS/src/host/transpose_data.cpp +++ b/PTRANS/src/host/transpose_data.cpp @@ -37,47 +37,3 @@ transpose::TransposeProgramSettings::getSettingsMap() { return map; } -transpose::TransposeData::TransposeData(cl::Context context, uint block_size, uint y_size) : context(context), - numBlocks(y_size), blockSize(block_size) { - if (numBlocks * blockSize > 0) { -#ifdef USE_SVM - A = reinterpret_cast( - clSVMAlloc(context(), 0 , - block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024)); - B = reinterpret_cast( - clSVMAlloc(context(), 0 , - block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024)); - result = reinterpret_cast( - clSVMAlloc(context(), 0 , - block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024)); - exchange = reinterpret_cast( - clSVMAlloc(context(), 0 , - block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024)); -#else - posix_memalign(reinterpret_cast(&A), 64, - sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size); - posix_memalign(reinterpret_cast(&B), 64, - sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size); - posix_memalign(reinterpret_cast(&result), 64, - sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size); - posix_memalign(reinterpret_cast(&exchange), 64, - sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size); -#endif - } -} - -transpose::TransposeData::~TransposeData() { - if (numBlocks * blockSize > 0) { -#ifdef USE_SVM - clSVMFree(context(), reinterpret_cast(A));}); - clSVMFree(context(), reinterpret_cast(B));}); - clSVMFree(context(), reinterpret_cast(result));}); - clSVMFree(context(), reinterpret_cast(exchange));}); -#else - free(A); - free(B); - free(result); - free(exchange); -#endif - } -} diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp index a223353f..c73a9959 100644 --- a/PTRANS/src/host/transpose_data.hpp +++ b/PTRANS/src/host/transpose_data.hpp @@ -94,6 +94,7 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings { * @brief Data class cotnaining the data the kernel is exeucted with * */ +template class TransposeData { public: @@ -138,7 +139,7 @@ class TransposeData { * @brief The context that is used to allocate memory in SVM mode * */ - cl::Context context; + TContext context; /** * @brief Construct a new Transpose Data object @@ -147,13 +148,54 @@ class TransposeData { * @param block_size size of the quadratic blocks that are stored within this object * @param y_size number of blocks that are stored within this object per replication */ - TransposeData(cl::Context context, uint block_size, uint size_y); + TransposeData(TContext context, uint block_size, uint y_size): context(context), + numBlocks(y_size), blockSize(block_size) { + if (numBlocks * blockSize > 0) { +#ifdef USE_SVM + A = reinterpret_cast( + clSVMAlloc(context(), 0 , + block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 4096)); + B = reinterpret_cast( + clSVMAlloc(context(), 0 , + block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 4096)); + result = reinterpret_cast( + clSVMAlloc(context(), 0 , + block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 4096)); + exchange = reinterpret_cast( + clSVMAlloc(context(), 0 , + block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 4096)); +#else + posix_memalign(reinterpret_cast(&A), 4096, + sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size); + posix_memalign(reinterpret_cast(&B), 4096, + sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size); + posix_memalign(reinterpret_cast(&result), 4096, + sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size); + posix_memalign(reinterpret_cast(&exchange), 4096, + sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size); +#endif + } + } /** * @brief Destroy the Transpose Data object. Free the allocated memory * */ - ~TransposeData(); + ~TransposeData() { + if (numBlocks * blockSize > 0) { +#ifdef USE_SVM + clSVMFree(context(), reinterpret_cast(A));}); + clSVMFree(context(), reinterpret_cast(B));}); + clSVMFree(context(), reinterpret_cast(result));}); + clSVMFree(context(), reinterpret_cast(exchange));}); +#else + free(A); + free(B); + free(result); + free(exchange); +#endif + } + } }; From 80a6e5721c108ffcb825dd7d2dc3233f3c945360 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 14 Apr 2022 16:58:38 +0100 Subject: [PATCH 019/318] Support for XRT w/o ACCL in base code --- cmake/general_benchmark_build_setup.cmake | 4 +- extern/CMakeLists.txt | 6 ++- shared/CMakeLists.txt | 4 ++ shared/include/hpcc_benchmark.hpp | 15 ++++-- shared/include/setup/fpga_setup.hpp | 20 ++++--- shared/setup/fpga_setup.cpp | 66 ++++++++++++----------- shared/setup/fpga_setup_accl.cpp | 8 +-- shared/setup/fpga_setup_xrt.cpp | 5 +- 8 files changed, 77 insertions(+), 51 deletions(-) diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake index 441b6f41..1537b092 100644 --- a/cmake/general_benchmark_build_setup.cmake +++ b/cmake/general_benchmark_build_setup.cmake @@ -91,7 +91,9 @@ endif() if (USE_ACCL) add_definitions(-DUSE_ACCL) endif() - +if (USE_XRT_HOST) + add_definitions(-DUSE_XRT_HOST) +endif() if (USE_OCL_HOST) add_definitions(-DUSE_OCL_HOST) endif() diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 77d5e3ac..18f03f37 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -55,13 +55,14 @@ if(NOT extern_cxxopts_POPULATED) EXCLUDE_FROM_ALL) endif() +if (DEFINED USE_ACCL) # ------------------------------------------------------------------------------- # ACCL Library FetchContent_Declare( extern_accl - GIT_REPOSITORY https://github.com/Mellich/ACCL.git - GIT_TAG dev) + GIT_REPOSITORY https://github.com/TristanLaan/ACCL.git + GIT_TAG xrt_hardware_support) FetchContent_GetProperties(extern_accl) if(NOT extern_accl_POPULATED) @@ -69,3 +70,4 @@ if(NOT extern_accl_POPULATED) FetchContent_Populate(extern_accl) set(extern_accl_SOURCE_DIR ${extern_accl_SOURCE_DIR} PARENT_SCOPE) endif() +endif() diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt index fdb8ca2f..43749c0a 100644 --- a/shared/CMakeLists.txt +++ b/shared/CMakeLists.txt @@ -15,6 +15,10 @@ if (USE_ACCL) target_include_directories(hpcc_fpga_base PRIVATE ${ACCL_INCLUDE_PATH}) target_link_libraries(hpcc_fpga_base accl) endif() +if (USE_XRT_HOST) + target_link_directories(hpcc_fpga_base PUBLIC ${XRT_SEARCH_PATH}) + target_link_libraries(hpcc_fpga_base xrt_coreutil xrt_core) +endif() find_package(OpenCL QUIET) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index b16994f2..0bf160f6 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -38,6 +38,9 @@ SOFTWARE. #ifdef USE_ACCL #include "setup/fpga_setup_accl.hpp" #endif +#ifdef USE_XRT_HOST +#include "setup/fpga_setup_xrt.hpp" +#endif #include "setup/fpga_setup.hpp" #include "cxxopts.hpp" #include "parameters.h" @@ -500,8 +503,8 @@ class HpccFpgaBenchmark { if (!programSettings->testOnly) { #ifdef USE_XRT_HOST usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultDevice); - context = false; - program = fpga_setup::fpgaSetup(usedDevice); + context = std::unique_ptr(new bool(false)); + program = fpga_setup::fpgaSetup(*usedDevice, programSettings->kernelFileName); #endif #ifdef USE_OCL_HOST usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, @@ -511,7 +514,7 @@ class HpccFpgaBenchmark { &programSettings->kernelFileName); #endif #ifdef USE_ACCL - accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program); + //accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program); #endif } @@ -696,9 +699,13 @@ std::ostream& operator<<(std::ostream& os, ExecutionSettingstestOnly) { -#ifndef USE_ACCL +#ifdef USE_OCL_HOST printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name); #endif +#ifdef USE_XRT_HOST + device_name = printedExecutionSettings.device->template get_info(); +#endif + } else { device_name = "TEST RUN: Not selected!"; diff --git a/shared/include/setup/fpga_setup.hpp b/shared/include/setup/fpga_setup.hpp index 0799900c..1aa77117 100644 --- a/shared/include/setup/fpga_setup.hpp +++ b/shared/include/setup/fpga_setup.hpp @@ -30,13 +30,14 @@ SOFTWARE. #include #include +#ifdef USE_OCL_HOST /* External libraries */ #ifdef USE_DEPRECATED_HPP_HEADER #include "CL/cl.hpp" #else #include OPENCL_HPP_HEADER #endif - +#endif /** Makro to convert the error integer representation to its string representation @@ -74,6 +75,7 @@ class FpgaSetupException : public std::exception std::string error_message; }; +#ifdef USE_OCL_HOST /** * @brief Exception that is thrown if the ASSERT_CL failed * @@ -134,13 +136,6 @@ Sets up the given FPGA with the kernel in the provided file. fpgaSetup(const cl::Context *context, std::vector deviceList, const std::string *usedKernelFile); -/** -Sets up the C++ environment by configuring std::cout and checking the clock -granularity using bm_helper::checktick() -*/ - void - setupEnvironmentAndClocks(); - /** Searches an selects an FPGA device using the CL library functions. @@ -159,5 +154,14 @@ choose a device. std::unique_ptr selectFPGADevice(int defaultPlatform, int defaultDevice); + +#endif +/** +Sets up the C++ environment by configuring std::cout and checking the clock +granularity using bm_helper::checktick() +*/ + void + setupEnvironmentAndClocks(); + } // namespace fpga_setup #endif // SRC_HOST_FPGA_SETUP_H_ diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp index 6d08a26f..53ce4f55 100644 --- a/shared/setup/fpga_setup.cpp +++ b/shared/setup/fpga_setup.cpp @@ -28,6 +28,9 @@ FpgaSetupException::what() const noexcept return error_message.c_str(); } + +#ifdef USE_OCL_HOST + OpenClException::OpenClException(std::string error_name) : FpgaSetupException("An OpenCL error occured: " + error_name) {} @@ -177,37 +180,6 @@ Sets up the given FPGA with the kernel in the provided file. return std::unique_ptr(new cl::Program(program)); } -/** -Sets up the C++ environment by configuring std::cout and checking the clock -granularity using bm_helper::checktick() -*/ - void - setupEnvironmentAndClocks() { - std::cout << std::setprecision(5) << std::scientific; - - int world_rank = 0; - -#ifdef _USE_MPI_ - MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); -#endif - - if (world_rank == 0) { - std::cout << HLINE; - std::cout << "General setup:" << std::endl; - - // Check clock granularity and output result - std::cout << "C++ high resolution clock is used." << std::endl; - std::cout << "The clock precision seems to be " - << static_cast - (std::chrono::high_resolution_clock::period::num) / - std::chrono::high_resolution_clock::period::den * 10e9 - << "ns" << std::endl; - - std::cout << HLINE; - } - } - - /** Searches an selects an FPGA device using the CL library functions. If multiple platforms or devices are given, the user will be prompted to @@ -321,4 +293,36 @@ choose a device. return std::unique_ptr(new cl::Device(deviceList[chosenDeviceId])); } + +#endif +/** +Sets up the C++ environment by configuring std::cout and checking the clock +granularity using bm_helper::checktick() +*/ + void + setupEnvironmentAndClocks() { + std::cout << std::setprecision(5) << std::scientific; + + int world_rank = 0; + +#ifdef _USE_MPI_ + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); +#endif + + if (world_rank == 0) { + std::cout << HLINE; + std::cout << "General setup:" << std::endl; + + // Check clock granularity and output result + std::cout << "C++ high resolution clock is used." << std::endl; + std::cout << "The clock precision seems to be " + << static_cast + (std::chrono::high_resolution_clock::period::num) / + std::chrono::high_resolution_clock::period::den * 10e9 + << "ns" << std::endl; + + std::cout << HLINE; + } + } + } // namespace fpga_setup diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 4abb8533..cbd98ede 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -13,7 +13,8 @@ /* External libraries */ #include "parameters.h" -#include "xrt.h" +#include "experimental/xrt_ip.h" +#include "xrt/xrt_kernel.h" #ifdef _USE_MPI_ #include "mpi.h" #endif @@ -38,9 +39,10 @@ namespace fpga_setup { } #ifdef ACCL_HARDWARE_SUPPORT auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}"); - auto hostctl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}", + auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}", xrt::kernel::cu_access_mode::exclusive); - return std::unique_ptr(new ACCL::ACCL(ranks, rank, device, cclo_ip, hostctrl_ip, 0, {0}, 0); + std::vector mem(1,0); + return std::unique_ptr(new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, 0)); #else // TODO: Add start port here. Currenty hardcoded! return std::unique_ptr(new ACCL::ACCL(ranks, current_rank, diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp index f04e90aa..0410fd1b 100644 --- a/shared/setup/fpga_setup_xrt.cpp +++ b/shared/setup/fpga_setup_xrt.cpp @@ -14,6 +14,7 @@ /* External libraries */ #include "parameters.h" +#include "xrt.h" #ifdef _USE_MPI_ #include "mpi.h" #endif @@ -22,14 +23,14 @@ namespace fpga_setup { std::unique_ptr fpgaSetup(xrt::device &device, - std::string &kernelFileName) { + const std::string &kernelFileName) { int current_rank; MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); int current_size; MPI_Comm_size(MPI_COMM_WORLD, & current_size); - return std::make_unique(std::move(device.load_xclbin(kernelFileName))); + return std::unique_ptr(new xrt::uuid(device.load_xclbin(kernelFileName))); } std::unique_ptr From 06a7150c9bca3782fcd3ecd305ad28e6b1b6781a Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 14 Apr 2022 17:43:30 +0100 Subject: [PATCH 020/318] Fix kernel arguments for XRT execution --- .../execution_types/execution_xrt_pcie_pq.hpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp index 8629af01..2223858e 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp @@ -108,6 +108,9 @@ static std::unique_ptr xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0)); xrt::bo bufferB(*config.device, data.B + bufferStartList[r] * data.blockSize * data.blockSize, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1)); + // TODO For small matrices, the 4KB alignment might fail for buffer B. Temporary fix seen in lines below (requires extra copying) + //xrt::bo bufferB(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1)); + //bufferB.write(data.B + bufferStartList[r] * data.blockSize * data.blockSize); xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2)); auto run = transposeKernel(bufferA, bufferB, bufferA_out, static_cast(bufferOffsetList[r]),static_cast(bufferOffsetList[r]), @@ -159,9 +162,9 @@ static std::unique_ptr auto startKernelCalculation = std::chrono::high_resolution_clock::now(); for (int r = 0; r < transposeKernelList.size(); r++) { - runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast(bufferOffsetList[r]),static_cast(bufferOffsetList[r]), + runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast(bufferStartList[r] + bufferOffsetList[r]),static_cast(bufferOffsetList[r]), static_cast(blocksPerReplication[r]), static_cast(handler.getWidthforRank()), - static_cast((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)))); + static_cast(handler.getHeightforRank()))); } for (int r = 0; r < transposeKernelList.size(); r++) { @@ -220,14 +223,6 @@ static std::unique_ptr calculationTimings }); - for (int i=0; i < local_matrix_height; i++) { - for (int j=0; j < local_matrix_width; j++) { - std::cout << data.result[i * local_matrix_width + j] << ","; - } - std::cout << std::endl; - } - std::cout << std::endl; - return result; } From ff7c70e5f882696753b9cd9d225965a30a7edd0f Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 Apr 2022 10:16:34 +0100 Subject: [PATCH 021/318] Add transpose ACCL implementation --- PTRANS/src/host/data_handlers/pq.hpp | 8 + .../execution_types/execution_xrt_accl_pq.hpp | 247 +++++++++++++++--- .../execution_types/execution_xrt_pcie_pq.hpp | 4 - shared/include/hpcc_benchmark.hpp | 2 +- 4 files changed, 220 insertions(+), 41 deletions(-) diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp index 0e28c109..7fb08b6c 100644 --- a/PTRANS/src/host/data_handlers/pq.hpp +++ b/PTRANS/src/host/data_handlers/pq.hpp @@ -130,6 +130,14 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler /* Project's headers */ +#include "buffer.hpp" +#include "cclo.hpp" +#include "constants.hpp" +#include "fpgabuffer.hpp" #include "transpose_benchmark.hpp" #include "data_handlers/data_handler_types.h" #include "data_handlers/pq.hpp" +#include "transpose_data.hpp" namespace transpose { namespace fpga_execution { namespace accl_pq { + void accl_exchangeData(ACCL::ACCL &accl, transpose::data_handler::DistributedPQTransposeDataHandler &handler, + transpose::TransposeData & data, xrt::bo bufferAXrt, int global_width) { + + int pq_width = handler.getP(); + int pq_height = handler.getQ(); + int width_per_rank = handler.getWidthforRank(); + int height_per_rank = handler.getHeightforRank(); + MPI_Datatype data_block; + MPI_Type_vector(data.blockSize,data.blockSize,(handler.getWidthforRank() - 1)*data.blockSize, MPI_FLOAT, &data_block); + MPI_Type_commit(&data_block); + + int mpi_comm_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank); + int pq_row = mpi_comm_rank / pq_width; + int pq_col = mpi_comm_rank % pq_width; + + auto AcclBufferA = ACCL::FPGABuffer(bufferAXrt, data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32, true, data.A); + + if (pq_width == pq_height) { + if (pq_col != pq_row) { + + int pair_rank = pq_width * pq_col + pq_row; + + // To re-calculate the matrix transposition locally on this host, we need to + // exchange matrix A for every kernel replication + // The order of the matrix blocks does not change during the exchange, because they are distributed diagonally + // and will be handled in the order below: + // + // . . 1 3 + // . . . 2 + // 1 . . . + // 3 2 . . + auto AcclBufferA_recv = accl.create_buffer(data.exchange, data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32); + + // Send and receive matrix A using ACCL directly on FPGA + auto send = accl.send(0, AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,true,ACCL::streamFlags::NO_STREAM, true); + accl.recv(0, *AcclBufferA_recv, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, true, ACCL::streamFlags::NO_STREAM); + send->wait(); + // Copy received matrix from receiving buffer to A buffer completely on FPGA + accl.copy(*AcclBufferA_recv, AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, true, true); + } + } + else { + // Taken from "Parallel matrix transpose algorithms on distributed memory concurrent computers" by J. Choi, J. J. Dongarra, D. W. Walker + // and translated to C++ + // This will do a diagonal exchange of matrix blocks. + + // Determine LCM using GCD from standard library using the C++14 call + // In C++17 this changes to std::gcd in numeric, also std::lcm is directly available in numeric + int gcd = std::__gcd(pq_height, pq_width); + int least_common_multiple = pq_height * pq_width / gcd; + + // If the global matrix size is not a multiple of the LCM block size, the numbers of send and received blocks + // may be wrongly calculated. Throw exception to prevent this and make aware of this issue! + if (global_width % least_common_multiple > 0) { + throw std::runtime_error("Implementation does not support matrix sizes that are not multiple of LCM blocks! Results may be wrong!"); + } + + // MPI requests for non-blocking communication + // First half of vector is for Isend, second half for Irecv! + std::vector accl_requests(2 * gcd); + + // Begin algorithm from Figure 14 for general case + int g = transpose::data_handler::mod(pq_row - pq_col, gcd); + int p = transpose::data_handler::mod(pq_col + g, pq_width); + int q = transpose::data_handler::mod(pq_row - g, pq_height); + + // Pre-calculate target ranks in LCM block + // The vector list variable can be interpreted as 2D matrix. Every entry represents the target rank of the sub-block + // Since the LCM block will repeat, we only need to store this small amount of data! + std::vector target_list(least_common_multiple/pq_height * least_common_multiple/pq_width); + for (int row = 0; row < least_common_multiple/pq_height; row++) { + for (int col = 0; col < least_common_multiple/pq_width; col++) { + int global_block_col = pq_col + col * pq_width; + int global_block_row = pq_row + row * pq_height; + int destination_rank = (global_block_col % pq_height) * pq_width + (global_block_row % pq_width); + target_list[row * least_common_multiple/pq_width + col] = destination_rank; + } + } + + // Create some ACCL buffers to send and receive from other FPGAs + // They can reside completely on FPGA + std::vector> send_buffers; + std::vector> recv_buffers; + for (int i = 0; i < gcd; i++) { + // TODO Is there a way to initialize buffer only in FPGA memory with ACCL? + send_buffers.push_back(accl.create_buffer(data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32)); + recv_buffers.push_back(accl.create_buffer(data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32)); + } + int current_parallel_execution = 0; + for (int j = 0; j < least_common_multiple/pq_width; j++) { + for (int i = 0; i < least_common_multiple/pq_height; i++) { + // Determine sender and receiver rank of current rank for current communication step + int send_rank = transpose::data_handler::mod(p + i * gcd, pq_width) + transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width; + int recv_rank = transpose::data_handler::mod(p - i * gcd, pq_width) + transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width; + + // Also count receiving buffer size because sending and receiving buffer size may differ in certain scenarios! + int receiving_size = 0; + int sending_size = 0; + + std::vector send_rows; + std::vector send_cols; + // Look up which blocks are affected by the current rank + for (int row = 0; row < least_common_multiple/pq_height; row++) { + for (int col = 0; col < least_common_multiple/pq_width; col++) { + if (target_list[row * least_common_multiple/pq_width + col] == send_rank) { + send_rows.push_back(row); + send_cols.push_back(col); + sending_size += data.blockSize * data.blockSize; + } + if (target_list[row * least_common_multiple/pq_width + col] == recv_rank) { + receiving_size += data.blockSize * data.blockSize; + } + } + } + receiving_size *= (height_per_rank)/(least_common_multiple/pq_height) * ((width_per_rank)/(least_common_multiple/pq_width)); + sending_size *= (height_per_rank)/(least_common_multiple/pq_height) * ((width_per_rank)/(least_common_multiple/pq_width)); + + // Copy the required date for this communication step to the send buffer! + for (int t=0; t < send_rows.size(); t++) { + for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) { + for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) { + size_t sending_buffer_offset = lcm_row * data.blockSize * data.blockSize * ((width_per_rank)/(least_common_multiple/pq_width)) + lcm_col * data.blockSize * data.blockSize; + size_t matrix_buffer_offset = (send_cols[t] + lcm_col * least_common_multiple/pq_width) * data.blockSize + (send_rows[t] + lcm_row * least_common_multiple/pq_height) * width_per_rank * data.blockSize * data.blockSize; + for (int block_row = 0; block_row < data.blockSize; block_row++) { + // TODO May be more efficient when done async! + accl.copy(*AcclBufferA.slice(matrix_buffer_offset + block_row * width_per_rank * data.blockSize, matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize),*send_buffers[current_parallel_execution]->slice(sending_buffer_offset, sending_buffer_offset + data.blockSize),data.blockSize, true, true); + } + } + } + } + + // Do actual MPI communication +#ifndef NDEBUG + std::cout << "Rank " << mpi_comm_rank << ": blocks (" << sending_size / (data.blockSize * data.blockSize) << "," << receiving_size / (data.blockSize * data.blockSize) << ") send " << send_rank << ", recv " << recv_rank << std::endl << std::flush; +#endif + accl_requests[current_parallel_execution] = (accl.send(0, *send_buffers[current_parallel_execution], sending_size, send_rank, 0, false, ACCL::streamFlags::NO_STREAM, true)); + accl_requests[current_parallel_execution] = (accl.recv(0, *recv_buffers[current_parallel_execution], sending_size, send_rank, 0, false, ACCL::streamFlags::NO_STREAM, true)); + // Increase the counter for parallel executions + current_parallel_execution = (current_parallel_execution + 1) % gcd; + + // Wait for MPI requests if GCD MPI calls are scheduled in parallel + if ((current_parallel_execution) % gcd == 0) { + + + for (auto& req :accl_requests) { + + MPI_Status status; + int index; + + // Wait for all send and recv events to complete + // TODO do the CCLO pointers need to be freed? + accl.nop(false, accl_requests); + // For each message that was received in parallel + if (index >= gcd) { + std::vector recv_rows; + std::vector recv_cols; + // Look up which blocks are affected by the current rank + for (int row = 0; row < least_common_multiple/pq_height; row++) { + for (int col = 0; col < least_common_multiple/pq_width; col++) { + if (target_list[row * least_common_multiple/pq_width + col] == status.MPI_SOURCE) { + recv_rows.push_back(row); + recv_cols.push_back(col); + } + } + } + // Copy received data to matrix A buffer + for (int t=0; t < recv_rows.size(); t++) { + for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) { + for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) { + size_t receiving_buffer_offset = lcm_row * data.blockSize * data.blockSize * ((width_per_rank)/(least_common_multiple/pq_width)) + lcm_col * data.blockSize * data.blockSize; + size_t matrix_buffer_offset = (recv_cols[t] + lcm_col * least_common_multiple/pq_width) * data.blockSize + (recv_rows[t] + lcm_row * least_common_multiple/pq_height) * width_per_rank * data.blockSize * data.blockSize; + for (int block_row = 0; block_row < data.blockSize; block_row++) { + // TODO May be more efficient when done async! + accl.copy(*recv_buffers[current_parallel_execution]->slice(receiving_buffer_offset, receiving_buffer_offset + data.blockSize),*AcclBufferA.slice(matrix_buffer_offset + block_row * width_per_rank * data.blockSize, matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize), data.blockSize, true, true); + + } + } + } + } + } + } + } + } + } + } + } + + /** * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and PCIe+MPI over the host for communication * @@ -97,23 +291,14 @@ static std::unique_ptr total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * local_matrix_width; - int memory_bank_info_a = 0; - int memory_bank_info_b = 0; - int memory_bank_info_out = 0; - // create the kernels xrt::kernel transposeKernel(*config.device, *config.program, ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); - xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0)); xrt::bo bufferB(*config.device, data.B + bufferStartList[r] * data.blockSize * data.blockSize, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1)); xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2)); - auto run = transposeKernel(bufferA, bufferB, bufferA_out, static_cast(bufferOffsetList[r]),static_cast(bufferOffsetList[r]), - static_cast(blocks_per_replication), static_cast(handler.getWidthforRank()), - static_cast((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize))); - bufferListA.push_back(bufferA); bufferListB.push_back(bufferB); bufferListA_out.push_back(bufferA_out); @@ -141,33 +326,26 @@ static std::unique_ptr auto startCalculation = std::chrono::high_resolution_clock::now(); + + // Exchange A data via ACCL + if (bufferListA.size() > 1) { + std::cerr << "WARNING: Only the matrix A of the first kernel replication will be exchanged via ACCL!" << std::endl; + } + accl_exchangeData(*config.accl, handler, data, bufferListA[0], config.programSettings->matrixSize / data.blockSize); + + std::vector runs; + auto startKernelCalculation = std::chrono::high_resolution_clock::now(); for (int r = 0; r < transposeKernelList.size(); r++) { - bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast(bufferOffsetList[r]),static_cast(bufferOffsetList[r]), + static_cast(blocksPerReplication[r]), static_cast(handler.getWidthforRank()), + static_cast((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)))); } - - - // Exchange A data via PCIe and MPI - handler.exchangeData(data); - - for (int r = 0; r < transposeKernelList.size(); r++) - { - bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); - } - - std::vector runs; - auto startKernelCalculation = std::chrono::high_resolution_clock::now(); - for (int r = 0; r < transposeKernelList.size(); r++) - { - runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast(bufferOffsetList[r]),static_cast(bufferOffsetList[r]), - static_cast(blocksPerReplication[r]), static_cast(handler.getWidthforRank()), - static_cast((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)))); - } - for (int r = 0; r < transposeKernelList.size(); r++) - { - runs[r].wait(); - } - auto endCalculation = std::chrono::high_resolution_clock::now(); + for (int r = 0; r < transposeKernelList.size(); r++) + { + runs[r].wait(); + } + auto endCalculation = std::chrono::high_resolution_clock::now(); #ifndef NDEBUG int mpi_rank; MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); @@ -177,9 +355,6 @@ static std::unique_ptr / std::chrono::duration_cast>(endCalculation - startKernelCalculation).count() * 1.0e-9) << " GB/s)" << std::endl; #endif - // Transfer back data for next repetition! - handler.exchangeData(data); - std::chrono::duration calculationTime = std::chrono::duration_cast> (endCalculation - startCalculation); diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp index 2223858e..fd3618c9 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp @@ -113,10 +113,6 @@ static std::unique_ptr //bufferB.write(data.B + bufferStartList[r] * data.blockSize * data.blockSize); xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2)); - auto run = transposeKernel(bufferA, bufferB, bufferA_out, static_cast(bufferOffsetList[r]),static_cast(bufferOffsetList[r]), - static_cast(blocks_per_replication), static_cast(handler.getWidthforRank()), - static_cast((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize))); - bufferListA.push_back(bufferA); bufferListB.push_back(bufferB); bufferListA_out.push_back(bufferA_out); diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 0bf160f6..c3ec4b4c 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -514,7 +514,7 @@ class HpccFpgaBenchmark { &programSettings->kernelFileName); #endif #ifdef USE_ACCL - //accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program); + accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program); #endif } From 9b3d8769799b441a7522c5853fd1a2439a394632 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 Apr 2022 10:43:09 +0100 Subject: [PATCH 022/318] Change ACCL dependency to dev branch --- extern/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 18f03f37..341f73cd 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -61,8 +61,8 @@ if (DEFINED USE_ACCL) FetchContent_Declare( extern_accl - GIT_REPOSITORY https://github.com/TristanLaan/ACCL.git - GIT_TAG xrt_hardware_support) + GIT_REPOSITORY https://github.com/Xilinx/ACCL.git + GIT_TAG dev) FetchContent_GetProperties(extern_accl) if(NOT extern_accl_POPULATED) From 9fe0d145318316b7581f74182ff9809e1549bf99 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 Apr 2022 10:43:44 +0100 Subject: [PATCH 023/318] Only initialize ACCL when ACCL implementation is used --- shared/include/hpcc_benchmark.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index c3ec4b4c..ab4d092d 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -514,7 +514,12 @@ class HpccFpgaBenchmark { &programSettings->kernelFileName); #endif #ifdef USE_ACCL - accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program); + if (programSettings->communicationType == CommunicationType::accl) { + accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program); + } + else { + accl = std::unique_ptr(nullptr); + } #endif } From f5c291aa02dc24a7d6393b73c2ce32221f0a98fb Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 Apr 2022 13:34:13 +0100 Subject: [PATCH 024/318] Add YCM config to gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 17305538..e6b8e632 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,9 @@ # Python virtual environments .venv +#YCM config +.ycm_extra_conf.py + # CMake build directories should be created in the following folder *._* build/* From 72ed4cdbae552a795bcf56fe9d3fa9ea1a9b748a Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 Apr 2022 18:15:03 +0100 Subject: [PATCH 025/318] Update extenr deps to XRT simulation branch --- extern/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 341f73cd..7845280d 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -61,8 +61,8 @@ if (DEFINED USE_ACCL) FetchContent_Declare( extern_accl - GIT_REPOSITORY https://github.com/Xilinx/ACCL.git - GIT_TAG dev) + GIT_REPOSITORY https://github.com/TristanLaan/ACCL.git + GIT_TAG simbuffer_bo_constructor) FetchContent_GetProperties(extern_accl) if(NOT extern_accl_POPULATED) From 44ff497697f9be0e107c0efcd02ae03766d91f9d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 Apr 2022 18:15:35 +0100 Subject: [PATCH 026/318] Adapt consturctor to new signature --- shared/setup/fpga_setup_accl.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index cbd98ede..d521264e 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -46,8 +46,7 @@ namespace fpga_setup { #else // TODO: Add start port here. Currenty hardcoded! return std::unique_ptr(new ACCL::ACCL(ranks, current_rank, - "tcp://localhost:" + - std::to_string(5500 + current_rank))); + 5500)); #endif } From 1764ff39648457f867db460f52bfdc92a0b31d9e Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 Apr 2022 18:16:20 +0100 Subject: [PATCH 027/318] Add debug logging and fixes of ACCL execution --- .../execution_types/execution_xrt_accl_pq.hpp | 52 ++++++++++++++----- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp index 832a7d37..10fb36e1 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp @@ -48,17 +48,14 @@ namespace accl_pq { int pq_height = handler.getQ(); int width_per_rank = handler.getWidthforRank(); int height_per_rank = handler.getHeightforRank(); - MPI_Datatype data_block; - MPI_Type_vector(data.blockSize,data.blockSize,(handler.getWidthforRank() - 1)*data.blockSize, MPI_FLOAT, &data_block); - MPI_Type_commit(&data_block); int mpi_comm_rank; MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank); int pq_row = mpi_comm_rank / pq_width; int pq_col = mpi_comm_rank % pq_width; - auto AcclBufferA = ACCL::FPGABuffer(bufferAXrt, data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32, true, data.A); - + auto AcclBufferA = accl.create_buffer(bufferAXrt, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32); + if (pq_width == pq_height) { if (pq_col != pq_row) { @@ -73,14 +70,14 @@ namespace accl_pq { // . . . 2 // 1 . . . // 3 2 . . - auto AcclBufferA_recv = accl.create_buffer(data.exchange, data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32); + auto AcclBufferA_recv = accl.create_buffer(data.exchange, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32); // Send and receive matrix A using ACCL directly on FPGA - auto send = accl.send(0, AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,true,ACCL::streamFlags::NO_STREAM, true); + auto send = accl.send(0, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,true,ACCL::streamFlags::NO_STREAM, true); accl.recv(0, *AcclBufferA_recv, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, true, ACCL::streamFlags::NO_STREAM); send->wait(); // Copy received matrix from receiving buffer to A buffer completely on FPGA - accl.copy(*AcclBufferA_recv, AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, true, true); + accl.copy(*AcclBufferA_recv, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, true, true); } } else { @@ -127,8 +124,8 @@ namespace accl_pq { std::vector> recv_buffers; for (int i = 0; i < gcd; i++) { // TODO Is there a way to initialize buffer only in FPGA memory with ACCL? - send_buffers.push_back(accl.create_buffer(data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32)); - recv_buffers.push_back(accl.create_buffer(data.blockSize * data.blockSize * data.numBlocks * sizeof(HOST_DATA_TYPE), ACCL::dataType::float32)); + send_buffers.push_back(accl.create_buffer(data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); + recv_buffers.push_back(accl.create_buffer(data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); } int current_parallel_execution = 0; for (int j = 0; j < least_common_multiple/pq_width; j++) { @@ -159,6 +156,9 @@ namespace accl_pq { receiving_size *= (height_per_rank)/(least_common_multiple/pq_height) * ((width_per_rank)/(least_common_multiple/pq_width)); sending_size *= (height_per_rank)/(least_common_multiple/pq_height) * ((width_per_rank)/(least_common_multiple/pq_width)); +#ifndef NDEBUG + std::cout << "Copy data to send buffers" << std::endl; +#endif // Copy the required date for this communication step to the send buffer! for (int t=0; t < send_rows.size(); t++) { for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) { @@ -167,7 +167,17 @@ namespace accl_pq { size_t matrix_buffer_offset = (send_cols[t] + lcm_col * least_common_multiple/pq_width) * data.blockSize + (send_rows[t] + lcm_row * least_common_multiple/pq_height) * width_per_rank * data.blockSize * data.blockSize; for (int block_row = 0; block_row < data.blockSize; block_row++) { // TODO May be more efficient when done async! - accl.copy(*AcclBufferA.slice(matrix_buffer_offset + block_row * width_per_rank * data.blockSize, matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize),*send_buffers[current_parallel_execution]->slice(sending_buffer_offset, sending_buffer_offset + data.blockSize),data.blockSize, true, true); + std::cout << "A(" << matrix_buffer_offset + block_row * width_per_rank * data.blockSize + << "," << matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize + << ") send(" << sending_buffer_offset + << "," << sending_buffer_offset + data.blockSize << ")" << std::endl; + accl.copy(*AcclBufferA->slice( + matrix_buffer_offset + block_row * width_per_rank * data.blockSize, + matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize), + *send_buffers[current_parallel_execution]->slice( + sending_buffer_offset, + sending_buffer_offset + data.blockSize), + data.blockSize, true, true); } } } @@ -215,7 +225,7 @@ namespace accl_pq { size_t matrix_buffer_offset = (recv_cols[t] + lcm_col * least_common_multiple/pq_width) * data.blockSize + (recv_rows[t] + lcm_row * least_common_multiple/pq_height) * width_per_rank * data.blockSize * data.blockSize; for (int block_row = 0; block_row < data.blockSize; block_row++) { // TODO May be more efficient when done async! - accl.copy(*recv_buffers[current_parallel_execution]->slice(receiving_buffer_offset, receiving_buffer_offset + data.blockSize),*AcclBufferA.slice(matrix_buffer_offset + block_row * width_per_rank * data.blockSize, matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize), data.blockSize, true, true); + accl.copy(*recv_buffers[current_parallel_execution]->slice(receiving_buffer_offset, receiving_buffer_offset + data.blockSize),*AcclBufferA->slice(matrix_buffer_offset + block_row * width_per_rank * data.blockSize, matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize), data.blockSize, true, true); } } @@ -268,6 +278,9 @@ static std::unique_ptr size_t total_offset = 0; size_t row_offset = 0; +#ifndef NDEBUG + std::cout << "Start kernel creation" << std::endl; +#endif // Setup the kernels depending on the number of kernel replications for (int r = 0; r < config.programSettings->kernelReplications; r++) { @@ -307,9 +320,12 @@ static std::unique_ptr std::vector transferTimings; std::vector calculationTimings; - + for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) { +#ifndef NDEBUG + std::cout << "Start data transfer" << std::endl; +#endif auto startTransfer = std::chrono::high_resolution_clock::now(); for (int r = 0; r < transposeKernelList.size(); r++) { @@ -331,8 +347,13 @@ static std::unique_ptr if (bufferListA.size() > 1) { std::cerr << "WARNING: Only the matrix A of the first kernel replication will be exchanged via ACCL!" << std::endl; } +#ifndef NDEBUG + std::cout << "Start data exchange with ACCL" << std::endl; +#endif accl_exchangeData(*config.accl, handler, data, bufferListA[0], config.programSettings->matrixSize / data.blockSize); - +#ifndef NDEBUG + std::cout << "End data exchange with ACCL" << std::endl; +#endif std::vector runs; auto startKernelCalculation = std::chrono::high_resolution_clock::now(); for (int r = 0; r < transposeKernelList.size(); r++) @@ -341,6 +362,9 @@ static std::unique_ptr static_cast(blocksPerReplication[r]), static_cast(handler.getWidthforRank()), static_cast((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)))); } +#ifndef NDEBUG + std::cout << "Wait for kernels to complete" << std::endl; +#endif for (int r = 0; r < transposeKernelList.size(); r++) { runs[r].wait(); From 536eb3f8264a495aa5e2400f6c0fbb49d08942ff Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 25 Apr 2022 15:13:43 +0100 Subject: [PATCH 028/318] Refactoring and cleanup of ACCL host code --- .../execution_types/execution_xrt_accl_pq.hpp | 758 ++++++++++-------- .../execution_types/execution_xrt_pcie_pq.hpp | 365 +++++---- 2 files changed, 620 insertions(+), 503 deletions(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp index 10fb36e1..dab92c96 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp @@ -23,407 +23,495 @@ SOFTWARE. #define SRC_HOST_ACCL_PQ_EXECUTION_H_ /* C++ standard library headers */ +#include #include #include -#include /* Project's headers */ #include "buffer.hpp" #include "cclo.hpp" #include "constants.hpp" -#include "fpgabuffer.hpp" -#include "transpose_benchmark.hpp" #include "data_handlers/data_handler_types.h" #include "data_handlers/pq.hpp" +#include "fpgabuffer.hpp" #include "transpose_data.hpp" namespace transpose { namespace fpga_execution { namespace accl_pq { - void accl_exchangeData(ACCL::ACCL &accl, transpose::data_handler::DistributedPQTransposeDataHandler &handler, - transpose::TransposeData & data, xrt::bo bufferAXrt, int global_width) { - - int pq_width = handler.getP(); - int pq_height = handler.getQ(); - int width_per_rank = handler.getWidthforRank(); - int height_per_rank = handler.getHeightforRank(); - - int mpi_comm_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank); - int pq_row = mpi_comm_rank / pq_width; - int pq_col = mpi_comm_rank % pq_width; - - auto AcclBufferA = accl.create_buffer(bufferAXrt, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32); - - if (pq_width == pq_height) { - if (pq_col != pq_row) { - - int pair_rank = pq_width * pq_col + pq_row; - - // To re-calculate the matrix transposition locally on this host, we need to - // exchange matrix A for every kernel replication - // The order of the matrix blocks does not change during the exchange, because they are distributed diagonally - // and will be handled in the order below: - // - // . . 1 3 - // . . . 2 - // 1 . . . - // 3 2 . . - auto AcclBufferA_recv = accl.create_buffer(data.exchange, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32); - - // Send and receive matrix A using ACCL directly on FPGA - auto send = accl.send(0, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0,true,ACCL::streamFlags::NO_STREAM, true); - accl.recv(0, *AcclBufferA_recv, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, true, ACCL::streamFlags::NO_STREAM); - send->wait(); - // Copy received matrix from receiving buffer to A buffer completely on FPGA - accl.copy(*AcclBufferA_recv, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, true, true); - } - } - else { - // Taken from "Parallel matrix transpose algorithms on distributed memory concurrent computers" by J. Choi, J. J. Dongarra, D. W. Walker - // and translated to C++ - // This will do a diagonal exchange of matrix blocks. - - // Determine LCM using GCD from standard library using the C++14 call - // In C++17 this changes to std::gcd in numeric, also std::lcm is directly available in numeric - int gcd = std::__gcd(pq_height, pq_width); - int least_common_multiple = pq_height * pq_width / gcd; - - // If the global matrix size is not a multiple of the LCM block size, the numbers of send and received blocks - // may be wrongly calculated. Throw exception to prevent this and make aware of this issue! - if (global_width % least_common_multiple > 0) { - throw std::runtime_error("Implementation does not support matrix sizes that are not multiple of LCM blocks! Results may be wrong!"); - } +void accl_exchangeData( + ACCL::ACCL &accl, + transpose::data_handler::DistributedPQTransposeDataHandler + &handler, + transpose::TransposeData &data, xrt::bo &bufferAXrt, int global_width) { + + int pq_width = handler.getP(); + int pq_height = handler.getQ(); + int width_per_rank = handler.getWidthforRank(); + int height_per_rank = handler.getHeightforRank(); + + int mpi_comm_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank); + int pq_row = mpi_comm_rank / pq_width; + int pq_col = mpi_comm_rank % pq_width; + + auto AcclBufferA = accl.create_buffer( + bufferAXrt, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32); + if (pq_width == pq_height) { + if (pq_col != pq_row) { + + int pair_rank = pq_width * pq_col + pq_row; + + // To re-calculate the matrix transposition locally on this host, we need to + // exchange matrix A for every kernel replication + // The order of the matrix blocks does not change during the exchange, because they are + // distributed diagonally and will be handled in the order below: + // + // . . 1 3 + // . . . 2 + // 1 . . . + // 3 2 . . + // auto AcclBufferA_recv = accl.create_buffer( + // data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32); + // AcclBufferA_recv->sync_to_device(); + // Send and receive matrix A using ACCL directly on FPGA + accl.send(0, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, + true, ACCL::streamFlags::NO_STREAM); + accl.recv(0, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, + true, ACCL::streamFlags::NO_STREAM); + // Copy received matrix from receiving buffer to A buffer completely on FPGA + // accl.copy(*AcclBufferA_recv, *AcclBufferA, data.blockSize * data.blockSize * + // data.numBlocks, + // true, true); + } + } else { + // Taken from "Parallel matrix transpose algorithms on distributed memory concurrent computers" + // by J. Choi, J. J. Dongarra, D. W. Walker and translated to C++ This will do a diagonal + // exchange of matrix blocks. + + // Determine LCM using GCD from standard library using the C++14 call + // In C++17 this changes to std::gcd in numeric, also std::lcm is directly available in numeric + int gcd = std::__gcd(pq_height, pq_width); + int least_common_multiple = pq_height * pq_width / gcd; + + // If the global matrix size is not a multiple of the LCM block size, the numbers of send and + // received blocks may be wrongly calculated. Throw exception to prevent this and make aware of + // this issue! + if (global_width % least_common_multiple > 0) { + throw std::runtime_error("Implementation does not support matrix sizes that are not multiple " + "of LCM blocks! Results may be wrong!"); + } - // MPI requests for non-blocking communication - // First half of vector is for Isend, second half for Irecv! - std::vector accl_requests(2 * gcd); - - // Begin algorithm from Figure 14 for general case - int g = transpose::data_handler::mod(pq_row - pq_col, gcd); - int p = transpose::data_handler::mod(pq_col + g, pq_width); - int q = transpose::data_handler::mod(pq_row - g, pq_height); - - // Pre-calculate target ranks in LCM block - // The vector list variable can be interpreted as 2D matrix. Every entry represents the target rank of the sub-block - // Since the LCM block will repeat, we only need to store this small amount of data! - std::vector target_list(least_common_multiple/pq_height * least_common_multiple/pq_width); - for (int row = 0; row < least_common_multiple/pq_height; row++) { - for (int col = 0; col < least_common_multiple/pq_width; col++) { - int global_block_col = pq_col + col * pq_width; - int global_block_row = pq_row + row * pq_height; - int destination_rank = (global_block_col % pq_height) * pq_width + (global_block_row % pq_width); - target_list[row * least_common_multiple/pq_width + col] = destination_rank; - } - } + // MPI requests for non-blocking communication + // First half of vector is for Isend, second half for Irecv! + std::vector accl_requests(2 * gcd); + + // Begin algorithm from Figure 14 for general case + int g = transpose::data_handler::mod(pq_row - pq_col, gcd); + int p = transpose::data_handler::mod(pq_col + g, pq_width); + int q = transpose::data_handler::mod(pq_row - g, pq_height); + + // Pre-calculate target ranks in LCM block + // The vector list variable can be interpreted as 2D matrix. Every entry represents the target + // rank of the sub-block Since the LCM block will repeat, we only need to store this small + // amount of data! + std::vector target_list(least_common_multiple / pq_height * least_common_multiple / + pq_width); + for (int row = 0; row < least_common_multiple / pq_height; row++) { + for (int col = 0; col < least_common_multiple / pq_width; col++) { + int global_block_col = pq_col + col * pq_width; + int global_block_row = pq_row + row * pq_height; + int destination_rank = + (global_block_col % pq_height) * pq_width + (global_block_row % pq_width); + target_list[row * least_common_multiple / pq_width + col] = destination_rank; + } + } - // Create some ACCL buffers to send and receive from other FPGAs - // They can reside completely on FPGA - std::vector> send_buffers; - std::vector> recv_buffers; - for (int i = 0; i < gcd; i++) { - // TODO Is there a way to initialize buffer only in FPGA memory with ACCL? - send_buffers.push_back(accl.create_buffer(data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); - recv_buffers.push_back(accl.create_buffer(data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); + // Create some ACCL buffers to send and receive from other FPGAs + // They can reside completely on FPGA + std::vector> send_buffers; + std::vector> recv_buffers; + for (int i = 0; i < gcd; i++) { + // TODO Is there a way to initialize buffer only in FPGA memory with ACCL? + send_buffers.push_back(accl.create_buffer( + data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); + recv_buffers.push_back(accl.create_buffer( + data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); + send_buffers.back()->sync_to_device(); + recv_buffers.back()->sync_to_device(); + } + int current_parallel_execution = 0; + for (int j = 0; j < least_common_multiple / pq_width; j++) { + for (int i = 0; i < least_common_multiple / pq_height; i++) { + // Determine sender and receiver rank of current rank for current communication step + int send_rank = transpose::data_handler::mod(p + i * gcd, pq_width) + + transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width; + int recv_rank = transpose::data_handler::mod(p - i * gcd, pq_width) + + transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width; + + // Also count receiving buffer size because sending and receiving buffer size may differ in + // certain scenarios! + int receiving_size = 0; + int sending_size = 0; + + std::vector send_rows; + std::vector send_cols; + // Look up which blocks are affected by the current rank + for (int row = 0; row < least_common_multiple / pq_height; row++) { + for (int col = 0; col < least_common_multiple / pq_width; col++) { + if (target_list[row * least_common_multiple / pq_width + col] == send_rank) { + send_rows.push_back(row); + send_cols.push_back(col); + sending_size += data.blockSize * data.blockSize; } - int current_parallel_execution = 0; - for (int j = 0; j < least_common_multiple/pq_width; j++) { - for (int i = 0; i < least_common_multiple/pq_height; i++) { - // Determine sender and receiver rank of current rank for current communication step - int send_rank = transpose::data_handler::mod(p + i * gcd, pq_width) + transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width; - int recv_rank = transpose::data_handler::mod(p - i * gcd, pq_width) + transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width; - - // Also count receiving buffer size because sending and receiving buffer size may differ in certain scenarios! - int receiving_size = 0; - int sending_size = 0; - - std::vector send_rows; - std::vector send_cols; - // Look up which blocks are affected by the current rank - for (int row = 0; row < least_common_multiple/pq_height; row++) { - for (int col = 0; col < least_common_multiple/pq_width; col++) { - if (target_list[row * least_common_multiple/pq_width + col] == send_rank) { - send_rows.push_back(row); - send_cols.push_back(col); - sending_size += data.blockSize * data.blockSize; - } - if (target_list[row * least_common_multiple/pq_width + col] == recv_rank) { - receiving_size += data.blockSize * data.blockSize; - } - } - } - receiving_size *= (height_per_rank)/(least_common_multiple/pq_height) * ((width_per_rank)/(least_common_multiple/pq_width)); - sending_size *= (height_per_rank)/(least_common_multiple/pq_height) * ((width_per_rank)/(least_common_multiple/pq_width)); + if (target_list[row * least_common_multiple / pq_width + col] == recv_rank) { + receiving_size += data.blockSize * data.blockSize; + } + } + } + receiving_size *= (height_per_rank) / (least_common_multiple / pq_height) * + ((width_per_rank) / (least_common_multiple / pq_width)); + sending_size *= (height_per_rank) / (least_common_multiple / pq_height) * + ((width_per_rank) / (least_common_multiple / pq_width)); #ifndef NDEBUG - std::cout << "Copy data to send buffers" << std::endl; + std::cout << "Copy data to send buffers" << std::endl; #endif - // Copy the required date for this communication step to the send buffer! - for (int t=0; t < send_rows.size(); t++) { - for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) { - for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) { - size_t sending_buffer_offset = lcm_row * data.blockSize * data.blockSize * ((width_per_rank)/(least_common_multiple/pq_width)) + lcm_col * data.blockSize * data.blockSize; - size_t matrix_buffer_offset = (send_cols[t] + lcm_col * least_common_multiple/pq_width) * data.blockSize + (send_rows[t] + lcm_row * least_common_multiple/pq_height) * width_per_rank * data.blockSize * data.blockSize; - for (int block_row = 0; block_row < data.blockSize; block_row++) { - // TODO May be more efficient when done async! - std::cout << "A(" << matrix_buffer_offset + block_row * width_per_rank * data.blockSize - << "," << matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize - << ") send(" << sending_buffer_offset - << "," << sending_buffer_offset + data.blockSize << ")" << std::endl; - accl.copy(*AcclBufferA->slice( - matrix_buffer_offset + block_row * width_per_rank * data.blockSize, - matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize), - *send_buffers[current_parallel_execution]->slice( - sending_buffer_offset, - sending_buffer_offset + data.blockSize), - data.blockSize, true, true); - } - } - } - } + // Copy the required date for this communication step to the send buffer! + for (int t = 0; t < send_rows.size(); t++) { + for (int lcm_row = 0; lcm_row < (height_per_rank) / (least_common_multiple / pq_height); + lcm_row++) { + for (int lcm_col = 0; lcm_col < (width_per_rank) / (least_common_multiple / pq_width); + lcm_col++) { + size_t sending_buffer_offset = + lcm_row * data.blockSize * data.blockSize * + ((width_per_rank) / (least_common_multiple / pq_width)) + + lcm_col * data.blockSize * data.blockSize; + size_t matrix_buffer_offset = + (send_cols[t] + lcm_col * least_common_multiple / pq_width) * data.blockSize + + (send_rows[t] + lcm_row * least_common_multiple / pq_height) * width_per_rank * + data.blockSize * data.blockSize; + for (int block_row = 0; block_row < data.blockSize; block_row++) { + // TODO May be more efficient when done async! + std::cout << "A(" + << matrix_buffer_offset + block_row * width_per_rank * data.blockSize + << "," + << matrix_buffer_offset + block_row * width_per_rank * data.blockSize + + data.blockSize + << ") send(" << sending_buffer_offset << "," + << sending_buffer_offset + data.blockSize << ")" << std::endl; + accl.copy(*AcclBufferA->slice( + matrix_buffer_offset + block_row * width_per_rank * data.blockSize, + matrix_buffer_offset + block_row * width_per_rank * data.blockSize + + data.blockSize), + *send_buffers[current_parallel_execution]->slice( + sending_buffer_offset, sending_buffer_offset + data.blockSize), + data.blockSize, true, true); + std::cout << "Copy done!" << std::endl; + } + } + } + } - // Do actual MPI communication + // Do actual MPI communication +#ifndef NDEBUG + std::cout << "Rank " << mpi_comm_rank << ": blocks (" + << sending_size / (data.blockSize * data.blockSize) << "," + << receiving_size / (data.blockSize * data.blockSize) << ") send " << send_rank + << ", recv " << recv_rank << std::endl + << std::flush; +#endif + accl_requests[current_parallel_execution] = + (accl.send(0, *send_buffers[current_parallel_execution], sending_size, send_rank, 0, + true, ACCL::streamFlags::NO_STREAM, true)); + accl_requests[current_parallel_execution + gcd] = + (accl.recv(0, *recv_buffers[current_parallel_execution], sending_size, send_rank, 0, + true, ACCL::streamFlags::NO_STREAM, true)); + // Increase the counter for parallel executions + current_parallel_execution = (current_parallel_execution + 1) % gcd; + + // Wait for MPI requests if GCD MPI calls are scheduled in parallel + if ((current_parallel_execution) % gcd == 0) { + + for (auto &req : accl_requests) { + + MPI_Status status; + int index; #ifndef NDEBUG - std::cout << "Rank " << mpi_comm_rank << ": blocks (" << sending_size / (data.blockSize * data.blockSize) << "," << receiving_size / (data.blockSize * data.blockSize) << ") send " << send_rank << ", recv " << recv_rank << std::endl << std::flush; + std::cout << "Wait for all requests to complete" << std::endl; #endif - accl_requests[current_parallel_execution] = (accl.send(0, *send_buffers[current_parallel_execution], sending_size, send_rank, 0, false, ACCL::streamFlags::NO_STREAM, true)); - accl_requests[current_parallel_execution] = (accl.recv(0, *recv_buffers[current_parallel_execution], sending_size, send_rank, 0, false, ACCL::streamFlags::NO_STREAM, true)); - // Increase the counter for parallel executions - current_parallel_execution = (current_parallel_execution + 1) % gcd; - - // Wait for MPI requests if GCD MPI calls are scheduled in parallel - if ((current_parallel_execution) % gcd == 0) { - - - for (auto& req :accl_requests) { - - MPI_Status status; - int index; - - // Wait for all send and recv events to complete - // TODO do the CCLO pointers need to be freed? - accl.nop(false, accl_requests); - // For each message that was received in parallel - if (index >= gcd) { - std::vector recv_rows; - std::vector recv_cols; - // Look up which blocks are affected by the current rank - for (int row = 0; row < least_common_multiple/pq_height; row++) { - for (int col = 0; col < least_common_multiple/pq_width; col++) { - if (target_list[row * least_common_multiple/pq_width + col] == status.MPI_SOURCE) { - recv_rows.push_back(row); - recv_cols.push_back(col); - } - } - } - // Copy received data to matrix A buffer - for (int t=0; t < recv_rows.size(); t++) { - for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) { - for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) { - size_t receiving_buffer_offset = lcm_row * data.blockSize * data.blockSize * ((width_per_rank)/(least_common_multiple/pq_width)) + lcm_col * data.blockSize * data.blockSize; - size_t matrix_buffer_offset = (recv_cols[t] + lcm_col * least_common_multiple/pq_width) * data.blockSize + (recv_rows[t] + lcm_row * least_common_multiple/pq_height) * width_per_rank * data.blockSize * data.blockSize; - for (int block_row = 0; block_row < data.blockSize; block_row++) { - // TODO May be more efficient when done async! - accl.copy(*recv_buffers[current_parallel_execution]->slice(receiving_buffer_offset, receiving_buffer_offset + data.blockSize),*AcclBufferA->slice(matrix_buffer_offset + block_row * width_per_rank * data.blockSize, matrix_buffer_offset + block_row * width_per_rank * data.blockSize + data.blockSize), data.blockSize, true, true); - - } - } - } - } - } - } - } + // Wait for all send and recv events to complete + // TODO do the CCLO pointers need to be freed? + accl.nop(false, accl_requests); + // For each message that was received in parallel + if (index >= gcd) { + std::vector recv_rows; + std::vector recv_cols; + // Look up which blocks are affected by the current rank + for (int row = 0; row < least_common_multiple / pq_height; row++) { + for (int col = 0; col < least_common_multiple / pq_width; col++) { + if (target_list[row * least_common_multiple / pq_width + col] == + status.MPI_SOURCE) { + recv_rows.push_back(row); + recv_cols.push_back(col); + } } + } + // Copy received data to matrix A buffer + for (int t = 0; t < recv_rows.size(); t++) { + for (int lcm_row = 0; + lcm_row < (height_per_rank) / (least_common_multiple / pq_height); lcm_row++) { + for (int lcm_col = 0; + lcm_col < (width_per_rank) / (least_common_multiple / pq_width); lcm_col++) { + size_t receiving_buffer_offset = + lcm_row * data.blockSize * data.blockSize * + ((width_per_rank) / (least_common_multiple / pq_width)) + + lcm_col * data.blockSize * data.blockSize; + size_t matrix_buffer_offset = + (recv_cols[t] + lcm_col * least_common_multiple / pq_width) * + data.blockSize + + (recv_rows[t] + lcm_row * least_common_multiple / pq_height) * + width_per_rank * data.blockSize * data.blockSize; + for (int block_row = 0; block_row < data.blockSize; block_row++) { + // TODO May be more efficient when done async! + accl.copy( + *recv_buffers[current_parallel_execution]->slice( + receiving_buffer_offset, receiving_buffer_offset + data.blockSize), + *AcclBufferA->slice( + matrix_buffer_offset + block_row * width_per_rank * data.blockSize, + matrix_buffer_offset + block_row * width_per_rank * data.blockSize + + data.blockSize), + data.blockSize, true, true); + } + } + } + } } + } } + } } + } +} - - /** - * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and PCIe+MPI over the host for communication - * +/** + * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and + * PCIe+MPI over the host for communication + * * @param config The progrma configuration * @param data data object that contains all required data for the execution on the FPGA * @param handler data handler instance that should be used to exchange data between hosts - * @return std::unique_ptr The measured execution times + * @return std::unique_ptr The measured execution times */ -static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { - int err; - - if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { - throw std::runtime_error("Used data handler not supported by execution handler!"); - } +static std::unique_ptr +calculate(const hpcc_base::ExecutionSettings &config, + transpose::TransposeData &data, + transpose::data_handler::DistributedPQTransposeDataHandler + &handler) { + int err; + + if (config.programSettings->dataHandlerIdentifier != + transpose::data_handler::DataHandlerType::pq) { + throw std::runtime_error("Used data handler not supported by execution handler!"); + } #ifdef USE_SVM - throw new std::runtime_error("SVM not supported in the host implementation of this communication method"); + throw new std::runtime_error( + "SVM not supported in the host implementation of this communication method"); #endif #ifdef USE_BUFFER_WRITE_RECT_FOR_A - throw new std::runtime_error("Using the Write Rect method is not supported in this host implementation of this communication method"); + throw new std::runtime_error("Using the Write Rect method is not supported in this host " + "implementation of this communication method"); #endif - - std::vector bufferSizeList; - std::vector bufferStartList; - std::vector bufferOffsetList; - std::vector bufferListA; - std::vector bufferListB; - std::vector bufferListA_out; - std::vector transposeKernelList; - std::vector blocksPerReplication; - - size_t local_matrix_width = handler.getWidthforRank(); - size_t local_matrix_height = handler.getHeightforRank(); - size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); - - size_t total_offset = 0; - size_t row_offset = 0; + std::vector bufferSizeList; + std::vector bufferStartList; + std::vector bufferOffsetList; + std::vector bufferListA; + std::vector bufferListB; + std::vector bufferListA_out; + std::vector transposeKernelList; + std::vector blocksPerReplication; + + size_t local_matrix_width = handler.getWidthforRank(); + size_t local_matrix_height = handler.getHeightforRank(); + size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); + + size_t total_offset = 0; + size_t row_offset = 0; #ifndef NDEBUG - std::cout << "Start kernel creation" << std::endl; + std::cout << "Start kernel creation" << std::endl; #endif - // Setup the kernels depending on the number of kernel replications - for (int r = 0; r < config.programSettings->kernelReplications; r++) { - - // Calculate how many blocks the current kernel replication will need to process. - size_t blocks_per_replication = (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications); - size_t blocks_remainder = (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications; - if (blocks_remainder > r) { - // Catch the case, that the number of blocks is not divisible by the number of kernel replications - blocks_per_replication += 1; - } - if (blocks_per_replication < 1) { - continue; - } - blocksPerReplication.push_back(blocks_per_replication); - size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width * local_matrix_width * data.blockSize * data.blockSize; - bufferSizeList.push_back(buffer_size); - bufferStartList.push_back(total_offset); - bufferOffsetList.push_back(row_offset); + // Setup the kernels depending on the number of kernel replications + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + + // Calculate how many blocks the current kernel replication will need to process. + size_t blocks_per_replication = + (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications); + size_t blocks_remainder = + (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications; + if (blocks_remainder > r) { + // Catch the case, that the number of blocks is not divisible by the number of kernel + // replications + blocks_per_replication += 1; + } + if (blocks_per_replication < 1) { + continue; + } + blocksPerReplication.push_back(blocks_per_replication); + size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width * + local_matrix_width * data.blockSize * data.blockSize; + bufferSizeList.push_back(buffer_size); + bufferStartList.push_back(total_offset); + bufferOffsetList.push_back(row_offset); - row_offset = (row_offset + blocks_per_replication) % local_matrix_width; + row_offset = (row_offset + blocks_per_replication) % local_matrix_width; - total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * local_matrix_width; + total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * + local_matrix_width; - // create the kernels - xrt::kernel transposeKernel(*config.device, *config.program, ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); + // create the kernels + xrt::kernel transposeKernel(*config.device, *config.program, + ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); - xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * - sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0)); - xrt::bo bufferB(*config.device, data.B + bufferStartList[r] * data.blockSize * data.blockSize, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1)); - xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2)); + xrt::bo bufferA(*config.device, data.A, + data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), + transposeKernel.group_id(0)); + xrt::bo bufferB(*config.device, &data.B[bufferStartList[r] * data.blockSize * data.blockSize], + buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1)); + xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), + transposeKernel.group_id(2)); - bufferListA.push_back(bufferA); - bufferListB.push_back(bufferB); - bufferListA_out.push_back(bufferA_out); - transposeKernelList.push_back(transposeKernel); - } + bufferListA.push_back(bufferA); + bufferListB.push_back(bufferB); + bufferListA_out.push_back(bufferA_out); + transposeKernelList.push_back(transposeKernel); + } + + std::vector transferTimings; + std::vector calculationTimings; - std::vector transferTimings; - std::vector calculationTimings; - - for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) { + for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) { #ifndef NDEBUG - std::cout << "Start data transfer" << std::endl; + std::cout << "Start data transfer" << std::endl; #endif - auto startTransfer = std::chrono::high_resolution_clock::now(); + auto startTransfer = std::chrono::high_resolution_clock::now(); - for (int r = 0; r < transposeKernelList.size(); r++) { - bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); - bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); - } - auto endTransfer = std::chrono::high_resolution_clock::now(); - - std::chrono::duration transferTime = - std::chrono::duration_cast> - (endTransfer - startTransfer); + for (int r = 0; r < transposeKernelList.size(); r++) { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + auto endTransfer = std::chrono::high_resolution_clock::now(); - MPI_Barrier(MPI_COMM_WORLD); + std::chrono::duration transferTime = + std::chrono::duration_cast>(endTransfer - startTransfer); - auto startCalculation = std::chrono::high_resolution_clock::now(); + MPI_Barrier(MPI_COMM_WORLD); + auto startCalculation = std::chrono::high_resolution_clock::now(); - // Exchange A data via ACCL - if (bufferListA.size() > 1) { - std::cerr << "WARNING: Only the matrix A of the first kernel replication will be exchanged via ACCL!" << std::endl; - } + // Exchange A data via ACCL + if (bufferListA.size() > 1) { + std::cerr << "WARNING: Only the matrix A of the first kernel replication will be exchanged " + "via ACCL!" + << std::endl; + } #ifndef NDEBUG - std::cout << "Start data exchange with ACCL" << std::endl; + std::cout << "Start data exchange with ACCL" << std::endl; #endif - accl_exchangeData(*config.accl, handler, data, bufferListA[0], config.programSettings->matrixSize / data.blockSize); + accl_exchangeData(*config.accl, handler, data, bufferListA[0], + config.programSettings->matrixSize / data.blockSize); #ifndef NDEBUG - std::cout << "End data exchange with ACCL" << std::endl; + std::cout << "End data exchange with ACCL" << std::endl; #endif - std::vector runs; - auto startKernelCalculation = std::chrono::high_resolution_clock::now(); - for (int r = 0; r < transposeKernelList.size(); r++) - { - runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast(bufferOffsetList[r]),static_cast(bufferOffsetList[r]), - static_cast(blocksPerReplication[r]), static_cast(handler.getWidthforRank()), - static_cast((bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)))); - } + std::vector runs; + auto startKernelCalculation = std::chrono::high_resolution_clock::now(); + for (int r = 0; r < transposeKernelList.size(); r++) { + runs.push_back(transposeKernelList[r]( + bufferListA[r], bufferListB[r], bufferListA_out[r], + static_cast(bufferOffsetList[r]), static_cast(bufferOffsetList[r]), + static_cast(blocksPerReplication[r]), + static_cast(handler.getWidthforRank()), + static_cast((bufferSizeList[r]) / + (local_matrix_width * data.blockSize * data.blockSize)))); + } #ifndef NDEBUG - std::cout << "Wait for kernels to complete" << std::endl; + std::cout << "Wait for kernels to complete" << std::endl; #endif - for (int r = 0; r < transposeKernelList.size(); r++) - { - runs[r].wait(); - } - auto endCalculation = std::chrono::high_resolution_clock::now(); + for (int r = 0; r < transposeKernelList.size(); r++) { + runs[r].wait(); + } + auto endCalculation = std::chrono::high_resolution_clock::now(); #ifndef NDEBUG - int mpi_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - std::cout << "Rank " << mpi_rank << ": " << "Done i=" << repetition << std::endl; - std::cout << "Kernel execution time: " << std::chrono::duration_cast>(endCalculation - startKernelCalculation).count() - << "s (" << ((config.programSettings->matrixSize * config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * 3) - / std::chrono::duration_cast>(endCalculation - startKernelCalculation).count() * 1.0e-9) << " GB/s)" << std::endl; + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + std::cout << "Rank " << mpi_rank << ": " + << "Done i=" << repetition << std::endl; + std::cout << "Kernel execution time: " + << std::chrono::duration_cast>(endCalculation - + startKernelCalculation) + .count() + << "s (" + << ((config.programSettings->matrixSize * config.programSettings->matrixSize * + sizeof(HOST_DATA_TYPE) * 3) / + std::chrono::duration_cast>(endCalculation - + startKernelCalculation) + .count() * + 1.0e-9) + << " GB/s)" << std::endl; #endif - std::chrono::duration calculationTime = - std::chrono::duration_cast> - (endCalculation - startCalculation); - calculationTimings.push_back(calculationTime.count()); - - std::vector tmp_write_buffer(local_matrix_height * local_matrix_width * data.blockSize * data.blockSize); - - startTransfer = std::chrono::high_resolution_clock::now(); - - for (int r = 0; r < transposeKernelList.size(); r++) { - // Copy possibly incomplete first block row - if (bufferOffsetList[r] != 0) { - bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); - bufferListA_out[r].read(tmp_write_buffer.data()); - for (int row = 0; row < data.blockSize; row++) { - for (int col = bufferOffsetList[r] * data.blockSize; col < local_matrix_width * data.blockSize; col++) { - data.result[bufferStartList[r] * data.blockSize * data.blockSize + row * local_matrix_width * data.blockSize + col] = - tmp_write_buffer[row * local_matrix_width * data.blockSize + col]; - } - } - // Copy remaining buffer - std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize, tmp_write_buffer.begin() + bufferSizeList[r],&data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize * data.blockSize]); - } - else { - bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); - bufferListA_out[r].read(data.result + bufferStartList[r] * data.blockSize * data.blockSize); - } - } - endTransfer = std::chrono::high_resolution_clock::now(); - transferTime += - std::chrono::duration_cast> - (endTransfer - startTransfer); - transferTimings.push_back(transferTime.count()); + std::chrono::duration calculationTime = + std::chrono::duration_cast>(endCalculation - + startCalculation); + calculationTimings.push_back(calculationTime.count()); + + std::vector tmp_write_buffer(local_matrix_height * local_matrix_width * + data.blockSize * data.blockSize); + + startTransfer = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeKernelList.size(); r++) { + // Copy possibly incomplete first block row + if (bufferOffsetList[r] != 0) { + bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bufferListA_out[r].read(tmp_write_buffer.data()); + for (int row = 0; row < data.blockSize; row++) { + for (int col = bufferOffsetList[r] * data.blockSize; + col < local_matrix_width * data.blockSize; col++) { + data.result[bufferStartList[r] * data.blockSize * data.blockSize + + row * local_matrix_width * data.blockSize + col] = + tmp_write_buffer[row * local_matrix_width * data.blockSize + col]; + } } + // Copy remaining buffer + std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize, + tmp_write_buffer.begin() + bufferSizeList[r], + &data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize * + data.blockSize]); + } else { + bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bufferListA_out[r].read(data.result, bufferSizeList[r] * sizeof(HOST_DATA_TYPE), + bufferStartList[r] * data.blockSize * data.blockSize * + sizeof(HOST_DATA_TYPE)); + } + } + endTransfer = std::chrono::high_resolution_clock::now(); + transferTime += + std::chrono::duration_cast>(endTransfer - startTransfer); + transferTimings.push_back(transferTime.count()); + } - std::unique_ptr result(new transpose::TransposeExecutionTimings{ - transferTimings, - calculationTimings - }); + std::unique_ptr result( + new transpose::TransposeExecutionTimings{transferTimings, calculationTimings}); - return result; - } + return result; +} -} // namespace transpose -} // namespace fpga_execution -} // namespace intel +} // namespace accl_pq +} // namespace fpga_execution +} // namespace transpose #endif diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp index fd3618c9..85481b6f 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp @@ -23,207 +23,236 @@ SOFTWARE. #define SRC_HOST_XRT_PCIE_PQ_EXECUTION_H_ /* C++ standard library headers */ +#include #include #include -#include /* Project's headers */ -#include "transpose_benchmark.hpp" #include "data_handlers/data_handler_types.h" #include "data_handlers/pq.hpp" +#include "transpose_benchmark.hpp" namespace transpose { namespace fpga_execution { namespace pcie_pq { - /** - * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and PCIe+MPI over the host for communication - * +/** + * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and + * PCIe+MPI over the host for communication + * * @param config The progrma configuration * @param data data object that contains all required data for the execution on the FPGA * @param handler data handler instance that should be used to exchange data between hosts - * @return std::unique_ptr The measured execution times + * @return std::unique_ptr The measured execution times */ -static std::unique_ptr - calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { - int err; - - if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { - throw std::runtime_error("Used data handler not supported by execution handler!"); - } +static std::unique_ptr +calculate(const hpcc_base::ExecutionSettings &config, + transpose::TransposeData &data, + transpose::data_handler::DistributedPQTransposeDataHandler + &handler) { + int err; + + if (config.programSettings->dataHandlerIdentifier != + transpose::data_handler::DataHandlerType::pq) { + throw std::runtime_error("Used data handler not supported by execution handler!"); + } #ifdef USE_SVM - throw new std::runtime_error("SVM not supported in the host implementation of this communication method"); + throw new std::runtime_error( + "SVM not supported in the host implementation of this communication method"); #endif #ifdef USE_BUFFER_WRITE_RECT_FOR_A - throw new std::runtime_error("Using the Write Rect method is not supported in this host implementation of this communication method"); + throw new std::runtime_error("Using the Write Rect method is not supported in this host " + "implementation of this communication method"); #endif + std::vector bufferSizeList; + std::vector bufferStartList; + std::vector bufferOffsetList; + std::vector bufferListA; + std::vector bufferListB; + std::vector bufferListA_out; + std::vector transposeKernelList; + std::vector blocksPerReplication; + + size_t local_matrix_width = handler.getWidthforRank(); + size_t local_matrix_height = handler.getHeightforRank(); + size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); + + size_t total_offset = 0; + size_t row_offset = 0; + // Setup the kernels depending on the number of kernel replications + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + + // Calculate how many blocks the current kernel replication will need to process. + size_t blocks_per_replication = + (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications); + size_t blocks_remainder = + (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications; + if (blocks_remainder > r) { + // Catch the case, that the number of blocks is not divisible by the number of kernel + // replications + blocks_per_replication += 1; + } + if (blocks_per_replication < 1) { + continue; + } + blocksPerReplication.push_back(blocks_per_replication); + size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width * + local_matrix_width * data.blockSize * data.blockSize; + bufferSizeList.push_back(buffer_size); + bufferStartList.push_back(total_offset); + bufferOffsetList.push_back(row_offset); + + row_offset = (row_offset + blocks_per_replication) % local_matrix_width; + + total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * + local_matrix_width; + + int memory_bank_info_a = 0; + int memory_bank_info_b = 0; + int memory_bank_info_out = 0; + + // create the kernels + xrt::kernel transposeKernel(*config.device, *config.program, + ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); + + xrt::bo bufferA(*config.device, data.A, + data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), + transposeKernel.group_id(0)); + xrt::bo bufferB(*config.device,&data.B[bufferStartList[r] * data.blockSize * data.blockSize], + buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1)); + // TODO For small matrices, the 4KB alignment might fail for buffer B. Temporary fix seen in + // lines below (requires extra copying) + // xrt::bo bufferB(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), + // transposeKernel.group_id(1)); bufferB.write(data.B + bufferStartList[r] * data.blockSize * + // data.blockSize); + xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), + transposeKernel.group_id(2)); + + bufferListA.push_back(bufferA); + bufferListB.push_back(bufferB); + bufferListA_out.push_back(bufferA_out); + transposeKernelList.push_back(transposeKernel); + } + + std::vector transferTimings; + std::vector calculationTimings; + + for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) { + + auto startTransfer = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeKernelList.size(); r++) { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + auto endTransfer = std::chrono::high_resolution_clock::now(); - std::vector bufferSizeList; - std::vector bufferStartList; - std::vector bufferOffsetList; - std::vector bufferListA; - std::vector bufferListB; - std::vector bufferListA_out; - std::vector transposeKernelList; - std::vector blocksPerReplication; - - size_t local_matrix_width = handler.getWidthforRank(); - size_t local_matrix_height = handler.getHeightforRank(); - size_t local_matrix_width_bytes = local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); - - size_t total_offset = 0; - size_t row_offset = 0; - // Setup the kernels depending on the number of kernel replications - for (int r = 0; r < config.programSettings->kernelReplications; r++) { - - // Calculate how many blocks the current kernel replication will need to process. - size_t blocks_per_replication = (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications); - size_t blocks_remainder = (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications; - if (blocks_remainder > r) { - // Catch the case, that the number of blocks is not divisible by the number of kernel replications - blocks_per_replication += 1; - } - if (blocks_per_replication < 1) { - continue; - } - blocksPerReplication.push_back(blocks_per_replication); - size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width * local_matrix_width * data.blockSize * data.blockSize; - bufferSizeList.push_back(buffer_size); - bufferStartList.push_back(total_offset); - bufferOffsetList.push_back(row_offset); - - row_offset = (row_offset + blocks_per_replication) % local_matrix_width; - - total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * local_matrix_width; - - int memory_bank_info_a = 0; - int memory_bank_info_b = 0; - int memory_bank_info_out = 0; - - // create the kernels - xrt::kernel transposeKernel(*config.device, *config.program, ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); - - - xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * - sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0)); - xrt::bo bufferB(*config.device, data.B + bufferStartList[r] * data.blockSize * data.blockSize, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1)); - // TODO For small matrices, the 4KB alignment might fail for buffer B. Temporary fix seen in lines below (requires extra copying) - //xrt::bo bufferB(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1)); - //bufferB.write(data.B + bufferStartList[r] * data.blockSize * data.blockSize); - xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2)); - - bufferListA.push_back(bufferA); - bufferListB.push_back(bufferB); - bufferListA_out.push_back(bufferA_out); - transposeKernelList.push_back(transposeKernel); - } - - std::vector transferTimings; - std::vector calculationTimings; - - for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) { - - auto startTransfer = std::chrono::high_resolution_clock::now(); - - for (int r = 0; r < transposeKernelList.size(); r++) { - bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); - bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); - } - auto endTransfer = std::chrono::high_resolution_clock::now(); - - std::chrono::duration transferTime = - std::chrono::duration_cast> - (endTransfer - startTransfer); - - MPI_Barrier(MPI_COMM_WORLD); + std::chrono::duration transferTime = + std::chrono::duration_cast>(endTransfer - startTransfer); - auto startCalculation = std::chrono::high_resolution_clock::now(); + MPI_Barrier(MPI_COMM_WORLD); - for (int r = 0; r < transposeKernelList.size(); r++) - { - bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); - } + auto startCalculation = std::chrono::high_resolution_clock::now(); + for (int r = 0; r < transposeKernelList.size(); r++) { + bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + } - // Exchange A data via PCIe and MPI - handler.exchangeData(data); + // Exchange A data via PCIe and MPI + handler.exchangeData(data); - for (int r = 0; r < transposeKernelList.size(); r++) - { - bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); - } + std::copy(data.A, data.A + data.numBlocks * data.blockSize * data.blockSize, data.exchange); + for (int r = 0; r < transposeKernelList.size(); r++) { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } - std::vector runs; - auto startKernelCalculation = std::chrono::high_resolution_clock::now(); - for (int r = 0; r < transposeKernelList.size(); r++) - { - runs.push_back(transposeKernelList[r](bufferListA[r], bufferListB[r], bufferListA_out[r], static_cast(bufferStartList[r] + bufferOffsetList[r]),static_cast(bufferOffsetList[r]), - static_cast(blocksPerReplication[r]), static_cast(handler.getWidthforRank()), - static_cast(handler.getHeightforRank()))); - } - for (int r = 0; r < transposeKernelList.size(); r++) - { - runs[r].wait(); - } - auto endCalculation = std::chrono::high_resolution_clock::now(); + std::vector runs; + auto startKernelCalculation = std::chrono::high_resolution_clock::now(); + for (int r = 0; r < transposeKernelList.size(); r++) { + runs.push_back(transposeKernelList[r]( + bufferListA[r], bufferListB[r], bufferListA_out[r], + static_cast(bufferStartList[r] + bufferOffsetList[r]), + static_cast(bufferOffsetList[r]), static_cast(blocksPerReplication[r]), + static_cast(handler.getWidthforRank()), + static_cast(handler.getHeightforRank()))); + } + for (int r = 0; r < transposeKernelList.size(); r++) { + runs[r].wait(); + } + auto endCalculation = std::chrono::high_resolution_clock::now(); #ifndef NDEBUG - int mpi_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - std::cout << "Rank " << mpi_rank << ": " << "Done i=" << repetition << std::endl; - std::cout << "Kernel execution time: " << std::chrono::duration_cast>(endCalculation - startKernelCalculation).count() - << "s (" << ((config.programSettings->matrixSize * config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * 3) - / std::chrono::duration_cast>(endCalculation - startKernelCalculation).count() * 1.0e-9) << " GB/s)" << std::endl; + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + std::cout << "Rank " << mpi_rank << ": " + << "Done i=" << repetition << std::endl; + std::cout << "Kernel execution time: " + << std::chrono::duration_cast>(endCalculation - + startKernelCalculation) + .count() + << "s (" + << ((config.programSettings->matrixSize * config.programSettings->matrixSize * + sizeof(HOST_DATA_TYPE) * 3) / + std::chrono::duration_cast>(endCalculation - + startKernelCalculation) + .count() * + 1.0e-9) + << " GB/s)" << std::endl; #endif - // Transfer back data for next repetition! - handler.exchangeData(data); - - std::chrono::duration calculationTime = - std::chrono::duration_cast> - (endCalculation - startCalculation); - calculationTimings.push_back(calculationTime.count()); - - std::vector tmp_write_buffer(local_matrix_height * local_matrix_width * data.blockSize * data.blockSize); - - startTransfer = std::chrono::high_resolution_clock::now(); - - for (int r = 0; r < transposeKernelList.size(); r++) { - // Copy possibly incomplete first block row - if (bufferOffsetList[r] != 0) { - bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); - bufferListA_out[r].read(tmp_write_buffer.data()); - for (int row = 0; row < data.blockSize; row++) { - for (int col = bufferOffsetList[r] * data.blockSize; col < local_matrix_width * data.blockSize; col++) { - data.result[bufferStartList[r] * data.blockSize * data.blockSize + row * local_matrix_width * data.blockSize + col] = - tmp_write_buffer[row * local_matrix_width * data.blockSize + col]; - } - } - // Copy remaining buffer - std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize, tmp_write_buffer.begin() + bufferSizeList[r],&data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize * data.blockSize]); - } - else { - bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); - bufferListA_out[r].read(data.result + bufferStartList[r] * data.blockSize * data.blockSize); - } - } - endTransfer = std::chrono::high_resolution_clock::now(); - transferTime += - std::chrono::duration_cast> - (endTransfer - startTransfer); - transferTimings.push_back(transferTime.count()); + // Transfer back data for next repetition! + handler.exchangeData(data); + + std::chrono::duration calculationTime = + std::chrono::duration_cast>(endCalculation - + startCalculation); + calculationTimings.push_back(calculationTime.count()); + + std::vector tmp_write_buffer(local_matrix_height * local_matrix_width * + data.blockSize * data.blockSize); + + startTransfer = std::chrono::high_resolution_clock::now(); + + for (int r = 0; r < transposeKernelList.size(); r++) { + // Copy possibly incomplete first block row + if (bufferOffsetList[r] != 0) { + bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bufferListA_out[r].read(tmp_write_buffer.data()); + for (int row = 0; row < data.blockSize; row++) { + for (int col = bufferOffsetList[r] * data.blockSize; + col < local_matrix_width * data.blockSize; col++) { + data.result[bufferStartList[r] * data.blockSize * data.blockSize + + row * local_matrix_width * data.blockSize + col] = + tmp_write_buffer[row * local_matrix_width * data.blockSize + col]; + } } + // Copy remaining buffer + std::copy(tmp_write_buffer.begin() + local_matrix_width * data.blockSize * data.blockSize, + tmp_write_buffer.begin() + bufferSizeList[r], + &data.result[(bufferStartList[r] + local_matrix_width) * data.blockSize * + data.blockSize]); + } else { + bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bufferListA_out[r].read(data.result, bufferSizeList[r] * sizeof(HOST_DATA_TYPE), + bufferStartList[r] * data.blockSize * data.blockSize * + sizeof(HOST_DATA_TYPE)); + } + } + endTransfer = std::chrono::high_resolution_clock::now(); + transferTime += + std::chrono::duration_cast>(endTransfer - startTransfer); + transferTimings.push_back(transferTime.count()); + } - std::unique_ptr result(new transpose::TransposeExecutionTimings{ - transferTimings, - calculationTimings - }); + std::unique_ptr result( + new transpose::TransposeExecutionTimings{transferTimings, calculationTimings}); - return result; - } + return result; +} -} // namespace transpose -} // namespace fpga_execution -} // namespace intel +} // namespace pcie_pq +} // namespace fpga_execution +} // namespace transpose #endif From b4800f29ebbf6e43d6195b7c2703a1e36eb3c2d7 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 25 Apr 2022 15:16:52 +0100 Subject: [PATCH 029/318] Fix faulty derived data type --- PTRANS/src/host/data_handlers/pq.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp index 7fb08b6c..afa11575 100644 --- a/PTRANS/src/host/data_handlers/pq.hpp +++ b/PTRANS/src/host/data_handlers/pq.hpp @@ -180,8 +180,8 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler dis(-100.0, 100.0); for (size_t i = 0; i < blocks_per_rank * settings.programSettings->blockSize; i++) { for (size_t j = 0; j < settings.programSettings->blockSize; j++) { - d->A[i * settings.programSettings->blockSize + j] = i * settings.programSettings->blockSize + j;//dis(gen); - d->B[i * settings.programSettings->blockSize + j] = 0.0; //dis(gen); + d->A[i * settings.programSettings->blockSize + j] = dis(gen); + d->B[i * settings.programSettings->blockSize + j] = dis(gen); d->result[i * settings.programSettings->blockSize + j] = 0.0; } } @@ -216,11 +216,11 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler 0) { int next_chunk = (remaining_data_size > std::numeric_limits::max()) ? std::numeric_limits::max(): remaining_data_size; - MPI_Sendrecv(&data.A[offset], next_chunk, data_block, pair_rank, 0, &data.exchange[offset], next_chunk, data_block, pair_rank, 0, MPI_COMM_WORLD, &status); + MPI_Sendrecv(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, &data.exchange[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD, &status); remaining_data_size -= next_chunk; offset += static_cast(next_chunk) * static_cast(data.blockSize * data.blockSize); @@ -379,7 +379,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler& data) { + reference_transpose(TransposeData& data) override { for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { data.A[i * height_per_rank * data.blockSize + j] -= (data.result[j * width_per_rank * data.blockSize + i] - data.B[j * width_per_rank * data.blockSize + i]); From c90daab4cca01b097f8761f830365e3dd13aa34a Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 25 Apr 2022 15:34:58 +0100 Subject: [PATCH 030/318] Add emulation switch for ACCL to benchmarks --- shared/include/hpcc_benchmark.hpp | 15 +++++- shared/include/setup/fpga_setup_accl.hpp | 23 ++++---- shared/setup/fpga_setup_accl.cpp | 67 ++++++++++++------------ 3 files changed, 58 insertions(+), 47 deletions(-) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index ab4d092d..bece837c 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -126,6 +126,11 @@ class BaseSettings { */ CommunicationType communicationType; + /** + * @brief Use ACCL emulation constructor instead of hardware execution + */ + bool useAcclEmulation; + /** * @brief Construct a new Base Settings object * @@ -146,6 +151,11 @@ class BaseSettings { #else kernelReplications(results.count("r") > 0 ? results["r"].as() : 1), #endif +#ifdef USE_ACCL + useAcclEmulation(static_cast(results.count("accl-emulation"))), +#else + useAcclEmulation(false), +#endif #ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED communicationType(retrieveCommunicationType(results["comm-type"].as(), results["f"].as())), #else @@ -393,6 +403,9 @@ class HpccFpgaBenchmark { cxxopts::value()->default_value(std::to_string(DEFAULT_REPETITIONS))) #ifdef INTEL_FPGA ("i", "Use memory Interleaving") +#endif +#ifdef USE_ACCL + ("accl-emulation", "Use the accl emulation instead of hardware execution") #endif ("skip-validation", "Skip the validation of the output data. This will speed up execution and helps when working with special data types.") ("device", "Index of the device that has to be used. If not given you "\ @@ -515,7 +528,7 @@ class HpccFpgaBenchmark { #endif #ifdef USE_ACCL if (programSettings->communicationType == CommunicationType::accl) { - accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program); + accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program, programSettings->useAcclEmulation); } else { accl = std::unique_ptr(nullptr); diff --git a/shared/include/setup/fpga_setup_accl.hpp b/shared/include/setup/fpga_setup_accl.hpp index 7158a81b..dcf2a530 100644 --- a/shared/include/setup/fpga_setup_accl.hpp +++ b/shared/include/setup/fpga_setup_accl.hpp @@ -22,18 +22,17 @@ SOFTWARE. #ifndef SRC_HOST_FPGA_SETUP_ACCL_H_ #define SRC_HOST_FPGA_SETUP_ACCL_H_ -#include -#include -#include -#include #include #include +#include +#include #include +#include +#include /* External libraries */ -#include "xrt/xrt_device.h" #include "accl.hpp" - +#include "xrt/xrt_device.h" namespace fpga_setup { @@ -41,12 +40,12 @@ namespace fpga_setup { Sets up the given FPGA with the kernel in the provided file. @param device The device used for the program -@param usedKernelFile The path to the kernel file +@param program The program used to find the ACCL kernels for hardware execution +@param useAcclEmulation Construct an ACCL emulation instance instead of hardware execution @return The ACCL instance used for communication */ - std::unique_ptr - fpgaSetupACCL(xrt::device &device, - xrt::uuid &program); +std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &program, + bool useAcclEmulation); -} // namespace fpga_setup -#endif // SRC_HOST_FPGA_SETUP_H_ +} // namespace fpga_setup +#endif // SRC_HOST_FPGA_SETUP_H_ diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index d521264e..b4753430 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -4,16 +4,16 @@ #include "setup/fpga_setup_accl.hpp" -#include -#include -#include -#include #include #include +#include +#include +#include +#include /* External libraries */ -#include "parameters.h" #include "experimental/xrt_ip.h" +#include "parameters.h" #include "xrt/xrt_kernel.h" #ifdef _USE_MPI_ #include "mpi.h" @@ -21,33 +21,32 @@ namespace fpga_setup { - std::unique_ptr - fpgaSetupACCL(xrt::device &device, - xrt::uuid &program) { - int current_rank; - MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); - - int current_size; - MPI_Comm_size(MPI_COMM_WORLD, & current_size); - - std::vector ranks = {}; - for (int i = 0; i < current_size; ++i) { - // TODO: Replace the ip addresses and ports here for execution of real hardware? - ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, - 1024}; - ranks.emplace_back(new_rank); - } -#ifdef ACCL_HARDWARE_SUPPORT - auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}"); - auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}", - xrt::kernel::cu_access_mode::exclusive); - std::vector mem(1,0); - return std::unique_ptr(new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, 0)); -#else - // TODO: Add start port here. Currenty hardcoded! - return std::unique_ptr(new ACCL::ACCL(ranks, current_rank, - 5500)); -#endif - } +std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &program, + bool useAcclEmulation) { + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, ¤t_rank); + + int current_size; + MPI_Comm_size(MPI_COMM_WORLD, ¤t_size); + + std::vector ranks = {}; + for (int i = 0; i < current_size; ++i) { + // TODO: Replace the ip addresses and ports here for execution of real hardware? + ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, 1024}; + ranks.emplace_back(new_rank); + } + if (!useAcclEmulation) { + auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}"); + auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}", + xrt::kernel::cu_access_mode::exclusive); + std::vector mem(1, 0); + return std::unique_ptr( + new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, 0)); + } else { + // TODO: Add start port here. Currenty hardcoded! + return std::unique_ptr( + new ACCL::ACCL(ranks, current_rank, 5500, device, ACCL::networkProtocol::TCP, 16, 1024)); + } +} -} // namespace fpga_setup +} // namespace fpga_setup From 0f96d28df1a147ae0592ca3c1701b0d625d11fdf Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 25 Apr 2022 15:37:29 +0100 Subject: [PATCH 031/318] Add ACCL buffers implementation for PTRANS --- PTRANS/src/device/transpose_PQ_ACCL_buffers.cl | 1 + 1 file changed, 1 insertion(+) create mode 120000 PTRANS/src/device/transpose_PQ_ACCL_buffers.cl diff --git a/PTRANS/src/device/transpose_PQ_ACCL_buffers.cl b/PTRANS/src/device/transpose_PQ_ACCL_buffers.cl new file mode 120000 index 00000000..64e94f20 --- /dev/null +++ b/PTRANS/src/device/transpose_PQ_ACCL_buffers.cl @@ -0,0 +1 @@ +transpose_PQ_PCIE.cl \ No newline at end of file From 869dbf23549af4f44d2bf604134aebdab879ca47 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 25 Apr 2022 17:10:28 +0100 Subject: [PATCH 032/318] Include ACCL build to HPCC builds --- PTRANS/src/device/CMakeLists.txt | 2 +- cmake/accl.cmake | 72 ++++++++++++++++++++++++++++++++ cmake/kernelTargets.cmake | 16 ++++++- extern/CMakeLists.txt | 4 +- 4 files changed, 90 insertions(+), 4 deletions(-) create mode 100644 cmake/accl.cmake diff --git a/PTRANS/src/device/CMakeLists.txt b/PTRANS/src/device/CMakeLists.txt index 7542a861..21176719 100644 --- a/PTRANS/src/device/CMakeLists.txt +++ b/PTRANS/src/device/CMakeLists.txt @@ -11,7 +11,7 @@ if (INTELFPGAOPENCL_FOUND) endif() if (VITIS_FOUND) - generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE) + generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE transpose_PQ_ACCL_buffers) add_test(NAME test_emulation_PQ_PCIE_xilinx COMMAND Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) diff --git a/cmake/accl.cmake b/cmake/accl.cmake new file mode 100644 index 00000000..6e7ccb38 --- /dev/null +++ b/cmake/accl.cmake @@ -0,0 +1,72 @@ + +set(ACCL_STACK_TYPE "UDP" CACHE STRING "Network layer used in ACCL") +set(ACCL_UDP_ETH_IF 0 CACHE STRING "Ethernet interface used. On ETHZ: 0 = switch, 1 = direct") +set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform") + +set(ACCL_CCLO_KERNEL_DIR ${extern_accl_SOURCE_DIR}/kernels/cclo/) +set(ACCL_CCLO_KERNEL_XO cclo_offload.xo) + +set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware) +set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/) +set(ACCL_UDP_MAC_XO ${ACCL_VNX_DIR}/Ethernet/_x.${FPGA_BOARD_NAME}/cmac_${ACCL_UDP_ETH_IF}.xo) +set(ACCL_UDP_NET_XO ${ACCL_VNX_DIR}/NetLayers/_x.${FPGA_BOARD_NAME}/networklayer.xo) + +add_custom_command( + OUTPUT ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} + COMMAND make STACK_TYPE=${ACCL_STACK_TYPE} PLATFORM=${FPGA_BOARD_NAME} + WORKING_DIRECTORY ${ACCL_CCLO_KERNEL_DIR}) + +add_custom_command( + OUTPUT ${ACCL_UDP_MAC_XO} + COMMAND make -C ${ACCL_VNX_DIR}/Ethernet DEVICE=${FPGA_BOARD_NAME} INTERFACE=${ACCL_UDP_ETH_IF} all + WORKING_DIRECTORY ${ACCL_HARDWARE_DIR}) + +add_custom_command( + OUTPUT ${ACCL_UDP_NET_XO} + COMMAND make -C ${ACCL_VNX_DIR}/NetLayers DEVICE=${FPGA_BOARD_NAME} all + WORKING_DIRECTORY ${ACCL_HARDWARE_DIR}) + + +set(ACCL_PLUGINS_DIR ${extern_accl_SOURCE_DIR}/kernels/plugins) +set(ACCL_PLUGINS_HOSTCTRL ${ACCL_PLUGINS_DIR}/hostctrl/hostctrl.xo) +set(ACCL_PLUGINS_SUM ${ACCL_PLUGINS_DIR}/reduce_sum/reduce_sum.xo) +set(ACCL_PLUGINS_COMPRESSION ${ACCL_PLUGINS_DIR}/hp_compression/hp_compression.xo) +set(ACCL_PLUGINS_LOOPBACK ${ACCL_PLUGINS_DIR}/loopback/loopback.xo) + +set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} + ${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO} PARENT_SCOPE) + +add_custom_target( + accl_udp_stack + DEPENDS ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO}) + +add_custom_target( + accl_cclo + DEPENDS ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO}) + +add_custom_command( + OUTPUT ${ACCL_PLUGINS_HOSTCTRL} + COMMAND vitis_hls build_hostctrl.tcl -tclargs ip ${ACCL_DEVICE_NAME} + WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/hostctrl ) +add_custom_command( + OUTPUT ${ACCL_PLUGINS_SUM} + COMMAND vitis_hls build.tcl -tclargs ip ${ACCL_DEVICE_NAME} + WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/reduce_sum ) +add_custom_command( + OUTPUT ${ACCL_PLUGINS_COMPRESSION} + COMMAND vitis_hls build.tcl -tclargs ip ${ACCL_DEVICE_NAME} + WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/hp_compression ) +add_custom_command( + OUTPUT ${ACCL_PLUGINS_LOOPBACK} + COMMAND vitis_hls build_loopback.tcl -tclargs ip ${ACCL_DEVICE_NAME} + WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/loopback ) + +add_custom_target( + accl_plugins + DEPENDS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} + ${ACCL_PLUGINS_COMPRESSION}) + +add_custom_target( + accl_udp) +add_dependencies(accl_udp accl_udp_stack accl_cclo accl_plugins) + diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake index 1d7e667f..35c128a9 100644 --- a/cmake/kernelTargets.cmake +++ b/cmake/kernelTargets.cmake @@ -9,6 +9,10 @@ else() set(VPP_FLAGS "-O3") endif() +if (USE_ACCL) + include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake) +endif() + ## # This function will create build targets for the kernels for emulationand synthesis for xilinx. ## @@ -21,6 +25,10 @@ function(generate_kernel_targets_xilinx) else() set(base_file_part "src/device/${kernel_file_name}") endif() + string(REGEX MATCH ".*_ACCL.*" is_accl_kernel ${kernel_file_name}) + if (is_accl_kernel AND NOT USE_ACCL) + continue() + endif() set(base_file "${CMAKE_SOURCE_DIR}/${base_file_part}.cl") if (KERNEL_REPLICATION_ENABLED) set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_replicated_xilinx.cl") @@ -40,6 +48,9 @@ function(generate_kernel_targets_xilinx) set(gen_xilinx_link_settings ${XILINX_LINK_SETTINGS_FILE}) set(xilinx_link_settings ${CMAKE_BINARY_DIR}/settings/settings.link.xilinx.${kernel_file_name}.ini) endif() + if (USE_ACCL AND is_accl_kernel) + list(APPEND additional_xos ${ACCL_UDP_XOS}) + endif() set(xilinx_report_folder "${EXECUTABLE_OUTPUT_PATH}/xilinx_reports") set(local_CLFLAGS ${CLFLAGS} -DXILINX_FPGA) list(APPEND local_CLFLAGS --report_dir=${xilinx_report_folder} --log_dir=${xilinx_report_folder}/logs) @@ -95,7 +106,7 @@ function(generate_kernel_targets_xilinx) DEPENDS ${XILINX_COMPILE_SETTINGS_FILE} ) add_custom_command(OUTPUT ${bitstream_f} - COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_f} ${bitstream_compile} + COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_f} ${bitstream_compile} ${additional_xos} MAIN_DEPENDENCY ${bitstream_compile} DEPENDS ${xilinx_link_settings} ) @@ -110,6 +121,9 @@ function(generate_kernel_targets_xilinx) DEPENDS ${bitstream_compile} DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h ) + if(USE_ACCL AND is_accl_kernel) + add_dependencies(${kernel_file_name}_xilinx accl_udp) + endif() list(APPEND kernel_emulation_targets_xilinx ${kernel_file_name}_emulate_xilinx) set(kernel_emulation_targets_xilinx ${kernel_emulation_targets_xilinx} CACHE INTERNAL "Kernel emulation targets used to define dependencies for the tests for Xilinx devices") endforeach () diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 7845280d..341f73cd 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -61,8 +61,8 @@ if (DEFINED USE_ACCL) FetchContent_Declare( extern_accl - GIT_REPOSITORY https://github.com/TristanLaan/ACCL.git - GIT_TAG simbuffer_bo_constructor) + GIT_REPOSITORY https://github.com/Xilinx/ACCL.git + GIT_TAG dev) FetchContent_GetProperties(extern_accl) if(NOT extern_accl_POPULATED) From 9f33a199ebeedd1c6075f1c84424364f3e0a02a2 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 25 Apr 2022 18:51:24 +0100 Subject: [PATCH 033/318] First version for ACCL+PTRANS synth --- .../Xilinx_U280_DDR_ACCL_buffers.cmake | 26 +++++++ ...k.xilinx.transpose_pq_accl_buffers.ddr.ini | 76 +++++++++++++++++++ cmake/accl.cmake | 8 +- cmake/kernelTargets.cmake | 6 +- 4 files changed, 112 insertions(+), 4 deletions(-) create mode 100644 PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini diff --git a/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake b/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake new file mode 100644 index 00000000..527f7612 --- /dev/null +++ b/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake @@ -0,0 +1,26 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes) +set(USE_XRT_HOST Yes) +set(USE_OCL_HOST No) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) +set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE) +set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini new file mode 100644 index 00000000..4809e31c --- /dev/null +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini @@ -0,0 +1,76 @@ +# /******************************************************************************* +# Copyright (C) 2021 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ +[connectivity] +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_sum:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:1:lb_user_krnl +nk=transpose0:2:transpose0.transpose1 + +# Kernels Foorplaning +slr=compression_0_0:SLR0 +slr=compression_0_1:SLR0 +slr=compression_0_2:SLR0 +slr=lb_user_krnl:SLR0 +slr=arith_0:SLR0 +slr=ccl_offload_0:SLR0 +slr=hostctrl_0:SLR0 +slr=networklayer_0:SLR0 +slr=cmac_0:SLR0 +slr=transpose0:SLR1 +slr=transpose1:SLR2 + +sp=ccl_offload_0.m_axi_0:DDR[0:1] +sp=ccl_offload_0.m_axi_1:DDR[0:1] +sp=transpose0.m_axi_gmem:DDR[0] +sp=transpose1.m_axi_gmem:DDR[1] + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in +stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl + diff --git a/cmake/accl.cmake b/cmake/accl.cmake index 6e7ccb38..b8f74167 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -4,13 +4,17 @@ set(ACCL_UDP_ETH_IF 0 CACHE STRING "Ethernet interface used. On ETHZ: 0 = switch set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform") set(ACCL_CCLO_KERNEL_DIR ${extern_accl_SOURCE_DIR}/kernels/cclo/) -set(ACCL_CCLO_KERNEL_XO cclo_offload.xo) +set(ACCL_CCLO_KERNEL_XO ccl_offload.xo) set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware) set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/) +set(ACCL_NETLAYER_HLS ${ACCL_VNX_DIR}/NetLayers/100G-fpga-network-stack-core) set(ACCL_UDP_MAC_XO ${ACCL_VNX_DIR}/Ethernet/_x.${FPGA_BOARD_NAME}/cmac_${ACCL_UDP_ETH_IF}.xo) set(ACCL_UDP_NET_XO ${ACCL_VNX_DIR}/NetLayers/_x.${FPGA_BOARD_NAME}/networklayer.xo) +set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HMB) +list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_VNX_DIR}/Ethernet/post_sys_link.tcl) +list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_HLS_IP_FOLDER}) add_custom_command( OUTPUT ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} COMMAND make STACK_TYPE=${ACCL_STACK_TYPE} PLATFORM=${FPGA_BOARD_NAME} @@ -34,7 +38,7 @@ set(ACCL_PLUGINS_COMPRESSION ${ACCL_PLUGINS_DIR}/hp_compression/hp_compression.x set(ACCL_PLUGINS_LOOPBACK ${ACCL_PLUGINS_DIR}/loopback/loopback.xo) set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} - ${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO} PARENT_SCOPE) + ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO} CACHE INTERNAL "Object files required for ACCL") add_custom_target( accl_udp_stack diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake index 35c128a9..fc84248c 100644 --- a/cmake/kernelTargets.cmake +++ b/cmake/kernelTargets.cmake @@ -10,7 +10,7 @@ else() endif() if (USE_ACCL) - include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake) + include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake) endif() ## @@ -54,7 +54,9 @@ function(generate_kernel_targets_xilinx) set(xilinx_report_folder "${EXECUTABLE_OUTPUT_PATH}/xilinx_reports") set(local_CLFLAGS ${CLFLAGS} -DXILINX_FPGA) list(APPEND local_CLFLAGS --report_dir=${xilinx_report_folder} --log_dir=${xilinx_report_folder}/logs) - + if (is_accl_kernel) + list(APPEND local_CLFLAGS ${ACCL_LINK_CONFIG}) + endif() string(REGEX MATCH "^.+\.tcl" is_tcl_script ${XILINX_COMPILE_SETTINGS_FILE}) if (is_tcl_script) set(CLFLAGS --hls.pre_tcl ${XILINX_COMPILE_SETTINGS_FILE}) From 979d275b6aa9f12c9295da0c5a2cfb201cab08bc Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 26 Apr 2022 16:54:22 +0100 Subject: [PATCH 034/318] Update configurations for first synthesis --- .../Xilinx_U280_DDR_ACCL_buffers.cmake | 7 +- .../Xilinx_U280_DDR_ACCL_buffers_ddr.cmake | 25 +++++++ .../Xilinx_U280_DDR_ACCL_buffers_hbm.cmake | 25 +++++++ .../settings.compile.xilinx.accl_buffers.ini | 0 .../settings.link.xilinx.accl_buffers.ddr.ini | 71 +++++++++++++++++++ .../settings.link.xilinx.accl_buffers.hbm.ini | 71 +++++++++++++++++++ b_eff/src/device/CMakeLists.txt | 24 ++++--- b_eff/src/device/communication_ACCL.cl | 27 +++++++ 8 files changed, 238 insertions(+), 12 deletions(-) create mode 100644 b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_ddr.cmake create mode 100644 b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_hbm.cmake create mode 100644 b_eff/settings/settings.compile.xilinx.accl_buffers.ini create mode 100644 b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini create mode 100644 b_eff/settings/settings.link.xilinx.accl_buffers.hbm.ini create mode 100644 b_eff/src/device/communication_ACCL.cl diff --git a/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake b/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake index 527f7612..21c8ec77 100644 --- a/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake +++ b/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake @@ -9,12 +9,13 @@ set(USE_MPI Yes CACHE BOOL "" FORCE) set(USE_SVM No CACHE BOOL "" FORCE) set(USE_HBM No CACHE BOOL "" FORCE) -set(USE_ACCL Yes) -set(USE_XRT_HOST Yes) -set(USE_OCL_HOST No) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini CACHE FILEPATH "" FORCE) set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) # STREAM specific options # Defaults to a total of ~12GB data diff --git a/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_ddr.cmake b/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_ddr.cmake new file mode 100644 index 00000000..523c8761 --- /dev/null +++ b/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_ddr.cmake @@ -0,0 +1,25 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_buffers.ddr.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_hbm.cmake b/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_hbm.cmake new file mode 100644 index 00000000..f097ebd9 --- /dev/null +++ b/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_hbm.cmake @@ -0,0 +1,25 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_buffers.hbm.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/b_eff/settings/settings.compile.xilinx.accl_buffers.ini b/b_eff/settings/settings.compile.xilinx.accl_buffers.ini new file mode 100644 index 00000000..e69de29b diff --git a/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini b/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini new file mode 100644 index 00000000..64c67abc --- /dev/null +++ b/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini @@ -0,0 +1,71 @@ +# /******************************************************************************* +# Copyright (C) 2021 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ +[connectivity] +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_sum:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:1:lb_user_krnl + +# Kernels Foorplaning +slr=compression_0_0:SLR0 +slr=compression_0_1:SLR0 +slr=compression_0_2:SLR0 +slr=lb_user_krnl:SLR0 +slr=arith_0:SLR0 +slr=ccl_offload_0:SLR0 +slr=hostctrl_0:SLR0 +slr=networklayer_0:SLR0 +slr=cmac_0:SLR0 + +sp=ccl_offload_0.m_axi_0:DDR[0:1] +sp=ccl_offload_0.m_axi_1:DDR[0:1] + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in +stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl + diff --git a/b_eff/settings/settings.link.xilinx.accl_buffers.hbm.ini b/b_eff/settings/settings.link.xilinx.accl_buffers.hbm.ini new file mode 100644 index 00000000..e6352198 --- /dev/null +++ b/b_eff/settings/settings.link.xilinx.accl_buffers.hbm.ini @@ -0,0 +1,71 @@ +# /******************************************************************************* +# Copyright (C) 2021 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ +[connectivity] +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_sum:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:1:lb_user_krnl + +# Kernels Foorplaning +slr=compression_0_0:SLR1 +slr=compression_0_1:SLR1 +slr=compression_0_2:SLR1 +slr=lb_user_krnl:SLR1 +slr=arith_0:SLR1 +slr=ccl_offload_0:SLR1 +slr=hostctrl_0:SLR1 +slr=networklayer_0:SLR2 +slr=cmac_0:SLR2 + +sp=ccl_offload_0.m_axi_0:HBM[0:5] +sp=ccl_offload_0.m_axi_1:HBM[0:5] + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in +stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl + diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt index 8316a884..e5939572 100644 --- a/b_eff/src/device/CMakeLists.txt +++ b/b_eff/src/device/CMakeLists.txt @@ -3,12 +3,18 @@ set(KERNEL_REPLICATION_ENABLED Yes CACHE INTERNAL "Enables kernel replication in set(NUM_REPLICATIONS 2) include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake) -generate_kernel_targets_intel(communication_bw520n_IEC) -add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 -m 0 -n 1 - WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) -add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1 - WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) -add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type PCIE -l 1 -u 10 -m 0 -n 1 - WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) -add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 1 -m 20 -n 1 - WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) +if (INTELFPGAOPENCL_FOUND) + generate_kernel_targets_intel(communication_bw520n_IEC) + add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 -m 0 -n 1 + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1 + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type PCIE -l 1 -u 10 -m 0 -n 1 + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 1 -m 20 -n 1 + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) +endif() + +if (VITIS_FOUND) + generate_kernel_targets_xilinx(communication_ACCL) +endif() diff --git a/b_eff/src/device/communication_ACCL.cl b/b_eff/src/device/communication_ACCL.cl new file mode 100644 index 00000000..80c12a86 --- /dev/null +++ b/b_eff/src/device/communication_ACCL.cl @@ -0,0 +1,27 @@ +/* +Copyright (c) 2022 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +__kernel +void dummy(__global void *nothing) { + // Do nothing. + // Will be exluded during linking process and will not be in final bitstream +} \ No newline at end of file From aaa8583bbc7d16b728d1dabc553c585146a6d0c0 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 26 Apr 2022 16:54:42 +0100 Subject: [PATCH 035/318] Change Kernel ordering on SLRs --- ...k.xilinx.transpose_pq_accl_buffers.ddr.ini | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini index 4809e31c..1cb8cc27 100644 --- a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini @@ -26,17 +26,17 @@ nk=loopback:1:lb_user_krnl nk=transpose0:2:transpose0.transpose1 # Kernels Foorplaning -slr=compression_0_0:SLR0 -slr=compression_0_1:SLR0 -slr=compression_0_2:SLR0 -slr=lb_user_krnl:SLR0 -slr=arith_0:SLR0 -slr=ccl_offload_0:SLR0 -slr=hostctrl_0:SLR0 -slr=networklayer_0:SLR0 -slr=cmac_0:SLR0 -slr=transpose0:SLR1 -slr=transpose1:SLR2 +slr=compression_0_0:SLR1 +slr=compression_0_1:SLR1 +slr=compression_0_2:SLR1 +slr=lb_user_krnl:SLR1 +slr=arith_0:SLR1 +slr=ccl_offload_0:SLR1 +slr=hostctrl_0:SLR1 +slr=networklayer_0:SLR2 +slr=cmac_0:SLR2 +slr=transpose0:SLR0 +slr=transpose1:SLR1 sp=ccl_offload_0.m_axi_0:DDR[0:1] sp=ccl_offload_0.m_axi_1:DDR[0:1] From 0e9dc1752835e05bbde33f018df8504b70e6ac44 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 26 Apr 2022 18:08:35 +0100 Subject: [PATCH 036/318] Fix placement of kernels in b_eff DDR --- .../settings.link.xilinx.accl_buffers.ddr.ini | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini b/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini index 64c67abc..2ee98436 100644 --- a/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini +++ b/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini @@ -25,15 +25,15 @@ nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 nk=loopback:1:lb_user_krnl # Kernels Foorplaning -slr=compression_0_0:SLR0 -slr=compression_0_1:SLR0 -slr=compression_0_2:SLR0 -slr=lb_user_krnl:SLR0 -slr=arith_0:SLR0 -slr=ccl_offload_0:SLR0 -slr=hostctrl_0:SLR0 -slr=networklayer_0:SLR0 -slr=cmac_0:SLR0 +slr=compression_0_0:SLR1 +slr=compression_0_1:SLR1 +slr=compression_0_2:SLR1 +slr=lb_user_krnl:SLR1 +slr=arith_0:SLR1 +slr=ccl_offload_0:SLR1 +slr=hostctrl_0:SLR1 +slr=networklayer_0:SLR2 +slr=cmac_0:SLR2 sp=ccl_offload_0.m_axi_0:DDR[0:1] sp=ccl_offload_0.m_axi_1:DDR[0:1] From ddefbac38169e01d2d478809e96843b2c926be07 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 26 Apr 2022 18:09:17 +0100 Subject: [PATCH 037/318] Update ACCL cmake scripts for TCP --- cmake/accl.cmake | 98 +++++++++++++++++++++++++++++++-------- cmake/kernelTargets.cmake | 2 +- 2 files changed, 79 insertions(+), 21 deletions(-) diff --git a/cmake/accl.cmake b/cmake/accl.cmake index b8f74167..ca7b0fc2 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -1,24 +1,21 @@ +# General definitions set(ACCL_STACK_TYPE "UDP" CACHE STRING "Network layer used in ACCL") set(ACCL_UDP_ETH_IF 0 CACHE STRING "Ethernet interface used. On ETHZ: 0 = switch, 1 = direct") set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform") - -set(ACCL_CCLO_KERNEL_DIR ${extern_accl_SOURCE_DIR}/kernels/cclo/) -set(ACCL_CCLO_KERNEL_XO ccl_offload.xo) - set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware) + +# UDP related definitions set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/) set(ACCL_NETLAYER_HLS ${ACCL_VNX_DIR}/NetLayers/100G-fpga-network-stack-core) set(ACCL_UDP_MAC_XO ${ACCL_VNX_DIR}/Ethernet/_x.${FPGA_BOARD_NAME}/cmac_${ACCL_UDP_ETH_IF}.xo) set(ACCL_UDP_NET_XO ${ACCL_VNX_DIR}/NetLayers/_x.${FPGA_BOARD_NAME}/networklayer.xo) - set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HMB) -list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_VNX_DIR}/Ethernet/post_sys_link.tcl) -list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_HLS_IP_FOLDER}) -add_custom_command( - OUTPUT ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} - COMMAND make STACK_TYPE=${ACCL_STACK_TYPE} PLATFORM=${FPGA_BOARD_NAME} - WORKING_DIRECTORY ${ACCL_CCLO_KERNEL_DIR}) +if (ACCL_STACK_TYPE STREQUAL "UDP") + list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_VNX_DIR}/Ethernet/post_sys_link.tcl) + list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_HLS_IP_FOLDER}) + set(ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE}) +endif() add_custom_command( OUTPUT ${ACCL_UDP_MAC_XO} @@ -30,24 +27,67 @@ add_custom_command( COMMAND make -C ${ACCL_VNX_DIR}/NetLayers DEVICE=${FPGA_BOARD_NAME} all WORKING_DIRECTORY ${ACCL_HARDWARE_DIR}) +add_custom_target( + accl_udp_stack + DEPENDS ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO}) -set(ACCL_PLUGINS_DIR ${extern_accl_SOURCE_DIR}/kernels/plugins) -set(ACCL_PLUGINS_HOSTCTRL ${ACCL_PLUGINS_DIR}/hostctrl/hostctrl.xo) -set(ACCL_PLUGINS_SUM ${ACCL_PLUGINS_DIR}/reduce_sum/reduce_sum.xo) -set(ACCL_PLUGINS_COMPRESSION ${ACCL_PLUGINS_DIR}/hp_compression/hp_compression.xo) -set(ACCL_PLUGINS_LOOPBACK ${ACCL_PLUGINS_DIR}/loopback/loopback.xo) +# TCP related definitions +set(ACCL_TCP_BASE_DIR ${extern_accl_SOURCE_DIR}/Vitis_with_100Gbps_TCP-IP) +set(ACCL_TCP_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/network_krnl.xo) +set(ACCL_TCP_CMAC_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/cmac_krnl.xo) +if (ACCL_STACK_TYPE STREQUAL "TCP") + list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_TCP_BASE_DIR}/scripts/post_sys_link.tcl) + list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo) + set(ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE} EN_FANIN=1) +endif() -set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} - ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO} CACHE INTERNAL "Object files required for ACCL") +# TODO: This is very sppecific to the Xilinx build system, because +# different Vivado version is required to build these ips +add_custom_command( + OUTPUT ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo + COMMAND mkdir build && cd build && cmake .. -DFDEV_NAME=u280 + -DVIVADO_HLS_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 + -DVIVADO_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 + -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=1 -DTCP_STACK_WINDOW_SCALING_EN=0 + WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR}) + +add_custom_command( + OUTPUT ${ACCL_TCP_CMAC_XO} + COMMAND make cmac_krnl DEVICE=${FPGA_BOARD_NAME} XSA=${FPGA_BOARD_NAME} TEMP_DIR=_x.hw.${FPGA_BOARD_NAME}/ + WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR} + DEPENDS ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo) + +add_custom_command( + OUTPUT ${ACCL_TCP_XO} + COMMAND make network_krnl DEVICE=${FPGA_BOARD_NAME} XSA=${FPGA_BOARD_NAME} TEMP_DIR=_x.hw.${FPGA_BOARD_NAME}/ + WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR} + DEPENDS ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo) add_custom_target( - accl_udp_stack - DEPENDS ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO}) + accl_tcp_stack + DEPENDS ${ACCL_TCP_XO} ${ACCL_TCP_CMAC_XO}) + + +# Build CCLO +set(ACCL_CCLO_KERNEL_DIR ${extern_accl_SOURCE_DIR}/kernels/cclo/) +set(ACCL_CCLO_KERNEL_XO ccl_offload.xo) + +add_custom_command( + OUTPUT ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} + COMMAND make ${ACCL_CCLO_BUILD_ARGS} PLATFORM=${FPGA_BOARD_NAME} + WORKING_DIRECTORY ${ACCL_CCLO_KERNEL_DIR}) add_custom_target( accl_cclo DEPENDS ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO}) +# Build the ACCL Plugins +set(ACCL_PLUGINS_DIR ${extern_accl_SOURCE_DIR}/kernels/plugins) +set(ACCL_PLUGINS_HOSTCTRL ${ACCL_PLUGINS_DIR}/hostctrl/hostctrl.xo) +set(ACCL_PLUGINS_SUM ${ACCL_PLUGINS_DIR}/reduce_sum/reduce_sum.xo) +set(ACCL_PLUGINS_COMPRESSION ${ACCL_PLUGINS_DIR}/hp_compression/hp_compression.xo) +set(ACCL_PLUGINS_LOOPBACK ${ACCL_PLUGINS_DIR}/loopback/loopback.xo) + add_custom_command( OUTPUT ${ACCL_PLUGINS_HOSTCTRL} COMMAND vitis_hls build_hostctrl.tcl -tclargs ip ${ACCL_DEVICE_NAME} @@ -70,7 +110,25 @@ add_custom_target( DEPENDS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} ${ACCL_PLUGINS_COMPRESSION}) +set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} + ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO} CACHE INTERNAL "Object files required for ACCL with UDP") + +set(ACCL_TCP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} + ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_TCP_CMAC_XO} ${ACCL_TCP_XO} CACHE INTERNAL "Object files required for ACCL with TCP") + +if (ACCL_STACK_TYPE STREQUAL "UDP") + set(ACCL_XOS ${ACCL_UDP_XOS} CACHE INTERNAL "Object files required for ACCL") +else() + set(ACCL_XOS ${ACCL_TCP_XOS} CACHE INTERNAL "Object files required for ACCL") +endif() + add_custom_target( accl_udp) add_dependencies(accl_udp accl_udp_stack accl_cclo accl_plugins) +add_custom_target( + accl_tcp) +add_dependencies(accl_tcp accl_tcp_stack accl_cclo accl_plugins) + + + diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake index fc84248c..7f5a4775 100644 --- a/cmake/kernelTargets.cmake +++ b/cmake/kernelTargets.cmake @@ -49,7 +49,7 @@ function(generate_kernel_targets_xilinx) set(xilinx_link_settings ${CMAKE_BINARY_DIR}/settings/settings.link.xilinx.${kernel_file_name}.ini) endif() if (USE_ACCL AND is_accl_kernel) - list(APPEND additional_xos ${ACCL_UDP_XOS}) + list(APPEND additional_xos ${ACCL_XOS}) endif() set(xilinx_report_folder "${EXECUTABLE_OUTPUT_PATH}/xilinx_reports") set(local_CLFLAGS ${CLFLAGS} -DXILINX_FPGA) From 4745d8f9ab55d9619deb506cbc77a6947a517bdd Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 26 Apr 2022 18:24:34 +0100 Subject: [PATCH 038/318] Add ACCL TCP configs for PTRANS --- .../Xilinx_U280_DDR_ACCL_TCP_buffers.cmake | 28 ++++++ ...linx.transpose_pq_accl_tcp_buffers.ddr.ini | 86 +++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 PTRANS/configs/Xilinx_U280_DDR_ACCL_TCP_buffers.cmake create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini diff --git a/PTRANS/configs/Xilinx_U280_DDR_ACCL_TCP_buffers.cmake b/PTRANS/configs/Xilinx_U280_DDR_ACCL_TCP_buffers.cmake new file mode 100644 index 00000000..e8e77751 --- /dev/null +++ b/PTRANS/configs/Xilinx_U280_DDR_ACCL_TCP_buffers.cmake @@ -0,0 +1,28 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(ACCL_STACK_TYPE "TCP" CACHE STRING "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) +set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE) +set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini new file mode 100644 index 00000000..a1492b0a --- /dev/null +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini @@ -0,0 +1,86 @@ +# /******************************************************************************* +# Copyright (C) 2021 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ +[connectivity] +# Define number of kernels and their name +nk=network_krnl:1:network_krnl_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_krnl:1:cmac_krnl_0 +nk=reduce_sum:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:1:lb_user_krnl +nk=transpose0:2:transpose0.transpose1 + +# Kernels Foorplaning +slr=compression_0_0:SLR0 +slr=compression_0_1:SLR0 +slr=compression_0_2:SLR0 +slr=lb_user_krnl:SLR0 +slr=arith_0:SLR0 +slr=ccl_offload_0:SLR0 +slr=hostctrl_0:SLR0 +slr=network_krnl_0:SLR1 +slr=cmac_krnl_0:SLR2 +slr=transpose0:SLR0 +slr=transpose1:SLR1 + +sp=network_krnl_0.m00_axi:DDR[0] +sp=network_krnl_0.m01_axi:DDR[0] +sp=ccl_offload_0.m_axi_0:DDR[0:1] +sp=ccl_offload_0.m_axi_1:DDR[0:1] +sp=transpose0.m_axi_gmem:DDR[0] +sp=transpose1.m_axi_gmem:DDR[1] + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to TCP Network Kernel +stream_connect=network_krnl_0.m_axis_tcp_port_status:ccl_offload_0.s_axis_eth_port_status:512 +stream_connect=network_krnl_0.m_axis_tcp_open_status:ccl_offload_0.s_axis_eth_open_status:512 +stream_connect=network_krnl_0.m_axis_tcp_notification:ccl_offload_0.s_axis_eth_notification:512 +stream_connect=network_krnl_0.m_axis_tcp_rx_meta:ccl_offload_0.s_axis_eth_rx_meta:512 +stream_connect=network_krnl_0.m_axis_tcp_rx_data:ccl_offload_0.s_axis_eth_rx_data:512 +stream_connect=network_krnl_0.m_axis_tcp_tx_status:ccl_offload_0.s_axis_eth_tx_status:512 +stream_connect=ccl_offload_0.m_axis_eth_listen_port:network_krnl_0.s_axis_tcp_listen_port:512 +stream_connect=ccl_offload_0.m_axis_eth_open_connection:network_krnl_0.s_axis_tcp_open_connection:512 +stream_connect=ccl_offload_0.m_axis_eth_read_pkg:network_krnl_0.s_axis_tcp_read_pkg:512 +stream_connect=ccl_offload_0.m_axis_eth_tx_meta:network_krnl_0.s_axis_tcp_tx_meta:512 +stream_connect=ccl_offload_0.m_axis_eth_tx_data:network_krnl_0.s_axis_tcp_tx_data:512 + +# Connect Network Kernel to CMAC Kernel +stream_connect=cmac_krnl_0.axis_net_rx:network_krnl_0.axis_net_rx +stream_connect=network_krnl_0.axis_net_tx:cmac_krnl_0.axis_net_tx + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in +stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl From f089ec171ff2b522b8cb6629031ac9d17e154f95 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 27 Apr 2022 10:11:41 +0100 Subject: [PATCH 039/318] Create unified device target for ACCL --- cmake/accl.cmake | 10 +++++++--- cmake/kernelTargets.cmake | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cmake/accl.cmake b/cmake/accl.cmake index ca7b0fc2..d3989e47 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -32,7 +32,7 @@ add_custom_target( DEPENDS ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO}) # TCP related definitions -set(ACCL_TCP_BASE_DIR ${extern_accl_SOURCE_DIR}/Vitis_with_100Gbps_TCP-IP) +set(ACCL_TCP_BASE_DIR ${ACCL_HARDWARE_DIR}/Vitis_with_100Gbps_TCP-IP) set(ACCL_TCP_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/network_krnl.xo) set(ACCL_TCP_CMAC_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/cmac_krnl.xo) if (ACCL_STACK_TYPE STREQUAL "TCP") @@ -130,5 +130,9 @@ add_custom_target( accl_tcp) add_dependencies(accl_tcp accl_tcp_stack accl_cclo accl_plugins) - - +add_custom_target(accl_device) +if (ACCL_STACK_TYPE STREQUAL "UDP") + add_dependencies(accl_device accl_udp) +else() + add_dependencies(accl_device accl_tcp) +endif() diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake index 7f5a4775..22680a6c 100644 --- a/cmake/kernelTargets.cmake +++ b/cmake/kernelTargets.cmake @@ -123,8 +123,8 @@ function(generate_kernel_targets_xilinx) DEPENDS ${bitstream_compile} DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h ) - if(USE_ACCL AND is_accl_kernel) - add_dependencies(${kernel_file_name}_xilinx accl_udp) + if(USE_ACCL AND is_accl_kernel) + add_dependencies(${kernel_file_name}_xilinx accl_device) endif() list(APPEND kernel_emulation_targets_xilinx ${kernel_file_name}_emulate_xilinx) set(kernel_emulation_targets_xilinx ${kernel_emulation_targets_xilinx} CACHE INTERNAL "Kernel emulation targets used to define dependencies for the tests for Xilinx devices") From 4da920372e28bf65fb85dd675321d7cd411ef122 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 27 Apr 2022 14:52:45 +0100 Subject: [PATCH 040/318] Also call make installip --- cmake/accl.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/accl.cmake b/cmake/accl.cmake index d3989e47..88bc2b64 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -48,7 +48,8 @@ add_custom_command( COMMAND mkdir build && cd build && cmake .. -DFDEV_NAME=u280 -DVIVADO_HLS_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 -DVIVADO_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 - -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=1 -DTCP_STACK_WINDOW_SCALING_EN=0 + -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=1 -DTCP_STACK_WINDOW_SCALING_EN=0 && + make installip WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR}) add_custom_command( From b45bd26cd33a0bba0d8513e835ddfcb2a0444670 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 27 Apr 2022 14:53:09 +0100 Subject: [PATCH 041/318] Fix b_eff host code build scripts --- b_eff/src/host/CMakeLists.txt | 2 ++ b_eff/src/host/network_benchmark.hpp | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt index b8c44859..d0be57ba 100755 --- a/b_eff/src/host/CMakeLists.txt +++ b/b_eff/src/host/CMakeLists.txt @@ -14,6 +14,7 @@ if (INTELFPGAOPENCL_FOUND) target_link_libraries(${LIB_NAME}_intel hpcc_fpga_base) target_link_libraries(${HOST_EXE_NAME}_intel ${LIB_NAME}_intel) target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA -D_USE_MPI_) + target_compile_definitions(${HOST_EXE_NAME}_intel PRIVATE -DINTEL_FPGA) target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_intel_host_executable COMMAND $ -h) endif() @@ -28,6 +29,7 @@ if (Vitis_FOUND) target_include_directories(${LIB_NAME}_xilinx PRIVATE ${ACCL_INCLUDE_PATH}) target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) + target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_xilinx_host_executable COMMAND $ -h) endif() diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index efffe1bf..8e9e2fc1 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -31,8 +31,6 @@ SOFTWARE. #include "hpcc_benchmark.hpp" #include "parameters.h" -//TODO: remove this custom allocator since cl2.hpp is available here? -#if 0 #ifdef XILINX_FPGA template struct aligned_allocator { @@ -59,7 +57,6 @@ namespace cl { template using vector = std::vector>; } #endif -#endif /** * @brief Contains all classes and methods needed by the Network benchmark From 64ca17851299b7a0d782a13a8d8f0b7773bcec3e Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 4 May 2022 11:45:37 +0100 Subject: [PATCH 042/318] Reformatting and fix data read back --- .../execution_types/execution_xrt_pcie_pq.hpp | 130 ++++++++++-------- 1 file changed, 72 insertions(+), 58 deletions(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp index 85481b6f..d59ba2e0 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp @@ -37,32 +37,37 @@ namespace fpga_execution { namespace pcie_pq { /** - * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and - * PCIe+MPI over the host for communication + * @brief Transpose and add the matrices using the OpenCL kernel using a PQ + * distribution and PCIe+MPI over the host for communication * * @param config The progrma configuration - * @param data data object that contains all required data for the execution on the FPGA - * @param handler data handler instance that should be used to exchange data between hosts - * @return std::unique_ptr The measured execution times + * @param data data object that contains all required data for the execution on + * the FPGA + * @param handler data handler instance that should be used to exchange data + * between hosts + * @return std::unique_ptr The measured + * execution times */ -static std::unique_ptr -calculate(const hpcc_base::ExecutionSettings &config, +static std::unique_ptr calculate( + const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, - transpose::data_handler::DistributedPQTransposeDataHandler - &handler) { + transpose::data_handler::DistributedPQTransposeDataHandler< + xrt::device, bool, xrt::uuid> &handler) { int err; if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { - throw std::runtime_error("Used data handler not supported by execution handler!"); + throw std::runtime_error( + "Used data handler not supported by execution handler!"); } #ifdef USE_SVM - throw new std::runtime_error( - "SVM not supported in the host implementation of this communication method"); + throw new std::runtime_error("SVM not supported in the host implementation " + "of this communication method"); #endif #ifdef USE_BUFFER_WRITE_RECT_FOR_A - throw new std::runtime_error("Using the Write Rect method is not supported in this host " + throw new std::runtime_error( + "Using the Write Rect method is not supported in this host " "implementation of this communication method"); #endif @@ -77,56 +82,59 @@ calculate(const hpcc_base::ExecutionSettingskernelReplications; r++) { - // Calculate how many blocks the current kernel replication will need to process. + // Calculate how many blocks the current kernel replication will need to + // process. size_t blocks_per_replication = - (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications); - size_t blocks_remainder = - (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications; + (local_matrix_height * local_matrix_width / + config.programSettings->kernelReplications); + size_t blocks_remainder = (local_matrix_height * local_matrix_width) % + config.programSettings->kernelReplications; if (blocks_remainder > r) { - // Catch the case, that the number of blocks is not divisible by the number of kernel - // replications + // Catch the case, that the number of blocks is not divisible by the + // number of kernel replications blocks_per_replication += 1; } if (blocks_per_replication < 1) { continue; } blocksPerReplication.push_back(blocks_per_replication); - size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width * - local_matrix_width * data.blockSize * data.blockSize; + size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / + local_matrix_width * local_matrix_width * + data.blockSize * data.blockSize; bufferSizeList.push_back(buffer_size); bufferStartList.push_back(total_offset); bufferOffsetList.push_back(row_offset); row_offset = (row_offset + blocks_per_replication) % local_matrix_width; - total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * - local_matrix_width; - - int memory_bank_info_a = 0; - int memory_bank_info_b = 0; - int memory_bank_info_out = 0; + total_offset += (bufferOffsetList.back() + blocks_per_replication) / + local_matrix_width * local_matrix_width; // create the kernels xrt::kernel transposeKernel(*config.device, *config.program, ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); xrt::bo bufferA(*config.device, data.A, - data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), + data.numBlocks * data.blockSize * data.blockSize * + sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0)); - xrt::bo bufferB(*config.device,&data.B[bufferStartList[r] * data.blockSize * data.blockSize], + xrt::bo bufferB( + *config.device, + &data.B[bufferStartList[r] * data.blockSize * data.blockSize], buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1)); - // TODO For small matrices, the 4KB alignment might fail for buffer B. Temporary fix seen in - // lines below (requires extra copying) - // xrt::bo bufferB(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), - // transposeKernel.group_id(1)); bufferB.write(data.B + bufferStartList[r] * data.blockSize * - // data.blockSize); + // TODO For small matrices, the 4KB alignment might fail for buffer B. + // Temporary fix seen in lines below (requires extra copying) xrt::bo + // bufferB(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), + // transposeKernel.group_id(1)); bufferB.write(data.B + bufferStartList[r] * + // data.blockSize * data.blockSize); xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2)); @@ -150,7 +158,8 @@ calculate(const hpcc_base::ExecutionSettings transferTime = - std::chrono::duration_cast>(endTransfer - startTransfer); + std::chrono::duration_cast>( + endTransfer - startTransfer); MPI_Barrier(MPI_COMM_WORLD); @@ -163,7 +172,8 @@ calculate(const hpcc_base::ExecutionSettings>(endCalculation - - startKernelCalculation) + << std::chrono::duration_cast>( + endCalculation - startKernelCalculation) .count() << "s (" - << ((config.programSettings->matrixSize * config.programSettings->matrixSize * - sizeof(HOST_DATA_TYPE) * 3) / - std::chrono::duration_cast>(endCalculation - - startKernelCalculation) + << ((config.programSettings->matrixSize * + config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * + 3) / + std::chrono::duration_cast>( + endCalculation - startKernelCalculation) .count() * 1.0e-9) << " GB/s)" << std::endl; @@ -205,12 +216,13 @@ calculate(const hpcc_base::ExecutionSettings calculationTime = - std::chrono::duration_cast>(endCalculation - - startCalculation); + std::chrono::duration_cast>( + endCalculation - startCalculation); calculationTimings.push_back(calculationTime.count()); - std::vector tmp_write_buffer(local_matrix_height * local_matrix_width * - data.blockSize * data.blockSize); + std::vector tmp_write_buffer( + local_matrix_height * local_matrix_width * data.blockSize * + data.blockSize); startTransfer = std::chrono::high_resolution_clock::now(); @@ -224,29 +236,31 @@ calculate(const hpcc_base::ExecutionSettings>(endTransfer - startTransfer); + transferTime += std::chrono::duration_cast>( + endTransfer - startTransfer); transferTimings.push_back(transferTime.count()); } std::unique_ptr result( - new transpose::TransposeExecutionTimings{transferTimings, calculationTimings}); + new transpose::TransposeExecutionTimings{transferTimings, + calculationTimings}); return result; } From e07261ee2eca88970fe9f7bb52fd9778358a6875 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 5 May 2022 10:12:31 +0100 Subject: [PATCH 043/318] Fix offset calculation of MPI transpose --- PTRANS/src/host/data_handlers/pq.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp index afa11575..87e7d15f 100644 --- a/PTRANS/src/host/data_handlers/pq.hpp +++ b/PTRANS/src/host/data_handlers/pq.hpp @@ -223,7 +223,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler(next_chunk) * static_cast(data.blockSize * data.blockSize); + offset += static_cast(next_chunk); } // Exchange window pointers From cc928b203863b2973b9ea94e1667fdcd20ea427d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 5 May 2022 10:12:54 +0100 Subject: [PATCH 044/318] Add number of wrong entries to output --- PTRANS/src/host/transpose_benchmark.hpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index d1ab4340..392789c8 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -167,16 +167,23 @@ public hpcc_base::HpccFpgaBenchmarkdataHandler->reference_transpose(data); double max_error = 0.0; + int error_count = 0; for (size_t i = 0; i < this->executionSettings->programSettings->blockSize * this->executionSettings->programSettings->blockSize * data.numBlocks; i++) { max_error = std::max(std::abs(data.A[i]), max_error); + if (std::abs(data.A[i]) - 100 * std::numeric_limits::epsilon() > 0.0) { + error_count++; + } } double global_max_error = 0; + int global_error_count = 0; MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(&error_count, &global_error_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); if (this->mpi_comm_rank == 0) { - std::cout << "Maximum error: " << global_max_error << " < " << 100 * std::numeric_limits::epsilon() << std::endl; - std::cout << "Mach. Epsilon: " << std::numeric_limits::epsilon() << std::endl; + std::cout << "Erronous entries: " << global_error_count << std::endl; + std::cout << "Maximum error: " << global_max_error << " < " << 100 * std::numeric_limits::epsilon() << std::endl; + std::cout << "Mach. Epsilon: " << std::numeric_limits::epsilon() << std::endl; } return static_cast(global_max_error) < 100 * std::numeric_limits::epsilon(); From 97414a49f20a6e1d600bda35c01da7e4f7791ea1 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 5 May 2022 16:28:01 +0100 Subject: [PATCH 045/318] Add support for C++ kernel code --- cmake/kernelTargets.cmake | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake index 22680a6c..84ce896f 100644 --- a/cmake/kernelTargets.cmake +++ b/cmake/kernelTargets.cmake @@ -13,6 +13,8 @@ if (USE_ACCL) include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake) endif() +set(file_endings "cpp" "cl") + ## # This function will create build targets for the kernels for emulationand synthesis for xilinx. ## @@ -29,11 +31,19 @@ function(generate_kernel_targets_xilinx) if (is_accl_kernel AND NOT USE_ACCL) continue() endif() - set(base_file "${CMAKE_SOURCE_DIR}/${base_file_part}.cl") + set(file_exists No) + foreach (ending ${file_endings}) + set(search_file_name "${CMAKE_SOURCE_DIR}/${base_file_part}.${ending}") + if (NOT file_exists AND EXISTS ${search_file_name}) + set(file_exists Yes) + set(selected_file_ending ${ending}) + set(base_file "${search_file_name}") + endif() + endforeach() if (KERNEL_REPLICATION_ENABLED) - set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_replicated_xilinx.cl") + set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_replicated_xilinx.${selected_file_ending}") else() - set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_copied_xilinx.cl") + set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_copied_xilinx.${selected_file_ending}") endif() set(bitstream_compile xilinx_tmp_compile/${kernel_file_name}.xo) set(bitstream_compile_emulate xilinx_tmp_compile/${kernel_file_name}_emulate.xo) @@ -55,7 +65,7 @@ function(generate_kernel_targets_xilinx) set(local_CLFLAGS ${CLFLAGS} -DXILINX_FPGA) list(APPEND local_CLFLAGS --report_dir=${xilinx_report_folder} --log_dir=${xilinx_report_folder}/logs) if (is_accl_kernel) - list(APPEND local_CLFLAGS ${ACCL_LINK_CONFIG}) + list(APPEND local_harware_only_flags ${ACCL_LINK_CONFIG}) endif() string(REGEX MATCH "^.+\.tcl" is_tcl_script ${XILINX_COMPILE_SETTINGS_FILE}) if (is_tcl_script) @@ -108,7 +118,7 @@ function(generate_kernel_targets_xilinx) DEPENDS ${XILINX_COMPILE_SETTINGS_FILE} ) add_custom_command(OUTPUT ${bitstream_f} - COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_f} ${bitstream_compile} ${additional_xos} + COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} ${local_harware_only_flags} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_f} ${bitstream_compile} ${additional_xos} MAIN_DEPENDENCY ${bitstream_compile} DEPENDS ${xilinx_link_settings} ) From fa9a2051f1588bc18c6a040186a1789daaab60ab Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 5 May 2022 16:29:03 +0100 Subject: [PATCH 046/318] Add C++ baseline for PTRANS --- .../src/device/transpose_PQ_ACCL_buffers.cl | 1 - .../src/device/transpose_PQ_ACCL_buffers.cpp | 1 + PTRANS/src/device/transpose_PQ_PCIE.cpp | 158 ++++++++++++++++++ 3 files changed, 159 insertions(+), 1 deletion(-) delete mode 120000 PTRANS/src/device/transpose_PQ_ACCL_buffers.cl create mode 120000 PTRANS/src/device/transpose_PQ_ACCL_buffers.cpp create mode 100644 PTRANS/src/device/transpose_PQ_PCIE.cpp diff --git a/PTRANS/src/device/transpose_PQ_ACCL_buffers.cl b/PTRANS/src/device/transpose_PQ_ACCL_buffers.cl deleted file mode 120000 index 64e94f20..00000000 --- a/PTRANS/src/device/transpose_PQ_ACCL_buffers.cl +++ /dev/null @@ -1 +0,0 @@ -transpose_PQ_PCIE.cl \ No newline at end of file diff --git a/PTRANS/src/device/transpose_PQ_ACCL_buffers.cpp b/PTRANS/src/device/transpose_PQ_ACCL_buffers.cpp new file mode 120000 index 00000000..58aeb801 --- /dev/null +++ b/PTRANS/src/device/transpose_PQ_ACCL_buffers.cpp @@ -0,0 +1 @@ +transpose_PQ_PCIE.cpp \ No newline at end of file diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cpp b/PTRANS/src/device/transpose_PQ_PCIE.cpp new file mode 100644 index 00000000..456c6919 --- /dev/null +++ b/PTRANS/src/device/transpose_PQ_PCIE.cpp @@ -0,0 +1,158 @@ +/****************************************************************************** + * Author: Arjun Ramaswami + * + * Edited by Marius Meyer: + * - Adapt to used kernel signature + * - Change to row-column loop structure + *****************************************************************************/ +#include "parameters.h" + +const unsigned int block_size = BLOCK_SIZE; +const unsigned int channel_width = CHANNEL_WIDTH; + + + +extern "C" { + +// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] + +/** + * Read blocks of matrix A and transpose them in memory. + * Write the block into an external channel. + * + * Will do the following: + * + * A -> trans(A) -> ext. ch + * + * @param A Buffer for matrix A + * @param B Buffer for matrix B + * @param A_out Buffer for result matrix + * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise + on the block level, the whole matrix A might be written to global memory and the relevant columns + need to be picked using this offset. + * @param number_of_blocks The number of blocks that will be processed starting from the block offset + * @param width_in_blocks The with of matrix A in blocks + * @param height_in_blocks The height of matix A in blocks + */ +void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, + const DEVICE_DATA_TYPE *B, + DEVICE_DATA_TYPE *A_out, + const unsigned int offset_a, + const unsigned int offset_b, + const unsigned int number_of_blocks, + const unsigned int width_in_blocks, + const unsigned int height_in_blocks) { + + // local memory double buffer for a matrix block + DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width]; +#pragma HLS ARRAY_PARTITION variable = a_block complete dim = 2 + // local memory double buffer for a matrix block + DEVICE_DATA_TYPE a_plus_b_block[block_size * block_size / channel_width][channel_width]; +#pragma HLS ARRAY_PARTITION variable = a_plus_b_block complete dim = 2 + + // transpose the matrix block-wise from global memory +block_loop: + for (unsigned int block = 0; block < number_of_blocks; block++) { +read_A: + for (unsigned int row = 0; row < block_size; row++) { +read_A_line: + for (unsigned int col = 0; col < block_size / channel_width; col++) { + #pragma HLS unroll region + unsigned long block_row_a = (block + offset_a) / width_in_blocks; + unsigned long block_col_a = (block + offset_a) % width_in_blocks; + unsigned long ls_address_trans = block_col_a * block_size * block_size * height_in_blocks + + block_row_a * block_size + + row * block_size * height_in_blocks; + + // read in block of A from global memory and store it in a memory efficient manner for transpose + DEVICE_DATA_TYPE rotate_in[channel_width]; +#pragma HLS ARRAY_PARTITION variable = rotate_in complete dim = 0 + + // Blocks of a will be stored columnwise in global memory + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + rotate_in[unroll_count] = A[ls_address_trans + col * channel_width + unroll_count]; + } + + unsigned int chunk = row * (block_size / channel_width) + col; + + unsigned rot = (row) & (channel_width - 1); + + // rotate temporary buffer to store data into local buffer + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + // every block of (N / channel_width), rotates the index by 1 + // store in double buffer + a_block[chunk][unroll_count] = rotate_in[(unroll_count + channel_width - rot) + & (channel_width - 1)]; + } + } + } + + // Read transposed A from local memory and add B +read_B: + for (unsigned int row = 0; row < block_size; row++) { +read_B_line: + for (unsigned int col = 0; col < block_size / channel_width; col++) { +#pragma HLS unroll region + unsigned long block_row = (block + offset_b) / width_in_blocks; + unsigned long block_col = (block + offset_b) % width_in_blocks; + unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks + + block_col * block_size + + row * block_size * width_in_blocks; + unsigned int chunk = row * (block_size / channel_width) + col; + + DEVICE_DATA_TYPE data_chunk[channel_width]; +#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0 + DEVICE_DATA_TYPE rotate_out[channel_width]; +#pragma HLS ARRAY_PARTITION variable = rotate_out complete dim = 0 + + unsigned int base = col * block_size; + unsigned int offset = row / channel_width; + + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + unsigned rot = ((channel_width + unroll_count - row) * (block_size / channel_width)) & + (BLOCK_SIZE - 1); + unsigned row_rotate = base + offset + rot; + rotate_out[unroll_count] = a_block[row_rotate][unroll_count]; + } + + unsigned rot_out = row & (channel_width - 1); + + // rotate temporary buffer to store data into local buffer + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) & (channel_width - 1)]; + } + + // load tranposed A from global memory + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + data_chunk[unroll_count] += B[ls_address_row + col * channel_width + unroll_count]; + } + + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + a_plus_b_block[chunk][unroll_count] = data_chunk[unroll_count]; + } + } + } + // Write back result +write_result: + for (unsigned int row = 0; row < block_size; row++) { +write_result_line: + for (unsigned int col = 0; col < block_size / channel_width; col++) { +#pragma HLS unroll region + unsigned long block_row = (block + offset_b) / width_in_blocks; + unsigned long block_col = (block + offset_b) % width_in_blocks; + unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks + + block_col * block_size + + row * block_size * width_in_blocks; + unsigned int chunk = row * (block_size / channel_width) + col; + + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + A_out[ls_address_row + col * channel_width + unroll_count] = a_plus_b_block[chunk][unroll_count]; + } + } + } + } +} + +// PY_CODE_GEN block_end + +} From ffc5648cdf17964548bed3db2c24c71f6881be14 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 5 May 2022 16:29:39 +0100 Subject: [PATCH 047/318] Add copying for other kernel replications --- .../execution_types/execution_xrt_accl_pq.hpp | 353 +++++++++++------- 1 file changed, 212 insertions(+), 141 deletions(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp index dab92c96..8400fb76 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp @@ -42,9 +42,10 @@ namespace accl_pq { void accl_exchangeData( ACCL::ACCL &accl, - transpose::data_handler::DistributedPQTransposeDataHandler - &handler, - transpose::TransposeData &data, xrt::bo &bufferAXrt, int global_width) { + transpose::data_handler::DistributedPQTransposeDataHandler< + xrt::device, bool, xrt::uuid> &handler, + transpose::TransposeData &data, std::vector &bufferAXrt, + int global_width) { int pq_width = handler.getP(); int pq_height = handler.getQ(); @@ -56,16 +57,21 @@ void accl_exchangeData( int pq_row = mpi_comm_rank / pq_width; int pq_col = mpi_comm_rank % pq_width; - auto AcclBufferA = accl.create_buffer( - bufferAXrt, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32); + std::vector> acclBuffersA; + for (auto &bo : bufferAXrt) { + acclBuffersA.push_back(accl.create_buffer( + bo, data.blockSize * data.blockSize * data.numBlocks, + ACCL::dataType::float32)); + } + if (pq_width == pq_height) { if (pq_col != pq_row) { int pair_rank = pq_width * pq_col + pq_row; - // To re-calculate the matrix transposition locally on this host, we need to - // exchange matrix A for every kernel replication - // The order of the matrix blocks does not change during the exchange, because they are + // To re-calculate the matrix transposition locally on this host, we need + // to exchange matrix A for every kernel replication The order of the + // matrix blocks does not change during the exchange, because they are // distributed diagonally and will be handled in the order below: // // . . 1 3 @@ -73,34 +79,35 @@ void accl_exchangeData( // 1 . . . // 3 2 . . // auto AcclBufferA_recv = accl.create_buffer( - // data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32); + // data.blockSize * data.blockSize * data.numBlocks, + // ACCL::dataType::float32); // AcclBufferA_recv->sync_to_device(); // Send and receive matrix A using ACCL directly on FPGA - accl.send(0, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, + accl.send(0, *acclBuffersA[0], + data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, true, ACCL::streamFlags::NO_STREAM); - accl.recv(0, *AcclBufferA, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, + accl.recv(0, *acclBuffersA[0], + data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, true, ACCL::streamFlags::NO_STREAM); - // Copy received matrix from receiving buffer to A buffer completely on FPGA - // accl.copy(*AcclBufferA_recv, *AcclBufferA, data.blockSize * data.blockSize * - // data.numBlocks, - // true, true); } } else { - // Taken from "Parallel matrix transpose algorithms on distributed memory concurrent computers" - // by J. Choi, J. J. Dongarra, D. W. Walker and translated to C++ This will do a diagonal - // exchange of matrix blocks. + // Taken from "Parallel matrix transpose algorithms on distributed memory + // concurrent computers" by J. Choi, J. J. Dongarra, D. W. Walker and + // translated to C++ This will do a diagonal exchange of matrix blocks. // Determine LCM using GCD from standard library using the C++14 call - // In C++17 this changes to std::gcd in numeric, also std::lcm is directly available in numeric + // In C++17 this changes to std::gcd in numeric, also std::lcm is directly + // available in numeric int gcd = std::__gcd(pq_height, pq_width); int least_common_multiple = pq_height * pq_width / gcd; - // If the global matrix size is not a multiple of the LCM block size, the numbers of send and - // received blocks may be wrongly calculated. Throw exception to prevent this and make aware of - // this issue! + // If the global matrix size is not a multiple of the LCM block size, the + // numbers of send and received blocks may be wrongly calculated. Throw + // exception to prevent this and make aware of this issue! if (global_width % least_common_multiple > 0) { - throw std::runtime_error("Implementation does not support matrix sizes that are not multiple " - "of LCM blocks! Results may be wrong!"); + throw std::runtime_error( + "Implementation does not support matrix sizes that are not multiple " + "of LCM blocks! Results may be wrong!"); } // MPI requests for non-blocking communication @@ -113,18 +120,19 @@ void accl_exchangeData( int q = transpose::data_handler::mod(pq_row - g, pq_height); // Pre-calculate target ranks in LCM block - // The vector list variable can be interpreted as 2D matrix. Every entry represents the target - // rank of the sub-block Since the LCM block will repeat, we only need to store this small - // amount of data! - std::vector target_list(least_common_multiple / pq_height * least_common_multiple / - pq_width); + // The vector list variable can be interpreted as 2D matrix. Every entry + // represents the target rank of the sub-block Since the LCM block will + // repeat, we only need to store this small amount of data! + std::vector target_list(least_common_multiple / pq_height * + least_common_multiple / pq_width); for (int row = 0; row < least_common_multiple / pq_height; row++) { for (int col = 0; col < least_common_multiple / pq_width; col++) { int global_block_col = pq_col + col * pq_width; int global_block_row = pq_row + row * pq_height; - int destination_rank = - (global_block_col % pq_height) * pq_width + (global_block_row % pq_width); - target_list[row * least_common_multiple / pq_width + col] = destination_rank; + int destination_rank = (global_block_col % pq_height) * pq_width + + (global_block_row % pq_width); + target_list[row * least_common_multiple / pq_width + col] = + destination_rank; } } @@ -135,23 +143,28 @@ void accl_exchangeData( for (int i = 0; i < gcd; i++) { // TODO Is there a way to initialize buffer only in FPGA memory with ACCL? send_buffers.push_back(accl.create_buffer( - data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); + data.blockSize * data.blockSize * data.numBlocks, + ACCL::dataType::float32)); recv_buffers.push_back(accl.create_buffer( - data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); + data.blockSize * data.blockSize * data.numBlocks, + ACCL::dataType::float32)); send_buffers.back()->sync_to_device(); recv_buffers.back()->sync_to_device(); } int current_parallel_execution = 0; for (int j = 0; j < least_common_multiple / pq_width; j++) { for (int i = 0; i < least_common_multiple / pq_height; i++) { - // Determine sender and receiver rank of current rank for current communication step - int send_rank = transpose::data_handler::mod(p + i * gcd, pq_width) + - transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width; - int recv_rank = transpose::data_handler::mod(p - i * gcd, pq_width) + - transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width; - - // Also count receiving buffer size because sending and receiving buffer size may differ in - // certain scenarios! + // Determine sender and receiver rank of current rank for current + // communication step + int send_rank = + transpose::data_handler::mod(p + i * gcd, pq_width) + + transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width; + int recv_rank = + transpose::data_handler::mod(p - i * gcd, pq_width) + + transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width; + + // Also count receiving buffer size because sending and receiving buffer + // size may differ in certain scenarios! int receiving_size = 0; int sending_size = 0; @@ -160,53 +173,69 @@ void accl_exchangeData( // Look up which blocks are affected by the current rank for (int row = 0; row < least_common_multiple / pq_height; row++) { for (int col = 0; col < least_common_multiple / pq_width; col++) { - if (target_list[row * least_common_multiple / pq_width + col] == send_rank) { + if (target_list[row * least_common_multiple / pq_width + col] == + send_rank) { send_rows.push_back(row); send_cols.push_back(col); sending_size += data.blockSize * data.blockSize; } - if (target_list[row * least_common_multiple / pq_width + col] == recv_rank) { + if (target_list[row * least_common_multiple / pq_width + col] == + recv_rank) { receiving_size += data.blockSize * data.blockSize; } } } - receiving_size *= (height_per_rank) / (least_common_multiple / pq_height) * - ((width_per_rank) / (least_common_multiple / pq_width)); - sending_size *= (height_per_rank) / (least_common_multiple / pq_height) * + receiving_size *= + (height_per_rank) / (least_common_multiple / pq_height) * + ((width_per_rank) / (least_common_multiple / pq_width)); + sending_size *= (height_per_rank) / + (least_common_multiple / pq_height) * ((width_per_rank) / (least_common_multiple / pq_width)); #ifndef NDEBUG std::cout << "Copy data to send buffers" << std::endl; #endif - // Copy the required date for this communication step to the send buffer! + // Copy the required date for this communication step to the send + // buffer! for (int t = 0; t < send_rows.size(); t++) { - for (int lcm_row = 0; lcm_row < (height_per_rank) / (least_common_multiple / pq_height); + for (int lcm_row = 0; + lcm_row < + (height_per_rank) / (least_common_multiple / pq_height); lcm_row++) { - for (int lcm_col = 0; lcm_col < (width_per_rank) / (least_common_multiple / pq_width); + for (int lcm_col = 0; + lcm_col < + (width_per_rank) / (least_common_multiple / pq_width); lcm_col++) { size_t sending_buffer_offset = lcm_row * data.blockSize * data.blockSize * ((width_per_rank) / (least_common_multiple / pq_width)) + lcm_col * data.blockSize * data.blockSize; size_t matrix_buffer_offset = - (send_cols[t] + lcm_col * least_common_multiple / pq_width) * data.blockSize + - (send_rows[t] + lcm_row * least_common_multiple / pq_height) * width_per_rank * - data.blockSize * data.blockSize; + (send_cols[t] + lcm_col * least_common_multiple / pq_width) * + data.blockSize + + (send_rows[t] + lcm_row * least_common_multiple / pq_height) * + width_per_rank * data.blockSize * data.blockSize; for (int block_row = 0; block_row < data.blockSize; block_row++) { // TODO May be more efficient when done async! std::cout << "A(" - << matrix_buffer_offset + block_row * width_per_rank * data.blockSize + << matrix_buffer_offset + + block_row * width_per_rank * data.blockSize << "," - << matrix_buffer_offset + block_row * width_per_rank * data.blockSize + + << matrix_buffer_offset + + block_row * width_per_rank * data.blockSize + data.blockSize << ") send(" << sending_buffer_offset << "," - << sending_buffer_offset + data.blockSize << ")" << std::endl; - accl.copy(*AcclBufferA->slice( - matrix_buffer_offset + block_row * width_per_rank * data.blockSize, - matrix_buffer_offset + block_row * width_per_rank * data.blockSize + + << sending_buffer_offset + data.blockSize << ")" + << std::endl; + accl.copy(*acclBuffersA[0]->slice( + matrix_buffer_offset + + block_row * width_per_rank * data.blockSize, + matrix_buffer_offset + + block_row * width_per_rank * data.blockSize + data.blockSize), *send_buffers[current_parallel_execution]->slice( - sending_buffer_offset, sending_buffer_offset + data.blockSize), + sending_buffer_offset, + sending_buffer_offset + data.blockSize), data.blockSize, true, true); std::cout << "Copy done!" << std::endl; } @@ -218,16 +247,17 @@ void accl_exchangeData( #ifndef NDEBUG std::cout << "Rank " << mpi_comm_rank << ": blocks (" << sending_size / (data.blockSize * data.blockSize) << "," - << receiving_size / (data.blockSize * data.blockSize) << ") send " << send_rank - << ", recv " << recv_rank << std::endl + << receiving_size / (data.blockSize * data.blockSize) + << ") send " << send_rank << ", recv " << recv_rank + << std::endl << std::flush; #endif - accl_requests[current_parallel_execution] = - (accl.send(0, *send_buffers[current_parallel_execution], sending_size, send_rank, 0, - true, ACCL::streamFlags::NO_STREAM, true)); - accl_requests[current_parallel_execution + gcd] = - (accl.recv(0, *recv_buffers[current_parallel_execution], sending_size, send_rank, 0, - true, ACCL::streamFlags::NO_STREAM, true)); + accl_requests[current_parallel_execution] = (accl.send( + 0, *send_buffers[current_parallel_execution], sending_size, + send_rank, 0, true, ACCL::streamFlags::NO_STREAM, true)); + accl_requests[current_parallel_execution + gcd] = (accl.recv( + 0, *recv_buffers[current_parallel_execution], sending_size, + send_rank, 0, true, ACCL::streamFlags::NO_STREAM, true)); // Increase the counter for parallel executions current_parallel_execution = (current_parallel_execution + 1) % gcd; @@ -249,10 +279,12 @@ void accl_exchangeData( std::vector recv_rows; std::vector recv_cols; // Look up which blocks are affected by the current rank - for (int row = 0; row < least_common_multiple / pq_height; row++) { - for (int col = 0; col < least_common_multiple / pq_width; col++) { - if (target_list[row * least_common_multiple / pq_width + col] == - status.MPI_SOURCE) { + for (int row = 0; row < least_common_multiple / pq_height; + row++) { + for (int col = 0; col < least_common_multiple / pq_width; + col++) { + if (target_list[row * least_common_multiple / pq_width + + col] == status.MPI_SOURCE) { recv_rows.push_back(row); recv_cols.push_back(col); } @@ -261,26 +293,37 @@ void accl_exchangeData( // Copy received data to matrix A buffer for (int t = 0; t < recv_rows.size(); t++) { for (int lcm_row = 0; - lcm_row < (height_per_rank) / (least_common_multiple / pq_height); lcm_row++) { + lcm_row < + (height_per_rank) / (least_common_multiple / pq_height); + lcm_row++) { for (int lcm_col = 0; - lcm_col < (width_per_rank) / (least_common_multiple / pq_width); lcm_col++) { + lcm_col < + (width_per_rank) / (least_common_multiple / pq_width); + lcm_col++) { size_t receiving_buffer_offset = lcm_row * data.blockSize * data.blockSize * - ((width_per_rank) / (least_common_multiple / pq_width)) + + ((width_per_rank) / + (least_common_multiple / pq_width)) + lcm_col * data.blockSize * data.blockSize; size_t matrix_buffer_offset = - (recv_cols[t] + lcm_col * least_common_multiple / pq_width) * + (recv_cols[t] + + lcm_col * least_common_multiple / pq_width) * data.blockSize + - (recv_rows[t] + lcm_row * least_common_multiple / pq_height) * + (recv_rows[t] + + lcm_row * least_common_multiple / pq_height) * width_per_rank * data.blockSize * data.blockSize; - for (int block_row = 0; block_row < data.blockSize; block_row++) { + for (int block_row = 0; block_row < data.blockSize; + block_row++) { // TODO May be more efficient when done async! accl.copy( *recv_buffers[current_parallel_execution]->slice( - receiving_buffer_offset, receiving_buffer_offset + data.blockSize), - *AcclBufferA->slice( - matrix_buffer_offset + block_row * width_per_rank * data.blockSize, - matrix_buffer_offset + block_row * width_per_rank * data.blockSize + + receiving_buffer_offset, + receiving_buffer_offset + data.blockSize), + *acclBuffersA[0]->slice( + matrix_buffer_offset + + block_row * width_per_rank * data.blockSize, + matrix_buffer_offset + + block_row * width_per_rank * data.blockSize + data.blockSize), data.blockSize, true, true); } @@ -293,36 +336,47 @@ void accl_exchangeData( } } } + // Copy received matrix A to the buffers of other kernel replications that + // may be placed on different memory banks + for (int b = 1; b < acclBuffersA.size(); b++) { + accl.copy(*acclBuffersA[0], *acclBuffersA[b], + data.blockSize * data.blockSize * data.numBlocks, true, true); + } } /** - * @brief Transpose and add the matrices using the OpenCL kernel using a PQ distribution and - * PCIe+MPI over the host for communication + * @brief Transpose and add the matrices using the OpenCL kernel using a PQ + * distribution and PCIe+MPI over the host for communication * * @param config The progrma configuration - * @param data data object that contains all required data for the execution on the FPGA - * @param handler data handler instance that should be used to exchange data between hosts - * @return std::unique_ptr The measured execution times + * @param data data object that contains all required data for the execution on + * the FPGA + * @param handler data handler instance that should be used to exchange data + * between hosts + * @return std::unique_ptr The measured + * execution times */ -static std::unique_ptr -calculate(const hpcc_base::ExecutionSettings &config, - transpose::TransposeData &data, - transpose::data_handler::DistributedPQTransposeDataHandler - &handler) { +static std::unique_ptr calculate( + const hpcc_base::ExecutionSettings &config, + transpose::TransposeData &data, + transpose::data_handler::DistributedPQTransposeDataHandler< + xrt::device, bool, xrt::uuid> &handler) { int err; if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) { - throw std::runtime_error("Used data handler not supported by execution handler!"); + throw std::runtime_error( + "Used data handler not supported by execution handler!"); } #ifdef USE_SVM - throw new std::runtime_error( - "SVM not supported in the host implementation of this communication method"); + throw new std::runtime_error("SVM not supported in the host implementation " + "of this communication method"); #endif #ifdef USE_BUFFER_WRITE_RECT_FOR_A - throw new std::runtime_error("Using the Write Rect method is not supported in this host " - "implementation of this communication method"); + throw new std::runtime_error( + "Using the Write Rect method is not supported in this host " + "implementation of this communication method"); #endif std::vector bufferSizeList; @@ -336,7 +390,8 @@ calculate(const hpcc_base::ExecutionSettingskernelReplications; r++) { - // Calculate how many blocks the current kernel replication will need to process. + // Calculate how many blocks the current kernel replication will need to + // process. size_t blocks_per_replication = - (local_matrix_height * local_matrix_width / config.programSettings->kernelReplications); - size_t blocks_remainder = - (local_matrix_height * local_matrix_width) % config.programSettings->kernelReplications; + (local_matrix_height * local_matrix_width / + config.programSettings->kernelReplications); + size_t blocks_remainder = (local_matrix_height * local_matrix_width) % + config.programSettings->kernelReplications; if (blocks_remainder > r) { - // Catch the case, that the number of blocks is not divisible by the number of kernel - // replications + // Catch the case, that the number of blocks is not divisible by the + // number of kernel replications blocks_per_replication += 1; } if (blocks_per_replication < 1) { continue; } blocksPerReplication.push_back(blocks_per_replication); - size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / local_matrix_width * - local_matrix_width * data.blockSize * data.blockSize; + size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / + local_matrix_width * local_matrix_width * + data.blockSize * data.blockSize; bufferSizeList.push_back(buffer_size); bufferStartList.push_back(total_offset); bufferOffsetList.push_back(row_offset); row_offset = (row_offset + blocks_per_replication) % local_matrix_width; - total_offset += (bufferOffsetList.back() + blocks_per_replication) / local_matrix_width * - local_matrix_width; + total_offset += (bufferOffsetList.back() + blocks_per_replication) / + local_matrix_width * local_matrix_width; // create the kernels - xrt::kernel transposeKernel(*config.device, *config.program, - ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); + xrt::kernel transposeKernel( + *config.device, *config.program, + ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); xrt::bo bufferA(*config.device, data.A, - data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), + data.numBlocks * data.blockSize * data.blockSize * + sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0)); - xrt::bo bufferB(*config.device, &data.B[bufferStartList[r] * data.blockSize * data.blockSize], - buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1)); + xrt::bo bufferB( + *config.device, + &data.B[bufferStartList[r] * data.blockSize * data.blockSize], + buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1)); xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2)); @@ -392,7 +454,8 @@ calculate(const hpcc_base::ExecutionSettings transferTimings; std::vector calculationTimings; - for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) { + for (int repetition = 0; repetition < config.programSettings->numRepetitions; + repetition++) { #ifndef NDEBUG std::cout << "Start data transfer" << std::endl; @@ -406,7 +469,8 @@ calculate(const hpcc_base::ExecutionSettings transferTime = - std::chrono::duration_cast>(endTransfer - startTransfer); + std::chrono::duration_cast>( + endTransfer - startTransfer); MPI_Barrier(MPI_COMM_WORLD); @@ -414,14 +478,15 @@ calculate(const hpcc_base::ExecutionSettings 1) { - std::cerr << "WARNING: Only the matrix A of the first kernel replication will be exchanged " + std::cerr << "WARNING: Only the matrix A of the first kernel replication " + "will be exchanged " "via ACCL!" << std::endl; } #ifndef NDEBUG std::cout << "Start data exchange with ACCL" << std::endl; #endif - accl_exchangeData(*config.accl, handler, data, bufferListA[0], + accl_exchangeData(*config.accl, handler, data, bufferListA, config.programSettings->matrixSize / data.blockSize); #ifndef NDEBUG std::cout << "End data exchange with ACCL" << std::endl; @@ -431,11 +496,13 @@ calculate(const hpcc_base::ExecutionSettings(bufferOffsetList[r]), static_cast(bufferOffsetList[r]), + static_cast(bufferOffsetList[r]), + static_cast(bufferOffsetList[r]), static_cast(blocksPerReplication[r]), static_cast(handler.getWidthforRank()), - static_cast((bufferSizeList[r]) / - (local_matrix_width * data.blockSize * data.blockSize)))); + static_cast( + (bufferSizeList[r]) / + (local_matrix_width * data.blockSize * data.blockSize)))); } #ifndef NDEBUG std::cout << "Wait for kernels to complete" << std::endl; @@ -450,26 +517,28 @@ calculate(const hpcc_base::ExecutionSettings>(endCalculation - - startKernelCalculation) + << std::chrono::duration_cast>( + endCalculation - startKernelCalculation) .count() << "s (" - << ((config.programSettings->matrixSize * config.programSettings->matrixSize * - sizeof(HOST_DATA_TYPE) * 3) / - std::chrono::duration_cast>(endCalculation - - startKernelCalculation) + << ((config.programSettings->matrixSize * + config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * + 3) / + std::chrono::duration_cast>( + endCalculation - startKernelCalculation) .count() * 1.0e-9) << " GB/s)" << std::endl; #endif std::chrono::duration calculationTime = - std::chrono::duration_cast>(endCalculation - - startCalculation); + std::chrono::duration_cast>( + endCalculation - startCalculation); calculationTimings.push_back(calculationTime.count()); - std::vector tmp_write_buffer(local_matrix_height * local_matrix_width * - data.blockSize * data.blockSize); + std::vector tmp_write_buffer( + local_matrix_height * local_matrix_width * data.blockSize * + data.blockSize); startTransfer = std::chrono::high_resolution_clock::now(); @@ -483,29 +552,31 @@ calculate(const hpcc_base::ExecutionSettings>(endTransfer - startTransfer); + transferTime += std::chrono::duration_cast>( + endTransfer - startTransfer); transferTimings.push_back(transferTime.count()); } std::unique_ptr result( - new transpose::TransposeExecutionTimings{transferTimings, calculationTimings}); + new transpose::TransposeExecutionTimings{transferTimings, + calculationTimings}); return result; } From cdc69a11f657f8d1863942c81afc0962e1d327f4 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 5 May 2022 16:45:23 +0100 Subject: [PATCH 048/318] Add FORCE_FILE_ENDING flag for convenience --- cmake/kernelTargets.cmake | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake index 84ce896f..66fecfb4 100644 --- a/cmake/kernelTargets.cmake +++ b/cmake/kernelTargets.cmake @@ -13,7 +13,7 @@ if (USE_ACCL) include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake) endif() -set(file_endings "cpp" "cl") +set(file_endings "cl" "cpp" ) ## # This function will create build targets for the kernels for emulationand synthesis for xilinx. @@ -32,6 +32,9 @@ function(generate_kernel_targets_xilinx) continue() endif() set(file_exists No) + if (DEFINED FORCE_FILE_ENDING) + set(file_endings ${FORCE_FILE_ENDING}) + endif() foreach (ending ${file_endings}) set(search_file_name "${CMAKE_SOURCE_DIR}/${base_file_part}.${ending}") if (NOT file_exists AND EXISTS ${search_file_name}) From cf8f7926614d1d6a70545605a837cc5ad9834d21 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 6 May 2022 13:35:55 +0100 Subject: [PATCH 049/318] Remove need to be power of 2 for PTRANS block size --- PTRANS/src/device/transpose_PQ_PCIE.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cpp b/PTRANS/src/device/transpose_PQ_PCIE.cpp index 456c6919..521d8e1a 100644 --- a/PTRANS/src/device/transpose_PQ_PCIE.cpp +++ b/PTRANS/src/device/transpose_PQ_PCIE.cpp @@ -46,9 +46,11 @@ void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, // local memory double buffer for a matrix block DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width]; #pragma HLS ARRAY_PARTITION variable = a_block complete dim = 2 +#pragma HLS BIND_STORAGE variable = a_block type = RAM_1P impl = URAM // local memory double buffer for a matrix block DEVICE_DATA_TYPE a_plus_b_block[block_size * block_size / channel_width][channel_width]; #pragma HLS ARRAY_PARTITION variable = a_plus_b_block complete dim = 2 +#pragma HLS BIND_STORAGE variable = a_plus_b_block type = RAM_1P impl = URAM // transpose the matrix block-wise from global memory block_loop: @@ -75,14 +77,14 @@ void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, unsigned int chunk = row * (block_size / channel_width) + col; - unsigned rot = (row) & (channel_width - 1); + unsigned rot = (row) % (channel_width); // rotate temporary buffer to store data into local buffer for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { // every block of (N / channel_width), rotates the index by 1 // store in double buffer a_block[chunk][unroll_count] = rotate_in[(unroll_count + channel_width - rot) - & (channel_width - 1)]; + % (channel_width)]; } } } @@ -109,17 +111,17 @@ void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, unsigned int offset = row / channel_width; for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { - unsigned rot = ((channel_width + unroll_count - row) * (block_size / channel_width)) & - (BLOCK_SIZE - 1); + unsigned rot = ((channel_width + unroll_count - row) * (block_size / channel_width)) % + (block_size); unsigned row_rotate = base + offset + rot; rotate_out[unroll_count] = a_block[row_rotate][unroll_count]; } - unsigned rot_out = row & (channel_width - 1); + unsigned rot_out = row % (channel_width); // rotate temporary buffer to store data into local buffer for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { - data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) & (channel_width - 1)]; + data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) % (channel_width)]; } // load tranposed A from global memory From 4a55592a296e617c23d7f6f3db19cfb29987c050 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 6 May 2022 13:36:10 +0100 Subject: [PATCH 050/318] Add .cpp as possible ending for custom kernels --- cmake/customKernelTargets.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/customKernelTargets.cmake b/cmake/customKernelTargets.cmake index 82ac811f..4657ba53 100644 --- a/cmake/customKernelTargets.cmake +++ b/cmake/customKernelTargets.cmake @@ -9,7 +9,7 @@ include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake) file(GLOB custom_kernel_files RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} - "*.cl" + "*.cl" "*.cpp" ) set(custom_kernel_targets "") From c07a29df6c919d56161d320a51e498dd67f751e3 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 6 May 2022 16:08:11 +0100 Subject: [PATCH 051/318] Remove compile settings from linking call --- cmake/kernelTargets.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake index 66fecfb4..4fa28b81 100644 --- a/cmake/kernelTargets.cmake +++ b/cmake/kernelTargets.cmake @@ -121,7 +121,7 @@ function(generate_kernel_targets_xilinx) DEPENDS ${XILINX_COMPILE_SETTINGS_FILE} ) add_custom_command(OUTPUT ${bitstream_f} - COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} ${local_harware_only_flags} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_f} ${bitstream_compile} ${additional_xos} + COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} ${local_harware_only_flags} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} -o ${bitstream_f} ${additional_xos} ${bitstream_compile} MAIN_DEPENDENCY ${bitstream_compile} DEPENDS ${xilinx_link_settings} ) From ddb88c796c8b1e4b5ea0d3f2eefcbddddf5a1dce Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 12 May 2022 11:23:24 +0100 Subject: [PATCH 052/318] use pipeline pragma instead of unroll region --- PTRANS/src/device/transpose_PQ_PCIE.cpp | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cpp b/PTRANS/src/device/transpose_PQ_PCIE.cpp index 521d8e1a..be7e6828 100644 --- a/PTRANS/src/device/transpose_PQ_PCIE.cpp +++ b/PTRANS/src/device/transpose_PQ_PCIE.cpp @@ -43,23 +43,24 @@ void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, const unsigned int width_in_blocks, const unsigned int height_in_blocks) { - // local memory double buffer for a matrix block - DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width]; -#pragma HLS ARRAY_PARTITION variable = a_block complete dim = 2 -#pragma HLS BIND_STORAGE variable = a_block type = RAM_1P impl = URAM - // local memory double buffer for a matrix block - DEVICE_DATA_TYPE a_plus_b_block[block_size * block_size / channel_width][channel_width]; -#pragma HLS ARRAY_PARTITION variable = a_plus_b_block complete dim = 2 -#pragma HLS BIND_STORAGE variable = a_plus_b_block type = RAM_1P impl = URAM - // transpose the matrix block-wise from global memory block_loop: for (unsigned int block = 0; block < number_of_blocks; block++) { + + // local memory double buffer for a matrix block + DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width]; +#pragma HLS ARRAY_PARTITION variable = a_block complete dim = 2 +// #pragma HLS BIND_STORAGE variable = a_block type = RAM_1P impl = URAM + // local memory double buffer for a matrix block + DEVICE_DATA_TYPE a_plus_b_block[block_size * block_size / channel_width][channel_width]; +#pragma HLS ARRAY_PARTITION variable = a_plus_b_block complete dim = 2 +// #pragma HLS BIND_STORAGE variable = a_plus_b_block type = RAM_1P impl = URAM + read_A: for (unsigned int row = 0; row < block_size; row++) { read_A_line: for (unsigned int col = 0; col < block_size / channel_width; col++) { - #pragma HLS unroll region +#pragma HLS PIPELINE unsigned long block_row_a = (block + offset_a) / width_in_blocks; unsigned long block_col_a = (block + offset_a) % width_in_blocks; unsigned long ls_address_trans = block_col_a * block_size * block_size * height_in_blocks + @@ -94,7 +95,7 @@ void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, for (unsigned int row = 0; row < block_size; row++) { read_B_line: for (unsigned int col = 0; col < block_size / channel_width; col++) { -#pragma HLS unroll region +#pragma HLS PIPELINE unsigned long block_row = (block + offset_b) / width_in_blocks; unsigned long block_col = (block + offset_b) % width_in_blocks; unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks + @@ -139,7 +140,7 @@ void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, for (unsigned int row = 0; row < block_size; row++) { write_result_line: for (unsigned int col = 0; col < block_size / channel_width; col++) { -#pragma HLS unroll region +#pragma HLS PIPELINE unsigned long block_row = (block + offset_b) / width_in_blocks; unsigned long block_col = (block + offset_b) % width_in_blocks; unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks + From 06ce1110ee821789052c591c943a6e9509daa17d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 12 May 2022 11:24:29 +0100 Subject: [PATCH 053/318] Only sync data if required for baseline version --- PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp index d59ba2e0..b5788fed 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp @@ -162,9 +162,11 @@ static std::unique_ptr calculate( endTransfer - startTransfer); MPI_Barrier(MPI_COMM_WORLD); - + int mpi_size; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); auto startCalculation = std::chrono::high_resolution_clock::now(); + if (mpi_size > 1) { for (int r = 0; r < transposeKernelList.size(); r++) { bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); } @@ -177,6 +179,7 @@ static std::unique_ptr calculate( for (int r = 0; r < transposeKernelList.size(); r++) { bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); } + } std::vector runs; auto startKernelCalculation = std::chrono::high_resolution_clock::now(); From 5bff3fc311c950ccd5ab42d317f563c2ce549db9 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 19 May 2022 14:03:32 +0100 Subject: [PATCH 054/318] Add variable for CCLO build parameters --- cmake/accl.cmake | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cmake/accl.cmake b/cmake/accl.cmake index 88bc2b64..8b9823d6 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -4,7 +4,8 @@ set(ACCL_STACK_TYPE "UDP" CACHE STRING "Network layer used in ACCL") set(ACCL_UDP_ETH_IF 0 CACHE STRING "Ethernet interface used. On ETHZ: 0 = switch, 1 = direct") set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform") set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware) - +set(ACCL_CCLO_ADDITIONAL_BUILD_ARGS "" CACHE STRING "Add additional build arguments that will be passed to the CCLO makefile") +set(ACCL_CCLO_BUILD_ARGS ${ACCL_CCLO_ADDITIONAL_BUILD_ARGS}) # UDP related definitions set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/) set(ACCL_NETLAYER_HLS ${ACCL_VNX_DIR}/NetLayers/100G-fpga-network-stack-core) @@ -14,7 +15,7 @@ set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HMB) if (ACCL_STACK_TYPE STREQUAL "UDP") list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_VNX_DIR}/Ethernet/post_sys_link.tcl) list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_HLS_IP_FOLDER}) - set(ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE}) + list(APPEND ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE}) endif() add_custom_command( @@ -38,7 +39,7 @@ set(ACCL_TCP_CMAC_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/cmac_krnl.xo) if (ACCL_STACK_TYPE STREQUAL "TCP") list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_TCP_BASE_DIR}/scripts/post_sys_link.tcl) list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo) - set(ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE} EN_FANIN=1) + list(APPEND ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE} EN_FANIN=1) endif() # TODO: This is very sppecific to the Xilinx build system, because From 3c552f8150bd9c205a9fe3649149e357e37b1068 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 19 May 2022 14:03:48 +0100 Subject: [PATCH 055/318] Fix compile flag handling --- cmake/kernelTargets.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake index 4fa28b81..b7a237a3 100644 --- a/cmake/kernelTargets.cmake +++ b/cmake/kernelTargets.cmake @@ -65,7 +65,7 @@ function(generate_kernel_targets_xilinx) list(APPEND additional_xos ${ACCL_XOS}) endif() set(xilinx_report_folder "${EXECUTABLE_OUTPUT_PATH}/xilinx_reports") - set(local_CLFLAGS ${CLFLAGS} -DXILINX_FPGA) + set(local_CLFLAGS -DXILINX_FPGA) list(APPEND local_CLFLAGS --report_dir=${xilinx_report_folder} --log_dir=${xilinx_report_folder}/logs) if (is_accl_kernel) list(APPEND local_harware_only_flags ${ACCL_LINK_CONFIG}) @@ -76,6 +76,7 @@ function(generate_kernel_targets_xilinx) else() set(CLFLAGS --config ${XILINX_COMPILE_SETTINGS_FILE}) endif() + list(APPEND local_CLFLAGS ${CLFLAGS}) # build emulation config for device add_custom_command(OUTPUT ${EXECUTABLE_OUTPUT_PATH}/emconfig.json From 8eccea40bd5e2d8db7a6fc588f6cf9db8d07baf0 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 20 May 2022 10:14:12 +0100 Subject: [PATCH 056/318] Update ACCL calls to new dev signature --- PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp index 8400fb76..67a3a1e0 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp @@ -254,10 +254,10 @@ void accl_exchangeData( #endif accl_requests[current_parallel_execution] = (accl.send( 0, *send_buffers[current_parallel_execution], sending_size, - send_rank, 0, true, ACCL::streamFlags::NO_STREAM, true)); + send_rank, 0, true, ACCL::streamFlags::NO_STREAM, ACCL::dataType::none, true)); accl_requests[current_parallel_execution + gcd] = (accl.recv( 0, *recv_buffers[current_parallel_execution], sending_size, - send_rank, 0, true, ACCL::streamFlags::NO_STREAM, true)); + send_rank, 0, true, ACCL::streamFlags::NO_STREAM, ACCL::dataType::none, true)); // Increase the counter for parallel executions current_parallel_execution = (current_parallel_execution + 1) % gcd; From 36c89013864d665b7ec66c5206391a0373363d8b Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 18 May 2022 17:55:35 +0100 Subject: [PATCH 057/318] Work on matrix transpose host code --- PTRANS/CMakeLists.txt | 5 ++ PTRANS/src/common/parameters.h.in | 2 + .../execution_types/execution_xrt_accl_pq.hpp | 52 ++++++++++++++----- shared/setup/fpga_setup_accl.cpp | 4 +- 4 files changed, 49 insertions(+), 14 deletions(-) diff --git a/PTRANS/CMakeLists.txt b/PTRANS/CMakeLists.txt index 71e64026..ef4c4a47 100755 --- a/PTRANS/CMakeLists.txt +++ b/PTRANS/CMakeLists.txt @@ -18,6 +18,11 @@ set(HOST_EMULATION_REORDER No CACHE BOOL "Reorder the scheduling of FPGA kernels mark_as_advanced(READ_KERNEL_NAME WRITE_KERNEL_NAME USE_BUFFER_WRITE_RECT_FOR_A XILINX_UNROLL_INNER_LOOPS) +if (USE_ACCL) + math(EXPR calculate_accl_buffer_size "${BLOCK_SIZE} * ${BLOCK_SIZE} * 8") + set(ACCL_BUFFER_SIZE ${calculate_accl_buffer_size} CACHE STRING "Size of ACCL buffers in bytes") +endif() + set(USE_MPI Yes) set(USE_OPENMP Yes) set(USE_DEPRECATED_HPP_HEADER No) diff --git a/PTRANS/src/common/parameters.h.in b/PTRANS/src/common/parameters.h.in index 68b50dd7..e42792ff 100644 --- a/PTRANS/src/common/parameters.h.in +++ b/PTRANS/src/common/parameters.h.in @@ -16,6 +16,8 @@ #define NUM_REPLICATIONS @NUM_REPLICATIONS@ #cmakedefine HOST_EMULATION_REORDER +#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@ + /** * Kernel Parameters */ diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp index 67a3a1e0..8d3edac5 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp @@ -62,6 +62,7 @@ void accl_exchangeData( acclBuffersA.push_back(accl.create_buffer( bo, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); + acclBuffersA.back()->sync_from_device(); } if (pq_width == pq_height) { @@ -78,17 +79,38 @@ void accl_exchangeData( // . . . 2 // 1 . . . // 3 2 . . - // auto AcclBufferA_recv = accl.create_buffer( - // data.blockSize * data.blockSize * data.numBlocks, - // ACCL::dataType::float32); - // AcclBufferA_recv->sync_to_device(); + auto acclBufferA_recv = accl.create_buffer( + data.blockSize * data.blockSize * data.numBlocks, + ACCL::dataType::float32); + acclBufferA_recv->sync_to_device(); // Send and receive matrix A using ACCL directly on FPGA - accl.send(0, *acclBuffersA[0], - data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, - true, ACCL::streamFlags::NO_STREAM); - accl.recv(0, *acclBuffersA[0], - data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0, - true, ACCL::streamFlags::NO_STREAM); + if (mpi_comm_rank < pair_rank) { + for (int block_num = 0; block_num < data.numBlocks; block_num++) { + accl.send(0, + *acclBuffersA[0]->slice( + data.blockSize * data.blockSize * block_num, + data.blockSize * data.blockSize * (block_num + 1)), + data.blockSize * data.blockSize, pair_rank, 0, true, + ACCL::streamFlags::NO_STREAM); + } + accl.recv(0, *acclBufferA_recv, + data.blockSize * data.blockSize * data.numBlocks, pair_rank, + 1, true, ACCL::streamFlags::NO_STREAM); + } else { + accl.recv(0, *acclBufferA_recv, + data.blockSize * data.blockSize * data.numBlocks, pair_rank, + 0, true, ACCL::streamFlags::NO_STREAM); + for (int block_num = 0; block_num < data.numBlocks; block_num++) { + accl.send(0, + *acclBuffersA[0]->slice( + data.blockSize * data.blockSize * block_num, + data.blockSize * data.blockSize * (block_num + 1)), + data.blockSize * data.blockSize, pair_rank, 1, true, + ACCL::streamFlags::NO_STREAM); + } + } + accl.copy(*acclBufferA_recv, *acclBuffersA[0], + data.blockSize * data.blockSize * data.numBlocks, true, true); } } else { // Taken from "Parallel matrix transpose algorithms on distributed memory @@ -254,10 +276,12 @@ void accl_exchangeData( #endif accl_requests[current_parallel_execution] = (accl.send( 0, *send_buffers[current_parallel_execution], sending_size, - send_rank, 0, true, ACCL::streamFlags::NO_STREAM, ACCL::dataType::none, true)); + send_rank, 0, true, ACCL::streamFlags::NO_STREAM, + ACCL::dataType::none, true)); accl_requests[current_parallel_execution + gcd] = (accl.recv( 0, *recv_buffers[current_parallel_execution], sending_size, - send_rank, 0, true, ACCL::streamFlags::NO_STREAM, ACCL::dataType::none, true)); + send_rank, 0, true, ACCL::streamFlags::NO_STREAM, + ACCL::dataType::none, true)); // Increase the counter for parallel executions current_parallel_execution = (current_parallel_execution + 1) % gcd; @@ -569,6 +593,10 @@ static std::unique_ptr calculate( } } endTransfer = std::chrono::high_resolution_clock::now(); + + accl_exchangeData(*config.accl, handler, data, bufferListA, + config.programSettings->matrixSize / data.blockSize); + transferTime += std::chrono::duration_cast>( endTransfer - startTransfer); transferTimings.push_back(transferTime.count()); diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index b4753430..5ce08a41 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -32,7 +32,7 @@ std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &progra std::vector ranks = {}; for (int i = 0; i < current_size; ++i) { // TODO: Replace the ip addresses and ports here for execution of real hardware? - ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, 1024}; + ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, ACCL_BUFFER_SIZE}; ranks.emplace_back(new_rank); } if (!useAcclEmulation) { @@ -45,7 +45,7 @@ std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &progra } else { // TODO: Add start port here. Currenty hardcoded! return std::unique_ptr( - new ACCL::ACCL(ranks, current_rank, 5500, device, ACCL::networkProtocol::TCP, 16, 1024)); + new ACCL::ACCL(ranks, current_rank, 5500, device, ACCL::networkProtocol::TCP, 16, ACCL_BUFFER_SIZE)); } } From 32f6586569c4a5ee872f2862317a557216e632d4 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 18 May 2022 17:55:51 +0100 Subject: [PATCH 058/318] Set ACCL buffer size with cmake --- cmake/accl.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/accl.cmake b/cmake/accl.cmake index 8b9823d6..2875657d 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -3,6 +3,7 @@ set(ACCL_STACK_TYPE "UDP" CACHE STRING "Network layer used in ACCL") set(ACCL_UDP_ETH_IF 0 CACHE STRING "Ethernet interface used. On ETHZ: 0 = switch, 1 = direct") set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform") +set(ACCL_BUFFER_SIZE 8192 CACHE STRING "Size of ACCL buffers in bytes") set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware) set(ACCL_CCLO_ADDITIONAL_BUILD_ARGS "" CACHE STRING "Add additional build arguments that will be passed to the CCLO makefile") set(ACCL_CCLO_BUILD_ARGS ${ACCL_CCLO_ADDITIONAL_BUILD_ARGS}) From e59049bf71b0359b30d8dedcf8076671c34ced2c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 20 Apr 2022 15:10:20 +0100 Subject: [PATCH 059/318] Add support for new base impl for LINPACK --- LINPACK/src/host/CMakeLists.txt | 4 +- .../host/execution_types/execution_iec.hpp | 7 +- .../host/execution_types/execution_pcie.hpp | 7 +- LINPACK/src/host/linpack_benchmark.cpp | 713 ------------------ LINPACK/src/host/linpack_benchmark.hpp | 689 +++++++++++------ LINPACK/src/host/linpack_data.cpp | 259 +++++++ LINPACK/src/host/linpack_data.hpp | 274 +++++++ LINPACK/src/host/main.cpp | 2 +- 8 files changed, 997 insertions(+), 958 deletions(-) delete mode 100644 LINPACK/src/host/linpack_benchmark.cpp create mode 100644 LINPACK/src/host/linpack_data.cpp create mode 100644 LINPACK/src/host/linpack_data.hpp diff --git a/LINPACK/src/host/CMakeLists.txt b/LINPACK/src/host/CMakeLists.txt index d8feb95d..5422f31f 100755 --- a/LINPACK/src/host/CMakeLists.txt +++ b/LINPACK/src/host/CMakeLists.txt @@ -1,6 +1,6 @@ add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase) -set(HOST_SOURCE linpack_benchmark.cpp gmres.c blas.c) +set(HOST_SOURCE linpack_data.cpp gmres.c blas.c) set(HOST_EXE_NAME Linpack) set(LIB_NAME lp) @@ -17,6 +17,7 @@ if (INTELFPGAOPENCL_FOUND) target_compile_definitions(${LIB_NAME}_intel PRIVATE -DCL_VERSION_2_0) endif() target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA) + target_compile_definitions(${HOST_EXE_NAME}_intel PRIVATE -DINTEL_FPGA) target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_intel_host_executable COMMAND $ -h) endif() @@ -30,6 +31,7 @@ if (Vitis_FOUND) target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) + target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_xilinx_host_executable COMMAND $ -h) endif() diff --git a/LINPACK/src/host/execution_types/execution_iec.hpp b/LINPACK/src/host/execution_types/execution_iec.hpp index b98bcc31..3c232f41 100644 --- a/LINPACK/src/host/execution_types/execution_iec.hpp +++ b/LINPACK/src/host/execution_types/execution_iec.hpp @@ -35,7 +35,7 @@ SOFTWARE. #endif #include "parameters.h" -#include "linpack_benchmark.hpp" +#include "linpack_data.hpp" namespace linpack { namespace execution { @@ -44,8 +44,9 @@ namespace iec { /* Prepare kernels and execute benchmark for a bitstream that makes use of intel external channels */ +template std::unique_ptr -calculate(const hpcc_base::ExecutionSettings&config, +calculate(const hpcc_base::ExecutionSettings&config, linpack::LinpackData& data) { int err; @@ -735,4 +736,4 @@ calculate(const hpcc_base::ExecutionSettings&co } // namespace execution } // namespace linpack -#endif \ No newline at end of file +#endif diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp index 51b9c546..5462f025 100644 --- a/LINPACK/src/host/execution_types/execution_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_pcie.hpp @@ -39,7 +39,7 @@ SOFTWARE. #endif #include "parameters.h" -#include "linpack_benchmark.hpp" +#include "linpack_data.hpp" namespace linpack { namespace execution { @@ -50,8 +50,9 @@ namespace pcie { @copydoc bm_execution::calculate() */ +template std::unique_ptr -calculate(const hpcc_base::ExecutionSettings&config, +calculate(const hpcc_base::ExecutionSettings&config, linpack::LinpackData& data) { cl_int err; @@ -729,4 +730,4 @@ calculate(const hpcc_base::ExecutionSettings&co } // namespace execution } // namespace linpack -#endif \ No newline at end of file +#endif diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp deleted file mode 100644 index d60be9d1..00000000 --- a/LINPACK/src/host/linpack_benchmark.cpp +++ /dev/null @@ -1,713 +0,0 @@ -// -// Created by Marius Meyer on 04.12.19. -// - -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -#include "linpack_benchmark.hpp" - -/* C++ standard library headers */ -#include -#include - -/* Project's headers */ -#include "communication_types.hpp" -#include "execution_types/execution_types.hpp" -#include "parameters.h" - -linpack::LinpackProgramSettings::LinpackProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), - matrixSize(results["m"].as() * (1 << (results["b"].as()))), blockSize(1 << (results["b"].as())), - isEmulationKernel(results.count("emulation") > 0), isDiagonallyDominant(results.count("uniform") == 0), - torus_width(results["p"].as()) { - int mpi_comm_rank; - int mpi_comm_size; - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank); - MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size); - // calculate the row and column of the MPI rank in the torus - if (mpi_comm_size % torus_width != 0) { - throw std::runtime_error("MPI size not dividable by P=" + std::to_string(torus_width) + "!"); - } - torus_height = mpi_comm_size / torus_width; - torus_row = (mpi_comm_rank / torus_width); - torus_col = (mpi_comm_rank % torus_width); -} - -std::map -linpack::LinpackProgramSettings::getSettingsMap() { - auto map = hpcc_base::BaseSettings::getSettingsMap(); - map["Matrix Size"] = std::to_string(matrixSize); - map["Block Size"] = std::to_string(blockSize); - map["Emulate"] = (isEmulationKernel) ? "Yes" : "No"; - map["Data Type"] = STR(HOST_DATA_TYPE); - map["FPGA Torus"] = "P=" + std::to_string(torus_width) + ", Q=" + std::to_string(torus_height); - return map; -} - -linpack::LinpackData::LinpackData(cl::Context context, size_t width, size_t height) : norma(0.0), context(context), - matrix_width(width), matrix_height(height) { -#ifdef USE_SVM - A = reinterpret_cast( - clSVMAlloc(context(), 0 , - size * size * sizeof(HOST_DATA_TYPE), 1024)); - b = reinterpret_cast( - clSVMAlloc(context(), 0 , - size * sizeof(HOST_DATA_TYPE), 1024)); - ipvt = reinterpret_cast( - clSVMAlloc(context(), 0 , - size * sizeof(cl_int), 1024)); -#else - posix_memalign(reinterpret_cast(&A), 4096, width * height * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&b), 4096, width * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&ipvt), 4096, height * sizeof(cl_int)); -#endif - } - -linpack::LinpackData::~LinpackData() { -#ifdef USE_SVM - clSVMFree(context(), reinterpret_cast(A)); - clSVMFree(context(), reinterpret_cast(b)); - clSVMFree(context(), reinterpret_cast(ipvt)); -#else - free(A); - free(b); - free(ipvt); -#endif -} - -linpack::LinpackBenchmark::LinpackBenchmark(int argc, char* argv[]) : HpccFpgaBenchmark(argc, argv) { - setupBenchmark(argc, argv); -} - -void -linpack::LinpackBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { - options.add_options() - ("m", "Global matrix size in number of blocks in one dimension. Local matrix sizes will be determined by PQ grid.", - cxxopts::value()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) - ("b", "Log2 of the block size in number of values in one dimension", - cxxopts::value()->default_value(std::to_string(LOCAL_MEM_BLOCK_LOG))) - ("p", "Width of the FPGA grid. The heigth (Q) will be calculated from mpi_size / P.", - cxxopts::value()->default_value(std::to_string(DEFAULT_P_VALUE))) - ("uniform", "Generate a uniform matrix instead of a diagonally dominant. This has to be supported by the FPGA kernel!") - ("emulation", "Use kernel arguments for emulation. This may be necessary to simulate persistent local memory on the FPGA"); -} - -std::unique_ptr -linpack::LinpackBenchmark::executeKernel(LinpackData &data) { - std::unique_ptr timings; - switch (executionSettings->programSettings->communicationType) { - case hpcc_base::CommunicationType::pcie_mpi : timings = execution::pcie::calculate(*executionSettings, data); break; - case hpcc_base::CommunicationType::intel_external_channels: timings = execution::iec::calculate(*executionSettings, data); break; - default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType)); - } -#ifdef DISTRIBUTED_VALIDATION - distributed_gesl_nopvt_ref(data); -#endif - return timings; -} - -void -linpack::LinpackBenchmark::collectAndPrintResults(const linpack::LinpackExecutionTimings &output) { - // Calculate performance for kernel execution plus data transfer - double tmean = 0; - double tlumean = 0; - double tslmean = 0; - double tmin = std::numeric_limits::max(); - double lu_min = std::numeric_limits::max(); - double sl_min = std::numeric_limits::max(); - -#ifndef NDEBUG - std::cout << "Rank " << mpi_comm_rank << ": Result collection started" << std::endl; -#endif - - std::vector global_lu_times(output.gefaTimings.size()); - MPI_Reduce(output.gefaTimings.data(), global_lu_times.data(), output.gefaTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - std::vector global_sl_times(output.geslTimings.size()); - MPI_Reduce(output.geslTimings.data(), global_sl_times.data(), output.geslTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); -#ifndef NDEBUG - std::cout << "Rank " << mpi_comm_rank << ": Result collection done" << std::endl; -#endif - - - if (mpi_comm_rank > 0) { - // Only the master rank needs to calculate and print result - return; - } - - double total_matrix_size = static_cast(executionSettings->programSettings->matrixSize); - double gflops_lu = ((2.0e0*total_matrix_size * total_matrix_size * total_matrix_size)/ 3.0) / 1.0e9; - double gflops_sl = (2.0*(total_matrix_size * total_matrix_size))/1.0e9; - for (int i =0; i < global_lu_times.size(); i++) { - double currentTime = global_lu_times[i] + global_sl_times[i]; - tmean += currentTime; - tlumean += global_lu_times[i]; - tslmean += global_sl_times[i]; - if (currentTime < tmin) { - tmin = currentTime; - } - if (global_lu_times[i] < lu_min) { - lu_min = global_lu_times[i]; - } - if (global_sl_times[i] < sl_min) { - sl_min = global_sl_times[i]; - } - } - tmean = tmean / global_lu_times.size(); - tlumean = tlumean / global_lu_times.size(); - tslmean = tslmean / global_sl_times.size(); - - std::cout << std::setw(ENTRY_SPACE) - << "Method" << std::setw(ENTRY_SPACE) - << "best" << std::setw(ENTRY_SPACE) << "mean" - << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl; - - std::cout << std::setw(ENTRY_SPACE) << "total" << std::setw(ENTRY_SPACE) - << tmin << std::setw(ENTRY_SPACE) << tmean - << std::setw(ENTRY_SPACE) << ((gflops_lu + gflops_sl) / tmin) - << std::endl; - - std::cout << std::setw(ENTRY_SPACE) << "GEFA" << std::setw(ENTRY_SPACE) - << lu_min << std::setw(ENTRY_SPACE) << tlumean - << std::setw(ENTRY_SPACE) << ((gflops_lu) / lu_min) - << std::endl; - - std::cout << std::setw(ENTRY_SPACE) << "GESL" << std::setw(ENTRY_SPACE) - << sl_min << std::setw(ENTRY_SPACE) << tslmean - << std::setw(ENTRY_SPACE) << (gflops_sl / sl_min) - << std::endl; -} - -std::unique_ptr -linpack::LinpackBenchmark::generateInputData() { - int local_matrix_width = executionSettings->programSettings->matrixSize / executionSettings->programSettings->torus_width; - int local_matrix_height = executionSettings->programSettings->matrixSize / executionSettings->programSettings->torus_height; - - if ((executionSettings->programSettings->matrixSize / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_width > 0 || - (executionSettings->programSettings->matrixSize / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_height > 0) { - throw std::runtime_error("Global matrix size must be multiple of LCM of PQ grid!"); - } - - auto d = std::unique_ptr(new linpack::LinpackData(*executionSettings->context ,local_matrix_width, local_matrix_height)); - std::mt19937 gen(this->mpi_comm_rank); - std::uniform_real_distribution<> dis(0.0, 1.0); - d->norma = 0.0; - d->normb = 0.0; - - - /* - Generate a matrix by using pseudo random number in the range (0,1) - */ - for (int j = 0; j < local_matrix_height; j++) { - // fill a single column of the matrix - for (int i = 0; i < local_matrix_width; i++) { - HOST_DATA_TYPE temp = dis(gen); - d->A[local_matrix_width*j+i] = temp; - d->norma = (temp > d->norma) ? temp : d->norma; - } - } - - - // If the matrix should be diagonally dominant, we need to exchange the sum of the rows with - // the ranks that share blocks in the same column - if (executionSettings->programSettings->isDiagonallyDominant) { - // create a communicator to exchange the rows - MPI_Comm row_communicator; - MPI_Comm_split(MPI_COMM_WORLD, executionSettings->programSettings->torus_row, 0,&row_communicator); - - // Caclulate the sum for every row and insert in into the matrix - for (int local_matrix_row = 0; local_matrix_row < local_matrix_height; local_matrix_row++) { - int blockSize = executionSettings->programSettings->blockSize; - int global_matrix_row = executionSettings->programSettings->torus_row * blockSize + (local_matrix_row / blockSize) * blockSize * executionSettings->programSettings->torus_height + (local_matrix_row % blockSize); - int local_matrix_col = (global_matrix_row - executionSettings->programSettings->torus_col * blockSize) / (blockSize * executionSettings->programSettings->torus_width) * blockSize + (global_matrix_row % blockSize); - int diagonal_rank = (global_matrix_row / blockSize) % executionSettings->programSettings->torus_width; - bool diagonal_on_this_rank = diagonal_rank == executionSettings->programSettings->torus_col; - // set the diagonal elements of the matrix to 0 - if (diagonal_on_this_rank) { - d->A[local_matrix_width*local_matrix_row + local_matrix_col] = 0.0; - } - HOST_DATA_TYPE local_row_sum = 0.0; - for (int i = 0; i < local_matrix_width; i++) { - local_row_sum += d->A[local_matrix_width*local_matrix_row + i]; - } - HOST_DATA_TYPE row_sum = 0.0; - MPI_Reduce(&local_row_sum, &row_sum, 1, MPI_DATA_TYPE, MPI_SUM, diagonal_rank, row_communicator); - // insert row sum into matrix if it contains the diagonal block - if (diagonal_on_this_rank) { - // update norm of local matrix - d->norma = (row_sum > d->norma) ? row_sum : d->norma; - d->A[local_matrix_width*local_matrix_row + local_matrix_col] = row_sum; - } - } - } - - // initialize other vectors - for (int i = 0; i < local_matrix_width; i++) { - d->b[i] = 0.0; - } - for (int i = 0; i < local_matrix_height; i++) { - d->ipvt[i] = i; - } - - MPI_Comm col_communicator; - MPI_Comm_split(MPI_COMM_WORLD, executionSettings->programSettings->torus_col, 0,&col_communicator); - - // Generate vector b by accumulating the columns of the matrix. - // This will lead to a result vector x with ones on every position - // Every rank will have a valid part of the final b vector stored - for (int j = 0; j < local_matrix_width; j++) { - HOST_DATA_TYPE local_col_sum = 0.0; - for (int i = 0; i < local_matrix_height; i++) { - local_col_sum += d->A[local_matrix_width*i+j]; - } - MPI_Allreduce(&local_col_sum, &(d->b[j]), 1, MPI_DATA_TYPE, MPI_SUM, col_communicator); - d->normb = (d->b[j] > d->normb) ? d->b[j] : d->normb; - } - return d; -} - -bool -linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &data) { - uint n= executionSettings->programSettings->matrixSize; - uint matrix_width = data.matrix_width; - uint matrix_height = data.matrix_height; - double residn; - double resid = 0.0; - double normx = 0.0; -#ifndef DISTRIBUTED_VALIDATION - if (mpi_comm_rank > 0) { - for (int j = 0; j < matrix_height; j++) { - for (int i = 0; i < matrix_width; i+= executionSettings->programSettings->blockSize) { - MPI_Send(&data.A[matrix_width * j + i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD); - } - } - if (executionSettings->programSettings->torus_row == 0) { - for (int i = 0; i < matrix_width; i+= executionSettings->programSettings->blockSize) { - MPI_Send(&data.b[i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD); - } - } - residn = 0; - } - else { - MPI_Status status; - size_t current_offset = 0; - std::vector total_b_original(n); - std::vector total_b(n); - std::vector total_a(n*n); - for (int j = 0; j < n; j++) { - for (int i = 0; i < n; i+= executionSettings->programSettings->blockSize) { - int recvcol= (i / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_width; - int recvrow= (j / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_height; - int recvrank = executionSettings->programSettings->torus_width * recvrow + recvcol; - if (recvrank > 0) { - MPI_Recv(&total_a[j * n + i],executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 0, MPI_COMM_WORLD, &status); - } - else { - for (int k=0; k < executionSettings->programSettings->blockSize; k++) { - total_a[j * n + i + k] = data.A[current_offset + k]; - } - current_offset += executionSettings->programSettings->blockSize; - } - } - } - current_offset = 0; - for (int i = 0; i < n; i+= executionSettings->programSettings->blockSize) { - int recvcol= (i / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_width; - if (recvcol > 0) { - MPI_Recv(&total_b[i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvcol, 0, MPI_COMM_WORLD, &status); - } - else { - for (int k=0; k < executionSettings->programSettings->blockSize; k++) { - total_b[i + k] = data.b[current_offset + k]; - } - current_offset += executionSettings->programSettings->blockSize; - } - } - - std::copy(total_b.begin(), total_b.end(), total_b_original.begin()); - gesl_ref_nopvt(total_a.data(), total_b.data(), n, n); - - for (int i = 0; i < n; i++) { - resid = (resid > std::abs(total_b[i] - 1)) ? resid : std::abs(total_b[i] - 1); - normx = (normx > std::abs(total_b_original[i])) ? normx : std::abs(total_b_original[i]); - } - } -#else - double local_resid = 0; - double local_normx = data.normb; - #pragma omp parallel for reduction(max:local_resid) - for (int i = 0; i < data.matrix_width; i++) { - local_resid = (local_resid > std::abs(data.b[i] - 1)) ? local_resid : std::abs(data.b[i] - 1); - } -#ifndef NDEBUG - std::cout << "Rank " << mpi_comm_rank << ": resid=" << local_resid << ", normx=" << local_normx << std::endl; -#endif - - MPI_Reduce(&local_resid, &resid, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - MPI_Reduce(&local_normx, &normx, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); -#endif - - - HOST_DATA_TYPE eps = std::numeric_limits::epsilon(); - residn = resid / (static_cast(n)*normx*eps); - - #ifndef NDEBUG - if (residn > 1 && mpi_comm_size == 1) { - auto ref_result = generateInputData(); - // For each column right of current diagonal element - for (int j = 0; j < n; j++) { - // For each element below it - for (int i = 0; i < n; i++) { - std::cout << ref_result->A[n * j + i] << ", "; - } - std::cout << std::endl; - } - std::cout << std::endl; - // For each column right of current diagonal element - for (int j = 0; j < n; j++) { - // For each element below it - for (int i = 0; i < n; i++) { - std::cout << data.A[n * j + i] << ", "; - } - std::cout << std::endl; - } - std::cout << std::endl; - if (executionSettings->programSettings->isDiagonallyDominant) { - linpack::gefa_ref_nopvt(ref_result->A, n, n); - linpack::gesl_ref_nopvt(ref_result->A, ref_result->b, n, n); - } - else { - linpack::gefa_ref(ref_result->A, n, n, ref_result->ipvt); - linpack::gesl_ref(ref_result->A, ref_result->b, ref_result->ipvt, n, n); - } - // For each column right of current diagonal element - for (int j = 0; j < n; j++) { - // For each element below it - for (int i = 0; i < n; i++) { - std::cout << std::abs(ref_result->A[n * j + i] - data.A[n * j + i]) << ", "; - } - std::cout << std::endl; - } - std::cout << std::endl; - } - #endif - - if (mpi_comm_rank == 0) { - //std::cout << resid << ", " << norma << ", " << normx << std::endl; - std::cout << " norm. resid resid "\ - "machep " << std::endl; - std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE) - << resid << std::setw(ENTRY_SPACE) << eps << std::endl; - return residn < 1; - } - else { - return true; - } -} - -void -linpack::LinpackBenchmark::distributed_gesl_nopvt_ref(linpack::LinpackData& data) { - uint global_matrix_size = executionSettings->programSettings->matrixSize; - uint matrix_width = data.matrix_width; - uint matrix_height = data.matrix_height; - uint block_size = executionSettings->programSettings->blockSize; - // create a communicator to exchange the rows - MPI_Comm row_communicator; - MPI_Comm_split(MPI_COMM_WORLD, executionSettings->programSettings->torus_row, 0,&row_communicator); - MPI_Comm col_communicator; - MPI_Comm_split(MPI_COMM_WORLD, executionSettings->programSettings->torus_col, 0,&col_communicator); - std::vector b_tmp(matrix_width); - - for (int k = 0; k < b_tmp.size(); k++) { - b_tmp[k] = data.b[k]; - } - - // solve l*y = b - // For each row in matrix - for (int k = 0; k < global_matrix_size - 1; k++) { - size_t local_k_index_col = k / (block_size * executionSettings->programSettings->torus_width) * block_size; - size_t local_k_index_row = k / (block_size * executionSettings->programSettings->torus_height) * block_size; - size_t remaining_k_col = k % (block_size * executionSettings->programSettings->torus_width); - size_t remaining_k_row = k % (block_size * executionSettings->programSettings->torus_height); - size_t start_offset = local_k_index_col; - if (remaining_k_col / block_size > executionSettings->programSettings->torus_col){ - local_k_index_col += block_size; - start_offset = local_k_index_col; - } - else if (remaining_k_col / block_size == executionSettings->programSettings->torus_col) { - local_k_index_col += (remaining_k_col % block_size); - start_offset = local_k_index_col + 1; - } - if (remaining_k_row / block_size > executionSettings->programSettings->torus_row){ - local_k_index_row += block_size; - } - else if (remaining_k_row / block_size == executionSettings->programSettings->torus_row) { - local_k_index_row += (remaining_k_row % block_size); - } - - int row_diagonal_rank = (k / block_size) % executionSettings->programSettings->torus_height; - int col_diagonal_rank = (k / block_size) % executionSettings->programSettings->torus_width; - std::vector tmp_scaled_b(matrix_width, 0.0); - if (row_diagonal_rank == executionSettings->programSettings->torus_row) { - HOST_DATA_TYPE current_k; - current_k = (local_k_index_col < matrix_width) ? b_tmp[local_k_index_col] : 0.0; - MPI_Bcast(¤t_k, 1, MPI_DATA_TYPE, col_diagonal_rank, row_communicator); - // For each row below add - for (int i = start_offset; i < matrix_width; i++) { - // add solved upper row to current row - tmp_scaled_b[i] = current_k * data.A[matrix_width * local_k_index_row + i]; - } - } - MPI_Bcast(&tmp_scaled_b.data()[start_offset], matrix_width - start_offset, MPI_DATA_TYPE, row_diagonal_rank, col_communicator); - for (int i = start_offset; i < matrix_width; i++) { - // add solved upper row to current row - b_tmp[i] += tmp_scaled_b[i]; - } - } - - // now solve u*x = y - for (int k = global_matrix_size - 1; k >= 0; k--) { - size_t local_k_index_col = k / (block_size * executionSettings->programSettings->torus_width) * block_size; - size_t local_k_index_row = k / (block_size * executionSettings->programSettings->torus_height) * block_size; - size_t remaining_k_col = k % (block_size * executionSettings->programSettings->torus_width); - size_t remaining_k_row = k % (block_size * executionSettings->programSettings->torus_height); - if (remaining_k_col / block_size > executionSettings->programSettings->torus_col){ - local_k_index_col += block_size; - } - else if (remaining_k_col / block_size == executionSettings->programSettings->torus_col) { - local_k_index_col += remaining_k_col % block_size; - } - if (remaining_k_row / block_size > executionSettings->programSettings->torus_row){ - local_k_index_row += block_size; - } - else if (remaining_k_row / block_size == executionSettings->programSettings->torus_row) { - local_k_index_row += remaining_k_row % block_size; - } - - HOST_DATA_TYPE scale_element = (local_k_index_col < matrix_width && local_k_index_row < matrix_height) ? b_tmp[local_k_index_col] * data.A[matrix_width * local_k_index_row + local_k_index_col] : 0.0; - int row_diagonal_rank = (k / block_size) % executionSettings->programSettings->torus_height; - int col_diagonal_rank = (k / block_size) % executionSettings->programSettings->torus_width; - MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, row_diagonal_rank, col_communicator); - if (col_diagonal_rank == executionSettings->programSettings->torus_col) { - b_tmp[local_k_index_col] = -scale_element; - } - MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, col_diagonal_rank, row_communicator); - size_t end_offset = local_k_index_col; - - std::vector tmp_scaled_b(matrix_width, 0.0); - if (row_diagonal_rank == executionSettings->programSettings->torus_row) { - // For each row below add - for (int i = 0; i < end_offset; i++) { - tmp_scaled_b[i] = scale_element * data.A[matrix_width * local_k_index_row + i]; - } - } - MPI_Bcast(tmp_scaled_b.data(), end_offset, MPI_DATA_TYPE, row_diagonal_rank, col_communicator); - for (int i = 0; i < end_offset; i++) { - // add solved upper row to current row - b_tmp[i] += tmp_scaled_b[i]; - } - } - for (int k = 0; k < b_tmp.size(); k++) { - data.b[k] = b_tmp[k]; - } - -#ifndef NDEBUG - MPI_Barrier(MPI_COMM_WORLD); - for (int rank = 0; rank < mpi_comm_size; rank++) { - if (rank == mpi_comm_rank) { - double sum = 0; - double max = 0; - for (int k = 0; k < matrix_width; k++) { - sum += std::abs(data.b[k]); - if (std::abs(data.b[k] - 1) > 0.1 || data.b[k] == NAN) { - std::cout << "Rank " << mpi_comm_rank << " Pos: " << k << " Value: " << std::abs(data.b[k]) << std::endl; - } - } - std::cout << "Rank " << mpi_comm_rank << " Dist.Sum: " << sum << " Max: " << max << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - } -#endif -} - -/** -Standard LU factorization on a block with fixed size - -Case 1 of Zhangs description -*/ -void -linpack::gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt) { - for (int i = 0; i < n; i++) { - ipvt[i] = i; - } - // For each diagnonal element - for (int k = 0; k < n - 1; k++) { - HOST_DATA_TYPE max_val = fabs(a[k * lda + k]); - int pvt_index = k; - for (int i = k + 1; i < n; i++) { - if (max_val < fabs(a[k * lda + i])) { - pvt_index = i; - max_val = fabs(a[k * lda + i]); - } - } - - for (int i = k; i < n; i++) { - HOST_DATA_TYPE tmp_val = a[i * lda + k]; - a[i * lda + k] = a[i * lda + pvt_index]; - a[i * lda + pvt_index] = tmp_val; - } - ipvt[k] = pvt_index; - - // For each element below it - for (int i = k + 1; i < n; i++) { - a[k * lda + i] *= -1.0 / a[k * lda + k]; - } - // For each column right of current diagonal element - for (int j = k + 1; j < n; j++) { - // For each element below it - for (int i = k+1; i < n; i++) { - a[j * lda + i] += a[k * lda + i] * a[j * lda + k]; - } - } - -#ifdef DEBUG - std::cout << "A(k=" << k <<"): " << std::endl; - for (int i= 0; i < n; i++) { - for (int j=0; j < n; j++) { - std::cout << a[i*lda + j] << ", "; - } - std::cout << std::endl; - } - std::cout << std::endl; -#endif - - } -} - -void -linpack::gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda) { - auto b_tmp = new HOST_DATA_TYPE[n]; - { - for (int k = 0; k < n; k++) { - b_tmp[k] = b[k]; - } - - // solve l*y = b - // For each row in matrix - for (int k = 0; k < n - 1; k++) { - if (ipvt[k] != k) { - HOST_DATA_TYPE tmp = b_tmp[k]; - b_tmp[k] = b_tmp[ipvt[k]]; - b_tmp[ipvt[k]] = tmp; - } - // For each row below add - for (int i = k + 1; i < n; i++) { - // add solved upper row to current row - b_tmp[i] += b_tmp[k] * a[lda * k + i]; - } - } - - // now solve u*x = y - for (int k = n - 1; k >= 0; k--) { - b_tmp[k] = b_tmp[k] / a[lda * k + k]; - for (int i = 0; i < k; i++) { - b_tmp[i] -= b_tmp[k] * a[lda * k + i]; - } - } - for (int k = 0; k < n; k++) { - b[k] = b_tmp[k]; - } - } - delete [] b_tmp; -} - -void linpack::dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m, bool transposed) { - for (int i=0; i < n1; i++) { - for (int j=0; j < n2; j++) { - y[i] = y[i] + x[j] * (transposed ? m[ldm*i + j] :m[ldm*j + i]); - } - } -} - -void -linpack::gefa_ref_nopvt(HOST_DATA_TYPE* a, unsigned n, unsigned lda) { - // For each diagnonal element - for (int k = 0; k < n; k++) { - // Store negatie invers of diagonal elements to get rid of some divisions afterwards! - a[k * lda + k] = -1.0 / a[k * lda + k]; - // For each element below it - for (int i = k + 1; i < n; i++) { - a[k * lda + i] *= a[k * lda + k]; - } - // For each column right of current diagonal element - for (int j = k + 1; j < n; j++) { - // For each element below it - for (int i = k+1; i < n; i++) { - a[j * lda + i] += a[k * lda + i] * a[j * lda + k]; - } - } - -#ifdef DEBUG - std::cout << "A(k=" << k << "): " << std::endl; - for (int i= 0; i < n; i++) { - for (int j=0; j < n; j++) { - std::cout << a[i*lda + j] << ", "; - } - std::cout << std::endl; - } - std::cout << std::endl; -#endif - - } -} - - -void -linpack::gesl_ref_nopvt(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, unsigned n, unsigned lda) { - auto b_tmp = new HOST_DATA_TYPE[n]; - - for (int k = 0; k < n; k++) { - b_tmp[k] = b[k]; - } - - // solve l*y = b - // For each row in matrix - for (int k = 0; k < n - 1; k++) { - // For each row below add - for (int i = k + 1; i < n; i++) { - // add solved upper row to current row - b_tmp[i] += b_tmp[k] * a[lda * k + i]; - } - } - - // now solve u*x = y - for (int k = n - 1; k >= 0; k--) { - HOST_DATA_TYPE scale = b_tmp[k] * a[lda * k + k]; - b_tmp[k] = -scale; - for (int i = 0; i < k; i++) { - b_tmp[i] += scale * a[lda * k + i]; - } - } - for (int k = 0; k < n; k++) { - b[k] = b_tmp[k]; - } - delete [] b_tmp; -} diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp index c05b323a..b79fa65a 100644 --- a/LINPACK/src/host/linpack_benchmark.hpp +++ b/LINPACK/src/host/linpack_benchmark.hpp @@ -26,10 +26,13 @@ SOFTWARE. /* C++ standard library headers */ #include #include +#include /* Project's headers */ #include "hpcc_benchmark.hpp" +#include "execution_types/execution_types.hpp" #include "parameters.h" +#include "linpack_data.hpp" extern "C" { #include "gmres.h" } @@ -40,177 +43,12 @@ extern "C" { */ namespace linpack { -/** - * @brief The Linpack specific program settings - * - */ -class LinpackProgramSettings : public hpcc_base::BaseSettings { - -public: - /** - * @brief The size of the local matrix in number of blocks in one dimension - * - */ - uint matrixSize; - - /** - * @brief Size of a single block of the matrix in values in one dimension - * - */ - uint blockSize; - - /** - * @brief Indicates if the generated input matrix should be diagonally dominant - * - */ - bool isDiagonallyDominant; - - /** - * @brief True, if the used kernel is an emulation kernel. Different kernel arguments may be used in this case to - * simulate persistent local memory. - * - */ - bool isEmulationKernel; - - /** - * @brief The row position of this MPI rank in the torus - * - */ - int torus_row; - - /** - * @brief The rcolumn position of this MPI rank in the torus - * - */ - int torus_col; - - /** - * @brief Width of the torus in number of ranks - * - */ - int torus_width; - - /** - * @brief Height of the FPGA torus in number of ranks - * - */ - int torus_height; - - /** - * @brief Construct a new Linpack Program Settings object - * - * @param results the result map from parsing the program input parameters - */ - LinpackProgramSettings(cxxopts::ParseResult &results); - - /** - * @brief Get a map of the settings. This map will be used to print the final configuration. - * - * @return a map of program parameters. keys are the name of the parameter. - */ - std::map getSettingsMap() override; - -}; - -/** - * @brief Data class containing the data the kernel is exeucted with - * - */ -class LinpackData { - -public: - - /** - * @brief The input matrix representing the left side of the linear equation system - * - */ - HOST_DATA_TYPE *A; - - /** - * @brief The input vector the right side of the linear equation system - * - */ - HOST_DATA_TYPE *b; - - /** - * @brief A vector that can be used to store pivoting information - * - */ - cl_int* ipvt; - - /** - * @brief Width of the local matrix in values - * - */ - size_t matrix_width; - - /** - * @brief Height of the local matrix in values - * - */ - size_t matrix_height; - - /** - * @brief The context that is used to allocate memory in SVM mode - * - */ - cl::Context context; - - /** - * @brief The maximum value of A that will be used for the error calculation - * - */ - HOST_DATA_TYPE norma; - - /** - * @brief The maximum value of A that will be used for the error calculation - * - */ - HOST_DATA_TYPE normb; - - /** - * @brief Construct a new Linpack Data object - * - * @param context The OpenCL context used to allocate memory in SVM mode - * @param width width of the local matrix in values - * @param height height of the local matrix in values - */ - LinpackData(cl::Context context, size_t width, size_t height); - - /** - * @brief Destroy the Linpack Data object. Free the allocated memory - * - */ - ~LinpackData(); - -}; - -/** - * @brief Measured execution timing from the kernel execution - * - */ -class LinpackExecutionTimings { -public: - /** - * @brief A vector containing the timings for all repetitions for the kernel execution for the gefa kernel - * - */ - std::vector gefaTimings; - - /** - * @brief A vector containing the timings for all repetitions for the kernel execution for the gesl kernel - * - */ - std::vector geslTimings; - - -}; - /** * @brief Implementation of the Linpack benchmark * */ -class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark { +template +class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark { protected: @@ -220,7 +58,18 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark()->default_value(std::to_string(DEFAULT_MATRIX_SIZE))) + ("b", "Log2 of the block size in number of values in one dimension", + cxxopts::value()->default_value(std::to_string(LOCAL_MEM_BLOCK_LOG))) + ("p", "Width of the FPGA grid. The heigth (Q) will be calculated from mpi_size / P.", + cxxopts::value()->default_value(std::to_string(DEFAULT_P_VALUE))) + ("uniform", "Generate a uniform matrix instead of a diagonally dominant. This has to be supported by the FPGA kernel!") + ("emulation", "Use kernel arguments for emulation. This may be necessary to simulate persistent local memory on the FPGA"); +} + /** * @brief Distributed solving of l*y=b and u*x = y @@ -228,7 +77,130 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmarkexecutionSettings->programSettings->matrixSize; + uint matrix_width = data.matrix_width; + uint matrix_height = data.matrix_height; + uint block_size = this->executionSettings->programSettings->blockSize; + // create a communicator to exchange the rows + MPI_Comm row_communicator; + MPI_Comm_split(MPI_COMM_WORLD, this->executionSettings->programSettings->torus_row, 0,&row_communicator); + MPI_Comm col_communicator; + MPI_Comm_split(MPI_COMM_WORLD, this->executionSettings->programSettings->torus_col, 0,&col_communicator); + std::vector b_tmp(matrix_width); + + for (int k = 0; k < b_tmp.size(); k++) { + b_tmp[k] = data.b[k]; + } + + // solve l*y = b + // For each row in matrix + for (int k = 0; k < global_matrix_size - 1; k++) { + size_t local_k_index_col = k / (block_size * this->executionSettings->programSettings->torus_width) * block_size; + size_t local_k_index_row = k / (block_size * this->executionSettings->programSettings->torus_height) * block_size; + size_t remaining_k_col = k % (block_size * this->executionSettings->programSettings->torus_width); + size_t remaining_k_row = k % (block_size * this->executionSettings->programSettings->torus_height); + size_t start_offset = local_k_index_col; + if (remaining_k_col / block_size > this->executionSettings->programSettings->torus_col){ + local_k_index_col += block_size; + start_offset = local_k_index_col; + } + else if (remaining_k_col / block_size == this->executionSettings->programSettings->torus_col) { + local_k_index_col += (remaining_k_col % block_size); + start_offset = local_k_index_col + 1; + } + if (remaining_k_row / block_size > this->executionSettings->programSettings->torus_row){ + local_k_index_row += block_size; + } + else if (remaining_k_row / block_size == this->executionSettings->programSettings->torus_row) { + local_k_index_row += (remaining_k_row % block_size); + } + + int row_diagonal_rank = (k / block_size) % this->executionSettings->programSettings->torus_height; + int col_diagonal_rank = (k / block_size) % this->executionSettings->programSettings->torus_width; + std::vector tmp_scaled_b(matrix_width, 0.0); + if (row_diagonal_rank == this->executionSettings->programSettings->torus_row) { + HOST_DATA_TYPE current_k; + current_k = (local_k_index_col < matrix_width) ? b_tmp[local_k_index_col] : 0.0; + MPI_Bcast(¤t_k, 1, MPI_DATA_TYPE, col_diagonal_rank, row_communicator); + // For each row below add + for (int i = start_offset; i < matrix_width; i++) { + // add solved upper row to current row + tmp_scaled_b[i] = current_k * data.A[matrix_width * local_k_index_row + i]; + } + } + MPI_Bcast(&tmp_scaled_b.data()[start_offset], matrix_width - start_offset, MPI_DATA_TYPE, row_diagonal_rank, col_communicator); + for (int i = start_offset; i < matrix_width; i++) { + // add solved upper row to current row + b_tmp[i] += tmp_scaled_b[i]; + } + } + + // now solve u*x = y + for (int k = global_matrix_size - 1; k >= 0; k--) { + size_t local_k_index_col = k / (block_size * this->executionSettings->programSettings->torus_width) * block_size; + size_t local_k_index_row = k / (block_size * this->executionSettings->programSettings->torus_height) * block_size; + size_t remaining_k_col = k % (block_size * this->executionSettings->programSettings->torus_width); + size_t remaining_k_row = k % (block_size * this->executionSettings->programSettings->torus_height); + if (remaining_k_col / block_size > this->executionSettings->programSettings->torus_col){ + local_k_index_col += block_size; + } + else if (remaining_k_col / block_size == this->executionSettings->programSettings->torus_col) { + local_k_index_col += remaining_k_col % block_size; + } + if (remaining_k_row / block_size > this->executionSettings->programSettings->torus_row){ + local_k_index_row += block_size; + } + else if (remaining_k_row / block_size == this->executionSettings->programSettings->torus_row) { + local_k_index_row += remaining_k_row % block_size; + } + + HOST_DATA_TYPE scale_element = (local_k_index_col < matrix_width && local_k_index_row < matrix_height) ? b_tmp[local_k_index_col] * data.A[matrix_width * local_k_index_row + local_k_index_col] : 0.0; + int row_diagonal_rank = (k / block_size) % this->executionSettings->programSettings->torus_height; + int col_diagonal_rank = (k / block_size) % this->executionSettings->programSettings->torus_width; + MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, row_diagonal_rank, col_communicator); + if (col_diagonal_rank == this->executionSettings->programSettings->torus_col) { + b_tmp[local_k_index_col] = -scale_element; + } + MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, col_diagonal_rank, row_communicator); + size_t end_offset = local_k_index_col; + + std::vector tmp_scaled_b(matrix_width, 0.0); + if (row_diagonal_rank == this->executionSettings->programSettings->torus_row) { + // For each row below add + for (int i = 0; i < end_offset; i++) { + tmp_scaled_b[i] = scale_element * data.A[matrix_width * local_k_index_row + i]; + } + } + MPI_Bcast(tmp_scaled_b.data(), end_offset, MPI_DATA_TYPE, row_diagonal_rank, col_communicator); + for (int i = 0; i < end_offset; i++) { + // add solved upper row to current row + b_tmp[i] += tmp_scaled_b[i]; + } + } + for (int k = 0; k < b_tmp.size(); k++) { + data.b[k] = b_tmp[k]; + } + +#ifndef NDEBUG + MPI_Barrier(MPI_COMM_WORLD); + for (int rank = 0; rank < this->mpi_comm_size; rank++) { + if (rank == this->mpi_comm_rank) { + double sum = 0; + double max = 0; + for (int k = 0; k < matrix_width; k++) { + sum += std::abs(data.b[k]); + if (std::abs(data.b[k] - 1) > 0.1 || data.b[k] == NAN) { + std::cout << "Rank " << this->mpi_comm_rank << " Pos: " << k << " Value: " << std::abs(data.b[k]) << std::endl; + } + } + std::cout << "Rank " << this->mpi_comm_rank << " Dist.Sum: " << sum << " Max: " << max << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } +#endif +} + public: @@ -238,7 +210,93 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark The input and output data of the benchmark */ std::unique_ptr - generateInputData() override; + generateInputData() override { + int local_matrix_width = this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->torus_width; + int local_matrix_height = this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->torus_height; + + if ((this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_width > 0 || + (this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_height > 0) { + throw std::runtime_error("Global matrix size must be multiple of LCM of PQ grid!"); + } + + auto d = std::unique_ptr(new linpack::LinpackData(*this->executionSettings->context ,local_matrix_width, local_matrix_height)); + std::mt19937 gen(this->mpi_comm_rank); + std::uniform_real_distribution<> dis(0.0, 1.0); + d->norma = 0.0; + d->normb = 0.0; + + + /* + Generate a matrix by using pseudo random number in the range (0,1) + */ + for (int j = 0; j < local_matrix_height; j++) { + // fill a single column of the matrix + for (int i = 0; i < local_matrix_width; i++) { + HOST_DATA_TYPE temp = dis(gen); + d->A[local_matrix_width*j+i] = temp; + d->norma = (temp > d->norma) ? temp : d->norma; + } + } + + + // If the matrix should be diagonally dominant, we need to exchange the sum of the rows with + // the ranks that share blocks in the same column + if (this->executionSettings->programSettings->isDiagonallyDominant) { + // create a communicator to exchange the rows + MPI_Comm row_communicator; + MPI_Comm_split(MPI_COMM_WORLD, this->executionSettings->programSettings->torus_row, 0,&row_communicator); + + // Caclulate the sum for every row and insert in into the matrix + for (int local_matrix_row = 0; local_matrix_row < local_matrix_height; local_matrix_row++) { + int blockSize = this->executionSettings->programSettings->blockSize; + int global_matrix_row = this->executionSettings->programSettings->torus_row * blockSize + (local_matrix_row / blockSize) * blockSize * this->executionSettings->programSettings->torus_height + (local_matrix_row % blockSize); + int local_matrix_col = (global_matrix_row - this->executionSettings->programSettings->torus_col * blockSize) / (blockSize * this->executionSettings->programSettings->torus_width) * blockSize + (global_matrix_row % blockSize); + int diagonal_rank = (global_matrix_row / blockSize) % this->executionSettings->programSettings->torus_width; + bool diagonal_on_this_rank = diagonal_rank == this->executionSettings->programSettings->torus_col; + // set the diagonal elements of the matrix to 0 + if (diagonal_on_this_rank) { + d->A[local_matrix_width*local_matrix_row + local_matrix_col] = 0.0; + } + HOST_DATA_TYPE local_row_sum = 0.0; + for (int i = 0; i < local_matrix_width; i++) { + local_row_sum += d->A[local_matrix_width*local_matrix_row + i]; + } + HOST_DATA_TYPE row_sum = 0.0; + MPI_Reduce(&local_row_sum, &row_sum, 1, MPI_DATA_TYPE, MPI_SUM, diagonal_rank, row_communicator); + // insert row sum into matrix if it contains the diagonal block + if (diagonal_on_this_rank) { + // update norm of local matrix + d->norma = (row_sum > d->norma) ? row_sum : d->norma; + d->A[local_matrix_width*local_matrix_row + local_matrix_col] = row_sum; + } + } + } + + // initialize other vectors + for (int i = 0; i < local_matrix_width; i++) { + d->b[i] = 0.0; + } + for (int i = 0; i < local_matrix_height; i++) { + d->ipvt[i] = i; + } + + MPI_Comm col_communicator; + MPI_Comm_split(MPI_COMM_WORLD, this->executionSettings->programSettings->torus_col, 0,&col_communicator); + + // Generate vector b by accumulating the columns of the matrix. + // This will lead to a result vector x with ones on every position + // Every rank will have a valid part of the final b vector stored + for (int j = 0; j < local_matrix_width; j++) { + HOST_DATA_TYPE local_col_sum = 0.0; + for (int i = 0; i < local_matrix_height; i++) { + local_col_sum += d->A[local_matrix_width*i+j]; + } + MPI_Allreduce(&local_col_sum, &(d->b[j]), 1, MPI_DATA_TYPE, MPI_SUM, col_communicator); + d->normb = (d->b[j] > d->normb) ? d->b[j] : d->normb; + } + return d; +} + /** * @brief Linpack specific implementation of the kernel execution @@ -247,7 +305,19 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark Measured runtimes of the kernel execution */ std::unique_ptr - executeKernel(LinpackData &data) override; + executeKernel(LinpackData &data) override { + std::unique_ptr timings; + switch (this->executionSettings->programSettings->communicationType) { + case hpcc_base::CommunicationType::pcie_mpi : timings = execution::pcie::calculate(*this->executionSettings, data); break; + case hpcc_base::CommunicationType::intel_external_channels: timings = execution::iec::calculate(*this->executionSettings, data); break; + default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(this->executionSettings->programSettings->communicationType)); + } +#ifdef DISTRIBUTED_VALIDATION + distributed_gesl_nopvt_ref(data); +#endif + return timings; +} + /** * @brief Linpack specific implementation of the execution validation @@ -257,7 +327,144 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmarkexecutionSettings->programSettings->matrixSize; + uint matrix_width = data.matrix_width; + uint matrix_height = data.matrix_height; + double residn; + double resid = 0.0; + double normx = 0.0; +#ifndef DISTRIBUTED_VALIDATION + if (mpi_comm_rank > 0) { + for (int j = 0; j < matrix_height; j++) { + for (int i = 0; i < matrix_width; i+= this->executionSettings->programSettings->blockSize) { + MPI_Send(&data.A[matrix_width * j + i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD); + } + } + if (executionSettings->programSettings->torus_row == 0) { + for (int i = 0; i < matrix_width; i+= this->executionSettings->programSettings->blockSize) { + MPI_Send(&data.b[i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD); + } + } + residn = 0; + } + else { + MPI_Status status; + size_t current_offset = 0; + std::vector total_b_original(n); + std::vector total_b(n); + std::vector total_a(n*n); + for (int j = 0; j < n; j++) { + for (int i = 0; i < n; i+= this->executionSettings->programSettings->blockSize) { + int recvcol= (i / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_width; + int recvrow= (j / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_height; + int recvrank = this->executionSettings->programSettings->torus_width * recvrow + recvcol; + if (recvrank > 0) { + MPI_Recv(&total_a[j * n + i],executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 0, MPI_COMM_WORLD, &status); + } + else { + for (int k=0; k < this->executionSettings->programSettings->blockSize; k++) { + total_a[j * n + i + k] = data.A[current_offset + k]; + } + current_offset += this->executionSettings->programSettings->blockSize; + } + } + } + current_offset = 0; + for (int i = 0; i < n; i+= this->executionSettings->programSettings->blockSize) { + int recvcol= (i / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_width; + if (recvcol > 0) { + MPI_Recv(&total_b[i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvcol, 0, MPI_COMM_WORLD, &status); + } + else { + for (int k=0; k < this->executionSettings->programSettings->blockSize; k++) { + total_b[i + k] = data.b[current_offset + k]; + } + current_offset += this->executionSettings->programSettings->blockSize; + } + } + + std::copy(total_b.begin(), total_b.end(), total_b_original.begin()); + gesl_ref_nopvt(total_a.data(), total_b.data(), n, n); + + for (int i = 0; i < n; i++) { + resid = (resid > std::abs(total_b[i] - 1)) ? resid : std::abs(total_b[i] - 1); + normx = (normx > std::abs(total_b_original[i])) ? normx : std::abs(total_b_original[i]); + } + } +#else + double local_resid = 0; + double local_normx = data.normb; + #pragma omp parallel for reduction(max:local_resid) + for (int i = 0; i < data.matrix_width; i++) { + local_resid = (local_resid > std::abs(data.b[i] - 1)) ? local_resid : std::abs(data.b[i] - 1); + } +#ifndef NDEBUG + std::cout << "Rank " << this->mpi_comm_rank << ": resid=" << local_resid << ", normx=" << local_normx << std::endl; +#endif + + MPI_Reduce(&local_resid, &resid, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(&local_normx, &normx, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); +#endif + + + HOST_DATA_TYPE eps = std::numeric_limits::epsilon(); + residn = resid / (static_cast(n)*normx*eps); + + #ifndef NDEBUG + if (residn > 1 && this->mpi_comm_size == 1) { + auto ref_result = generateInputData(); + // For each column right of current diagonal element + for (int j = 0; j < n; j++) { + // For each element below it + for (int i = 0; i < n; i++) { + std::cout << ref_result->A[n * j + i] << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + // For each column right of current diagonal element + for (int j = 0; j < n; j++) { + // For each element below it + for (int i = 0; i < n; i++) { + std::cout << data.A[n * j + i] << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + if (this->executionSettings->programSettings->isDiagonallyDominant) { + linpack::gefa_ref_nopvt(ref_result->A, n, n); + linpack::gesl_ref_nopvt(ref_result->A, ref_result->b, n, n); + } + else { + linpack::gefa_ref(ref_result->A, n, n, ref_result->ipvt); + linpack::gesl_ref(ref_result->A, ref_result->b, ref_result->ipvt, n, n); + } + // For each column right of current diagonal element + for (int j = 0; j < n; j++) { + // For each element below it + for (int i = 0; i < n; i++) { + std::cout << std::abs(ref_result->A[n * j + i] - data.A[n * j + i]) << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + #endif + + if (this->mpi_comm_rank == 0) { + //std::cout << resid << ", " << norma << ", " << normx << std::endl; + std::cout << " norm. resid resid "\ + "machep " << std::endl; + std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE) + << resid << std::setw(ENTRY_SPACE) << eps << std::endl; + return residn < 1; + } + else { + return true; + } +} + /** * @brief Linpack specific implementation of printing the execution results @@ -265,7 +472,75 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark::max(); + double lu_min = std::numeric_limits::max(); + double sl_min = std::numeric_limits::max(); + +#ifndef NDEBUG + std::cout << "Rank " << this->mpi_comm_rank << ": Result collection started" << std::endl; +#endif + + std::vector global_lu_times(output.gefaTimings.size()); + MPI_Reduce(output.gefaTimings.data(), global_lu_times.data(), output.gefaTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + std::vector global_sl_times(output.geslTimings.size()); + MPI_Reduce(output.geslTimings.data(), global_sl_times.data(), output.geslTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); +#ifndef NDEBUG + std::cout << "Rank " << this->mpi_comm_rank << ": Result collection done" << std::endl; +#endif + + + if (this->mpi_comm_rank > 0) { + // Only the master rank needs to calculate and print result + return; + } + + double total_matrix_size = static_cast(this->executionSettings->programSettings->matrixSize); + double gflops_lu = ((2.0e0*total_matrix_size * total_matrix_size * total_matrix_size)/ 3.0) / 1.0e9; + double gflops_sl = (2.0*(total_matrix_size * total_matrix_size))/1.0e9; + for (int i =0; i < global_lu_times.size(); i++) { + double currentTime = global_lu_times[i] + global_sl_times[i]; + tmean += currentTime; + tlumean += global_lu_times[i]; + tslmean += global_sl_times[i]; + if (currentTime < tmin) { + tmin = currentTime; + } + if (global_lu_times[i] < lu_min) { + lu_min = global_lu_times[i]; + } + if (global_sl_times[i] < sl_min) { + sl_min = global_sl_times[i]; + } + } + tmean = tmean / global_lu_times.size(); + tlumean = tlumean / global_lu_times.size(); + tslmean = tslmean / global_sl_times.size(); + + std::cout << std::setw(ENTRY_SPACE) + << "Method" << std::setw(ENTRY_SPACE) + << "best" << std::setw(ENTRY_SPACE) << "mean" + << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl; + + std::cout << std::setw(ENTRY_SPACE) << "total" << std::setw(ENTRY_SPACE) + << tmin << std::setw(ENTRY_SPACE) << tmean + << std::setw(ENTRY_SPACE) << ((gflops_lu + gflops_sl) / tmin) + << std::endl; + + std::cout << std::setw(ENTRY_SPACE) << "GEFA" << std::setw(ENTRY_SPACE) + << lu_min << std::setw(ENTRY_SPACE) << tlumean + << std::setw(ENTRY_SPACE) << ((gflops_lu) / lu_min) + << std::endl; + + std::cout << std::setw(ENTRY_SPACE) << "GESL" << std::setw(ENTRY_SPACE) + << sl_min << std::setw(ENTRY_SPACE) << tslmean + << std::setw(ENTRY_SPACE) << (gflops_sl / sl_min) + << std::endl; +} /** * @brief Construct a new Linpack Benchmark object @@ -273,7 +548,9 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark(argc, argv) { + this->setupBenchmark(argc, argv); + } /** * @brief Construct a new Linpack Benchmark object @@ -282,69 +559,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark=n -@param ipvt array of pivoting indices - -*/ -void gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt); - -/** -Solve linear equations using its LU decomposition. -Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU -where A is a matrix of size n*n - -@param a the matrix a in LU representation calculated by gefa call -@param b vector b of the given equation -@param ipvt vector containing pivoting information -@param n size of matrix A -@param lda row with of the matrix. must be >=n - -*/ -void gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda); - -/** -Gaussian elemination reference implementation without pivoting. -Can be used in exchange with kernel functions for functionality testing - -@param a the matrix with size of n*n -@param n size of matrix A -@param lda row with of the matrix. must be >=n - -*/ -void gefa_ref_nopvt(HOST_DATA_TYPE* a, unsigned n, unsigned lda); - -/** -Solve linear equations using its LU decomposition without pivoting. -Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU -where A is a matrix of size n*n - -@param a the matrix a in LU representation calculated by gefa call -@param b vector b of the given equation -@param n size of matrix A -@param lda row with of the matrix. must be >=n - -*/ -void gesl_ref_nopvt(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, unsigned n, unsigned lda); - -} // namespace stream +} // namespace linpack #endif // SRC_HOST_STREAM_BENCHMARK_H_ diff --git a/LINPACK/src/host/linpack_data.cpp b/LINPACK/src/host/linpack_data.cpp new file mode 100644 index 00000000..951c37c2 --- /dev/null +++ b/LINPACK/src/host/linpack_data.cpp @@ -0,0 +1,259 @@ +// +// Created by Marius Meyer on 04.12.19. +// + +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "linpack_benchmark.hpp" + +/* C++ standard library headers */ +#include +#include + +/* Project's headers */ +#include "communication_types.hpp" +#include "execution_types/execution_types.hpp" +#include "parameters.h" + +linpack::LinpackProgramSettings::LinpackProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), + matrixSize(results["m"].as() * (1 << (results["b"].as()))), blockSize(1 << (results["b"].as())), + isEmulationKernel(results.count("emulation") > 0), isDiagonallyDominant(results.count("uniform") == 0), + torus_width(results["p"].as()) { + int mpi_comm_rank; + int mpi_comm_size; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank); + MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size); + // calculate the row and column of the MPI rank in the torus + if (mpi_comm_size % torus_width != 0) { + throw std::runtime_error("MPI size not dividable by P=" + std::to_string(torus_width) + "!"); + } + torus_height = mpi_comm_size / torus_width; + torus_row = (mpi_comm_rank / torus_width); + torus_col = (mpi_comm_rank % torus_width); +} + +std::map +linpack::LinpackProgramSettings::getSettingsMap() { + auto map = hpcc_base::BaseSettings::getSettingsMap(); + map["Matrix Size"] = std::to_string(matrixSize); + map["Block Size"] = std::to_string(blockSize); + map["Emulate"] = (isEmulationKernel) ? "Yes" : "No"; + map["Data Type"] = STR(HOST_DATA_TYPE); + map["FPGA Torus"] = "P=" + std::to_string(torus_width) + ", Q=" + std::to_string(torus_height); + return map; +} + +linpack::LinpackData::LinpackData(cl::Context context, size_t width, size_t height) : norma(0.0), context(context), + matrix_width(width), matrix_height(height) { +#ifdef USE_SVM + A = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * size * sizeof(HOST_DATA_TYPE), 1024)); + b = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(HOST_DATA_TYPE), 1024)); + ipvt = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(cl_int), 1024)); +#else + posix_memalign(reinterpret_cast(&A), 4096, width * height * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&b), 4096, width * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&ipvt), 4096, height * sizeof(cl_int)); +#endif + } + +linpack::LinpackData::~LinpackData() { +#ifdef USE_SVM + clSVMFree(context(), reinterpret_cast(A)); + clSVMFree(context(), reinterpret_cast(b)); + clSVMFree(context(), reinterpret_cast(ipvt)); +#else + free(A); + free(b); + free(ipvt); +#endif +} + +/** +Standard LU factorization on a block with fixed size + +Case 1 of Zhangs description +*/ +void +linpack::gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt) { + for (int i = 0; i < n; i++) { + ipvt[i] = i; + } + // For each diagnonal element + for (int k = 0; k < n - 1; k++) { + HOST_DATA_TYPE max_val = fabs(a[k * lda + k]); + int pvt_index = k; + for (int i = k + 1; i < n; i++) { + if (max_val < fabs(a[k * lda + i])) { + pvt_index = i; + max_val = fabs(a[k * lda + i]); + } + } + + for (int i = k; i < n; i++) { + HOST_DATA_TYPE tmp_val = a[i * lda + k]; + a[i * lda + k] = a[i * lda + pvt_index]; + a[i * lda + pvt_index] = tmp_val; + } + ipvt[k] = pvt_index; + + // For each element below it + for (int i = k + 1; i < n; i++) { + a[k * lda + i] *= -1.0 / a[k * lda + k]; + } + // For each column right of current diagonal element + for (int j = k + 1; j < n; j++) { + // For each element below it + for (int i = k+1; i < n; i++) { + a[j * lda + i] += a[k * lda + i] * a[j * lda + k]; + } + } + +#ifdef DEBUG + std::cout << "A(k=" << k <<"): " << std::endl; + for (int i= 0; i < n; i++) { + for (int j=0; j < n; j++) { + std::cout << a[i*lda + j] << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; +#endif + + } +} + +void +linpack::gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda) { + auto b_tmp = new HOST_DATA_TYPE[n]; + { + for (int k = 0; k < n; k++) { + b_tmp[k] = b[k]; + } + + // solve l*y = b + // For each row in matrix + for (int k = 0; k < n - 1; k++) { + if (ipvt[k] != k) { + HOST_DATA_TYPE tmp = b_tmp[k]; + b_tmp[k] = b_tmp[ipvt[k]]; + b_tmp[ipvt[k]] = tmp; + } + // For each row below add + for (int i = k + 1; i < n; i++) { + // add solved upper row to current row + b_tmp[i] += b_tmp[k] * a[lda * k + i]; + } + } + + // now solve u*x = y + for (int k = n - 1; k >= 0; k--) { + b_tmp[k] = b_tmp[k] / a[lda * k + k]; + for (int i = 0; i < k; i++) { + b_tmp[i] -= b_tmp[k] * a[lda * k + i]; + } + } + for (int k = 0; k < n; k++) { + b[k] = b_tmp[k]; + } + } + delete [] b_tmp; +} + +void linpack::dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m, bool transposed) { + for (int i=0; i < n1; i++) { + for (int j=0; j < n2; j++) { + y[i] = y[i] + x[j] * (transposed ? m[ldm*i + j] :m[ldm*j + i]); + } + } +} + +void +linpack::gefa_ref_nopvt(HOST_DATA_TYPE* a, unsigned n, unsigned lda) { + // For each diagnonal element + for (int k = 0; k < n; k++) { + // Store negatie invers of diagonal elements to get rid of some divisions afterwards! + a[k * lda + k] = -1.0 / a[k * lda + k]; + // For each element below it + for (int i = k + 1; i < n; i++) { + a[k * lda + i] *= a[k * lda + k]; + } + // For each column right of current diagonal element + for (int j = k + 1; j < n; j++) { + // For each element below it + for (int i = k+1; i < n; i++) { + a[j * lda + i] += a[k * lda + i] * a[j * lda + k]; + } + } + +#ifdef DEBUG + std::cout << "A(k=" << k << "): " << std::endl; + for (int i= 0; i < n; i++) { + for (int j=0; j < n; j++) { + std::cout << a[i*lda + j] << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; +#endif + + } +} + + +void +linpack::gesl_ref_nopvt(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, unsigned n, unsigned lda) { + auto b_tmp = new HOST_DATA_TYPE[n]; + + for (int k = 0; k < n; k++) { + b_tmp[k] = b[k]; + } + + // solve l*y = b + // For each row in matrix + for (int k = 0; k < n - 1; k++) { + // For each row below add + for (int i = k + 1; i < n; i++) { + // add solved upper row to current row + b_tmp[i] += b_tmp[k] * a[lda * k + i]; + } + } + + // now solve u*x = y + for (int k = n - 1; k >= 0; k--) { + HOST_DATA_TYPE scale = b_tmp[k] * a[lda * k + k]; + b_tmp[k] = -scale; + for (int i = 0; i < k; i++) { + b_tmp[i] += scale * a[lda * k + i]; + } + } + for (int k = 0; k < n; k++) { + b[k] = b_tmp[k]; + } + delete [] b_tmp; +} diff --git a/LINPACK/src/host/linpack_data.hpp b/LINPACK/src/host/linpack_data.hpp new file mode 100644 index 00000000..51324a5c --- /dev/null +++ b/LINPACK/src/host/linpack_data.hpp @@ -0,0 +1,274 @@ +/* +Copyright (c) 2022 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef SRC_HOST_LINPACK_DATA_H_ +#define SRC_HOST_LINPACK_DATA_H_ + +/* C++ standard library headers */ +#include +#include +#include + +/* Project's headers */ +#include "hpcc_benchmark.hpp" +#include "parameters.h" +extern "C" { + #include "gmres.h" +} + +/** + * @brief Contains all classes and methods needed by the LINPACK benchmark + * + */ +namespace linpack { + +/** + * @brief The Linpack specific program settings + * + */ +class LinpackProgramSettings : public hpcc_base::BaseSettings { + +public: + /** + * @brief The size of the local matrix in number of blocks in one dimension + * + */ + uint matrixSize; + + /** + * @brief Size of a single block of the matrix in values in one dimension + * + */ + uint blockSize; + + /** + * @brief Indicates if the generated input matrix should be diagonally dominant + * + */ + bool isDiagonallyDominant; + + /** + * @brief True, if the used kernel is an emulation kernel. Different kernel arguments may be used in this case to + * simulate persistent local memory. + * + */ + bool isEmulationKernel; + + /** + * @brief The row position of this MPI rank in the torus + * + */ + int torus_row; + + /** + * @brief The rcolumn position of this MPI rank in the torus + * + */ + int torus_col; + + /** + * @brief Width of the torus in number of ranks + * + */ + int torus_width; + + /** + * @brief Height of the FPGA torus in number of ranks + * + */ + int torus_height; + + /** + * @brief Construct a new Linpack Program Settings object + * + * @param results the result map from parsing the program input parameters + */ + LinpackProgramSettings(cxxopts::ParseResult &results); + + /** + * @brief Get a map of the settings. This map will be used to print the final configuration. + * + * @return a map of program parameters. keys are the name of the parameter. + */ + std::map getSettingsMap() override; + +}; + +/** + * @brief Data class containing the data the kernel is exeucted with + * + */ +class LinpackData { + +public: + + /** + * @brief The input matrix representing the left side of the linear equation system + * + */ + HOST_DATA_TYPE *A; + + /** + * @brief The input vector the right side of the linear equation system + * + */ + HOST_DATA_TYPE *b; + + /** + * @brief A vector that can be used to store pivoting information + * + */ + cl_int* ipvt; + + /** + * @brief Width of the local matrix in values + * + */ + size_t matrix_width; + + /** + * @brief Height of the local matrix in values + * + */ + size_t matrix_height; + + /** + * @brief The context that is used to allocate memory in SVM mode + * + */ + cl::Context context; + + /** + * @brief The maximum value of A that will be used for the error calculation + * + */ + HOST_DATA_TYPE norma; + + /** + * @brief The maximum value of A that will be used for the error calculation + * + */ + HOST_DATA_TYPE normb; + + /** + * @brief Construct a new Linpack Data object + * + * @param context The OpenCL context used to allocate memory in SVM mode + * @param width width of the local matrix in values + * @param height height of the local matrix in values + */ + LinpackData(cl::Context context, size_t width, size_t height); + + /** + * @brief Destroy the Linpack Data object. Free the allocated memory + * + */ + ~LinpackData(); + +}; + +/** + * @brief Measured execution timing from the kernel execution + * + */ +class LinpackExecutionTimings { +public: + /** + * @brief A vector containing the timings for all repetitions for the kernel execution for the gefa kernel + * + */ + std::vector gefaTimings; + + /** + * @brief A vector containing the timings for all repetitions for the kernel execution for the gesl kernel + * + */ + std::vector geslTimings; + + +}; + +/** + * + * + * @param n1 + * @param y + * @param n2 + * @param ldm + * @param x + * @param m + */ +void dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m, bool transposed); + +/** +Gaussian elemination reference implementation with partial pivoting. +Can be used in exchange with kernel functions for functionality testing + +@param a the matrix with size of n*n +@param n size of matrix A +@param lda row with of the matrix. must be >=n +@param ipvt array of pivoting indices + +*/ +void gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt); + +/** +Solve linear equations using its LU decomposition. +Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU +where A is a matrix of size n*n + +@param a the matrix a in LU representation calculated by gefa call +@param b vector b of the given equation +@param ipvt vector containing pivoting information +@param n size of matrix A +@param lda row with of the matrix. must be >=n + +*/ +void gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda); + +/** +Gaussian elemination reference implementation without pivoting. +Can be used in exchange with kernel functions for functionality testing + +@param a the matrix with size of n*n +@param n size of matrix A +@param lda row with of the matrix. must be >=n + +*/ +void gefa_ref_nopvt(HOST_DATA_TYPE* a, unsigned n, unsigned lda); + +/** +Solve linear equations using its LU decomposition without pivoting. +Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU +where A is a matrix of size n*n + +@param a the matrix a in LU representation calculated by gefa call +@param b vector b of the given equation +@param n size of matrix A +@param lda row with of the matrix. must be >=n + +*/ +void gesl_ref_nopvt(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, unsigned n, unsigned lda); + + +} +#endif // SRC_HOST_LINPACK_DATA_H__ diff --git a/LINPACK/src/host/main.cpp b/LINPACK/src/host/main.cpp index d05a7319..73dcc570 100644 --- a/LINPACK/src/host/main.cpp +++ b/LINPACK/src/host/main.cpp @@ -12,7 +12,7 @@ The program entry point int main(int argc, char *argv[]) { // Setup benchmark - LinpackBenchmark bm(argc, argv); + LinpackBenchmark bm(argc, argv); bool success = bm.executeBenchmark(); if (success) { return 0; From b8ecac71abd00e90bffb174d5fbf02a546b7a8c7 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 28 Apr 2022 19:02:16 +0100 Subject: [PATCH 060/318] First compilable version HPL ACCL. No communicators! --- .../execution_accl_buffers.hpp | 466 ++++++++++++++++++ .../host/execution_types/execution_iec.hpp | 3 +- .../host/execution_types/execution_pcie.hpp | 3 +- .../host/execution_types/execution_types.hpp | 8 +- LINPACK/src/host/linpack_benchmark.hpp | 5 + LINPACK/src/host/linpack_data.cpp | 3 +- LINPACK/src/host/main.cpp | 5 + 7 files changed, 486 insertions(+), 7 deletions(-) create mode 100644 LINPACK/src/host/execution_types/execution_accl_buffers.hpp diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp new file mode 100644 index 00000000..4a2a7907 --- /dev/null +++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp @@ -0,0 +1,466 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef EXECUTION_TYPES_EXECUTION_ACCL_BUFFERS_HPP +#define EXECUTION_TYPES_EXECUTION_ACCL_BUFFERS_HPP + +/* C++ standard library headers */ +#include +#include +#include +#include +#include +#include + +/* External library headers */ +#ifdef _OPENMP +#include "omp.h" +#endif + +#include "linpack_data.hpp" +#include "parameters.h" + +namespace linpack { +namespace execution { +namespace accl_buffers { + +/* + Prepare kernels and execute benchmark + + @copydoc bm_execution::calculate() +*/ +std::unique_ptr calculate( + const hpcc_base::ExecutionSettings &config, + linpack::LinpackData &data) { + + cl_int err; + + int num_omp_threads = 1; +#ifdef _OPENMP + num_omp_threads = omp_get_num_threads(); +#endif + + uint blocks_per_row = data.matrix_width / config.programSettings->blockSize; + uint blocks_per_col = data.matrix_height / config.programSettings->blockSize; + + // TODO: Allow to handle Communicators in ACCL! + // // Communicate with all ranks in the same row of the torus + // // Configure ACCL Communicators + + // // Create Ranks. This must be the same configuration as used for + // // the global communicator! + // std::vector all_accl_ranks = {}; + // for (int i = 0; i < config.programSettings->torus_width * config.programSettings->torus_; ++i) { + // // TODO: Replace the ip addresses and ports here for execution of real hardware? + // ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, 1024}; + // all_accl_ranks.emplace_back(new_rank); + // } + + // std::vector row_ranks; + // std::vector col_ranks; + + // for (int i = 0; i < config.programSettings->torus_width; i++) { + // row_ranks.push_back(all_accl_ranks[i]); + // } + // for (int i = config.programSettings->torus_col; i < all_accl_ranks.size(); + // i += config.programSettings->torus_width) { + // col_ranks.push_back(all_accl_ranks[config.programSettings->torus_row * + // config.programSettings->torus_width + + // i]); + // } + + // // Row communicator should now be index 1 + // config.accl->configure_communicator(row_ranks, + // config.programSettings->torus_col); + // // Column communicator should now be index 2 + // config.accl->configure_communicator(col_ranks, + // config.programSettings->torus_row); + + // TODO: Select the correct memory groups! + // Create Buffers for input and output + // TODO: Need to set a memory group for the buffers here! + xrt::bo Buffer_a( + *config.device, data.A, + sizeof(HOST_DATA_TYPE) * data.matrix_height * data.matrix_width, 0); + xrt::bo Buffer_b(*config.device, data.b, + sizeof(HOST_DATA_TYPE) * data.matrix_width, 0); + xrt::bo Buffer_pivot(*config.device, data.ipvt, + sizeof(cl_int) * data.matrix_height, 0); + + /* --- Setup MPI communication and required additional buffers --- */ + + // Buffers only used to store data received over the network layer + // The content will not be modified by the host + auto Buffer_lu1 = config.accl->create_buffer( + (config.programSettings->blockSize) * (config.programSettings->blockSize), + ACCL::dataType::float32, 1); + auto Buffer_lu2 = config.accl->create_buffer( + (config.programSettings->blockSize) * (config.programSettings->blockSize), + ACCL::dataType::float32, 1); + + std::vector>> Buffer_left_list; + std::vector>> Buffer_top_list; + + // Create two sets of communication buffers to allow overlap of communication + // and matrix multiplications + for (int rep = 0; rep < 2; rep++) { + Buffer_left_list.emplace_back(); + Buffer_top_list.emplace_back(); + for (int i = 0; i < blocks_per_row; i++) { + Buffer_top_list.back().push_back(config.accl->create_buffer( + config.programSettings->blockSize * + (config.programSettings->blockSize), + ACCL::dataType::float32, 1)); + } + + for (int i = 0; i < blocks_per_col; i++) { + Buffer_left_list.back().push_back(config.accl->create_buffer( + config.programSettings->blockSize * + (config.programSettings->blockSize), + ACCL::dataType::float32, 1)); + } + } + + /* --- Execute actual benchmark kernels --- */ + + double t; + std::vector gefaExecutionTimes; + std::vector geslExecutionTimes; + std::vector gefaWaitTimes; + for (int i = 0; i < config.programSettings->numRepetitions; i++) { + + Buffer_a.sync(XCL_BO_SYNC_BO_TO_DEVICE); + Buffer_b.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // Command queues + // A new command queue is created for every iteration of the algorithm to + // reduce the overhead of too large queues + std::vector inner_mms; + std::thread flush_thread; + + std::chrono::time_point t1, t2, twait1, + twait2; + std::chrono::duration currentwaittime = + std::chrono::duration::zero(); + + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << "Start! " << std::endl; + MPI_Barrier(MPI_COMM_WORLD); + t1 = std::chrono::high_resolution_clock::now(); + + int kernel_offset = 0; +#pragma omp parallel + { + +#pragma omp single + uint current_replication = 0; + + // For every row of blocks create kernels and enqueue them + for (int block_row = 0; block_row < config.programSettings->matrixSize / + config.programSettings->blockSize; + block_row++) { + + int local_block_row_remainder = + (block_row % config.programSettings->torus_height); + int local_block_row = + (block_row / config.programSettings->torus_height); + int local_block_col_remainder = + (block_row % config.programSettings->torus_width); + int local_block_col = (block_row / config.programSettings->torus_width); + bool in_same_row_as_lu = + local_block_row_remainder == config.programSettings->torus_row; + bool in_same_col_as_lu = + local_block_col_remainder == config.programSettings->torus_col; + int start_row_index = + local_block_row + + ((local_block_row_remainder >= config.programSettings->torus_row) + ? 1 + : 0); + int start_col_index = + local_block_col + + ((local_block_col_remainder >= config.programSettings->torus_col) + ? 1 + : 0); + int num_left_blocks = + (in_same_col_as_lu) ? blocks_per_col - start_row_index : 0; + int num_top_blocks = + (in_same_row_as_lu) ? blocks_per_row - start_col_index : 0; + int num_inner_block_rows = (blocks_per_col - start_row_index); + int num_inner_block_cols = + (num_inner_block_rows > 0) ? (blocks_per_row - start_col_index) : 0; + num_inner_block_rows = + (num_inner_block_cols > 0) ? num_inner_block_rows : 0; + bool is_calulating_lu_block = (in_same_col_as_lu && in_same_row_as_lu); + +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col + << " Start iteration " << block_row << std::endl; +#endif + + uint total_inner_updates_first_row = num_inner_block_cols; + uint updates_per_replication = + total_inner_updates_first_row / + config.programSettings->kernelReplications; + uint total_inner_updates = + (num_inner_block_cols - 1) * (num_inner_block_rows - 1); + uint total_updates_per_replication = + total_inner_updates / config.programSettings->kernelReplications; + uint current_update = 0; + + std::vector comm_kernel_runs; + +#pragma omp single + { + + if (is_calulating_lu_block) { + // create the LU kernel + auto lu_kernel = xrt::kernel(*config.device, *config.program, "lu"); + +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " LU " + << local_block_row << "," << local_block_col << std::endl; +#endif + auto lu_run = + lu_kernel(Buffer_a, Buffer_lu1, Buffer_lu2, local_block_col, + local_block_row, blocks_per_row); + lu_run.wait(); + } + + // Exchange LU blocks on all ranks to prevent stalls in MPI broadcast + // All tasks until now need to be executed so we can use the result of + // the LU factorization and communicate it via MPI with the other + // FPGAs + + // Broadcast LU block in column to update all left blocks + config.accl->bcast(2, *Buffer_lu2, + config.programSettings->blockSize * + config.programSettings->blockSize, + local_block_row_remainder, true, true); + // Broadcast LU block in row to update all top blocks + config.accl->bcast(1, *Buffer_lu2, + config.programSettings->blockSize * + config.programSettings->blockSize, + local_block_col_remainder, true, true); + } + + if (num_top_blocks > 0) { + +// Create top kernels +#pragma omp for + for (int tops = start_col_index; tops < blocks_per_row; tops++) { + xrt::kernel k(*config.device, *config.program, "top_update"); +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " Top " + << local_block_row << "," << tops << std::endl; +#endif + + comm_kernel_runs.push_back( + k(Buffer_a, + Buffer_top_list[block_row % 2][tops - start_col_index], + Buffer_lu1, (tops == start_col_index), tops, local_block_row, + blocks_per_row)); + } + } + if (num_left_blocks > 0) { + +// Create left kernels +#pragma omp for + for (int tops = start_row_index; tops < blocks_per_col; tops++) { + xrt::kernel k(*config.device, *config.program, "left_update"); +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " Left " << tops + << "," << local_block_col << std::endl; +#endif + comm_kernel_runs.push_back( + k(Buffer_a, + Buffer_left_list[block_row % 2][tops - start_row_index], + Buffer_lu2, (tops == start_row_index), local_block_col, tops, + blocks_per_row)); + } + } + +#pragma omp single + { + // Wait until all top and left blocks are calculated + std::for_each(comm_kernel_runs.begin(), comm_kernel_runs.end(), + [](xrt::run &e) { e.wait(); }); + + // Send the left and top blocks to all other ranks so they can be used + // to update all inner blocks + for (int lbi = 0; + lbi < + std::max(static_cast(blocks_per_col - local_block_col), 0); + lbi++) { + config.accl->bcast(1, *Buffer_left_list[block_row % 2][lbi], + config.programSettings->blockSize * + config.programSettings->blockSize, + local_block_col_remainder, true, true); + } + for (int tbi = 0; + tbi < + std::max(static_cast(blocks_per_row - local_block_row), 0); + tbi++) { + config.accl->bcast(2, *Buffer_top_list[block_row % 2][tbi], + config.programSettings->blockSize * + config.programSettings->blockSize, + local_block_row_remainder, true, true); + } + + // update all remaining inner blocks using only global memory + } + + std::vector outer_mms; + + // Wait for previous inner MMs to complete. + // They may need to be reused by the next outer MM calls! + std::for_each(inner_mms.begin(), inner_mms.end(), + [](xrt::run &e) { e.wait(); }); + +#pragma omp for + for (int lbi = 1; lbi < num_inner_block_rows; lbi++) { + + // select the matrix multiplication kernel that should be used for + // this block updated + xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); + + int block_col = static_cast( + (data.matrix_width / config.programSettings->blockSize) - + num_inner_block_cols); + int block_row = static_cast( + (data.matrix_height / config.programSettings->blockSize) - + num_inner_block_rows + lbi); + + outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][lbi], + Buffer_top_list[block_row % 2][0], block_col, + block_row, blocks_per_row)); + } + +#pragma omp for + for (int tbi = 0; tbi < num_inner_block_cols; tbi++) { + + // select the matrix multiplication kernel that should be used for + // this block updated + xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); + + int block_col = static_cast( + (data.matrix_width / config.programSettings->blockSize) - + num_inner_block_cols + tbi); + int block_row = static_cast( + (data.matrix_height / config.programSettings->blockSize) - + num_inner_block_rows); + + outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][0], + Buffer_top_list[block_row % 2][tbi], block_col, + block_row, blocks_per_row)); + } + + // Clear inner MM runs vector for this iteration + // All runs have completed before scheduling the outer MMs + inner_mms.clear(); + +#pragma omp for collapse(2) schedule(static) + for (int lbi = 1; lbi < num_inner_block_rows; lbi++) { + for (int tbi = 1; tbi < num_inner_block_cols; tbi++) { + // select the matrix multiplication kernel that should be used for + // this block updated + + xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); + + int block_col = static_cast( + (data.matrix_width / config.programSettings->blockSize) - + num_inner_block_cols + tbi); + int block_row = static_cast( + (data.matrix_height / config.programSettings->blockSize) - + num_inner_block_rows + lbi); + + inner_mms.push_back(k(Buffer_a, + Buffer_left_list[block_row % 2][lbi], + Buffer_top_list[block_row % 2][tbi], + block_col, block_row, blocks_per_row)); + } + } + +#ifndef NDEBUG + MPI_Barrier(MPI_COMM_WORLD); + if (is_calulating_lu_block) + std::cout << "---------------" << std::endl; +#endif + + // Wait for all outer MMs to complete because the results are required + // by the next communication phase + std::for_each(outer_mms.begin(), outer_mms.end(), + [](xrt::run &e) { e.wait(); }); + } + } + +#ifdef NDEBUG + t2 = std::chrono::high_resolution_clock::now(); + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << "End! " << std::endl; +#endif + +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col + << "Wait time: " << currentwaittime.count() << "s" << std::endl; + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " Exit " << i + << std::endl; +#endif + + std::chrono::duration timespan = + std::chrono::duration_cast>(t2 - t1); + gefaExecutionTimes.push_back(timespan.count()); + + // Execute GESL + t1 = std::chrono::high_resolution_clock::now(); + t2 = std::chrono::high_resolution_clock::now(); + timespan = + std::chrono::duration_cast>(t2 - t1); + geslExecutionTimes.push_back(timespan.count()); + } + + /* --- Read back results from Device --- */ + + Buffer_a.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + if (!config.programSettings->isDiagonallyDominant) { + Buffer_pivot.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + } + + std::unique_ptr results( + new linpack::LinpackExecutionTimings{gefaExecutionTimes, + geslExecutionTimes}); + + MPI_Barrier(MPI_COMM_WORLD); + + return results; +} + +} // namespace accl_buffers +} // namespace execution +} // namespace linpack + +#endif diff --git a/LINPACK/src/host/execution_types/execution_iec.hpp b/LINPACK/src/host/execution_types/execution_iec.hpp index 3c232f41..b07ed6a6 100644 --- a/LINPACK/src/host/execution_types/execution_iec.hpp +++ b/LINPACK/src/host/execution_types/execution_iec.hpp @@ -44,9 +44,8 @@ namespace iec { /* Prepare kernels and execute benchmark for a bitstream that makes use of intel external channels */ -template std::unique_ptr -calculate(const hpcc_base::ExecutionSettings&config, +calculate(const hpcc_base::ExecutionSettings&config, linpack::LinpackData& data) { int err; diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp index 5462f025..5ef4ad27 100644 --- a/LINPACK/src/host/execution_types/execution_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_pcie.hpp @@ -50,9 +50,8 @@ namespace pcie { @copydoc bm_execution::calculate() */ -template std::unique_ptr -calculate(const hpcc_base::ExecutionSettings&config, +calculate(const hpcc_base::ExecutionSettings&config, linpack::LinpackData& data) { cl_int err; diff --git a/LINPACK/src/host/execution_types/execution_types.hpp b/LINPACK/src/host/execution_types/execution_types.hpp index 975dd4cf..1990336d 100644 --- a/LINPACK/src/host/execution_types/execution_types.hpp +++ b/LINPACK/src/host/execution_types/execution_types.hpp @@ -22,7 +22,13 @@ SOFTWARE. #ifndef EXECUTION_TYPES_HPP #define EXECUTION_TYPES_HPP +#ifdef USE_OCL_HOST #include "execution_types/execution_pcie.hpp" #include "execution_types/execution_iec.hpp" - +#endif +#ifdef USE_XRT_HOST +#ifdef USE_ACCL +#include "execution_types/execution_accl_buffers.hpp" +#endif +#endif #endif \ No newline at end of file diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp index b79fa65a..eed54a44 100644 --- a/LINPACK/src/host/linpack_benchmark.hpp +++ b/LINPACK/src/host/linpack_benchmark.hpp @@ -308,8 +308,13 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark timings; switch (this->executionSettings->programSettings->communicationType) { +#ifdef USE_OCL_HOST case hpcc_base::CommunicationType::pcie_mpi : timings = execution::pcie::calculate(*this->executionSettings, data); break; case hpcc_base::CommunicationType::intel_external_channels: timings = execution::iec::calculate(*this->executionSettings, data); break; +#endif +#ifdef USE_XRT_HOST + case hpcc_base::CommunicationType::accl : timings = execution::accl_buffers::calculate(*this->executionSettings, data); break; +#endif default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(this->executionSettings->programSettings->communicationType)); } #ifdef DISTRIBUTED_VALIDATION diff --git a/LINPACK/src/host/linpack_data.cpp b/LINPACK/src/host/linpack_data.cpp index 951c37c2..2c724796 100644 --- a/LINPACK/src/host/linpack_data.cpp +++ b/LINPACK/src/host/linpack_data.cpp @@ -24,7 +24,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "linpack_benchmark.hpp" +#include "linpack_data.hpp" /* C++ standard library headers */ #include @@ -32,7 +32,6 @@ SOFTWARE. /* Project's headers */ #include "communication_types.hpp" -#include "execution_types/execution_types.hpp" #include "parameters.h" linpack::LinpackProgramSettings::LinpackProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), diff --git a/LINPACK/src/host/main.cpp b/LINPACK/src/host/main.cpp index 73dcc570..cfd89914 100644 --- a/LINPACK/src/host/main.cpp +++ b/LINPACK/src/host/main.cpp @@ -12,7 +12,12 @@ The program entry point int main(int argc, char *argv[]) { // Setup benchmark +#ifdef USE_OCL_HOST LinpackBenchmark bm(argc, argv); +#endif +#ifdef USE_XRT_HOST + LinpackBenchmark bm(argc, argv); +#endif bool success = bm.executeBenchmark(); if (success) { return 0; From c902b8243dbe4e7e55e0a13c6287f27ba8b3d42b Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 28 Apr 2022 19:02:27 +0100 Subject: [PATCH 061/318] Add ACCL config for HPL --- ...linx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake new file mode 100644 index 00000000..941a1d78 --- /dev/null +++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake @@ -0,0 +1,28 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) + +# LINPACK specific options +set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE) +set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE) +set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) + +set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) +set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini" CACHE STRING "Link settings file" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE) +set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE) From 264ac09db8a768e054fdfc6166195d37c2fbbf2d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 29 Apr 2022 15:24:20 +0100 Subject: [PATCH 062/318] Update for new communicator interface --- .../execution_accl_buffers.hpp | 96 +++++++++---------- 1 file changed, 46 insertions(+), 50 deletions(-) diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp index 4a2a7907..6424a99f 100644 --- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp +++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp @@ -48,8 +48,8 @@ namespace accl_buffers { @copydoc bm_execution::calculate() */ std::unique_ptr calculate( - const hpcc_base::ExecutionSettings &config, + const hpcc_base::ExecutionSettings &config, linpack::LinpackData &data) { cl_int err; @@ -62,38 +62,32 @@ std::unique_ptr calculate( uint blocks_per_row = data.matrix_width / config.programSettings->blockSize; uint blocks_per_col = data.matrix_height / config.programSettings->blockSize; - // TODO: Allow to handle Communicators in ACCL! - // // Communicate with all ranks in the same row of the torus - // // Configure ACCL Communicators - - // // Create Ranks. This must be the same configuration as used for - // // the global communicator! - // std::vector all_accl_ranks = {}; - // for (int i = 0; i < config.programSettings->torus_width * config.programSettings->torus_; ++i) { - // // TODO: Replace the ip addresses and ports here for execution of real hardware? - // ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, 1024}; - // all_accl_ranks.emplace_back(new_rank); - // } - - // std::vector row_ranks; - // std::vector col_ranks; - - // for (int i = 0; i < config.programSettings->torus_width; i++) { - // row_ranks.push_back(all_accl_ranks[i]); - // } - // for (int i = config.programSettings->torus_col; i < all_accl_ranks.size(); - // i += config.programSettings->torus_width) { - // col_ranks.push_back(all_accl_ranks[config.programSettings->torus_row * - // config.programSettings->torus_width + - // i]); - // } - - // // Row communicator should now be index 1 - // config.accl->configure_communicator(row_ranks, - // config.programSettings->torus_col); - // // Column communicator should now be index 2 - // config.accl->configure_communicator(col_ranks, - // config.programSettings->torus_row); + // Communicate with all ranks in the same row of the torus + // Configure ACCL Communicators + + // Get group of global communicator + std::vector all_accl_ranks = + config.accl->get_comm_group(ACCL::GLOBAL_COMM); + + std::vector row_ranks; + std::vector col_ranks; + + // Create sub-groups for rows and columns + for (int i = 0; i < config.programSettings->torus_width; i++) { + row_ranks.push_back(all_accl_ranks[i]); + } + for (int i = config.programSettings->torus_col; i < all_accl_ranks.size(); + i += config.programSettings->torus_width) { + col_ranks.push_back(all_accl_ranks[config.programSettings->torus_row * + config.programSettings->torus_width + + i]); + } + + // Create communicators from sub-groups + ACCL::CommunicatorId row_comm = config.accl->configure_communicator( + row_ranks, config.programSettings->torus_col); + ACCL::CommunicatorId col_comm = config.accl->configure_communicator( + col_ranks, config.programSettings->torus_row); // TODO: Select the correct memory groups! // Create Buffers for input and output @@ -126,17 +120,19 @@ std::unique_ptr calculate( Buffer_left_list.emplace_back(); Buffer_top_list.emplace_back(); for (int i = 0; i < blocks_per_row; i++) { - Buffer_top_list.back().push_back(config.accl->create_buffer( - config.programSettings->blockSize * - (config.programSettings->blockSize), - ACCL::dataType::float32, 1)); + Buffer_top_list.back().push_back( + config.accl->create_buffer( + config.programSettings->blockSize * + (config.programSettings->blockSize), + ACCL::dataType::float32, 1)); } for (int i = 0; i < blocks_per_col; i++) { - Buffer_left_list.back().push_back(config.accl->create_buffer( - config.programSettings->blockSize * - (config.programSettings->blockSize), - ACCL::dataType::float32, 1)); + Buffer_left_list.back().push_back( + config.accl->create_buffer( + config.programSettings->blockSize * + (config.programSettings->blockSize), + ACCL::dataType::float32, 1)); } } @@ -253,15 +249,15 @@ std::unique_ptr calculate( // FPGAs // Broadcast LU block in column to update all left blocks - config.accl->bcast(2, *Buffer_lu2, + config.accl->bcast(*Buffer_lu2, config.programSettings->blockSize * config.programSettings->blockSize, - local_block_row_remainder, true, true); + local_block_row_remainder, col_comm, true, true); // Broadcast LU block in row to update all top blocks - config.accl->bcast(1, *Buffer_lu2, + config.accl->bcast(*Buffer_lu2, config.programSettings->blockSize * config.programSettings->blockSize, - local_block_col_remainder, true, true); + local_block_col_remainder, row_comm, true, true); } if (num_top_blocks > 0) { @@ -314,19 +310,19 @@ std::unique_ptr calculate( lbi < std::max(static_cast(blocks_per_col - local_block_col), 0); lbi++) { - config.accl->bcast(1, *Buffer_left_list[block_row % 2][lbi], + config.accl->bcast(*Buffer_left_list[block_row % 2][lbi], config.programSettings->blockSize * config.programSettings->blockSize, - local_block_col_remainder, true, true); + local_block_col_remainder, row_comm, true, true); } for (int tbi = 0; tbi < std::max(static_cast(blocks_per_row - local_block_row), 0); tbi++) { - config.accl->bcast(2, *Buffer_top_list[block_row % 2][tbi], + config.accl->bcast(*Buffer_top_list[block_row % 2][tbi], config.programSettings->blockSize * config.programSettings->blockSize, - local_block_row_remainder, true, true); + local_block_row_remainder, col_comm, true, true); } // update all remaining inner blocks using only global memory From 94b5deb811e473ba125d638d6211645681e1f956 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 29 Apr 2022 16:18:44 +0100 Subject: [PATCH 063/318] Fix creation of row communicator --- LINPACK/src/host/execution_types/execution_accl_buffers.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp index 6424a99f..e1c2c5c0 100644 --- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp +++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp @@ -73,7 +73,11 @@ std::unique_ptr calculate( std::vector col_ranks; // Create sub-groups for rows and columns - for (int i = 0; i < config.programSettings->torus_width; i++) { + for (int i = config.programSettings->torus_row * + config.programSettings->torus_width; + i < config.programSettings->torus_row * + (config.programSettings->torus_width + 1); + i++) { row_ranks.push_back(all_accl_ranks[i]); } for (int i = config.programSettings->torus_col; i < all_accl_ranks.size(); From 03baabe594b17b011ee0e984d43384baa9bd8dce Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 6 May 2022 18:19:09 +0100 Subject: [PATCH 064/318] Change communicator call to new version --- LINPACK/src/host/execution_types/execution_accl_buffers.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp index e1c2c5c0..fd58d75c 100644 --- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp +++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp @@ -88,9 +88,9 @@ std::unique_ptr calculate( } // Create communicators from sub-groups - ACCL::CommunicatorId row_comm = config.accl->configure_communicator( + ACCL::CommunicatorId row_comm = config.accl->create_communicator( row_ranks, config.programSettings->torus_col); - ACCL::CommunicatorId col_comm = config.accl->configure_communicator( + ACCL::CommunicatorId col_comm = config.accl->create_communicator( col_ranks, config.programSettings->torus_row); // TODO: Select the correct memory groups! From 3fa0e842dc83ab64214f54c933d02108e90dec57 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 13 May 2022 17:34:18 +0100 Subject: [PATCH 065/318] Add XRT PCIE host code --- .../host/execution_types/execution_types.hpp | 1 + .../execution_types/execution_xrt_pcie.hpp | 493 ++++++++++++++++++ LINPACK/src/host/linpack_benchmark.hpp | 1 + 3 files changed, 495 insertions(+) create mode 100644 LINPACK/src/host/execution_types/execution_xrt_pcie.hpp diff --git a/LINPACK/src/host/execution_types/execution_types.hpp b/LINPACK/src/host/execution_types/execution_types.hpp index 1990336d..294115ea 100644 --- a/LINPACK/src/host/execution_types/execution_types.hpp +++ b/LINPACK/src/host/execution_types/execution_types.hpp @@ -29,6 +29,7 @@ SOFTWARE. #ifdef USE_XRT_HOST #ifdef USE_ACCL #include "execution_types/execution_accl_buffers.hpp" +#include "execution_types/execution_xrt_pcie.hpp" #endif #endif #endif \ No newline at end of file diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp new file mode 100644 index 00000000..58cd0acc --- /dev/null +++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp @@ -0,0 +1,493 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef EXECUTION_TYPES_EXECUTION_XRT_PCIE_HPP +#define EXECUTION_TYPES_EXECUTION_XRT_PCIE_HPP + +/* C++ standard library headers */ +#include +#include +#include +#include +#include +#include + +/* External library headers */ +#ifdef _OPENMP +#include "omp.h" +#endif + +#include "linpack_data.hpp" +#include "parameters.h" + +namespace linpack { +namespace execution { +namespace xrt_pcie { + +/* + Prepare kernels and execute benchmark + + @copydoc bm_execution::calculate() +*/ +std::unique_ptr calculate( + const hpcc_base::ExecutionSettings &config, + linpack::LinpackData &data) { + + cl_int err; + + int num_omp_threads = 1; +#ifdef _OPENMP + num_omp_threads = omp_get_num_threads(); +#endif + + uint blocks_per_row = data.matrix_width / config.programSettings->blockSize; + uint blocks_per_col = data.matrix_height / config.programSettings->blockSize; + + // Communicate with all ranks in the same row of the torus + MPI_Comm row_communicator; + MPI_Comm col_communicator; + + MPI_Comm_split(MPI_COMM_WORLD, config.programSettings->torus_row, 0, + &row_communicator); + MPI_Comm_split(MPI_COMM_WORLD, config.programSettings->torus_col, 0, + &col_communicator); + + // TODO: Select the correct memory groups! + // Create Buffers for input and output + // TODO: Need to set a memory group for the buffers here! + + auto lu_tmp_kernel = xrt::kernel(*config.device, *config.program, "lu"); + xrt::bo Buffer_a( + *config.device, data.A, + sizeof(HOST_DATA_TYPE) * data.matrix_height * data.matrix_width, lu_tmp_kernel.group_id(0)); + xrt::bo Buffer_b(*config.device, data.b, + sizeof(HOST_DATA_TYPE) * data.matrix_width, lu_tmp_kernel.group_id(0)); + xrt::bo Buffer_pivot(*config.device, data.ipvt, + sizeof(cl_int) * data.matrix_height, lu_tmp_kernel.group_id(0)); + + /* --- Setup MPI communication and required additional buffers --- */ + HOST_DATA_TYPE *lu_block, *lu_trans_block; + posix_memalign(reinterpret_cast(&lu_block), 4096, + sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * + (config.programSettings->blockSize)); + posix_memalign(reinterpret_cast(&lu_trans_block), 4096, + sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * + (config.programSettings->blockSize)); + + // Buffers only used to store data received over the network layer + // The content will not be modified by the host + xrt::bo Buffer_lu1(*config.device, lu_trans_block, + sizeof(HOST_DATA_TYPE) * + (config.programSettings->blockSize) * + (config.programSettings->blockSize), + lu_tmp_kernel.group_id(1)); + xrt::bo Buffer_lu2(*config.device, lu_block, + sizeof(HOST_DATA_TYPE) * + (config.programSettings->blockSize) * + (config.programSettings->blockSize), + lu_tmp_kernel.group_id(2)); + + std::vector> Buffer_left_list(2); + std::vector> Buffer_top_list(2); + std::vector> left_blocks; + std::vector> top_blocks; + + for (int double_buffer = 0; double_buffer < 2; double_buffer++) { + top_blocks.emplace_back(blocks_per_row); + left_blocks.emplace_back(blocks_per_col); + for (int i = 0; i < blocks_per_row; i++) { + posix_memalign( + reinterpret_cast(&(top_blocks[double_buffer][i])), 4096, + sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * + (config.programSettings->blockSize)); + Buffer_top_list[double_buffer].emplace_back( + *config.device, top_blocks[double_buffer][i], + sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * + (config.programSettings->blockSize), + lu_tmp_kernel.group_id(0)); + } + + for (int i = 0; i < blocks_per_col; i++) { + posix_memalign( + reinterpret_cast(&(left_blocks[double_buffer][i])), 4096, + sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * + (config.programSettings->blockSize)); + Buffer_left_list[double_buffer].emplace_back( + *config.device, left_blocks[double_buffer][i], + sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * + (config.programSettings->blockSize), + lu_tmp_kernel.group_id(2)); + } + } + + /* --- Execute actual benchmark kernels --- */ + + double t; + std::vector gefaExecutionTimes; + std::vector geslExecutionTimes; + std::vector gefaWaitTimes; + for (int i = 0; i < config.programSettings->numRepetitions; i++) { + + Buffer_a.sync(XCL_BO_SYNC_BO_TO_DEVICE); + Buffer_b.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // Command queues + // A new command queue is created for every iteration of the algorithm to + // reduce the overhead of too large queues + std::vector inner_mms; + std::thread flush_thread; + + std::chrono::time_point t1, t2, twait1, + twait2; + std::chrono::duration currentwaittime = + std::chrono::duration::zero(); + + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << "Start! " << std::endl; + MPI_Barrier(MPI_COMM_WORLD); + t1 = std::chrono::high_resolution_clock::now(); + + int kernel_offset = 0; +#pragma omp parallel + { + +#pragma omp single + uint current_replication = 0; + + // For every row of blocks create kernels and enqueue them + for (int block_row = 0; block_row < config.programSettings->matrixSize / + config.programSettings->blockSize; + block_row++) { + + int local_block_row_remainder = + (block_row % config.programSettings->torus_height); + int local_block_row = + (block_row / config.programSettings->torus_height); + int local_block_col_remainder = + (block_row % config.programSettings->torus_width); + int local_block_col = (block_row / config.programSettings->torus_width); + bool in_same_row_as_lu = + local_block_row_remainder == config.programSettings->torus_row; + bool in_same_col_as_lu = + local_block_col_remainder == config.programSettings->torus_col; + int start_row_index = + local_block_row + + ((local_block_row_remainder >= config.programSettings->torus_row) + ? 1 + : 0); + int start_col_index = + local_block_col + + ((local_block_col_remainder >= config.programSettings->torus_col) + ? 1 + : 0); + int num_left_blocks = + (in_same_col_as_lu) ? blocks_per_col - start_row_index : 0; + int num_top_blocks = + (in_same_row_as_lu) ? blocks_per_row - start_col_index : 0; + int num_inner_block_rows = (blocks_per_col - start_row_index); + int num_inner_block_cols = + (num_inner_block_rows > 0) ? (blocks_per_row - start_col_index) : 0; + num_inner_block_rows = + (num_inner_block_cols > 0) ? num_inner_block_rows : 0; + bool is_calulating_lu_block = (in_same_col_as_lu && in_same_row_as_lu); + +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col + << " Start iteration " << block_row << std::endl; +#endif + + uint total_inner_updates_first_row = num_inner_block_cols; + uint updates_per_replication = + total_inner_updates_first_row / + config.programSettings->kernelReplications; + uint total_inner_updates = + (num_inner_block_cols - 1) * (num_inner_block_rows - 1); + uint total_updates_per_replication = + total_inner_updates / config.programSettings->kernelReplications; + uint current_update = 0; + + std::vector comm_kernel_runs; + +#pragma omp single + { + + if (is_calulating_lu_block) { + // create the LU kernel + auto lu_kernel = xrt::kernel(*config.device, *config.program, "lu"); + +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " LU " + << local_block_row << "," << local_block_col << std::endl; +#endif + auto lu_run = + lu_kernel(Buffer_a, Buffer_lu1, Buffer_lu2, local_block_col, + local_block_row, blocks_per_row); + ert_cmd_state state = lu_run.wait(); + if (state != ERT_CMD_STATE_COMPLETED) { + std::cerr << "Execution Lu failed: " << state << std::endl; + } + Buffer_lu1.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + Buffer_lu2.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + } + + // Broadcast LU block in column to update all left blocks + MPI_Bcast(lu_block, + config.programSettings->blockSize * + config.programSettings->blockSize, + MPI_DATA_TYPE, local_block_row_remainder, col_communicator); + // Broadcast LU block in row to update all top blocks + MPI_Bcast(lu_trans_block, + config.programSettings->blockSize * + config.programSettings->blockSize, + MPI_DATA_TYPE, local_block_col_remainder, row_communicator); + } + + if (num_top_blocks > 0) { + + Buffer_lu1.sync(XCL_BO_SYNC_BO_TO_DEVICE); + +// Create top kernels +#pragma omp for + for (int tops = start_col_index; tops < blocks_per_row; tops++) { + xrt::kernel k(*config.device, *config.program, "top_update"); +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " Top " + << local_block_row << "," << tops << std::endl; +#endif + + comm_kernel_runs.push_back( + k(Buffer_a, + Buffer_top_list[block_row % 2][tops - start_col_index], + Buffer_lu1, (tops == start_col_index), tops, local_block_row, + blocks_per_row)); + } + } + if (num_left_blocks > 0) { + + Buffer_lu2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + +// Create left kernels +#pragma omp for + for (int tops = start_row_index; tops < blocks_per_col; tops++) { + xrt::kernel k(*config.device, *config.program, "left_update"); +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " Left " << tops + << "," << local_block_col << std::endl; +#endif + comm_kernel_runs.push_back( + k(Buffer_a, + Buffer_left_list[block_row % 2][tops - start_row_index], + Buffer_lu2, (tops == start_row_index), local_block_col, tops, + blocks_per_row)); + } + } + +#pragma omp single + { + // Wait until all top and left blocks are calculated + std::for_each(comm_kernel_runs.begin(), comm_kernel_runs.end(), + [](xrt::run &e) { e.wait(); }); + + // Send the left and top blocks to all other ranks so they can be used + // to update all inner blocks + for (int lbi = 0; + lbi < + std::max(static_cast(blocks_per_col - local_block_col), 0); + lbi++) { + Buffer_left_list[block_row % 2][lbi].sync( + XCL_BO_SYNC_BO_FROM_DEVICE); + MPI_Bcast(left_blocks[block_row % 2][lbi], + config.programSettings->blockSize * + config.programSettings->blockSize, + MPI_DATA_TYPE, local_block_col_remainder, + row_communicator); + Buffer_left_list[block_row % 2][lbi].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + for (int tbi = 0; + tbi < + std::max(static_cast(blocks_per_row - local_block_row), 0); + tbi++) { + Buffer_top_list[block_row % 2][tbi].sync( + XCL_BO_SYNC_BO_FROM_DEVICE); + MPI_Bcast(top_blocks[block_row % 2][tbi], + config.programSettings->blockSize * + config.programSettings->blockSize, + MPI_DATA_TYPE, local_block_row_remainder, + col_communicator); + Buffer_top_list[block_row % 2][tbi].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + + // update all remaining inner blocks using only global memory + } + + std::vector outer_mms; + + // Wait for previous inner MMs to complete. + // They may need to be reused by the next outer MM calls! + std::for_each(inner_mms.begin(), inner_mms.end(), + [](xrt::run &e) { e.wait(); }); + +#pragma omp for + for (int lbi = 1; lbi < num_inner_block_rows; lbi++) { + + // select the matrix multiplication kernel that should be used for + // this block updated + xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); + + int block_col = static_cast( + (data.matrix_width / config.programSettings->blockSize) - + num_inner_block_cols); + int block_row = static_cast( + (data.matrix_height / config.programSettings->blockSize) - + num_inner_block_rows + lbi); + +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " MM row " + << block_row << "," << block_col << std::endl; +#endif + + outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][lbi], + Buffer_top_list[block_row % 2][0], block_col, + block_row, blocks_per_row)); + } + +#pragma omp for + for (int tbi = 0; tbi < num_inner_block_cols; tbi++) { + + // select the matrix multiplication kernel that should be used for + // this block updated + xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); + + int block_col = static_cast( + (data.matrix_width / config.programSettings->blockSize) - + num_inner_block_cols + tbi); + int block_row = static_cast( + (data.matrix_height / config.programSettings->blockSize) - + num_inner_block_rows); + +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " MM col " + << block_row << "," << block_col << std::endl; +#endif + + outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][0], + Buffer_top_list[block_row % 2][tbi], block_col, + block_row, blocks_per_row)); + } + + // Clear inner MM runs vector for this iteration + // All runs have completed before scheduling the outer MMs + inner_mms.clear(); + +#pragma omp for collapse(2) schedule(static) + for (int lbi = 1; lbi < num_inner_block_rows; lbi++) { + for (int tbi = 1; tbi < num_inner_block_cols; tbi++) { + // select the matrix multiplication kernel that should be used for + // this block updated + + xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); + + int block_col = static_cast( + (data.matrix_width / config.programSettings->blockSize) - + num_inner_block_cols + tbi); + int block_row = static_cast( + (data.matrix_height / config.programSettings->blockSize) - + num_inner_block_rows + lbi); + +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " MM " + << block_row << "," << block_col << std::endl; +#endif + + inner_mms.push_back(k(Buffer_a, + Buffer_left_list[block_row % 2][lbi], + Buffer_top_list[block_row % 2][tbi], + block_col, block_row, blocks_per_row)); + } + } + +#ifndef NDEBUG + MPI_Barrier(MPI_COMM_WORLD); + if (is_calulating_lu_block) + std::cout << "---------------" << std::endl; +#endif + + // Wait for all outer MMs to complete because the results are required + // by the next communication phase + std::for_each(outer_mms.begin(), outer_mms.end(), + [](xrt::run &e) { e.wait(); }); + } + } + + t2 = std::chrono::high_resolution_clock::now(); + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << "End! " << std::endl; + +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col + << "Wait time: " << currentwaittime.count() << "s" << std::endl; + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " Exit " << i + << std::endl; +#endif + + std::chrono::duration timespan = + std::chrono::duration_cast>(t2 - t1); + gefaExecutionTimes.push_back(timespan.count()); + + // Execute GESL + t1 = std::chrono::high_resolution_clock::now(); + t2 = std::chrono::high_resolution_clock::now(); + timespan = + std::chrono::duration_cast>(t2 - t1); + geslExecutionTimes.push_back(timespan.count()); + } + + /* --- Read back results from Device --- */ + + Buffer_a.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + if (!config.programSettings->isDiagonallyDominant) { + Buffer_pivot.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + } + + std::unique_ptr results( + new linpack::LinpackExecutionTimings{gefaExecutionTimes, + geslExecutionTimes}); + + MPI_Barrier(MPI_COMM_WORLD); + + return results; +} + +} // namespace xrt_pcie +} // namespace execution +} // namespace linpack + +#endif diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp index eed54a44..c6656ffd 100644 --- a/LINPACK/src/host/linpack_benchmark.hpp +++ b/LINPACK/src/host/linpack_benchmark.hpp @@ -313,6 +313,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmarkexecutionSettings, data); break; #endif #ifdef USE_XRT_HOST + case hpcc_base::CommunicationType::pcie_mpi : timings = execution::xrt_pcie::calculate(*this->executionSettings, data); break; case hpcc_base::CommunicationType::accl : timings = execution::accl_buffers::calculate(*this->executionSettings, data); break; #endif default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(this->executionSettings->programSettings->communicationType)); From f578a06ac3757eac5ebdbd3289a52336bec6f0bb Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 17 May 2022 12:09:37 +0100 Subject: [PATCH 066/318] Fix xrt scheduling --- .../execution_types/execution_xrt_pcie.hpp | 80 ++++++++----------- 1 file changed, 32 insertions(+), 48 deletions(-) diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp index 58cd0acc..7c239aae 100644 --- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp @@ -85,22 +85,15 @@ std::unique_ptr calculate( sizeof(cl_int) * data.matrix_height, lu_tmp_kernel.group_id(0)); /* --- Setup MPI communication and required additional buffers --- */ - HOST_DATA_TYPE *lu_block, *lu_trans_block; - posix_memalign(reinterpret_cast(&lu_block), 4096, - sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * - (config.programSettings->blockSize)); - posix_memalign(reinterpret_cast(&lu_trans_block), 4096, - sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * - (config.programSettings->blockSize)); // Buffers only used to store data received over the network layer // The content will not be modified by the host - xrt::bo Buffer_lu1(*config.device, lu_trans_block, + xrt::bo Buffer_lu1(*config.device, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * (config.programSettings->blockSize), lu_tmp_kernel.group_id(1)); - xrt::bo Buffer_lu2(*config.device, lu_block, + xrt::bo Buffer_lu2(*config.device, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * (config.programSettings->blockSize), @@ -108,31 +101,19 @@ std::unique_ptr calculate( std::vector> Buffer_left_list(2); std::vector> Buffer_top_list(2); - std::vector> left_blocks; - std::vector> top_blocks; for (int double_buffer = 0; double_buffer < 2; double_buffer++) { - top_blocks.emplace_back(blocks_per_row); - left_blocks.emplace_back(blocks_per_col); for (int i = 0; i < blocks_per_row; i++) { - posix_memalign( - reinterpret_cast(&(top_blocks[double_buffer][i])), 4096, - sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * - (config.programSettings->blockSize)); Buffer_top_list[double_buffer].emplace_back( - *config.device, top_blocks[double_buffer][i], + *config.device, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * (config.programSettings->blockSize), lu_tmp_kernel.group_id(0)); } for (int i = 0; i < blocks_per_col; i++) { - posix_memalign( - reinterpret_cast(&(left_blocks[double_buffer][i])), 4096, - sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * - (config.programSettings->blockSize)); Buffer_left_list[double_buffer].emplace_back( - *config.device, left_blocks[double_buffer][i], + *config.device, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * (config.programSettings->blockSize), lu_tmp_kernel.group_id(2)); @@ -151,8 +132,8 @@ std::unique_ptr calculate( Buffer_b.sync(XCL_BO_SYNC_BO_TO_DEVICE); // Command queues - // A new command queue is created for every iteration of the algorithm to - // reduce the overhead of too large queues + // A new command queue is created for every iteration of the + // algorithm to reduce the overhead of too large queues std::vector inner_mms; std::thread flush_thread; @@ -252,12 +233,12 @@ std::unique_ptr calculate( } // Broadcast LU block in column to update all left blocks - MPI_Bcast(lu_block, + MPI_Bcast(Buffer_lu2.map(), config.programSettings->blockSize * config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, col_communicator); // Broadcast LU block in row to update all top blocks - MPI_Bcast(lu_trans_block, + MPI_Bcast(Buffer_lu1.map(), config.programSettings->blockSize * config.programSettings->blockSize, MPI_DATA_TYPE, local_block_col_remainder, row_communicator); @@ -319,7 +300,7 @@ std::unique_ptr calculate( lbi++) { Buffer_left_list[block_row % 2][lbi].sync( XCL_BO_SYNC_BO_FROM_DEVICE); - MPI_Bcast(left_blocks[block_row % 2][lbi], + MPI_Bcast(Buffer_left_list[block_row % 2][lbi].map(), config.programSettings->blockSize * config.programSettings->blockSize, MPI_DATA_TYPE, local_block_col_remainder, @@ -332,7 +313,7 @@ std::unique_ptr calculate( tbi++) { Buffer_top_list[block_row % 2][tbi].sync( XCL_BO_SYNC_BO_FROM_DEVICE); - MPI_Bcast(top_blocks[block_row % 2][tbi], + MPI_Bcast(Buffer_top_list[block_row % 2][tbi].map(), config.programSettings->blockSize * config.programSettings->blockSize, MPI_DATA_TYPE, local_block_row_remainder, @@ -355,24 +336,25 @@ std::unique_ptr calculate( // select the matrix multiplication kernel that should be used for // this block updated - xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); + xrt::kernel k(*config.device, *config.program, + "inner_update_mm0"); - int block_col = static_cast( + int current_block_col = static_cast( (data.matrix_width / config.programSettings->blockSize) - num_inner_block_cols); - int block_row = static_cast( + int current_block_row = static_cast( (data.matrix_height / config.programSettings->blockSize) - num_inner_block_rows + lbi); #ifndef NDEBUG std::cout << "Torus " << config.programSettings->torus_row << "," - << config.programSettings->torus_col << " MM row " - << block_row << "," << block_col << std::endl; + << config.programSettings->torus_col << " MM col " + << current_block_row << "," << current_block_col << std::endl; #endif outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][lbi], - Buffer_top_list[block_row % 2][0], block_col, - block_row, blocks_per_row)); + Buffer_top_list[block_row % 2][0], current_block_col, + current_block_row, blocks_per_row)); } #pragma omp for @@ -380,24 +362,25 @@ std::unique_ptr calculate( // select the matrix multiplication kernel that should be used for // this block updated - xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); + xrt::kernel k(*config.device, *config.program, + "inner_update_mm0"); - int block_col = static_cast( + int current_block_col = static_cast( (data.matrix_width / config.programSettings->blockSize) - num_inner_block_cols + tbi); - int block_row = static_cast( + int current_block_row = static_cast( (data.matrix_height / config.programSettings->blockSize) - num_inner_block_rows); #ifndef NDEBUG std::cout << "Torus " << config.programSettings->torus_row << "," - << config.programSettings->torus_col << " MM col " - << block_row << "," << block_col << std::endl; + << config.programSettings->torus_col << " MM row " + << current_block_row << "," << current_block_col << std::endl; #endif outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][0], - Buffer_top_list[block_row % 2][tbi], block_col, - block_row, blocks_per_row)); + Buffer_top_list[block_row % 2][tbi], current_block_col, + current_block_row, blocks_per_row)); } // Clear inner MM runs vector for this iteration @@ -410,25 +393,26 @@ std::unique_ptr calculate( // select the matrix multiplication kernel that should be used for // this block updated - xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); + xrt::kernel k(*config.device, *config.program, + "inner_update_mm0"); - int block_col = static_cast( + int current_block_col = static_cast( (data.matrix_width / config.programSettings->blockSize) - num_inner_block_cols + tbi); - int block_row = static_cast( + int current_block_row = static_cast( (data.matrix_height / config.programSettings->blockSize) - num_inner_block_rows + lbi); #ifndef NDEBUG std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " MM " - << block_row << "," << block_col << std::endl; + << current_block_row << "," << current_block_col << std::endl; #endif inner_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][lbi], Buffer_top_list[block_row % 2][tbi], - block_col, block_row, blocks_per_row)); + current_block_col, current_block_row, blocks_per_row)); } } From ecdb80d26446bc0f0d68f600f80df9c8db37379c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 17 May 2022 15:32:16 +0100 Subject: [PATCH 067/318] Refactoring XRT host code --- .../execution_types/execution_xrt_pcie.hpp | 49 ++++++++++--------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp index 7c239aae..b054a132 100644 --- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp @@ -76,13 +76,16 @@ std::unique_ptr calculate( // TODO: Need to set a memory group for the buffers here! auto lu_tmp_kernel = xrt::kernel(*config.device, *config.program, "lu"); - xrt::bo Buffer_a( - *config.device, data.A, - sizeof(HOST_DATA_TYPE) * data.matrix_height * data.matrix_width, lu_tmp_kernel.group_id(0)); + xrt::bo Buffer_a(*config.device, data.A, + sizeof(HOST_DATA_TYPE) * data.matrix_height * + data.matrix_width, + lu_tmp_kernel.group_id(0)); xrt::bo Buffer_b(*config.device, data.b, - sizeof(HOST_DATA_TYPE) * data.matrix_width, lu_tmp_kernel.group_id(0)); + sizeof(HOST_DATA_TYPE) * data.matrix_width, + lu_tmp_kernel.group_id(0)); xrt::bo Buffer_pivot(*config.device, data.ipvt, - sizeof(cl_int) * data.matrix_height, lu_tmp_kernel.group_id(0)); + sizeof(cl_int) * data.matrix_height, + lu_tmp_kernel.group_id(0)); /* --- Setup MPI communication and required additional buffers --- */ @@ -336,8 +339,7 @@ std::unique_ptr calculate( // select the matrix multiplication kernel that should be used for // this block updated - xrt::kernel k(*config.device, *config.program, - "inner_update_mm0"); + xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); int current_block_col = static_cast( (data.matrix_width / config.programSettings->blockSize) - @@ -349,12 +351,14 @@ std::unique_ptr calculate( #ifndef NDEBUG std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " MM col " - << current_block_row << "," << current_block_col << std::endl; + << current_block_row << "," << current_block_col + << std::endl; #endif outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][lbi], - Buffer_top_list[block_row % 2][0], current_block_col, - current_block_row, blocks_per_row)); + Buffer_top_list[block_row % 2][0], + current_block_col, current_block_row, + blocks_per_row)); } #pragma omp for @@ -362,8 +366,7 @@ std::unique_ptr calculate( // select the matrix multiplication kernel that should be used for // this block updated - xrt::kernel k(*config.device, *config.program, - "inner_update_mm0"); + xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); int current_block_col = static_cast( (data.matrix_width / config.programSettings->blockSize) - @@ -375,12 +378,14 @@ std::unique_ptr calculate( #ifndef NDEBUG std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " MM row " - << current_block_row << "," << current_block_col << std::endl; + << current_block_row << "," << current_block_col + << std::endl; #endif outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][0], - Buffer_top_list[block_row % 2][tbi], current_block_col, - current_block_row, blocks_per_row)); + Buffer_top_list[block_row % 2][tbi], + current_block_col, current_block_row, + blocks_per_row)); } // Clear inner MM runs vector for this iteration @@ -393,8 +398,7 @@ std::unique_ptr calculate( // select the matrix multiplication kernel that should be used for // this block updated - xrt::kernel k(*config.device, *config.program, - "inner_update_mm0"); + xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); int current_block_col = static_cast( (data.matrix_width / config.programSettings->blockSize) - @@ -406,13 +410,14 @@ std::unique_ptr calculate( #ifndef NDEBUG std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " MM " - << current_block_row << "," << current_block_col << std::endl; + << current_block_row << "," << current_block_col + << std::endl; #endif - inner_mms.push_back(k(Buffer_a, - Buffer_left_list[block_row % 2][lbi], - Buffer_top_list[block_row % 2][tbi], - current_block_col, current_block_row, blocks_per_row)); + inner_mms.push_back( + k(Buffer_a, Buffer_left_list[block_row % 2][lbi], + Buffer_top_list[block_row % 2][tbi], current_block_col, + current_block_row, blocks_per_row)); } } From 5ce867f7ae50565df5c04e8d263bef528bd73a93 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 17 May 2022 15:32:39 +0100 Subject: [PATCH 068/318] Fix single FPGA ACCL host code --- .../execution_accl_buffers.hpp | 151 ++++++++++++------ 1 file changed, 105 insertions(+), 46 deletions(-) diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp index fd58d75c..2f3922ae 100644 --- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp +++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp @@ -82,9 +82,7 @@ std::unique_ptr calculate( } for (int i = config.programSettings->torus_col; i < all_accl_ranks.size(); i += config.programSettings->torus_width) { - col_ranks.push_back(all_accl_ranks[config.programSettings->torus_row * - config.programSettings->torus_width + - i]); + col_ranks.push_back(all_accl_ranks[i]); } // Create communicators from sub-groups @@ -93,27 +91,50 @@ std::unique_ptr calculate( ACCL::CommunicatorId col_comm = config.accl->create_communicator( col_ranks, config.programSettings->torus_row); - // TODO: Select the correct memory groups! - // Create Buffers for input and output - // TODO: Need to set a memory group for the buffers here! - xrt::bo Buffer_a( - *config.device, data.A, - sizeof(HOST_DATA_TYPE) * data.matrix_height * data.matrix_width, 0); + // Create global memory buffers + auto lu_tmp_kernel = xrt::kernel(*config.device, *config.program, "lu"); + xrt::bo Buffer_a(*config.device, data.A, + sizeof(HOST_DATA_TYPE) * data.matrix_height * + data.matrix_width, + lu_tmp_kernel.group_id(0)); xrt::bo Buffer_b(*config.device, data.b, - sizeof(HOST_DATA_TYPE) * data.matrix_width, 0); + sizeof(HOST_DATA_TYPE) * data.matrix_width, + lu_tmp_kernel.group_id(0)); xrt::bo Buffer_pivot(*config.device, data.ipvt, - sizeof(cl_int) * data.matrix_height, 0); + sizeof(cl_int) * data.matrix_height, + lu_tmp_kernel.group_id(0)); + + // TODO: To make this code work with the ACCL simulator, we need to create + // buffers using bos. This vector is used to store these bos during execution. + // They will be accessed via the ACCL buffers are not required in the code + // itself. Fixing the simulator code of ACCL to always create a bo would fix + // this issue. + std::vector tmp_bos; /* --- Setup MPI communication and required additional buffers --- */ // Buffers only used to store data received over the network layer // The content will not be modified by the host + tmp_bos.emplace_back(*config.device, + sizeof(HOST_DATA_TYPE) * + (config.programSettings->blockSize) * + (config.programSettings->blockSize), + lu_tmp_kernel.group_id(1)); auto Buffer_lu1 = config.accl->create_buffer( + tmp_bos.back(), (config.programSettings->blockSize) * (config.programSettings->blockSize), - ACCL::dataType::float32, 1); + ACCL::dataType::float32); + tmp_bos.emplace_back(*config.device, + sizeof(HOST_DATA_TYPE) * + (config.programSettings->blockSize) * + (config.programSettings->blockSize), + lu_tmp_kernel.group_id(2)); auto Buffer_lu2 = config.accl->create_buffer( + tmp_bos.back(), (config.programSettings->blockSize) * (config.programSettings->blockSize), - ACCL::dataType::float32, 1); + ACCL::dataType::float32); + Buffer_lu1->sync_to_device(); + Buffer_lu2->sync_to_device(); std::vector>> Buffer_left_list; std::vector>> Buffer_top_list; @@ -124,19 +145,33 @@ std::unique_ptr calculate( Buffer_left_list.emplace_back(); Buffer_top_list.emplace_back(); for (int i = 0; i < blocks_per_row; i++) { + tmp_bos.emplace_back(*config.device, + sizeof(HOST_DATA_TYPE) * + (config.programSettings->blockSize) * + (config.programSettings->blockSize), + lu_tmp_kernel.group_id(0)); Buffer_top_list.back().push_back( config.accl->create_buffer( - config.programSettings->blockSize * + tmp_bos.back(), + (config.programSettings->blockSize) * (config.programSettings->blockSize), - ACCL::dataType::float32, 1)); + ACCL::dataType::float32)); + Buffer_top_list.back().back()->sync_to_device(); } for (int i = 0; i < blocks_per_col; i++) { + tmp_bos.emplace_back(*config.device, + sizeof(HOST_DATA_TYPE) * + (config.programSettings->blockSize) * + (config.programSettings->blockSize), + lu_tmp_kernel.group_id(2)); Buffer_left_list.back().push_back( config.accl->create_buffer( - config.programSettings->blockSize * + tmp_bos.back(), + (config.programSettings->blockSize) * (config.programSettings->blockSize), - ACCL::dataType::float32, 1)); + ACCL::dataType::float32)); + Buffer_left_list.back().back()->sync_to_device(); } } @@ -242,9 +277,12 @@ std::unique_ptr calculate( << local_block_row << "," << local_block_col << std::endl; #endif auto lu_run = - lu_kernel(Buffer_a, Buffer_lu1, Buffer_lu2, local_block_col, - local_block_row, blocks_per_row); - lu_run.wait(); + lu_kernel(Buffer_a, *Buffer_lu1->bo(), *Buffer_lu2->bo(), + local_block_col, local_block_row, blocks_per_row); + ert_cmd_state state = lu_run.wait(); + if (state != ERT_CMD_STATE_COMPLETED) { + std::cerr << "Execution Lu failed: " << state << std::endl; + } } // Exchange LU blocks on all ranks to prevent stalls in MPI broadcast @@ -278,9 +316,9 @@ std::unique_ptr calculate( comm_kernel_runs.push_back( k(Buffer_a, - Buffer_top_list[block_row % 2][tops - start_col_index], - Buffer_lu1, (tops == start_col_index), tops, local_block_row, - blocks_per_row)); + *Buffer_top_list[block_row % 2][tops - start_col_index]->bo(), + *Buffer_lu1->bo(), (tops == start_col_index), tops, + local_block_row, blocks_per_row)); } } if (num_left_blocks > 0) { @@ -294,11 +332,11 @@ std::unique_ptr calculate( << config.programSettings->torus_col << " Left " << tops << "," << local_block_col << std::endl; #endif - comm_kernel_runs.push_back( - k(Buffer_a, - Buffer_left_list[block_row % 2][tops - start_row_index], - Buffer_lu2, (tops == start_row_index), local_block_col, tops, - blocks_per_row)); + comm_kernel_runs.push_back(k( + Buffer_a, + *Buffer_left_list[block_row % 2][tops - start_row_index]->bo(), + *Buffer_lu2->bo(), (tops == start_row_index), local_block_col, + tops, blocks_per_row)); } } @@ -346,16 +384,24 @@ std::unique_ptr calculate( // this block updated xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); - int block_col = static_cast( + int current_block_col = static_cast( (data.matrix_width / config.programSettings->blockSize) - num_inner_block_cols); - int block_row = static_cast( + int current_block_row = static_cast( (data.matrix_height / config.programSettings->blockSize) - num_inner_block_rows + lbi); - outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][lbi], - Buffer_top_list[block_row % 2][0], block_col, - block_row, blocks_per_row)); +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " MM col " + << current_block_row << "," << current_block_col + << std::endl; +#endif + + outer_mms.push_back( + k(Buffer_a, *Buffer_left_list[block_row % 2][lbi]->bo(), + *Buffer_top_list[block_row % 2][0]->bo(), current_block_col, + current_block_row, blocks_per_row)); } #pragma omp for @@ -365,16 +411,24 @@ std::unique_ptr calculate( // this block updated xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); - int block_col = static_cast( + int current_block_col = static_cast( (data.matrix_width / config.programSettings->blockSize) - num_inner_block_cols + tbi); - int block_row = static_cast( + int current_block_row = static_cast( (data.matrix_height / config.programSettings->blockSize) - num_inner_block_rows); - outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][0], - Buffer_top_list[block_row % 2][tbi], block_col, - block_row, blocks_per_row)); +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " MM row " + << current_block_row << "," << current_block_col + << std::endl; +#endif + + outer_mms.push_back( + k(Buffer_a, *Buffer_left_list[block_row % 2][0]->bo(), + *Buffer_top_list[block_row % 2][tbi]->bo(), current_block_col, + current_block_row, blocks_per_row)); } // Clear inner MM runs vector for this iteration @@ -389,17 +443,24 @@ std::unique_ptr calculate( xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); - int block_col = static_cast( + int current_block_col = static_cast( (data.matrix_width / config.programSettings->blockSize) - num_inner_block_cols + tbi); - int block_row = static_cast( + int current_block_row = static_cast( (data.matrix_height / config.programSettings->blockSize) - num_inner_block_rows + lbi); - inner_mms.push_back(k(Buffer_a, - Buffer_left_list[block_row % 2][lbi], - Buffer_top_list[block_row % 2][tbi], - block_col, block_row, blocks_per_row)); +#ifndef NDEBUG + std::cout << "Torus " << config.programSettings->torus_row << "," + << config.programSettings->torus_col << " MM " + << current_block_row << "," << current_block_col + << std::endl; +#endif + + inner_mms.push_back( + k(Buffer_a, *Buffer_left_list[block_row % 2][lbi]->bo(), + *Buffer_top_list[block_row % 2][tbi]->bo(), current_block_col, + current_block_row, blocks_per_row)); } } @@ -416,11 +477,9 @@ std::unique_ptr calculate( } } -#ifdef NDEBUG t2 = std::chrono::high_resolution_clock::now(); std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << "End! " << std::endl; -#endif #ifndef NDEBUG std::cout << "Torus " << config.programSettings->torus_row << "," From c883f3532f8b9eb9786a40f552025279c56b1907 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 18 May 2022 11:34:19 +0100 Subject: [PATCH 069/318] Refactor XRT host codes --- .../execution_accl_buffers.hpp | 33 ++++++++++--------- .../execution_types/execution_xrt_pcie.hpp | 23 +++++++------ 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp index 2f3922ae..29645a51 100644 --- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp +++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp @@ -73,10 +73,10 @@ std::unique_ptr calculate( std::vector col_ranks; // Create sub-groups for rows and columns - for (int i = config.programSettings->torus_row * - config.programSettings->torus_width; - i < config.programSettings->torus_row * - (config.programSettings->torus_width + 1); + for (int i = config.programSettings->torus_width * + config.programSettings->torus_row; + i < config.programSettings->torus_width * + (config.programSettings->torus_row + 1); i++) { row_ranks.push_back(all_accl_ranks[i]); } @@ -296,12 +296,11 @@ std::unique_ptr calculate( config.programSettings->blockSize, local_block_row_remainder, col_comm, true, true); // Broadcast LU block in row to update all top blocks - config.accl->bcast(*Buffer_lu2, + config.accl->bcast(*Buffer_lu1, config.programSettings->blockSize * config.programSettings->blockSize, local_block_col_remainder, row_comm, true, true); } - if (num_top_blocks > 0) { // Create top kernels @@ -343,8 +342,9 @@ std::unique_ptr calculate( #pragma omp single { // Wait until all top and left blocks are calculated - std::for_each(comm_kernel_runs.begin(), comm_kernel_runs.end(), - [](xrt::run &e) { e.wait(); }); + for (auto &run : comm_kernel_runs) { + run.wait(); + } // Send the left and top blocks to all other ranks so they can be used // to update all inner blocks @@ -366,7 +366,6 @@ std::unique_ptr calculate( config.programSettings->blockSize, local_block_row_remainder, col_comm, true, true); } - // update all remaining inner blocks using only global memory } @@ -374,8 +373,9 @@ std::unique_ptr calculate( // Wait for previous inner MMs to complete. // They may need to be reused by the next outer MM calls! - std::for_each(inner_mms.begin(), inner_mms.end(), - [](xrt::run &e) { e.wait(); }); + for (auto &run : inner_mms) { + run.wait(); + } #pragma omp for for (int lbi = 1; lbi < num_inner_block_rows; lbi++) { @@ -464,16 +464,17 @@ std::unique_ptr calculate( } } + // Wait for all outer MMs to complete because the results are required + // by the next communication phase + for (auto &run : outer_mms) { + run.wait(); + } + #ifndef NDEBUG MPI_Barrier(MPI_COMM_WORLD); if (is_calulating_lu_block) std::cout << "---------------" << std::endl; #endif - - // Wait for all outer MMs to complete because the results are required - // by the next communication phase - std::for_each(outer_mms.begin(), outer_mms.end(), - [](xrt::run &e) { e.wait(); }); } } diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp index b054a132..33330ea4 100644 --- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp @@ -292,9 +292,10 @@ std::unique_ptr calculate( #pragma omp single { // Wait until all top and left blocks are calculated - std::for_each(comm_kernel_runs.begin(), comm_kernel_runs.end(), - [](xrt::run &e) { e.wait(); }); - + for (auto &run : comm_kernel_runs) { + run.wait(); + } + // Send the left and top blocks to all other ranks so they can be used // to update all inner blocks for (int lbi = 0; @@ -331,8 +332,9 @@ std::unique_ptr calculate( // Wait for previous inner MMs to complete. // They may need to be reused by the next outer MM calls! - std::for_each(inner_mms.begin(), inner_mms.end(), - [](xrt::run &e) { e.wait(); }); + for (auto &run : inner_mms) { + run.wait(); + } #pragma omp for for (int lbi = 1; lbi < num_inner_block_rows; lbi++) { @@ -421,16 +423,17 @@ std::unique_ptr calculate( } } + // Wait for all outer MMs to complete because the results are required + // by the next communication phase + for (auto &run : outer_mms) { + run.wait(); + } + #ifndef NDEBUG MPI_Barrier(MPI_COMM_WORLD); if (is_calulating_lu_block) std::cout << "---------------" << std::endl; #endif - - // Wait for all outer MMs to complete because the results are required - // by the next communication phase - std::for_each(outer_mms.begin(), outer_mms.end(), - [](xrt::run &e) { e.wait(); }); } } From 239d0a2c89290f00830b26e7bfb8290086403140 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 19 May 2022 14:16:38 +0100 Subject: [PATCH 070/318] Initialize kernels only once and reuse --- .../execution_types/execution_xrt_pcie.hpp | 53 +++++++------------ 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp index 33330ea4..6de18915 100644 --- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp @@ -71,21 +71,21 @@ std::unique_ptr calculate( MPI_Comm_split(MPI_COMM_WORLD, config.programSettings->torus_col, 0, &col_communicator); - // TODO: Select the correct memory groups! - // Create Buffers for input and output - // TODO: Need to set a memory group for the buffers here! + xrt::kernel kernel_mm(*config.device, *config.program, "inner_update_mm0"); + xrt::kernel kernel_lu(*config.device, *config.program, "lu"); + xrt::kernel kernel_top(*config.device, *config.program, "top_update"); + xrt::kernel kernel_left(*config.device, *config.program, "left_update"); - auto lu_tmp_kernel = xrt::kernel(*config.device, *config.program, "lu"); xrt::bo Buffer_a(*config.device, data.A, sizeof(HOST_DATA_TYPE) * data.matrix_height * data.matrix_width, - lu_tmp_kernel.group_id(0)); + kernel_lu.group_id(0)); xrt::bo Buffer_b(*config.device, data.b, sizeof(HOST_DATA_TYPE) * data.matrix_width, - lu_tmp_kernel.group_id(0)); + kernel_lu.group_id(0)); xrt::bo Buffer_pivot(*config.device, data.ipvt, sizeof(cl_int) * data.matrix_height, - lu_tmp_kernel.group_id(0)); + kernel_lu.group_id(0)); /* --- Setup MPI communication and required additional buffers --- */ @@ -95,12 +95,12 @@ std::unique_ptr calculate( sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * (config.programSettings->blockSize), - lu_tmp_kernel.group_id(1)); + kernel_lu.group_id(1)); xrt::bo Buffer_lu2(*config.device, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * (config.programSettings->blockSize), - lu_tmp_kernel.group_id(2)); + kernel_lu.group_id(2)); std::vector> Buffer_left_list(2); std::vector> Buffer_top_list(2); @@ -111,7 +111,7 @@ std::unique_ptr calculate( *config.device, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * (config.programSettings->blockSize), - lu_tmp_kernel.group_id(0)); + kernel_lu.group_id(0)); } for (int i = 0; i < blocks_per_col; i++) { @@ -119,7 +119,7 @@ std::unique_ptr calculate( *config.device, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * (config.programSettings->blockSize), - lu_tmp_kernel.group_id(2)); + kernel_lu.group_id(2)); } } @@ -216,8 +216,6 @@ std::unique_ptr calculate( { if (is_calulating_lu_block) { - // create the LU kernel - auto lu_kernel = xrt::kernel(*config.device, *config.program, "lu"); #ifndef NDEBUG std::cout << "Torus " << config.programSettings->torus_row << "," @@ -225,7 +223,7 @@ std::unique_ptr calculate( << local_block_row << "," << local_block_col << std::endl; #endif auto lu_run = - lu_kernel(Buffer_a, Buffer_lu1, Buffer_lu2, local_block_col, + kernel_lu(Buffer_a, Buffer_lu1, Buffer_lu2, local_block_col, local_block_row, blocks_per_row); ert_cmd_state state = lu_run.wait(); if (state != ERT_CMD_STATE_COMPLETED) { @@ -254,7 +252,6 @@ std::unique_ptr calculate( // Create top kernels #pragma omp for for (int tops = start_col_index; tops < blocks_per_row; tops++) { - xrt::kernel k(*config.device, *config.program, "top_update"); #ifndef NDEBUG std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Top " @@ -262,7 +259,7 @@ std::unique_ptr calculate( #endif comm_kernel_runs.push_back( - k(Buffer_a, + kernel_top(Buffer_a, Buffer_top_list[block_row % 2][tops - start_col_index], Buffer_lu1, (tops == start_col_index), tops, local_block_row, blocks_per_row)); @@ -275,14 +272,13 @@ std::unique_ptr calculate( // Create left kernels #pragma omp for for (int tops = start_row_index; tops < blocks_per_col; tops++) { - xrt::kernel k(*config.device, *config.program, "left_update"); #ifndef NDEBUG std::cout << "Torus " << config.programSettings->torus_row << "," << config.programSettings->torus_col << " Left " << tops << "," << local_block_col << std::endl; #endif comm_kernel_runs.push_back( - k(Buffer_a, + kernel_left(Buffer_a, Buffer_left_list[block_row % 2][tops - start_row_index], Buffer_lu2, (tops == start_row_index), local_block_col, tops, blocks_per_row)); @@ -339,10 +335,6 @@ std::unique_ptr calculate( #pragma omp for for (int lbi = 1; lbi < num_inner_block_rows; lbi++) { - // select the matrix multiplication kernel that should be used for - // this block updated - xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); - int current_block_col = static_cast( (data.matrix_width / config.programSettings->blockSize) - num_inner_block_cols); @@ -357,7 +349,7 @@ std::unique_ptr calculate( << std::endl; #endif - outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][lbi], + outer_mms.push_back(kernel_mm(Buffer_a, Buffer_left_list[block_row % 2][lbi], Buffer_top_list[block_row % 2][0], current_block_col, current_block_row, blocks_per_row)); @@ -366,10 +358,6 @@ std::unique_ptr calculate( #pragma omp for for (int tbi = 0; tbi < num_inner_block_cols; tbi++) { - // select the matrix multiplication kernel that should be used for - // this block updated - xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); - int current_block_col = static_cast( (data.matrix_width / config.programSettings->blockSize) - num_inner_block_cols + tbi); @@ -384,7 +372,7 @@ std::unique_ptr calculate( << std::endl; #endif - outer_mms.push_back(k(Buffer_a, Buffer_left_list[block_row % 2][0], + outer_mms.push_back(kernel_mm(Buffer_a, Buffer_left_list[block_row % 2][0], Buffer_top_list[block_row % 2][tbi], current_block_col, current_block_row, blocks_per_row)); @@ -397,10 +385,6 @@ std::unique_ptr calculate( #pragma omp for collapse(2) schedule(static) for (int lbi = 1; lbi < num_inner_block_rows; lbi++) { for (int tbi = 1; tbi < num_inner_block_cols; tbi++) { - // select the matrix multiplication kernel that should be used for - // this block updated - - xrt::kernel k(*config.device, *config.program, "inner_update_mm0"); int current_block_col = static_cast( (data.matrix_width / config.programSettings->blockSize) - @@ -417,7 +401,7 @@ std::unique_ptr calculate( #endif inner_mms.push_back( - k(Buffer_a, Buffer_left_list[block_row % 2][lbi], + kernel_mm(Buffer_a, Buffer_left_list[block_row % 2][lbi], Buffer_top_list[block_row % 2][tbi], current_block_col, current_block_row, blocks_per_row)); } @@ -428,6 +412,9 @@ std::unique_ptr calculate( for (auto &run : outer_mms) { run.wait(); } + for (auto &run : inner_mms) { + run.wait(); + } #ifndef NDEBUG MPI_Barrier(MPI_COMM_WORLD); From 7aaa838bbb6d6b5a6dbdc61aea9ce854e224b2dc Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 19 May 2022 14:21:28 +0100 Subject: [PATCH 071/318] Print diff for failing non-dist validation --- LINPACK/src/host/linpack_benchmark.hpp | 27 +++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp index c6656ffd..dd33c0f0 100644 --- a/LINPACK/src/host/linpack_benchmark.hpp +++ b/LINPACK/src/host/linpack_benchmark.hpp @@ -341,13 +341,15 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark 0) { + auto base_data = this->generateInputData(); + if (this->mpi_comm_rank > 0) { for (int j = 0; j < matrix_height; j++) { for (int i = 0; i < matrix_width; i+= this->executionSettings->programSettings->blockSize) { MPI_Send(&data.A[matrix_width * j + i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD); + MPI_Send(&base_data->A[matrix_width * j + i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 1, MPI_COMM_WORLD); } } - if (executionSettings->programSettings->torus_row == 0) { + if (this->executionSettings->programSettings->torus_row == 0) { for (int i = 0; i < matrix_width; i+= this->executionSettings->programSettings->blockSize) { MPI_Send(&data.b[i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD); } @@ -360,17 +362,20 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark total_b_original(n); std::vector total_b(n); std::vector total_a(n*n); + std::vector total_a_old(n*n); for (int j = 0; j < n; j++) { for (int i = 0; i < n; i+= this->executionSettings->programSettings->blockSize) { int recvcol= (i / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_width; int recvrow= (j / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_height; int recvrank = this->executionSettings->programSettings->torus_width * recvrow + recvcol; if (recvrank > 0) { - MPI_Recv(&total_a[j * n + i],executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 0, MPI_COMM_WORLD, &status); + MPI_Recv(&total_a[j * n + i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 0, MPI_COMM_WORLD, &status); + MPI_Recv(&total_a_old[j * n + i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 1, MPI_COMM_WORLD, &status); } else { for (int k=0; k < this->executionSettings->programSettings->blockSize; k++) { total_a[j * n + i + k] = data.A[current_offset + k]; + total_a_old[j * n + i + k] = base_data->A[current_offset + k]; } current_offset += this->executionSettings->programSettings->blockSize; } @@ -397,6 +402,22 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark std::abs(total_b[i] - 1)) ? resid : std::abs(total_b[i] - 1); normx = (normx > std::abs(total_b_original[i])) ? normx : std::abs(total_b_original[i]); } + +#ifndef NDEBUG + double residn = resid / (static_cast(n)*normx*eps); + if (residn > 1.0) { + gefa_ref_nopvt(total_a_old.data(), n, n); + + for (int i=0; i < n; i++) { + for (int j=0; j < n; j++) { + double error = std::abs(total_a[i * n + j] - total_a_old[i * n + j]); + std::cout << ((error > 1.0e-6) ? error : 0.0) << ","; + } + std::cout << std::endl; + } + std::cout << std::endl; + } +#endif } #else double local_resid = 0; From cf8113a020a443108aca9902468b94db42c5d279 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 20 May 2022 14:51:04 +0100 Subject: [PATCH 072/318] Add firt C++ kernel version --- LINPACK/src/device/hpl_torus_PCIE.cpp | 799 ++++++++++++++++++++++++++ 1 file changed, 799 insertions(+) create mode 100644 LINPACK/src/device/hpl_torus_PCIE.cpp diff --git a/LINPACK/src/device/hpl_torus_PCIE.cpp b/LINPACK/src/device/hpl_torus_PCIE.cpp new file mode 100644 index 00000000..391ee48d --- /dev/null +++ b/LINPACK/src/device/hpl_torus_PCIE.cpp @@ -0,0 +1,799 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#include "parameters.h" + +const unsigned block_size = (1 << LOCAL_MEM_BLOCK_LOG); +const unsigned gemm_block = (1 << REGISTER_BLOCK_LOG); +const unsigned gemm_block_mm = (1 << REGISTER_BLOCK_MM_LOG); + +#ifdef KERNEL_lu +/** +Executes a single step of the LU factorization. + +This method takes a partially solved 8x8 matrix and calculates the next step of +the LU factorization The method needs 7 (gemm_block-1) calls to perform a single +LU factorization. This is done to reduce resource usage, since all upcomng calls +are anyway depending on the results of the previous call and there is no way to +pipeline multiple executions. + +A is the input block that might be partially computed +step is the current step and must be a value between 0 to gemm_block-2. After +step gemm_block-2, the block is factorized + */ +void lu_block(const DEVICE_DATA_TYPE A[gemm_block][gemm_block], const int step, + DEVICE_DATA_TYPE A_out[gemm_block][gemm_block]) { + + // Read current line from input + DEVICE_DATA_TYPE line[gemm_block]; + for (int i = 0; i < gemm_block; i++) { + line[i] = A[step][i]; + } + + // calculate the inverse of the diagonal element for the scaling + DEVICE_DATA_TYPE inv_scale_a = -1.0 / line[step]; + + // Scale the current row + for (int i = 0; i < gemm_block; i++) { + if (i > step) { + line[i] = line[i] * inv_scale_a; + } + } + line[step] = inv_scale_a; + + // Update all rows fully unrolled + // The multiply adds are fully independent + //__attribute__((opencl_unroll_hint(gemm_block))) + // Unrolling disabled for this loop to save resources + for (int j = 0; j < gemm_block; j++) { +#pragma HLS PIPELINE II=1 + DEVICE_DATA_TYPE curr_scale = A[j][step]; + // Update a single row. If it is already updated, just write back the value, + // if it is the current row write back the value in "line", else update the + // value + if (j != step) { + for (int i = 0; i < gemm_block; i++) { + A_out[j][i] = + (i > step && j > step) ? A[j][i] + line[i] * curr_scale : A[j][i]; + } + } else { + for (int i = 0; i < gemm_block; i++) { + A_out[j][i] = line[i]; + } + } + } +} + +/** +This function can be used to update blocks using with three different +operations. It will execute the update for a single row in the block. The update +is completed after gemm_block calls of this update function + +operation_type: 0 for top = the top row of blocks will need a triangular MM + 1 for left = the left column of blocks will need +a triangular MM, matrices have to be transposed 2 for inner block == all inner +blocks will be updated with a MM + */ +void update_block(const DEVICE_DATA_TYPE a[gemm_block][gemm_block], + const DEVICE_DATA_TYPE top[gemm_block], + const DEVICE_DATA_TYPE left_or_lu[gemm_block], + DEVICE_DATA_TYPE out[gemm_block][gemm_block], + const int current_row, const int operation_type) { + + // Define different operation types of function + const int op_top = 0; + const int op_left = 1; + const int op_inner = 2; + + // Transpose the input matrices if the target is a left block + DEVICE_DATA_TYPE current_block[gemm_block][gemm_block]; + if (operation_type == op_left) { + for (int ii = 0; ii < gemm_block; ii++) { + for (int jj = 0; jj < gemm_block; jj++) { + current_block[ii][jj] = a[jj][ii]; + } + } + } else { + for (int ii = 0; ii < gemm_block; ii++) { + for (int jj = 0; jj < gemm_block; jj++) { + current_block[ii][jj] = a[ii][jj]; + } + } + } + + // Generate the first scalling array depending on the operation type + DEVICE_DATA_TYPE scale_row[gemm_block]; + if (operation_type == op_inner) { + for (int jj = 0; jj < gemm_block; jj++) { + scale_row[jj] = top[jj]; + } + } else { + for (int jj = 0; jj < gemm_block; jj++) { + scale_row[jj] = current_block[current_row][jj]; + } + } + if (operation_type == op_top) { + for (int jj = 0; jj < gemm_block; jj++) { + scale_row[jj] *= left_or_lu[current_row]; + } + } + + DEVICE_DATA_TYPE tmp[gemm_block][gemm_block]; + // scale all values with the pre calculated scaling array and the second input + for (int ii = 0; ii < gemm_block; ii++) { + for (int jj = 0; jj < gemm_block; jj++) { + // left_or_lu_block are stored transposed to simplify the data access here + tmp[ii][jj] = current_block[ii][jj] + scale_row[jj] * left_or_lu[ii]; + } + } + + // overwrite results that were calculated altough they are not needed for the + // triangular operations left and top + if (operation_type != op_inner) { + for (int ii = 0; ii < gemm_block; ii++) { + if (ii == current_row) { + for (int jj = 0; jj < gemm_block; jj++) { + tmp[ii][jj] = scale_row[jj]; + } + } else if (ii < current_row) { + for (int jj = 0; jj < gemm_block; jj++) { + tmp[ii][jj] = current_block[ii][jj]; + } + } + } + } + + // write result back and transpose if necessary + if (operation_type == op_left) { + for (int ii = 0; ii < gemm_block; ii++) { + for (int jj = 0; jj < gemm_block; jj++) { + out[ii][jj] = tmp[jj][ii]; + } + } + } else { + for (int ii = 0; ii < gemm_block; ii++) { + for (int jj = 0; jj < gemm_block; jj++) { + out[ii][jj] = tmp[ii][jj]; + } + } + } +} + +#endif + +extern "C" { + +#ifdef KERNEL_lu +void lu(DEVICE_DATA_TYPE *a, DEVICE_DATA_TYPE *a_block_trans, + DEVICE_DATA_TYPE *a_block, const unsigned int block_col, const unsigned int block_row, + const unsigned int blocks_per_row) { + + DEVICE_DATA_TYPE a_buffer[block_size / gemm_block][block_size / gemm_block] + [gemm_block][gemm_block]; + + // Store current row and column in separate buffers for + // easier access in the deep pipeline + // need to be declared as local to prevent the compiler from + DEVICE_DATA_TYPE top_buffer[block_size / gemm_block][gemm_block]; + DEVICE_DATA_TYPE left_buffer[block_size / gemm_block][gemm_block]; + + // Load block to local memory +load_a_block: + for (int i = 0; i < block_size / gemm_block; i++) { + for (int ii = 0; ii < gemm_block; ii++) { + for (int j = 0; j < block_size / gemm_block; j++) { +#pragma HLS PIPELINE + for (int jj = 0; jj < gemm_block; jj++) { + a_buffer[i][j][ii][jj] = + a[block_col * block_size + + (block_row * block_size + i * gemm_block + ii) * block_size * + blocks_per_row + + j * gemm_block + jj]; + } + } + } + } + + // For each row in the matrix update whole matrix. + // The iterations depend on each other, so loop pipelining is disabled here +loop_diag: + for (int gk = 0; gk < block_size; gk++) { + + int k = gk / gemm_block; + int kk = gk & (gemm_block - 1); + + // Read in current LU block + DEVICE_DATA_TYPE lu_a_buffer_in[gemm_block][gemm_block]; +load_a_sb: + for (int ii = 0; ii < gemm_block; ii++) { + for (int jj = 0; jj < gemm_block; jj++) { + lu_a_buffer_in[ii][jj] = a_buffer[k][k][ii][jj]; + } + } + + DEVICE_DATA_TYPE lu_a_buffer_out[gemm_block][gemm_block]; + DEVICE_DATA_TYPE lu_a_buffer_out_row[gemm_block]; + DEVICE_DATA_TYPE lu_a_buffer_out_col[gemm_block]; + // Calculate next row and column of LU factorization and store in local + // memory buffer + lu_block(lu_a_buffer_in, kk, lu_a_buffer_out); +write_lu_sb: + for (int ii = 0; ii < gemm_block; ii++) { + for (int jj = 0; jj < gemm_block; jj++) { + a_buffer[k][k][ii][jj] = lu_a_buffer_out[ii][jj]; + } + } +write_lu_row: + for (int jj = 0; jj < gemm_block; jj++) { + lu_a_buffer_out_row[jj] = lu_a_buffer_out[kk][jj]; + } +write_lu_col: + for (int jj = 0; jj < gemm_block; jj++) { + lu_a_buffer_out_col[jj] = lu_a_buffer_out[jj][kk]; + } + + // The update pipeline does not need to be executed for the last + // row of blocks + if (gk < block_size - gemm_block) { + +update_inner: + // update all left blocks + for (int tj = 1; tj < block_size / gemm_block; tj++) { +#pragma HLS PIPELINE II=1 + + int j = k; + int i = tj; + + if (i > k) { + // copy the correct block in the second input buffer + // this depends on the operations that has to be executed + DEVICE_DATA_TYPE second_input[gemm_block]; + + // left matrix block will be calculated + for (int jj = 0; jj < gemm_block; jj++) { + second_input[jj] = lu_a_buffer_out_row[jj]; + } + DEVICE_DATA_TYPE a_input[gemm_block][gemm_block]; + for (int ii = 0; ii < gemm_block; ii++) { + for (int jj = 0; jj < gemm_block; jj++) { + a_input[ii][jj] = a_buffer[i][j][ii][jj]; + } + } + DEVICE_DATA_TYPE top_input[gemm_block]; + DEVICE_DATA_TYPE out[gemm_block][gemm_block]; + update_block(a_input, top_input, second_input, out, kk, 1); + + for (int ii = 0; ii < gemm_block; ii++) { + left_buffer[i][ii] = out[ii][kk]; + } + for (int ii = 0; ii < gemm_block; ii++) { + for (int jj = 0; jj < gemm_block; jj++) { + a_buffer[i][j][ii][jj] = out[ii][jj]; + } + } + } + } + + // Update all other blocks with the new calculated row and column + // First update top blocks, then update left blocks, then all inner blocks + // ti == 0: top blocks + // ti == 1: left blocks + // ti > 1: inner blocks +update_inner_2: + for (int ti = 0; ti < block_size / gemm_block - k; ti++) { + for (int tj = 1; tj < block_size / gemm_block; tj++) { +#pragma HLS PIPELINE II=1 + + int j = tj; + int i = ti + k; + // always execute the pipeline for whole rows of matrix blocks. + // Only execute update for blocks that are required. + // This helps to keep constant latencies between data dependencies of + // the pipeline stages + if ((i > k || ti == 0) && j > k) { + + // copy the correct block in the second input buffer + // this depends on the operations that has to be executed + DEVICE_DATA_TYPE second_input[gemm_block]; + if (ti == 0) { + // top matrix block will be calculated + for (int jj = 0; jj < gemm_block; jj++) { + second_input[jj] = lu_a_buffer_out_col[jj]; + } + } else { + // inner block will be calculated + for (int jj = 0; jj < gemm_block; jj++) { + second_input[jj] = left_buffer[i][jj]; + } + } + DEVICE_DATA_TYPE a_input[gemm_block][gemm_block]; + for (int ii = 0; ii < gemm_block; ii++) { + for (int jj = 0; jj < gemm_block; jj++) { + a_input[ii][jj] = a_buffer[i][j][ii][jj]; + } + } + DEVICE_DATA_TYPE top_input[gemm_block]; + for (int jj = 0; jj < gemm_block; jj++) { + top_input[jj] = top_buffer[j][jj]; + } + DEVICE_DATA_TYPE out[gemm_block][gemm_block]; + update_block(a_input, top_input, second_input, out, kk, + (ti == 0) ? 0 : 2); + if (ti == 0) { + // only update in the first row + for (int jj = 0; jj < gemm_block; jj++) { + top_buffer[j][jj] = out[kk][jj]; + } + } + for (int ii = 0; ii < gemm_block; ii++) { + for (int jj = 0; jj < gemm_block; jj++) { + a_buffer[i][j][ii][jj] = out[ii][jj]; + } + } + } + } + } + } + } + + // Store block to global memory +store_a: + for (int i = 0; i < block_size / gemm_block; i++) { + for (int ii = 0; ii < gemm_block; ii++) { + for (int j = 0; j < block_size / gemm_block; j++) { + for (int jj = 0; jj < gemm_block; jj++) { + a[block_col * block_size + + (block_row * block_size + i * gemm_block + ii) * block_size * + blocks_per_row + + j * gemm_block + jj] = a_buffer[i][j][ii][jj]; + } + } + } + } + // Store current block in global memory also transposed to allow easier access + // from the top kernel + store_a_bt: + for (int i = 0; i < block_size / gemm_block; i++) { + for (int ii = 0; ii < gemm_block; ii++) { + for (int j = 0; j < block_size / gemm_block; j++) { + for (int jj = 0; jj < gemm_block; jj++) { + a_block_trans[(i * gemm_block + ii) * block_size + j * gemm_block + + jj] = a_buffer[j][i][jj][ii]; + } + } + } + } + +store_a_b: + for (int i = 0; i < block_size / gemm_block; i++) { + for (int ii = 0; ii < gemm_block; ii++) { + for (int j = 0; j < block_size / gemm_block; j++) { + for (int jj = 0; jj < gemm_block; jj++) { + a_block[(i * gemm_block + ii) * block_size + j * gemm_block + jj] = + a_buffer[i][j][ii][jj]; + } + } + } + } +} +#endif + +#ifdef KERNEL_top_update +/** +Update the blocks to the right of the current LU block + + */ +void top_update(DEVICE_DATA_TYPE *a, DEVICE_DATA_TYPE *top_block, + const DEVICE_DATA_TYPE *lu_global_buffer_transposed, + const unsigned int is_first_block, const unsigned int block_col, + const unsigned int block_row, const unsigned int blocks_per_row) { + + // Store current block in local memory + DEVICE_DATA_TYPE + a_buffer[block_size / gemm_block][block_size / gemm_block][gemm_block] + [gemm_block]; + + // Load block to local memory +load_a: + for (int i = 0; i < block_size / gemm_block; i++) { + for (int ii = 0; ii < gemm_block; ii++) { + for (int j = 0; j < block_size / gemm_block; j++) { + for (int jj = 0; jj < gemm_block; jj++) { + a_buffer[i][j][ii][jj] = + a[block_col * block_size + + (block_row * block_size + i * gemm_block + ii) * block_size * + blocks_per_row + + j * gemm_block + jj]; + } + } + } + } + +// For each row in the matrix update whole matrix. +// The iterations depend on each other, so loop pipelining is disabled here +diag_exe: + for (int gk = 0; gk < block_size; gk++) { + + int k = gk / gemm_block; + int kk = gk & (gemm_block - 1); + + DEVICE_DATA_TYPE current_lu_col[block_size / gemm_block][gemm_block]; + DEVICE_DATA_TYPE current_row[block_size / gemm_block][gemm_block]; + DEVICE_DATA_TYPE current_scale; + +scale_row: + for (int col = 0; col < block_size / gemm_block; col++) { +#pragma HLS PIPELINE II=1 + DEVICE_DATA_TYPE col_in[gemm_block]; +#pragma HLS array_partition variable=col_in type=complete dim=0 + DEVICE_DATA_TYPE scale_chunk[gemm_block]; +#pragma HLS array_partition variable=col_in type=complete dim=0 + + // get current row chunk + for (int i = 0; i < gemm_block; i++) { + scale_chunk[i] = a_buffer[k][col][kk][i]; + } + + // if current column data is still available read it in and store it in + // buffer + if (col < block_size / gemm_block - k) { + // Load LU data from global memory instead of receiving it from the + // channel + for (int i = 0; i < gemm_block; i++) { + col_in[i] = + lu_global_buffer_transposed[gk * block_size + + (col + k) * gemm_block + i]; + } + if (col == 0) { + current_scale = col_in[kk]; + } + for (int i = 0; i < gemm_block; i++) { + current_lu_col[col][i] = (col > 0 || i > kk) ? col_in[i] : 0.f; + } + } + + // scale current row chunk with the rows scale factor received over the + // external channel + for (int i = 0; i < gemm_block; i++) { + scale_chunk[i] = scale_chunk[i] * current_scale; + } + + for (int i = 0; i < gemm_block; i++) { + current_row[col][i] = scale_chunk[i]; + } + + // Update local memory buffer with chunk + for (int i = 0; i < gemm_block; i++) { + a_buffer[k][col][kk][i] = scale_chunk[i]; + } + } + +// Update all remaining rows +update_rows: + for (int row = k; row < block_size / gemm_block; row++) { +#pragma HLS loop_tripcount min=0 max=block_size/gemm_block avg=block_size/gemm_block/2 + // Update whole rows! + for (int curr_col = 0; curr_col < block_size / gemm_block; curr_col++) { +#pragma HLS PIPELINE II=1 + DEVICE_DATA_TYPE colbuf[gemm_block]; + for (int j = 0; j < gemm_block; j++) { + colbuf[j] = current_lu_col[row - k][j]; + } + for (int i = 0; i < gemm_block; i++) { + for (int j = 0; j < gemm_block; j++) { + a_buffer[row][curr_col][i][j] += + colbuf[i] * current_row[curr_col][j]; + } + } + } + } + } + +// Store block to global memory +store_a: + for (int i = 0; i < block_size / gemm_block; i++) { + for (int ii = 0; ii < gemm_block; ii++) { + for (int j = 0; j < block_size / gemm_block; j++) { + for (int jj = 0; jj < gemm_block; jj++) { + a[block_col * block_size + + (block_row * block_size + i * gemm_block + ii) * block_size * + blocks_per_row + + j * gemm_block + jj] = a_buffer[i][j][ii][jj]; + } + } + } + } +// Store current block separately for easier transmission over host +store_top: + for (int i = 0; i < block_size / gemm_block; i++) { + for (int ii = 0; ii < gemm_block; ii++) { + for (int j = 0; j < block_size / gemm_block; j++) { + for (int jj = 0; jj < gemm_block; jj++) { + top_block[(i * gemm_block + ii) * block_size + j * gemm_block + jj] = + a_buffer[i][j][ii][jj]; + } + } + } + } +} +#endif + +#ifdef KERNEL_left_update +/** +Update the blocks below the current LU block + + */ +void left_update(DEVICE_DATA_TYPE * a, + DEVICE_DATA_TYPE * left_block, + const DEVICE_DATA_TYPE * lu_global_buffer, + const unsigned int is_first_block, const unsigned int block_col, + const unsigned int block_row, const unsigned int blocks_per_row) { + + // Store current block in local memory + DEVICE_DATA_TYPE + a_buffer[block_size / gemm_block][block_size / gemm_block][gemm_block] + [gemm_block]; + + // Load block to local memory +load_a: + for (int i = 0; i < block_size / gemm_block; i++) { + for (int ii = 0; ii < gemm_block; ii++) { + for (int j = 0; j < block_size / gemm_block; j++) { + for (int jj = 0; jj < gemm_block; jj++) { + a_buffer[i][j][ii][jj] = + a[block_col * block_size + + (block_row * block_size + i * gemm_block + ii) * block_size * + blocks_per_row + + j * gemm_block + jj]; + } + } + } + } + + // For each row in the matrix update whole matrix. + // The iterations depend on each other, so loop pipelining is disabled here +diag: + for (int gk = 0; gk < block_size; gk++) { + + int k = gk / gemm_block; + int kk = gk & (gemm_block - 1); + + DEVICE_DATA_TYPE current_lu_row[block_size / gemm_block][gemm_block]; + DEVICE_DATA_TYPE current_col[block_size / gemm_block][gemm_block]; + +first_col: + for (int col = 0; col < block_size / gemm_block; col++) { +#pragma HLS PIPELINE II=1 + DEVICE_DATA_TYPE chunk[gemm_block]; + // get current row chunk + for (int i = 0; i < gemm_block; i++) { + chunk[i] = a_buffer[col][k][i][kk]; + } + + // Store chunk for later update + for (int i = 0; i < gemm_block; i++) { + current_col[col][i] = chunk[i]; + } + + DEVICE_DATA_TYPE row_in[gemm_block]; + + // if current column data is still available read it in and store it in + // buffer + if (col < block_size / gemm_block - k) { + // Load LU data from global memory + for (int i = 0; i < gemm_block; i++) { + row_in[i] = + lu_global_buffer[gk * block_size + (col + k) * gemm_block + i]; + } + for (int i = 0; i < gemm_block; i++) { + current_lu_row[col][i] = (col > 0 || i > kk) ? row_in[i] : 0.f; + } + } + } + + // Update all rows + // Update only remaining row chunks +update: + for (int curr_col = 0; curr_col < block_size / gemm_block - k; curr_col++) { +#pragma HLS loop_tripcount min=0 max=block_size/gemm_block avg=block_size/gemm_block/2 + for (int row = 0; row < block_size / gemm_block; row++) { +#pragma HLS PIPELINE II=1 + DEVICE_DATA_TYPE colbuf[gemm_block]; + for (int j = 0; j < gemm_block; j++) { + colbuf[j] = current_col[row][j]; + } + for (int i = 0; i < gemm_block; i++) { + for (int j = 0; j < gemm_block; j++) { + a_buffer[row][curr_col + k][i][j] += + current_lu_row[curr_col][j] * colbuf[i]; + } + } + } + } + } + + // Store block to global memory +store_a: + for (int i = 0; i < block_size / gemm_block; i++) { + for (int ii = 0; ii < gemm_block; ii++) { + for (int j = 0; j < block_size / gemm_block; j++) { + for (int jj = 0; jj < gemm_block; jj++) { + a[block_col * block_size + + (block_row * block_size + i * gemm_block + ii) * block_size * + blocks_per_row + + j * gemm_block + jj] = a_buffer[i][j][ii][jj]; + } + } + } + } + + // Store current block separately for easier transmission over host +store_left: + for (int i = 0; i < block_size / gemm_block; i++) { + for (int ii = 0; ii < gemm_block; ii++) { + for (int j = 0; j < block_size / gemm_block; j++) { + for (int jj = 0; jj < gemm_block; jj++) { + left_block[(i * gemm_block + ii) * block_size + j * gemm_block + jj] = + a_buffer[j][i][jj][ii]; + } + } + } + } +} +#endif + +#ifdef KERNEL_inner_update_mm0 +/** +Update the inner blocks using the left and right column and rows + + */ +void inner_update_mm0( + DEVICE_DATA_TYPE *a, const DEVICE_DATA_TYPE *left_global_buffer, + const DEVICE_DATA_TYPE *top_global_buffer, const unsigned int block_col, + const unsigned int block_row, const unsigned int blocks_per_row) { + + // Store current block in local memory + DEVICE_DATA_TYPE a_buffer[block_size / gemm_block_mm] + [block_size / gemm_block_mm][gemm_block_mm] + [gemm_block_mm]; + DEVICE_DATA_TYPE top_buffer[block_size / gemm_block_mm] + [block_size / gemm_block_mm][gemm_block_mm] + [gemm_block_mm]; + DEVICE_DATA_TYPE left_buffer[block_size / gemm_block_mm] + [block_size / gemm_block_mm][gemm_block_mm] + [gemm_block_mm]; + + // If Xilinx FPGA, load blocks in separate pipelines to achieve memory bursts! + // Load blocks to local memory +load_a_block: + for (int i = 0; i < block_size / gemm_block_mm; i++) { + for (int ii = 0; ii < gemm_block_mm; ii++) { + for (int j = 0; j < block_size / gemm_block_mm; j++) { +#pragma HLS PIPELINE II=1 + for (int jj = 0; jj < gemm_block_mm; jj++) { + a_buffer[i][j][ii][jj] = + a[block_col * block_size + + (block_row * block_size + i * gemm_block_mm + ii) * block_size * + blocks_per_row + + j * gemm_block_mm + jj]; + } + } + } + } + +load_top_block: + for (int i = 0; i < block_size / gemm_block_mm; i++) { + for (int ii = 0; ii < gemm_block_mm; ii++) { + for (int j = 0; j < block_size / gemm_block_mm; j++) { +#pragma HLS PIPELINE II=1 + for (int jj = 0; jj < gemm_block_mm; jj++) { + top_buffer[i][j][ii][jj] = + top_global_buffer[(i * gemm_block_mm + ii) * block_size + + j * gemm_block_mm + jj]; + } + } + } + } + +load_left_block: + for (int i = 0; i < block_size / gemm_block_mm; i++) { + for (int ii = 0; ii < gemm_block_mm; ii++) { + for (int j = 0; j < block_size / gemm_block_mm; j++) { +#pragma HLS PIPELINE II=1 + for (int jj = 0; jj < gemm_block_mm; jj++) { + left_buffer[i][j][ii][jj] = + left_global_buffer[(i * gemm_block_mm + ii) * block_size + + j * gemm_block_mm + jj]; + } + } + } + } + + // Update whole block +calc_subblocks: + for (int c = 0; + c < (block_size / gemm_block_mm) * (block_size / gemm_block_mm) * + (block_size / gemm_block_mm); + c++) { +#pragma HLS PIPELINE II=1 + + int mcol = + c / ((block_size / gemm_block_mm) * (block_size / gemm_block_mm)); + int row = + (c / (block_size / gemm_block_mm)) % (block_size / gemm_block_mm); + int curr_col = c & ((block_size / gemm_block_mm) - 1); + + DEVICE_DATA_TYPE top_sub[gemm_block_mm][gemm_block_mm]; + DEVICE_DATA_TYPE left_sub[gemm_block_mm][gemm_block_mm]; + +load_top_sb: + for (int i = 0; i < gemm_block_mm; i++) { + for (int j = 0; j < gemm_block_mm; j++) { + top_sub[i][j] = top_buffer[mcol][curr_col][i][j]; + } + } + +load_left_sb: + for (int i = 0; i < gemm_block_mm; i++) { + for (int j = 0; j < gemm_block_mm; j++) { + left_sub[i][j] = left_buffer[mcol][row][i][j]; + } + } + + DEVICE_DATA_TYPE result_sub[gemm_block_mm][gemm_block_mm]; +mmul: + for (int i = 0; i < gemm_block_mm; i++) { + for (int j = 0; j < gemm_block_mm; j++) { + // Calculate sum of whole column and only write it back once + DEVICE_DATA_TYPE sum = 0.0; + for (int k = 0; k < gemm_block_mm; k++) { + sum += left_sub[k][i] * top_sub[k][j]; + } + result_sub[i][j] = sum; + } + } + +add_sb: + for (int i = 0; i < gemm_block_mm; i++) { + for (int j = 0; j < gemm_block_mm; j++) { + a_buffer[row][curr_col][i][j] += result_sub[i][j]; + } + } + } + + // Store block to global memory +store_result: + for (int i = 0; i < block_size / gemm_block_mm; i++) { + for (int ii = 0; ii < gemm_block_mm; ii++) { + for (int j = 0; j < block_size / gemm_block_mm; j++) { + for (int jj = 0; jj < gemm_block_mm; jj++) { + a[block_col * block_size + + (block_row * block_size + i * gemm_block_mm + ii) * block_size * + blocks_per_row + + j * gemm_block_mm + jj] = a_buffer[i][j][ii][jj]; + } + } + } + } +} + +#endif +} From ce8f0404c7311c678265b233719af0bdb9eb561d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 20 May 2022 14:52:23 +0100 Subject: [PATCH 073/318] Extend cmake to allow compiling multiple kernels for one link step --- cmake/kernelTargets.cmake | 71 +++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake index b7a237a3..86aeeb1c 100644 --- a/cmake/kernelTargets.cmake +++ b/cmake/kernelTargets.cmake @@ -21,7 +21,7 @@ set(file_endings "cl" "cpp" ) function(generate_kernel_targets_xilinx) foreach (kernel_file_name ${ARGN}) string(REGEX MATCH "^custom_.*" is_custom_kernel ${kernel_file_name}) - if (is_custom_kernel) + if (is_custom_kernel) string(REPLACE "custom_" "" base_file_name ${kernel_file_name}) set(base_file_part "src/device/custom/${base_file_name}") else() @@ -48,8 +48,17 @@ function(generate_kernel_targets_xilinx) else() set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_copied_xilinx.${selected_file_ending}") endif() - set(bitstream_compile xilinx_tmp_compile/${kernel_file_name}.xo) - set(bitstream_compile_emulate xilinx_tmp_compile/${kernel_file_name}_emulate.xo) + if (DEFINED XILINX_KERNEL_NAMES) + set(bitstream_compile "") + set(bitstream_compile_emulate "") + foreach (kernel ${XILINX_KERNEL_NAMES}) + list(APPEND bitstream_compile xilinx_tmp_compile/${kernel_file_name}/${kernel}.xo) + list(APPEND bitstream_compile_emulate xilinx_tmp_compile/${kernel_file_name}/${kernel}_emulate.xo) + endforeach() + else() + set(bitstream_compile xilinx_tmp_compile/${kernel_file_name}.xo) + set(bitstream_compile_emulate xilinx_tmp_compile/${kernel_file_name}_emulate.xo) + endif() set(bitstream_emulate_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_emulate.xclbin) set(bitstream_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}.xclbin) @@ -62,7 +71,7 @@ function(generate_kernel_targets_xilinx) set(xilinx_link_settings ${CMAKE_BINARY_DIR}/settings/settings.link.xilinx.${kernel_file_name}.ini) endif() if (USE_ACCL AND is_accl_kernel) - list(APPEND additional_xos ${ACCL_XOS}) + list(APPEND additional_xos ${ACCL_XOS}) endif() set(xilinx_report_folder "${EXECUTABLE_OUTPUT_PATH}/xilinx_reports") set(local_CLFLAGS -DXILINX_FPGA) @@ -106,35 +115,47 @@ function(generate_kernel_targets_xilinx) ) endif() - add_custom_command(OUTPUT ${bitstream_compile_emulate} - COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -DEMULATE -t sw_emu ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS} -f ${FPGA_BOARD_NAME} -g -c ${XILINX_COMPILE_FLAGS} -o ${bitstream_compile_emulate} ${source_f} - MAIN_DEPENDENCY ${source_f} - DEPENDS ${XILINX_COMPILE_SETTINGS_FILE} - ) + foreach (kernel ${bitstream_compile_emulate}) + if (DEFINED XILINX_KERNEL_NAMES) + string(REGEX MATCH ".+/(.+)_emulate\.xo" kernel_name ${kernel}) + set(kernel_name_flag -k ${CMAKE_MATCH_1}) + endif() + add_custom_command(OUTPUT ${kernel} + COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -DKERNEL_${CMAKE_MATCH_1} -DEMULATE -t sw_emu ${kernel_name_flag} ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS} -f ${FPGA_BOARD_NAME} -g -c ${XILINX_COMPILE_FLAGS} -o ${kernel} ${source_f} + MAIN_DEPENDENCY ${source_f} + DEPENDS ${XILINX_COMPILE_SETTINGS_FILE} + ) + endforeach() add_custom_command(OUTPUT ${bitstream_emulate_f} COMMAND ${Vitis_COMPILER} ${local_CL_FLAGS} ${VPP_FLAGS} -DEMULATE -t sw_emu ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} -f ${FPGA_BOARD_NAME} -g -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_emulate_f} ${bitstream_compile_emulate} - MAIN_DEPENDENCY ${bitstream_compile_emulate} + DEPENDS ${bitstream_compile_emulate} DEPENDS ${xilinx_link_settings} ) - add_custom_command(OUTPUT ${bitstream_compile} - COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -c ${XILINX_COMPILE_FLAGS} -o ${bitstream_compile} ${source_f} - MAIN_DEPENDENCY ${source_f} - DEPENDS ${XILINX_COMPILE_SETTINGS_FILE} - ) + foreach (kernel ${bitstream_compile}) + if (DEFINED XILINX_KERNEL_NAMES) + string(REGEX MATCH ".+/(.+)\.xo" kernel_name ${kernel}) + set(kernel_name_flag -k ${CMAKE_MATCH_1}) + endif() + add_custom_command(OUTPUT ${kernel} + COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -t hw -DKERNEL_${CMAKE_MATCH_1} ${kernel_name_flag} ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -c ${XILINX_COMPILE_FLAGS} -o ${kernel} ${source_f} + MAIN_DEPENDENCY ${source_f} + DEPENDS ${XILINX_COMPILE_SETTINGS_FILE} + ) + endforeach() add_custom_command(OUTPUT ${bitstream_f} COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} ${local_harware_only_flags} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} -o ${bitstream_f} ${additional_xos} ${bitstream_compile} - MAIN_DEPENDENCY ${bitstream_compile} + DEPENDS ${bitstream_compile} DEPENDS ${xilinx_link_settings} ) - add_custom_target(${kernel_file_name}_emulate_xilinx - DEPENDS ${bitstream_emulate_f} + add_custom_target(${kernel_file_name}_emulate_xilinx + DEPENDS ${bitstream_emulate_f} DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h ${EXECUTABLE_OUTPUT_PATH}/emconfig.json) add_custom_target(${kernel_file_name}_xilinx - DEPENDS ${bitstream_f} + DEPENDS ${bitstream_f} DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h ) add_custom_target(${kernel_file_name}_report_xilinx - DEPENDS ${bitstream_compile} + DEPENDS ${bitstream_compile} DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h ) if(USE_ACCL AND is_accl_kernel) @@ -153,7 +174,7 @@ endfunction() function(generate_kernel_targets_intel) foreach (kernel_file_name ${ARGN}) string(REGEX MATCH "^custom_.*" is_custom_kernel ${kernel_file_name}) - if (is_custom_kernel) + if (is_custom_kernel) string(REPLACE "custom_" "" base_file_name ${kernel_file_name}) set(base_file_part "src/device/custom/${base_file_name}") else() @@ -192,7 +213,7 @@ function(generate_kernel_targets_intel) DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${report_f} ) add_custom_command(OUTPUT ${EXECUTABLE_OUTPUT_PATH}/${bitstream_f} - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bitstream_f} ${EXECUTABLE_OUTPUT_PATH}/${bitstream_f} + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bitstream_f} ${EXECUTABLE_OUTPUT_PATH}/${bitstream_f} COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/${kernel_file_name}/reports ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_synth_reports COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${kernel_file_name}/acl_quartus_report.txt ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_synth_reports/acl_quartus_report.txt COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${kernel_file_name}/quartus_sh_compile.log ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_synth_reports/quartus_sh_compile.log @@ -217,11 +238,11 @@ function(generate_kernel_targets_intel) MAIN_DEPENDENCY ${source_f} DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h ) - add_custom_target(${kernel_file_name}_report_intel + add_custom_target(${kernel_file_name}_report_intel DEPENDS ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_reports/report.html) - add_custom_target(${kernel_file_name}_intel + add_custom_target(${kernel_file_name}_intel DEPENDS ${EXECUTABLE_OUTPUT_PATH}/${bitstream_f}) - add_custom_target(${kernel_file_name}_emulate_intel + add_custom_target(${kernel_file_name}_emulate_intel DEPENDS ${EXECUTABLE_OUTPUT_PATH}/${bitstream_emulate_f}) list(APPEND kernel_emulation_targets_intel ${kernel_file_name}_emulate_intel) set(kernel_emulation_targets_intel ${kernel_emulation_targets_intel} CACHE INTERNAL "Kernel emulation targets used to define dependencies for the tests for intel devices") From d379d0cb50188ff1b86998b11302afc96cbcf55b Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 20 May 2022 14:52:37 +0100 Subject: [PATCH 074/318] Add ACCL kernel link --- LINPACK/src/device/CMakeLists.txt | 2 +- LINPACK/src/device/hpl_torus_ACCL_buffers.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 120000 LINPACK/src/device/hpl_torus_ACCL_buffers.cpp diff --git a/LINPACK/src/device/CMakeLists.txt b/LINPACK/src/device/CMakeLists.txt index 7a28cc56..2e9431a5 100644 --- a/LINPACK/src/device/CMakeLists.txt +++ b/LINPACK/src/device/CMakeLists.txt @@ -10,7 +10,7 @@ if (INTELFPGAOPENCL_FOUND) endif() if (VITIS_FOUND) - generate_kernel_targets_xilinx(hpl_torus_PCIE) + generate_kernel_targets_xilinx(hpl_torus_PCIE hpl_torus_ACCL_buffers) add_test(NAME test_emulation_xilinx COMMAND Linpack_xilinx -f hpl_torus_PCIE_emulate.xclbin -m 2 -n 1 ${TEST_HOST_FLAGS} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Linpack_xilinx ${TEST_HOST_FLAGS} -f hpl_torus_PCIE_emulate.xclbin -m 2 -n 1 diff --git a/LINPACK/src/device/hpl_torus_ACCL_buffers.cpp b/LINPACK/src/device/hpl_torus_ACCL_buffers.cpp new file mode 120000 index 00000000..a11753b1 --- /dev/null +++ b/LINPACK/src/device/hpl_torus_ACCL_buffers.cpp @@ -0,0 +1 @@ +hpl_torus_PCIE.cpp \ No newline at end of file From 019aa15f9246a2b66728c6a85a022a947c3d8942 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 20 May 2022 14:56:56 +0100 Subject: [PATCH 075/318] Update PCIE config for U280 --- ...CCL_buffers.cmake => Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake} | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) rename LINPACK/configs/{Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake => Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake} (86%) diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake similarity index 86% rename from LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake rename to LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake index 941a1d78..5ddc6b30 100644 --- a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_ACCL_buffers.cmake +++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake @@ -14,11 +14,13 @@ set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) set(USE_OCL_HOST No CACHE BOOL "" FORCE) set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) - +set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE) +set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE) # LINPACK specific options set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE) set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE) set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) From 0eab2932bff646600d7daa623a64607219d08d3a Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 20 May 2022 18:20:24 +0100 Subject: [PATCH 076/318] Fix build for XRT host without ACCL --- LINPACK/src/host/execution_types/execution_types.hpp | 2 +- LINPACK/src/host/linpack_benchmark.hpp | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/LINPACK/src/host/execution_types/execution_types.hpp b/LINPACK/src/host/execution_types/execution_types.hpp index 294115ea..457f4e85 100644 --- a/LINPACK/src/host/execution_types/execution_types.hpp +++ b/LINPACK/src/host/execution_types/execution_types.hpp @@ -27,9 +27,9 @@ SOFTWARE. #include "execution_types/execution_iec.hpp" #endif #ifdef USE_XRT_HOST +#include "execution_types/execution_xrt_pcie.hpp" #ifdef USE_ACCL #include "execution_types/execution_accl_buffers.hpp" -#include "execution_types/execution_xrt_pcie.hpp" #endif #endif #endif \ No newline at end of file diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp index dd33c0f0..d1d3093c 100644 --- a/LINPACK/src/host/linpack_benchmark.hpp +++ b/LINPACK/src/host/linpack_benchmark.hpp @@ -314,7 +314,9 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmarkexecutionSettings, data); break; +#ifdef USE_ACCL case hpcc_base::CommunicationType::accl : timings = execution::accl_buffers::calculate(*this->executionSettings, data); break; +#endif #endif default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(this->executionSettings->programSettings->communicationType)); } From 6d90f70b21aba131d8b0cc3a54391cb2cf0e5a3c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 20 May 2022 18:20:56 +0100 Subject: [PATCH 077/318] Adjust to changed ACCL interface --- LINPACK/src/host/execution_types/execution_accl_buffers.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp index 29645a51..5e26e267 100644 --- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp +++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp @@ -86,9 +86,9 @@ std::unique_ptr calculate( } // Create communicators from sub-groups - ACCL::CommunicatorId row_comm = config.accl->create_communicator( + ACCL::communicatorId row_comm = config.accl->create_communicator( row_ranks, config.programSettings->torus_col); - ACCL::CommunicatorId col_comm = config.accl->create_communicator( + ACCL::communicatorId col_comm = config.accl->create_communicator( col_ranks, config.programSettings->torus_row); // Create global memory buffers From 41c26033fac43196877d6da317e3fc313db437f8 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 20 May 2022 18:21:43 +0100 Subject: [PATCH 078/318] Use CMake parameter for ACCL buffer size --- LINPACK/CMakeLists.txt | 5 +++++ LINPACK/src/common/parameters.h.in | 1 + 2 files changed, 6 insertions(+) diff --git a/LINPACK/CMakeLists.txt b/LINPACK/CMakeLists.txt index fb17db96..a33cc82f 100755 --- a/LINPACK/CMakeLists.txt +++ b/LINPACK/CMakeLists.txt @@ -19,6 +19,11 @@ if (TEST_UNIFORM) set(TEST_HOST_FLAGS "--uniform") endif() +if (USE_ACCL) + math(EXPR calculate_accl_buffer_size "(2^${LOCAL_MEM_BLOCK_LOG})^2 * 8") + set(ACCL_BUFFER_SIZE ${calculate_accl_buffer_size} CACHE STRING "Size of ACCL buffers in bytes") +endif() + if (TEST_EMULATION) set(TEST_HOST_FLAGS "--emulation") endif() diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in index 4c036fb9..a5bac5e0 100644 --- a/LINPACK/src/common/parameters.h.in +++ b/LINPACK/src/common/parameters.h.in @@ -30,6 +30,7 @@ #cmakedefine USE_SVM #cmakedefine DISTRIBUTED_VALIDATION +#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@ /* Short description of the program From 26a4dea22424b0a62b6ac67f17a7650c6c0e8562 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 24 May 2022 14:11:45 +0100 Subject: [PATCH 079/318] Add initial ACCL stream implementation --- .../configs/Xilinx_U280_DDR_ACCL_stream.cmake | 28 +++ ...nk.xilinx.transpose_pq_accl_stream.ddr.ini | 76 +++++++ PTRANS/src/device/CMakeLists.txt | 2 +- .../src/device/transpose_PQ_ACCL_stream.cpp | 202 ++++++++++++++++++ 4 files changed, 307 insertions(+), 1 deletion(-) create mode 100644 PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini create mode 100644 PTRANS/src/device/transpose_PQ_ACCL_stream.cpp diff --git a/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake new file mode 100644 index 00000000..89114c4d --- /dev/null +++ b/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake @@ -0,0 +1,28 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(ACCL_STACK_TYPE "TCP" CACHE STRING "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE) +set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) +set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE) +set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini new file mode 100644 index 00000000..3c7fcf31 --- /dev/null +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini @@ -0,0 +1,76 @@ +# /******************************************************************************* +# Copyright (C) 2021 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ +[connectivity] +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_sum:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=transpose_read0:1 +nk=transpose_write0:1 + +# Kernels Foorplaning +slr=compression_0_0:SLR0 +slr=compression_0_1:SLR0 +slr=compression_0_2:SLR0 +slr=arith_0:SLR0 +slr=ccl_offload_0:SLR0 +slr=hostctrl_0:SLR0 +slr=networklayer_0:SLR1 +slr=cmac_0:SLR2 +slr=transpose_read0_1:SLR0 +slr=transpose_write0_1:SLR0 + +sp=ccl_offload_0.m_axi_0:DDR[0:1] +sp=ccl_offload_0.m_axi_1:DDR[0:1] +sp=transpose_read0_1.m_axi_gmem0:DDR[0:1] +sp=transpose_write0_1.m_axi_gmem0:DDR[0] +sp=transpose_write0_1.m_axi_gmem1:DDR[1] + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl +stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl + diff --git a/PTRANS/src/device/CMakeLists.txt b/PTRANS/src/device/CMakeLists.txt index 21176719..34c47551 100644 --- a/PTRANS/src/device/CMakeLists.txt +++ b/PTRANS/src/device/CMakeLists.txt @@ -11,7 +11,7 @@ if (INTELFPGAOPENCL_FOUND) endif() if (VITIS_FOUND) - generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE transpose_PQ_ACCL_buffers) + generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE transpose_PQ_ACCL_buffers transpose_PQ_ACCL_stream) add_test(NAME test_emulation_PQ_PCIE_xilinx COMMAND Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp new file mode 100644 index 00000000..223dfd53 --- /dev/null +++ b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp @@ -0,0 +1,202 @@ +/****************************************************************************** + * Author: Arjun Ramaswami + * + * Edited by Marius Meyer: + * - Adapt to used kernel signature + * - Change to row-column loop structure + *****************************************************************************/ +#include "parameters.h" +#include "hls_stream.h" +#include "ap_int.h" +#include "ap_utils.h" +#include "ap_axi_sdata.h" + +const unsigned int block_size = BLOCK_SIZE; +const unsigned int channel_width = CHANNEL_WIDTH; + +extern "C" { + +// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] + +/** + * Read blocks of matrix A and transpose them in memory. + * Write the block into an external channel. + * + * Will do the following: + * + * A -> trans(A) -> ext. ch + * + * @param A Buffer for matrix A + * @param B Buffer for matrix B + * @param A_out Buffer for result matrix + * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise + on the block level, the whole matrix A might be written to global memory and the relevant columns + need to be picked using this offset. + * @param number_of_blocks The number of blocks that will be processed starting from the block offset + * @param width_in_blocks The with of matrix A in blocks + * @param height_in_blocks The height of matix A in blocks + */ +void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, + const unsigned int offset_a, + const unsigned int number_of_blocks, + const unsigned int width_in_blocks, + const unsigned int height_in_blocks, + hls::stream > &krnl2cclo) { +#pragma HLS INTERFACE axis register both port=krnl2cclo +#pragma HLS INTERFACE ap_ctrl_none port=return + + + // local memory double buffer for a matrix block + DEVICE_DATA_TYPE a_block[2][block_size * block_size / channel_width][channel_width]; +#pragma HLS ARRAY_PARTITION variable = a_block complete dim = 3 + + // transpose the matrix block-wise from global memory +block_loop: + for (unsigned int block = 0; block < number_of_blocks + 1; block++) { + +read_A: + for (unsigned int row = 0; row < block_size; row++) { +read_A_line: + for (unsigned int col = 0; col < block_size / channel_width; col++) { +#pragma HLS PIPELINE + unsigned long block_row_a = (block + offset_a) / width_in_blocks; + unsigned long block_col_a = (block + offset_a) % width_in_blocks; + unsigned long ls_address_trans = block_col_a * block_size * block_size * height_in_blocks + + block_row_a * block_size + + row * block_size * height_in_blocks; + +#ifdef EMULATE + // This condition is actually required to not read out of bounds + // but prevents memory bursts, so for hardware this should be removed + // In emulation it prevents segfaults + if (block < number_of_blocks) { +#endif + // read in block of A from global memory and store it in a memory efficient manner for transpose + DEVICE_DATA_TYPE rotate_in[channel_width]; +#pragma HLS ARRAY_PARTITION variable = rotate_in complete dim = 0 + + // Blocks of a will be stored columnwise in global memory + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + rotate_in[unroll_count] = A[ls_address_trans + col * channel_width + unroll_count]; + } + + unsigned int chunk = row * (block_size / channel_width) + col; + + unsigned rot = (row) % (channel_width); + + // rotate temporary buffer to store data into local buffer + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + // every block of (N / channel_width), rotates the index by 1 + // store in double buffer + a_block[block & 1][chunk][unroll_count] = rotate_in[(unroll_count + channel_width - rot) + % (channel_width)]; + } +#ifdef EMULATE + } +#endif + if (block > 0) { + DEVICE_DATA_TYPE data_chunk[channel_width]; +#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0 + DEVICE_DATA_TYPE rotate_out[channel_width]; +#pragma HLS ARRAY_PARTITION variable = rotate_out complete dim = 0 + + unsigned int base = col * block_size; + unsigned int offset = row / channel_width; + + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + unsigned rot = ((channel_width + unroll_count - row) * (block_size / channel_width)) % + (block_size); + unsigned row_rotate = base + offset + rot; + rotate_out[unroll_count] = a_block[(block - 1) & 1][row_rotate][unroll_count]; + } + + unsigned rot_out = row % (channel_width); + + // rotate temporary buffer to store data into local buffer + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) % (channel_width)]; + } + + ap_uint<512> data = 0; + + // load tranposed A from global memory + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + data |= ((ap_uint<8*sizeof(DEVICE_DATA_TYPE)>)data_chunk[unroll_count]) << (unroll_count * sizeof(DEVICE_DATA_TYPE)); + } + + ap_axiu<512, 0, 0, 8> tmp; + tmp.data = data; + tmp.dest = 0; + tmp.keep = -1; + krnl2cclo.write(tmp); + } + } + } + } +} + +/** + * + * ext. channel -> trans(A) + B -> A_out + * + * @param B Buffer for matrix B + * @param A_out Buffer for result matrix + * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise + on the block level, the whole matrix A might be written to global memory and the relevant columns + need to be picked using this offset. + * @param number_of_blocks The number of blocks that will be processed starting from the block offset + * @param width_in_blocks The with of matrix A in blocks + * @param height_in_blocks The height of matix A in blocks + */ +void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B, + DEVICE_DATA_TYPE *A_out, + const unsigned int offset_b, + const unsigned int number_of_blocks, + const unsigned int width_in_blocks, + const unsigned int height_in_blocks, + hls::stream > &cclo2krnl) { +#pragma HLS INTERFACE axis register both port=cclo2krnl +#pragma HLS INTERFACE ap_ctrl_none port=return + + // transpose the matrix block-wise from global memory +block_loop: + for (unsigned int block = 0; block < number_of_blocks; block++) { + + // Read transposed A from local memory and add B +read_B: + for (unsigned int row = 0; row < block_size; row++) { +read_B_line: + for (unsigned int col = 0; col < block_size / channel_width; col++) { + unsigned long block_row = (block + offset_b) / width_in_blocks; + unsigned long block_col = (block + offset_b) % width_in_blocks; + unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks + + block_col * block_size + + row * block_size * width_in_blocks; + unsigned int chunk = row * (block_size / channel_width) + col; + + DEVICE_DATA_TYPE data_chunk[channel_width]; +#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0 + + ap_axiu<512, 0, 0, 8> tmp = cclo2krnl.read(); + + // rotate temporary buffer to store data into local buffer + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + data_chunk[unroll_count] = (DEVICE_DATA_TYPE)((tmp.data >> (unroll_count * sizeof(DEVICE_DATA_TYPE))) & ((1 << 32) - 1)); + } + + // load tranposed A from global memory + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + data_chunk[unroll_count] += B[ls_address_row + col * channel_width + unroll_count]; + } + + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + A_out[ls_address_row + col * channel_width + unroll_count] = data_chunk[unroll_count]; + } + } + } + } +} + +// PY_CODE_GEN block_end + +} From 6c12ead036eb8926dcf2ef3d76208b58469d2744 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 24 May 2022 15:08:48 +0100 Subject: [PATCH 080/318] Adding tripcount for better reports --- PTRANS/src/device/transpose_PQ_ACCL_stream.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp index 223dfd53..2cda216f 100644 --- a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp +++ b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp @@ -53,6 +53,7 @@ void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, // transpose the matrix block-wise from global memory block_loop: for (unsigned int block = 0; block < number_of_blocks + 1; block++) { +#pragma HLS loop_tripcount min=1 max=1024 avg=1 read_A: for (unsigned int row = 0; row < block_size; row++) { @@ -161,7 +162,7 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B, // transpose the matrix block-wise from global memory block_loop: for (unsigned int block = 0; block < number_of_blocks; block++) { - +#pragma HLS loop_tripcount min=1 max=1024 avg=1 // Read transposed A from local memory and add B read_B: for (unsigned int row = 0; row < block_size; row++) { From e1bf6e86341a6006bb4b3c818e2e572bc0d7ea0e Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 3 Jun 2022 14:59:18 +0100 Subject: [PATCH 081/318] Add support for multiple CMAC kernels for UDP --- cmake/accl.cmake | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/cmake/accl.cmake b/cmake/accl.cmake index 2875657d..cdf23b0b 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -1,7 +1,7 @@ # General definitions set(ACCL_STACK_TYPE "UDP" CACHE STRING "Network layer used in ACCL") -set(ACCL_UDP_ETH_IF 0 CACHE STRING "Ethernet interface used. On ETHZ: 0 = switch, 1 = direct") +set(ACCL_UDP_ETH_IFS 1 CACHE STRING "Number of Ethernet interfaces to synthesize for UDP stack") set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform") set(ACCL_BUFFER_SIZE 8192 CACHE STRING "Size of ACCL buffers in bytes") set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware) @@ -10,7 +10,6 @@ set(ACCL_CCLO_BUILD_ARGS ${ACCL_CCLO_ADDITIONAL_BUILD_ARGS}) # UDP related definitions set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/) set(ACCL_NETLAYER_HLS ${ACCL_VNX_DIR}/NetLayers/100G-fpga-network-stack-core) -set(ACCL_UDP_MAC_XO ${ACCL_VNX_DIR}/Ethernet/_x.${FPGA_BOARD_NAME}/cmac_${ACCL_UDP_ETH_IF}.xo) set(ACCL_UDP_NET_XO ${ACCL_VNX_DIR}/NetLayers/_x.${FPGA_BOARD_NAME}/networklayer.xo) set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HMB) if (ACCL_STACK_TYPE STREQUAL "UDP") @@ -19,10 +18,17 @@ if (ACCL_STACK_TYPE STREQUAL "UDP") list(APPEND ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE}) endif() -add_custom_command( - OUTPUT ${ACCL_UDP_MAC_XO} - COMMAND make -C ${ACCL_VNX_DIR}/Ethernet DEVICE=${FPGA_BOARD_NAME} INTERFACE=${ACCL_UDP_ETH_IF} all - WORKING_DIRECTORY ${ACCL_HARDWARE_DIR}) +set(ACCL_UDP_MAC_XOS "") + +math(EXPR loopend "${ACCL_UDP_ETH_IFS} - 1") +foreach(i RANGE ${loopend}) + set(CURRENT_MAC_XO ${ACCL_VNX_DIR}/Ethernet/_x.${FPGA_BOARD_NAME}/cmac_${i}.xo) + add_custom_command( + OUTPUT ${CURRENT_MAC_XO} + COMMAND make -C ${ACCL_VNX_DIR}/Ethernet DEVICE=${FPGA_BOARD_NAME} INTERFACE=${i} all + WORKING_DIRECTORY ${ACCL_HARDWARE_DIR}) + list(APPEND ACCL_UDP_MAC_XOS ${CURRENT_MAC_XO}) +endforeach() add_custom_command( OUTPUT ${ACCL_UDP_NET_XO} @@ -31,7 +37,7 @@ add_custom_command( add_custom_target( accl_udp_stack - DEPENDS ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO}) + DEPENDS ${ACCL_UDP_MAC_XOS} ${ACCL_UDP_NET_XO}) # TCP related definitions set(ACCL_TCP_BASE_DIR ${ACCL_HARDWARE_DIR}/Vitis_with_100Gbps_TCP-IP) @@ -114,7 +120,7 @@ add_custom_target( ${ACCL_PLUGINS_COMPRESSION}) set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} - ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XO} ${ACCL_UDP_NET_XO} CACHE INTERNAL "Object files required for ACCL with UDP") + ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XOS} ${ACCL_UDP_NET_XO} CACHE INTERNAL "Object files required for ACCL with UDP") set(ACCL_TCP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_TCP_CMAC_XO} ${ACCL_TCP_XO} CACHE INTERNAL "Object files required for ACCL with TCP") From 75ecdb73744149e6e05d69cafa813f324c10706a Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 5 Jul 2022 11:00:25 +0200 Subject: [PATCH 082/318] Add ACCL buffer size to b_eff --- b_eff/CMakeLists.txt | 5 +++++ b_eff/src/common/parameters.h.in | 1 + b_eff/src/host/CMakeLists.txt | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt index 13d93b1b..f150bcc9 100755 --- a/b_eff/CMakeLists.txt +++ b/b_eff/CMakeLists.txt @@ -19,6 +19,11 @@ set(USE_DEPRECATED_HPP_HEADER No) set(COMMUNICATION_TYPE_SUPPORT_ENABLED Yes) +if (USE_ACCL) + math(EXPR calculate_accl_buffer_size "2 ^ ${DEFAULT_MAX_MESSAGE_SIZE} * 4") + set(ACCL_BUFFER_SIZE ${calculate_accl_buffer_size} CACHE STRING "Size of ACCL buffers in bytes") +endif() + set(DATA_TYPE char) include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake) unset(DATA_TYPE CACHE) diff --git a/b_eff/src/common/parameters.h.in b/b_eff/src/common/parameters.h.in index d404bfd7..5c823610 100644 --- a/b_eff/src/common/parameters.h.in +++ b/b_eff/src/common/parameters.h.in @@ -23,6 +23,7 @@ #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ #cmakedefine HOST_EMULATION_REORDER +#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@ /* Short description of the program. diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt index d0be57ba..5e22b54a 100755 --- a/b_eff/src/host/CMakeLists.txt +++ b/b_eff/src/host/CMakeLists.txt @@ -27,7 +27,7 @@ if (Vitis_FOUND) target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) target_link_libraries(${LIB_NAME}_xilinx accl) target_include_directories(${LIB_NAME}_xilinx PRIVATE ${ACCL_INCLUDE_PATH}) - target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) + target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx zmqpp) target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") From 1ac78f8a31788e1cd8dec0cf5ef9473ce9080b85 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 5 Jul 2022 11:00:41 +0200 Subject: [PATCH 083/318] Fix b_eff accl calls --- b_eff/src/host/execution_types/execution_accl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp index 81673835..c4686b29 100644 --- a/b_eff/src/host/execution_types/execution_accl.hpp +++ b/b_eff/src/host/execution_types/execution_accl.hpp @@ -79,8 +79,8 @@ namespace network::execution_types::accl { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); for (int l = 0; l < looplength; l++) { - config.accl->send(0, *acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); - config.accl->recv(0, *acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + config.accl->send(*acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + config.accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); } auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); From 3ad2069ba9404646ebd45d8072c447c8572ce75f Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 5 Jul 2022 11:01:02 +0200 Subject: [PATCH 084/318] Switch default accl branch to main --- extern/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 341f73cd..5587a0e1 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -62,7 +62,7 @@ FetchContent_Declare( extern_accl GIT_REPOSITORY https://github.com/Xilinx/ACCL.git - GIT_TAG dev) + GIT_TAG main) FetchContent_GetProperties(extern_accl) if(NOT extern_accl_POPULATED) From 67f6967d1a7828a644b0fb9cec7f3a84a967c605 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 5 Jul 2022 11:01:24 +0200 Subject: [PATCH 085/318] Link PTRANS host with zmqpp --- PTRANS/src/host/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt index 2404394f..5bb10e54 100755 --- a/PTRANS/src/host/CMakeLists.txt +++ b/PTRANS/src/host/CMakeLists.txt @@ -39,7 +39,7 @@ if (Vitis_FOUND) add_executable(${HOST_EXE_NAME}_xilinx main.cpp) target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) - target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) + target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx zmqpp) target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") From 5aa44d045b36d57f65dc11dc91681da6366de3a8 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 5 Jul 2022 11:03:14 +0200 Subject: [PATCH 086/318] Add copy-a option to PTRANS --- PTRANS/src/host/transpose_benchmark.hpp | 40 ++++++++++++++++++++++++- PTRANS/src/host/transpose_data.cpp | 2 +- PTRANS/src/host/transpose_data.hpp | 6 ++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index 392789c8..585e60be 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -81,7 +81,8 @@ public hpcc_base::HpccFpgaBenchmark()->default_value(std::to_string(DEFAULT_P_VALUE))) ("distribute-buffers", "Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs.") ("handler", "Specify the used data handler that distributes the data over devices and memory banks", - cxxopts::value()->default_value(DEFAULT_DIST_TYPE)); + cxxopts::value()->default_value(DEFAULT_DIST_TYPE)) + ("copy-a", "Create a copy of matrix A for each kernel replication"); } std::unique_ptr> dataHandler; @@ -164,6 +165,11 @@ public hpcc_base::HpccFpgaBenchmarkdataHandler->exchangeData(data); +#ifndef NDEBUG + std::vector oldA(this->executionSettings->programSettings->blockSize * this->executionSettings->programSettings->blockSize * data.numBlocks); + std::copy(data.A, data.A + oldA.size(), oldA.data()); +#endif + this->dataHandler->reference_transpose(data); double max_error = 0.0; @@ -175,6 +181,38 @@ public hpcc_base::HpccFpgaBenchmark*>(this->dataHandler.get())->getHeightforRank(); + long width_per_rank = reinterpret_cast*>(this->dataHandler.get())->getWidthforRank(); + if (error_count > 0) { + std::cout << "A:" << std::endl; + for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { + for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { + std::cout << oldA[j * width_per_rank * data.blockSize + i] << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + std::cout << "B:" << std::endl; + for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { + for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { + std::cout << data.B[j * width_per_rank * data.blockSize + i] << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + std::cout << "Transposed A:" << std::endl; + for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { + for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { + std::cout << data.A[j * width_per_rank * data.blockSize + i] << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + +#endif + double global_max_error = 0; int global_error_count = 0; MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); diff --git a/PTRANS/src/host/transpose_data.cpp b/PTRANS/src/host/transpose_data.cpp index 20d6560f..e8a7c8f0 100644 --- a/PTRANS/src/host/transpose_data.cpp +++ b/PTRANS/src/host/transpose_data.cpp @@ -7,7 +7,7 @@ transpose::TransposeProgramSettings::TransposeProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), matrixSize(results["m"].as() * results["b"].as()), blockSize(results["b"].as()), dataHandlerIdentifier(transpose::data_handler::stringToHandler(results["handler"].as())), - distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as()) { + distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as()), copyA(results["copy-a"].count() > 0) { // auto detect data distribution type if required if (dataHandlerIdentifier == transpose::data_handler::DataHandlerType::automatic) { diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp index c73a9959..fed4eff6 100644 --- a/PTRANS/src/host/transpose_data.hpp +++ b/PTRANS/src/host/transpose_data.hpp @@ -74,6 +74,12 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings { */ bool distributeBuffers; + /** + * @brief If true, create a copy of matrix A for each kernel replication + * + */ + bool copyA; + /** * @brief Construct a new Transpose Program Settings object * From c373e0249523a8813fce693fdfbe17d39976bd30 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 5 Jul 2022 11:03:48 +0200 Subject: [PATCH 087/318] Implement copy-a for accl --- .../execution_types/execution_xrt_accl_pq.hpp | 54 ++++++++----------- 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp index 8d3edac5..3fdaeb1f 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp @@ -62,7 +62,6 @@ void accl_exchangeData( acclBuffersA.push_back(accl.create_buffer( bo, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); - acclBuffersA.back()->sync_from_device(); } if (pq_width == pq_height) { @@ -82,33 +81,24 @@ void accl_exchangeData( auto acclBufferA_recv = accl.create_buffer( data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32); - acclBufferA_recv->sync_to_device(); // Send and receive matrix A using ACCL directly on FPGA - if (mpi_comm_rank < pair_rank) { - for (int block_num = 0; block_num < data.numBlocks; block_num++) { - accl.send(0, - *acclBuffersA[0]->slice( + for (int block_chunk = 0; block_chunk < data.numBlocks; block_chunk+= 16) { + for (int block_num = block_chunk; block_num < std::min(data.numBlocks, block_chunk + 16); block_num++) { + accl.send(*acclBuffersA[0]->slice( data.blockSize * data.blockSize * block_num, data.blockSize * data.blockSize * (block_num + 1)), - data.blockSize * data.blockSize, pair_rank, 0, true, + data.blockSize * data.blockSize, pair_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM); } - accl.recv(0, *acclBufferA_recv, - data.blockSize * data.blockSize * data.numBlocks, pair_rank, - 1, true, ACCL::streamFlags::NO_STREAM); - } else { - accl.recv(0, *acclBufferA_recv, - data.blockSize * data.blockSize * data.numBlocks, pair_rank, - 0, true, ACCL::streamFlags::NO_STREAM); - for (int block_num = 0; block_num < data.numBlocks; block_num++) { - accl.send(0, - *acclBuffersA[0]->slice( + for (int block_num = block_chunk; block_num < std::min(data.numBlocks, block_chunk + 16); block_num++) { + accl.recv(*acclBufferA_recv->slice( data.blockSize * data.blockSize * block_num, data.blockSize * data.blockSize * (block_num + 1)), - data.blockSize * data.blockSize, pair_rank, 1, true, - ACCL::streamFlags::NO_STREAM); + data.blockSize * data.blockSize, pair_rank, + 1, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM); } } + accl.copy(*acclBufferA_recv, *acclBuffersA[0], data.blockSize * data.blockSize * data.numBlocks, true, true); } @@ -275,12 +265,12 @@ void accl_exchangeData( << std::flush; #endif accl_requests[current_parallel_execution] = (accl.send( - 0, *send_buffers[current_parallel_execution], sending_size, - send_rank, 0, true, ACCL::streamFlags::NO_STREAM, + *send_buffers[current_parallel_execution], sending_size, + send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM, ACCL::dataType::none, true)); accl_requests[current_parallel_execution + gcd] = (accl.recv( - 0, *recv_buffers[current_parallel_execution], sending_size, - send_rank, 0, true, ACCL::streamFlags::NO_STREAM, + *recv_buffers[current_parallel_execution], sending_size, + send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM, ACCL::dataType::none, true)); // Increase the counter for parallel executions current_parallel_execution = (current_parallel_execution + 1) % gcd; @@ -458,10 +448,13 @@ static std::unique_ptr calculate( *config.device, *config.program, ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); - xrt::bo bufferA(*config.device, data.A, + if (r == 0 || config.programSettings->copyA) { + xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0)); + bufferListA.push_back(bufferA); + } xrt::bo bufferB( *config.device, &data.B[bufferStartList[r] * data.blockSize * data.blockSize], @@ -469,7 +462,6 @@ static std::unique_ptr calculate( xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2)); - bufferListA.push_back(bufferA); bufferListB.push_back(bufferB); bufferListA_out.push_back(bufferA_out); transposeKernelList.push_back(transposeKernel); @@ -487,7 +479,9 @@ static std::unique_ptr calculate( auto startTransfer = std::chrono::high_resolution_clock::now(); for (int r = 0; r < transposeKernelList.size(); r++) { - bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + if (r == 0 || config.programSettings->copyA) { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); } auto endTransfer = std::chrono::high_resolution_clock::now(); @@ -501,12 +495,6 @@ static std::unique_ptr calculate( auto startCalculation = std::chrono::high_resolution_clock::now(); // Exchange A data via ACCL - if (bufferListA.size() > 1) { - std::cerr << "WARNING: Only the matrix A of the first kernel replication " - "will be exchanged " - "via ACCL!" - << std::endl; - } #ifndef NDEBUG std::cout << "Start data exchange with ACCL" << std::endl; #endif @@ -519,7 +507,7 @@ static std::unique_ptr calculate( auto startKernelCalculation = std::chrono::high_resolution_clock::now(); for (int r = 0; r < transposeKernelList.size(); r++) { runs.push_back(transposeKernelList[r]( - bufferListA[r], bufferListB[r], bufferListA_out[r], + (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]), bufferListB[r], bufferListA_out[r], static_cast(bufferOffsetList[r]), static_cast(bufferOffsetList[r]), static_cast(blocksPerReplication[r]), From e87731de58a225e8f2dbae59c5e3db12af480ff4 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 5 Jul 2022 11:04:00 +0200 Subject: [PATCH 088/318] Implement copy-a for xrt --- .../execution_types/execution_xrt_pcie_pq.hpp | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp index b5788fed..f0d4eeed 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp @@ -122,10 +122,13 @@ static std::unique_ptr calculate( xrt::kernel transposeKernel(*config.device, *config.program, ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); - xrt::bo bufferA(*config.device, data.A, + if ( r == 0 || config.programSettings->copyA) { + xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0)); + bufferListA.push_back(bufferA); + } xrt::bo bufferB( *config.device, &data.B[bufferStartList[r] * data.blockSize * data.blockSize], @@ -138,7 +141,6 @@ static std::unique_ptr calculate( xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2)); - bufferListA.push_back(bufferA); bufferListB.push_back(bufferB); bufferListA_out.push_back(bufferA_out); transposeKernelList.push_back(transposeKernel); @@ -152,7 +154,9 @@ static std::unique_ptr calculate( auto startTransfer = std::chrono::high_resolution_clock::now(); for (int r = 0; r < transposeKernelList.size(); r++) { - bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + if ( r == 0 || config.programSettings->copyA) { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); } auto endTransfer = std::chrono::high_resolution_clock::now(); @@ -168,7 +172,9 @@ static std::unique_ptr calculate( if (mpi_size > 1) { for (int r = 0; r < transposeKernelList.size(); r++) { - bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + if ( r == 0 || config.programSettings->copyA) { + bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + } } // Exchange A data via PCIe and MPI @@ -177,7 +183,9 @@ static std::unique_ptr calculate( std::copy(data.A, data.A + data.numBlocks * data.blockSize * data.blockSize, data.exchange); for (int r = 0; r < transposeKernelList.size(); r++) { - bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + if ( r == 0 || config.programSettings->copyA) { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } } } @@ -185,7 +193,7 @@ static std::unique_ptr calculate( auto startKernelCalculation = std::chrono::high_resolution_clock::now(); for (int r = 0; r < transposeKernelList.size(); r++) { runs.push_back(transposeKernelList[r]( - bufferListA[r], bufferListB[r], bufferListA_out[r], + (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]), bufferListB[r], bufferListA_out[r], static_cast(bufferStartList[r] + bufferOffsetList[r]), static_cast(bufferOffsetList[r]), static_cast(blocksPerReplication[r]), static_cast(handler.getWidthforRank()), From 50be2cf53a85387f387525a784615262bc5f3f83 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 8 Aug 2022 17:45:20 +0200 Subject: [PATCH 089/318] Add proper ACCL UDP support on hardware --- .../execution_types/execution_xrt_accl_pq.hpp | 54 +++++++---------- .../execution_types/execution_xrt_pcie_pq.hpp | 20 +++++-- PTRANS/src/host/transpose_benchmark.hpp | 40 ++++++++++++- PTRANS/src/host/transpose_data.cpp | 2 +- PTRANS/src/host/transpose_data.hpp | 6 ++ shared/CMakeLists.txt | 6 +- shared/setup/fpga_setup_accl.cpp | 58 ++++++++++++++++++- shared/setup/fpga_setup_xrt.cpp | 5 +- 8 files changed, 145 insertions(+), 46 deletions(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp index 8d3edac5..3fdaeb1f 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp @@ -62,7 +62,6 @@ void accl_exchangeData( acclBuffersA.push_back(accl.create_buffer( bo, data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32)); - acclBuffersA.back()->sync_from_device(); } if (pq_width == pq_height) { @@ -82,33 +81,24 @@ void accl_exchangeData( auto acclBufferA_recv = accl.create_buffer( data.blockSize * data.blockSize * data.numBlocks, ACCL::dataType::float32); - acclBufferA_recv->sync_to_device(); // Send and receive matrix A using ACCL directly on FPGA - if (mpi_comm_rank < pair_rank) { - for (int block_num = 0; block_num < data.numBlocks; block_num++) { - accl.send(0, - *acclBuffersA[0]->slice( + for (int block_chunk = 0; block_chunk < data.numBlocks; block_chunk+= 16) { + for (int block_num = block_chunk; block_num < std::min(data.numBlocks, block_chunk + 16); block_num++) { + accl.send(*acclBuffersA[0]->slice( data.blockSize * data.blockSize * block_num, data.blockSize * data.blockSize * (block_num + 1)), - data.blockSize * data.blockSize, pair_rank, 0, true, + data.blockSize * data.blockSize, pair_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM); } - accl.recv(0, *acclBufferA_recv, - data.blockSize * data.blockSize * data.numBlocks, pair_rank, - 1, true, ACCL::streamFlags::NO_STREAM); - } else { - accl.recv(0, *acclBufferA_recv, - data.blockSize * data.blockSize * data.numBlocks, pair_rank, - 0, true, ACCL::streamFlags::NO_STREAM); - for (int block_num = 0; block_num < data.numBlocks; block_num++) { - accl.send(0, - *acclBuffersA[0]->slice( + for (int block_num = block_chunk; block_num < std::min(data.numBlocks, block_chunk + 16); block_num++) { + accl.recv(*acclBufferA_recv->slice( data.blockSize * data.blockSize * block_num, data.blockSize * data.blockSize * (block_num + 1)), - data.blockSize * data.blockSize, pair_rank, 1, true, - ACCL::streamFlags::NO_STREAM); + data.blockSize * data.blockSize, pair_rank, + 1, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM); } } + accl.copy(*acclBufferA_recv, *acclBuffersA[0], data.blockSize * data.blockSize * data.numBlocks, true, true); } @@ -275,12 +265,12 @@ void accl_exchangeData( << std::flush; #endif accl_requests[current_parallel_execution] = (accl.send( - 0, *send_buffers[current_parallel_execution], sending_size, - send_rank, 0, true, ACCL::streamFlags::NO_STREAM, + *send_buffers[current_parallel_execution], sending_size, + send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM, ACCL::dataType::none, true)); accl_requests[current_parallel_execution + gcd] = (accl.recv( - 0, *recv_buffers[current_parallel_execution], sending_size, - send_rank, 0, true, ACCL::streamFlags::NO_STREAM, + *recv_buffers[current_parallel_execution], sending_size, + send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM, ACCL::dataType::none, true)); // Increase the counter for parallel executions current_parallel_execution = (current_parallel_execution + 1) % gcd; @@ -458,10 +448,13 @@ static std::unique_ptr calculate( *config.device, *config.program, ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); - xrt::bo bufferA(*config.device, data.A, + if (r == 0 || config.programSettings->copyA) { + xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0)); + bufferListA.push_back(bufferA); + } xrt::bo bufferB( *config.device, &data.B[bufferStartList[r] * data.blockSize * data.blockSize], @@ -469,7 +462,6 @@ static std::unique_ptr calculate( xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2)); - bufferListA.push_back(bufferA); bufferListB.push_back(bufferB); bufferListA_out.push_back(bufferA_out); transposeKernelList.push_back(transposeKernel); @@ -487,7 +479,9 @@ static std::unique_ptr calculate( auto startTransfer = std::chrono::high_resolution_clock::now(); for (int r = 0; r < transposeKernelList.size(); r++) { - bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + if (r == 0 || config.programSettings->copyA) { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); } auto endTransfer = std::chrono::high_resolution_clock::now(); @@ -501,12 +495,6 @@ static std::unique_ptr calculate( auto startCalculation = std::chrono::high_resolution_clock::now(); // Exchange A data via ACCL - if (bufferListA.size() > 1) { - std::cerr << "WARNING: Only the matrix A of the first kernel replication " - "will be exchanged " - "via ACCL!" - << std::endl; - } #ifndef NDEBUG std::cout << "Start data exchange with ACCL" << std::endl; #endif @@ -519,7 +507,7 @@ static std::unique_ptr calculate( auto startKernelCalculation = std::chrono::high_resolution_clock::now(); for (int r = 0; r < transposeKernelList.size(); r++) { runs.push_back(transposeKernelList[r]( - bufferListA[r], bufferListB[r], bufferListA_out[r], + (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]), bufferListB[r], bufferListA_out[r], static_cast(bufferOffsetList[r]), static_cast(bufferOffsetList[r]), static_cast(blocksPerReplication[r]), diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp index b5788fed..f0d4eeed 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp @@ -122,10 +122,13 @@ static std::unique_ptr calculate( xrt::kernel transposeKernel(*config.device, *config.program, ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str()); - xrt::bo bufferA(*config.device, data.A, + if ( r == 0 || config.programSettings->copyA) { + xrt::bo bufferA(*config.device, data.A, data.numBlocks * data.blockSize * data.blockSize * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(0)); + bufferListA.push_back(bufferA); + } xrt::bo bufferB( *config.device, &data.B[bufferStartList[r] * data.blockSize * data.blockSize], @@ -138,7 +141,6 @@ static std::unique_ptr calculate( xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(2)); - bufferListA.push_back(bufferA); bufferListB.push_back(bufferB); bufferListA_out.push_back(bufferA_out); transposeKernelList.push_back(transposeKernel); @@ -152,7 +154,9 @@ static std::unique_ptr calculate( auto startTransfer = std::chrono::high_resolution_clock::now(); for (int r = 0; r < transposeKernelList.size(); r++) { - bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + if ( r == 0 || config.programSettings->copyA) { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); } auto endTransfer = std::chrono::high_resolution_clock::now(); @@ -168,7 +172,9 @@ static std::unique_ptr calculate( if (mpi_size > 1) { for (int r = 0; r < transposeKernelList.size(); r++) { - bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + if ( r == 0 || config.programSettings->copyA) { + bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + } } // Exchange A data via PCIe and MPI @@ -177,7 +183,9 @@ static std::unique_ptr calculate( std::copy(data.A, data.A + data.numBlocks * data.blockSize * data.blockSize, data.exchange); for (int r = 0; r < transposeKernelList.size(); r++) { - bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + if ( r == 0 || config.programSettings->copyA) { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } } } @@ -185,7 +193,7 @@ static std::unique_ptr calculate( auto startKernelCalculation = std::chrono::high_resolution_clock::now(); for (int r = 0; r < transposeKernelList.size(); r++) { runs.push_back(transposeKernelList[r]( - bufferListA[r], bufferListB[r], bufferListA_out[r], + (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]), bufferListB[r], bufferListA_out[r], static_cast(bufferStartList[r] + bufferOffsetList[r]), static_cast(bufferOffsetList[r]), static_cast(blocksPerReplication[r]), static_cast(handler.getWidthforRank()), diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index 392789c8..585e60be 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -81,7 +81,8 @@ public hpcc_base::HpccFpgaBenchmark()->default_value(std::to_string(DEFAULT_P_VALUE))) ("distribute-buffers", "Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs.") ("handler", "Specify the used data handler that distributes the data over devices and memory banks", - cxxopts::value()->default_value(DEFAULT_DIST_TYPE)); + cxxopts::value()->default_value(DEFAULT_DIST_TYPE)) + ("copy-a", "Create a copy of matrix A for each kernel replication"); } std::unique_ptr> dataHandler; @@ -164,6 +165,11 @@ public hpcc_base::HpccFpgaBenchmarkdataHandler->exchangeData(data); +#ifndef NDEBUG + std::vector oldA(this->executionSettings->programSettings->blockSize * this->executionSettings->programSettings->blockSize * data.numBlocks); + std::copy(data.A, data.A + oldA.size(), oldA.data()); +#endif + this->dataHandler->reference_transpose(data); double max_error = 0.0; @@ -175,6 +181,38 @@ public hpcc_base::HpccFpgaBenchmark*>(this->dataHandler.get())->getHeightforRank(); + long width_per_rank = reinterpret_cast*>(this->dataHandler.get())->getWidthforRank(); + if (error_count > 0) { + std::cout << "A:" << std::endl; + for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { + for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { + std::cout << oldA[j * width_per_rank * data.blockSize + i] << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + std::cout << "B:" << std::endl; + for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { + for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { + std::cout << data.B[j * width_per_rank * data.blockSize + i] << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + std::cout << "Transposed A:" << std::endl; + for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { + for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { + std::cout << data.A[j * width_per_rank * data.blockSize + i] << ", "; + } + std::cout << std::endl; + } + std::cout << std::endl; + } + +#endif + double global_max_error = 0; int global_error_count = 0; MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); diff --git a/PTRANS/src/host/transpose_data.cpp b/PTRANS/src/host/transpose_data.cpp index 20d6560f..e8a7c8f0 100644 --- a/PTRANS/src/host/transpose_data.cpp +++ b/PTRANS/src/host/transpose_data.cpp @@ -7,7 +7,7 @@ transpose::TransposeProgramSettings::TransposeProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), matrixSize(results["m"].as() * results["b"].as()), blockSize(results["b"].as()), dataHandlerIdentifier(transpose::data_handler::stringToHandler(results["handler"].as())), - distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as()) { + distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as()), copyA(results["copy-a"].count() > 0) { // auto detect data distribution type if required if (dataHandlerIdentifier == transpose::data_handler::DataHandlerType::automatic) { diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp index c73a9959..fed4eff6 100644 --- a/PTRANS/src/host/transpose_data.hpp +++ b/PTRANS/src/host/transpose_data.hpp @@ -74,6 +74,12 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings { */ bool distributeBuffers; + /** + * @brief If true, create a copy of matrix A for each kernel replication + * + */ + bool copyA; + /** * @brief Construct a new Transpose Program Settings object * diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt index 43749c0a..43731ce8 100644 --- a/shared/CMakeLists.txt +++ b/shared/CMakeLists.txt @@ -12,8 +12,10 @@ endif() list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) add_library(hpcc_fpga_base STATIC ${HPCC_BASE_SOURCES}) if (USE_ACCL) - target_include_directories(hpcc_fpga_base PRIVATE ${ACCL_INCLUDE_PATH}) - target_link_libraries(hpcc_fpga_base accl) + add_subdirectory(${extern_accl_SOURCE_DIR}/test/hardware/xup_vitis_network_example/xrt_host_api + ${CMAKE_BINARY_DIR}/libs/xrt_host_api) + target_include_directories(hpcc_fpga_base PRIVATE ${VNX_INCLUDE_PATH} ${ACCL_INCLUDE_PATH}) + target_link_libraries(hpcc_fpga_base accl vnx) endif() if (USE_XRT_HOST) target_link_directories(hpcc_fpga_base PUBLIC ${XRT_SEARCH_PATH}) diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 5ce08a41..1e41b3d6 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -14,13 +14,58 @@ /* External libraries */ #include "experimental/xrt_ip.h" #include "parameters.h" +#include +#include #include "xrt/xrt_kernel.h" #ifdef _USE_MPI_ #include "mpi.h" #endif +using namespace vnx; + namespace fpga_setup { +void configure_vnx(CMAC &cmac, Networklayer &network_layer, + std::vector &ranks, int rank) { + if (ranks.size() > max_sockets_size) { + throw std::runtime_error("Too many ranks. VNX supports up to " + + std::to_string(max_sockets_size) + " sockets."); + } + + const auto link_status = cmac.link_status(); + + if (link_status.at("rx_status")) { + std::cout << "Link successful!" << std::endl; + } else { + std::cout << "No link found." << std::endl; + } + + if (!link_status.at("rx_status")) { + // Give time for other ranks to setup link. + std::this_thread::sleep_for(std::chrono::seconds(3)); + exit(1); + } + + MPI_Barrier(MPI_COMM_WORLD); + + network_layer.update_ip_address(ranks[rank].ip); + for (size_t i = 0; i < ranks.size(); ++i) { + if (i == static_cast(rank)) { + continue; + } + + network_layer.configure_socket(i, ranks[i].ip, ranks[i].port, + ranks[rank].port, true); + } + + network_layer.populate_socket_table(); + + std::this_thread::sleep_for(std::chrono::seconds(4)); + network_layer.arp_discovery(); + std::this_thread::sleep_for(std::chrono::seconds(2)); + network_layer.arp_discovery(); +} + std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &program, bool useAcclEmulation) { int current_rank; @@ -32,16 +77,25 @@ std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &progra std::vector ranks = {}; for (int i = 0; i < current_size; ++i) { // TODO: Replace the ip addresses and ports here for execution of real hardware? - ACCL::rank_t new_rank = {"127.0.0.1", 5500 + i, i, ACCL_BUFFER_SIZE}; + ACCL::rank_t new_rank = {"10.10.10." + current_rank, 5500 + i, i, ACCL_BUFFER_SIZE}; ranks.emplace_back(new_rank); } if (!useAcclEmulation) { + std::cout << "Create cclo ip" << std::endl; auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}"); + std::cout << "Create hostctrl" << std::endl; auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}", xrt::kernel::cu_access_mode::exclusive); + + auto cmac = CMAC(xrt::ip(device, program, "cmac_0:{cmac_0}")); + auto network_layer = Networklayer( + xrt::ip(device, program, "networklayer:{networklayer_0}")); + configure_vnx(cmac, network_layer, ranks, current_rank); + std::vector mem(1, 0); + std::cout << "Create ACCL" << std::endl; return std::unique_ptr( - new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, 0)); + new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, 0, ACCL::networkProtocol::UDP)); } else { // TODO: Add start port here. Currenty hardcoded! return std::unique_ptr( diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp index 0410fd1b..103eda17 100644 --- a/shared/setup/fpga_setup_xrt.cpp +++ b/shared/setup/fpga_setup_xrt.cpp @@ -35,6 +35,9 @@ namespace fpga_setup { std::unique_ptr selectFPGADevice(int defaultDevice) { - return std::unique_ptr(new xrt::device(defaultDevice)); + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + + return std::unique_ptr(new xrt::device(current_rank)); } } // namespace fpga_setup From 4dbc06eaed24b08727679d39a526869691e47896 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 8 Aug 2022 17:47:53 +0200 Subject: [PATCH 090/318] Add ACCL support to b_eff host code --- b_eff/CMakeLists.txt | 5 +++++ b_eff/src/common/parameters.h.in | 1 + b_eff/src/host/execution_types/execution_accl.hpp | 4 ++-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt index 13d93b1b..f150bcc9 100755 --- a/b_eff/CMakeLists.txt +++ b/b_eff/CMakeLists.txt @@ -19,6 +19,11 @@ set(USE_DEPRECATED_HPP_HEADER No) set(COMMUNICATION_TYPE_SUPPORT_ENABLED Yes) +if (USE_ACCL) + math(EXPR calculate_accl_buffer_size "2 ^ ${DEFAULT_MAX_MESSAGE_SIZE} * 4") + set(ACCL_BUFFER_SIZE ${calculate_accl_buffer_size} CACHE STRING "Size of ACCL buffers in bytes") +endif() + set(DATA_TYPE char) include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake) unset(DATA_TYPE CACHE) diff --git a/b_eff/src/common/parameters.h.in b/b_eff/src/common/parameters.h.in index d404bfd7..5c823610 100644 --- a/b_eff/src/common/parameters.h.in +++ b/b_eff/src/common/parameters.h.in @@ -23,6 +23,7 @@ #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ #cmakedefine HOST_EMULATION_REORDER +#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@ /* Short description of the program. diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp index 81673835..c4686b29 100644 --- a/b_eff/src/host/execution_types/execution_accl.hpp +++ b/b_eff/src/host/execution_types/execution_accl.hpp @@ -79,8 +79,8 @@ namespace network::execution_types::accl { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); for (int l = 0; l < looplength; l++) { - config.accl->send(0, *acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); - config.accl->recv(0, *acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + config.accl->send(*acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + config.accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); } auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); From 8e825ae5a4cfe950aa7e25e7a5a8cc4952afed1f Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 19 Sep 2022 17:54:37 +0100 Subject: [PATCH 091/318] Use UDP for ACCL communication --- shared/setup/fpga_setup_accl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 1e41b3d6..2f78366d 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -99,7 +99,7 @@ std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &progra } else { // TODO: Add start port here. Currenty hardcoded! return std::unique_ptr( - new ACCL::ACCL(ranks, current_rank, 5500, device, ACCL::networkProtocol::TCP, 16, ACCL_BUFFER_SIZE)); + new ACCL::ACCL(ranks, current_rank, 5500, device, ACCL::networkProtocol::UDP, 16, ACCL_BUFFER_SIZE)); } } From b8b24d0fc40121c58e2346409ae6309b338a4654 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 19 Sep 2022 17:56:50 +0100 Subject: [PATCH 092/318] Change repo name for VNx --- cmake/accl.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/accl.cmake b/cmake/accl.cmake index cdf23b0b..6b064d95 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -11,7 +11,7 @@ set(ACCL_CCLO_BUILD_ARGS ${ACCL_CCLO_ADDITIONAL_BUILD_ARGS}) set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/) set(ACCL_NETLAYER_HLS ${ACCL_VNX_DIR}/NetLayers/100G-fpga-network-stack-core) set(ACCL_UDP_NET_XO ${ACCL_VNX_DIR}/NetLayers/_x.${FPGA_BOARD_NAME}/networklayer.xo) -set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HMB) +set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HBM) if (ACCL_STACK_TYPE STREQUAL "UDP") list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_VNX_DIR}/Ethernet/post_sys_link.tcl) list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_HLS_IP_FOLDER}) From 32423b3e0b8b428424af7857b61e93b543047cd7 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 19 Sep 2022 17:57:26 +0100 Subject: [PATCH 093/318] Add config and settings for U55c --- .../Xilinx_U55C_DDR_ACCL_buffers_hbm.cmake | 25 +++++++ ...ings.link.xilinx.accl_buffers.u55c.hbm.ini | 71 +++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 b_eff/configs/Xilinx_U55C_DDR_ACCL_buffers_hbm.cmake create mode 100644 b_eff/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini diff --git a/b_eff/configs/Xilinx_U55C_DDR_ACCL_buffers_hbm.cmake b/b_eff/configs/Xilinx_U55C_DDR_ACCL_buffers_hbm.cmake new file mode 100644 index 00000000..ed6ec1f9 --- /dev/null +++ b/b_eff/configs/Xilinx_U55C_DDR_ACCL_buffers_hbm.cmake @@ -0,0 +1,25 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/b_eff/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini b/b_eff/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini new file mode 100644 index 00000000..61850b2a --- /dev/null +++ b/b_eff/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini @@ -0,0 +1,71 @@ +# /******************************************************************************* +# Copyright (C) 2021 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ +[connectivity] +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_sum:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:1:lb_user_krnl + +# Kernels Foorplaning +slr=compression_0_0:SLR0 +slr=compression_0_1:SLR0 +slr=compression_0_2:SLR0 +slr=lb_user_krnl:SLR0 +slr=arith_0:SLR0 +slr=ccl_offload_0:SLR0 +slr=hostctrl_0:SLR0 +slr=networklayer_0:SLR1 +slr=cmac_0:SLR1 + +sp=ccl_offload_0.m_axi_0:HBM[0:5] +sp=ccl_offload_0.m_axi_1:HBM[0:5] + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in +stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl + From 3ab470b5e2771a7bc803b9ac35e5a419a0451f6d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 19 Sep 2022 18:00:27 +0100 Subject: [PATCH 094/318] Add U55c configs for LINPACK --- .../configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake | 29 ++++++ ...nk.xilinx.hpl_torus_accl.hbm.generator.ini | 89 +++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake new file mode 100644 index 00000000..15e2edeb --- /dev/null +++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake @@ -0,0 +1,29 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) + +# LINPACK specific options +set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE) +set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE) +set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) + +set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) +set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini" CACHE STRING "Link settings file" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE) +set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE) + diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini new file mode 100644 index 00000000..1e75a2eb --- /dev/null +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini @@ -0,0 +1,89 @@ +[connectivity] +nk=lu:1 +nk=left_update:1 +nk=top_update:1 +nk=inner_update_mm0:$PY_CODE_GEN num_replications$ + +# slrs +# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR +slr=lu_1:SLR0 +slr=left_update_1:SLR0 +slr=top_update_1:SLR0 +slr=inner_update_mm0_1:SLR0 +slr=inner_update_mm0_2:SLR2 + +# matrix ports +sp=lu_1.m_axi_gmem0:HBM[0:4] +sp=lu_1.m_axi_gmem1:HBM[5:6] +sp=lu_1.m_axi_gmem2:HBM[5:6] + +sp=top_update_1.m_axi_gmem0:HBM[0:4] +sp=top_update_1.m_axi_gmem1:HBM[5:6] +sp=top_update_1.m_axi_gmem2:HBM[5:6] + +sp=left_update_1.m_axi_gmem0:HBM[0:4] +sp=left_update_1.m_axi_gmem1:HBM[5:6] +sp=left_update_1.m_axi_gmem2:HBM[5:6] + +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:4] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[5:6] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[5:6] +# PY_CODE_GEN block_end + +#ACCL +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_sum:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:1:lb_user_krnl + +# Kernels Foorplaning +slr=compression_0_0:SLR1 +slr=compression_0_1:SLR1 +slr=compression_0_2:SLR1 +slr=lb_user_krnl:SLR1 +slr=arith_0:SLR1 +slr=ccl_offload_0:SLR1 +slr=hostctrl_0:SLR1 +slr=networklayer_0:SLR1 +slr=cmac_0:SLR1 + +sp=ccl_offload_0.m_axi_0:HBM[5:6] +sp=ccl_offload_0.m_axi_1:HBM[5:6] + + + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in +stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl From 0321648a59653d3922923aa4716d966c096d4a2f Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 20 Sep 2022 15:30:08 +0100 Subject: [PATCH 095/318] Add b_eff PL scheduler --- b_eff/src/device/CMakeLists.txt | 2 +- b_eff/src/device/communication_ACCL_pl.cpp | 35 ++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 b_eff/src/device/communication_ACCL_pl.cpp diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt index e5939572..c5af8b66 100644 --- a/b_eff/src/device/CMakeLists.txt +++ b/b_eff/src/device/CMakeLists.txt @@ -16,5 +16,5 @@ if (INTELFPGAOPENCL_FOUND) endif() if (VITIS_FOUND) - generate_kernel_targets_xilinx(communication_ACCL) + generate_kernel_targets_xilinx(communication_ACCL communication_ACCL_pl) endif() diff --git a/b_eff/src/device/communication_ACCL_pl.cpp b/b_eff/src/device/communication_ACCL_pl.cpp new file mode 100644 index 00000000..528c69cb --- /dev/null +++ b/b_eff/src/device/communication_ACCL_pl.cpp @@ -0,0 +1,35 @@ +/* +Copyright (c) 2022 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#include "accl_hls.h" + +extern "C" { + +void send_recv(char *read_buffer,char *write_buffer, unsigned int size_in_bytes, unsigned int num_iterations, + unsigned int neighbor_rank, addr_t communicator_addr, + hls::stream > &cmd, hls::stream > &sts) { + for (int i = 0; i < num_iterations; i++) { + ACCLCommand accl_cmd(cmd, sts, communicator_addr, 0,0,0); + accl_cmd.send(size_in_bytes, 0, neighbor_rank, read_buffer); + accl_cmd.recv(size_in_bytes, 0, neighbor_rank, write_buffer); + } +} +} \ No newline at end of file From a533fc53d0ab0da7124773aaa3f56be42b7e96ce Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 20 Sep 2022 15:31:38 +0100 Subject: [PATCH 096/318] Add host support for PL scheduler --- b_eff/src/host/CMakeLists.txt | 5 +- b_eff/src/host/execution_types/execution.hpp | 1 + .../execution_types/execution_accl_pl.hpp | 116 ++++++++++++++++++ b_eff/src/host/network_benchmark.cpp | 17 ++- b_eff/src/host/network_benchmark.hpp | 6 + 5 files changed, 140 insertions(+), 5 deletions(-) create mode 100644 b_eff/src/host/execution_types/execution_accl_pl.hpp diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt index 5e22b54a..adaa8348 100755 --- a/b_eff/src/host/CMakeLists.txt +++ b/b_eff/src/host/CMakeLists.txt @@ -25,9 +25,12 @@ if (Vitis_FOUND) add_executable(${HOST_EXE_NAME}_xilinx main.cpp) target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) +if (USE_ACCL) target_link_libraries(${LIB_NAME}_xilinx accl) target_include_directories(${LIB_NAME}_xilinx PRIVATE ${ACCL_INCLUDE_PATH}) - target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx zmqpp) + target_link_libraries(${HOST_EXE_NAME}_xilinx zmqpp) +endif() + target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp index 118f0ebc..e9bea3be 100644 --- a/b_eff/src/host/execution_types/execution.hpp +++ b/b_eff/src/host/execution_types/execution.hpp @@ -28,4 +28,5 @@ SOFTWARE. #endif #else #include "execution_types/execution_accl.hpp" +#include "execution_types/execution_accl_pl.hpp" #endif diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp new file mode 100644 index 00000000..e82fa29a --- /dev/null +++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp @@ -0,0 +1,116 @@ +/* +Copyright (c) 2022 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_PL_HPP +#define SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_PL_HPP + +/* C++ standard library headers */ +#include +#include +#include + +/* External library headers */ +#include "mpi.h" +#include "accl.hpp" + +/* Project's headers */ + +namespace network::execution_types::accl_pl { + + /* + Implementation for the single kernel. + @copydoc bm_execution::calculate() + */ + template + std::shared_ptr + calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, + cl::vector &validationData) { + + int err; + std::vector> dummyBufferContents; + std::vector> recvBufferContents; + std::vector>> acclSendBuffers; + std::vector>> acclRecvBuffers; + cl_uint size_in_bytes = std::max(static_cast(validationData.size()), (1 << messageSize)); + + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + + int current_size; + MPI_Comm_size(MPI_COMM_WORLD, & current_size); + + std::vector calculationTimings; + for (uint r =0; r < config.programSettings->numRepetitions; r++) { + dummyBufferContents.clear(); + recvBufferContents.clear(); + acclSendBuffers.clear(); + acclRecvBuffers.clear(); + int size_in_values = (size_in_bytes + 3) / 4; + // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); + recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); + acclSendBuffers.push_back(config.accl->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); + acclRecvBuffers.push_back(config.accl->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); + acclSendBuffers.back()->sync_to_device(); + acclRecvBuffers.back()->sync_to_device(); + } + + xrt::kernel sendrecvKernel(*config.device, *config.program, "sendrecv"); + + double calculationTime = 0.0; + for (int i = 0; i < config.programSettings->kernelReplications; i++) { + MPI_Barrier(MPI_COMM_WORLD); + auto startCalculation = std::chrono::high_resolution_clock::now(); + auto run = sendrecvKernel(acclSendBuffers[i]->bo(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.accl->get_communicator_adr()); + run.wait(); + auto endCalculation = std::chrono::high_resolution_clock::now(); + calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); + #ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl; + #endif + } + calculationTimings.push_back(calculationTime); +#ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Done " << r << std::endl; +#endif + } + // Read validation data from FPGA will be placed sequentially in buffer for all replications + // The data order should not matter, because every byte should have the same value! + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + acclRecvBuffers.back()->sync_from_device(); + std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); + } + std::shared_ptr result(new network::ExecutionTimings{ + looplength, + messageSize, + calculationTimings + }); + return result; + } + +} // namespace bm_execution + +#endif diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index 09872106..2eef9621 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -36,8 +36,11 @@ SOFTWARE. network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), maxLoopLength(results["u"].as()), minLoopLength(results["l"].as()), maxMessageSize(results["m"].as()), - minMessageSize(results["min-size"].as()), llOffset(results["o"].as()), llDecrease(results["d"].as()) { - + minMessageSize(results["min-size"].as()), llOffset(results["o"].as()), llDecrease(results["d"].as()) +#ifdef USE_ACCL + , accl_from_programable_logic(results["accl-pl"].count()) +#endif +{ } std::map @@ -86,7 +89,11 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options) ("o", "Offset used before reducing repetitions", cxxopts::value()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_OFFSET))) ("d", "Number os steps the repetitions are decreased to its minimum", - cxxopts::value()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE))); + cxxopts::value()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE))) +#ifdef USE_ACCL + ("accl-pl", "Use second ACCL command kernel to schedule sends and recevs from PL") +#endif +; } std::unique_ptr @@ -113,8 +120,10 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) { #ifdef INTEL_FPGA case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; #endif +#else + case hpcc_base::CommunicationType::accl: if (!executionSettings->programSettings->accl_from_programable_logic) { timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); + } else { timing = execution_types::accl_pl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);} break; #endif - case hpcc_base::CommunicationType::accl: timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType)); } timing_results.push_back(timing); diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index 8e9e2fc1..964ec5ca 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -140,6 +140,12 @@ class NetworkProgramSettings : public hpcc_base::BaseSettings { */ uint llDecrease; + /** + * @brief Use the second command kernel to schedule sends and receives directly from PL + * + */ + bool accl_from_programable_logic; + /** * @brief Construct a new Network Program Settings object * From e2f20829b2be3982d3fd7c3d2a09500f030fda40 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 20 Sep 2022 15:32:30 +0100 Subject: [PATCH 097/318] Add config for PL scheduler --- .../configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake diff --git a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake new file mode 100644 index 00000000..472957c1 --- /dev/null +++ b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake @@ -0,0 +1,25 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) +set(XILINX_KERNEL_NAMES "sendrecv" CACHE STRING "" FORCE) +# STREAM specific options +# Defaults to a total of ~12GB data +set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) From 8f87e16a883d77c18b201fe08ad0a7ce691857e2 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 20 Sep 2022 15:35:06 +0100 Subject: [PATCH 098/318] Set temporary branch for ACCL with recent features (Pl schedule and UDP fix) --- extern/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 5587a0e1..141899f7 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -61,8 +61,8 @@ if (DEFINED USE_ACCL) FetchContent_Declare( extern_accl - GIT_REPOSITORY https://github.com/Xilinx/ACCL.git - GIT_TAG main) + GIT_REPOSITORY https://github.com/Mellich/ACCL.git + GIT_TAG udp_address_fix_and_new_tcp) FetchContent_GetProperties(extern_accl) if(NOT extern_accl_POPULATED) From d202beb42b05d34ddb1c7b6e25029f8e7d94b214 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 20 Sep 2022 16:52:52 +0100 Subject: [PATCH 099/318] Add support for client arbiter --- .../configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake | 3 +- .../settings.link.xilinx.accl_pl.u55c.hbm.ini | 81 +++++++++++++++++++ cmake/accl.cmake | 16 +++- 3 files changed, 96 insertions(+), 4 deletions(-) create mode 100644 b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini diff --git a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake index 472957c1..65516d5b 100644 --- a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake +++ b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake @@ -13,10 +13,11 @@ set(USE_ACCL Yes CACHE BOOL "" FORCE) set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) set(USE_OCL_HOST No CACHE BOOL "" FORCE) set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) -set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini CACHE FILEPATH "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini CACHE FILEPATH "" FORCE) set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE) set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) set(XILINX_KERNEL_NAMES "sendrecv" CACHE STRING "" FORCE) +set(USE_ACCL_CLIENT_ARBITER Yes CACHE BOOL "Use the client arbiter kernel to connect multiple kernels to the CCLO cmd stream" FORCE) # STREAM specific options # Defaults to a total of ~12GB data set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE) diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini new file mode 100644 index 00000000..f344c4bb --- /dev/null +++ b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini @@ -0,0 +1,81 @@ +# /******************************************************************************* +# Copyright (C) 2021 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ +[connectivity] +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_sum:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:1:lb_user_krnl +nk=client_arbiter:1:client_arbiter +nk=sendrecv:1:sendrecv + +# Kernels Foorplaning +slr=compression_0_0:SLR0 +slr=compression_0_1:SLR0 +slr=compression_0_2:SLR0 +slr=lb_user_krnl:SLR0 +slr=arith_0:SLR0 +slr=ccl_offload_0:SLR0 +slr=hostctrl_0:SLR0 +slr=networklayer_0:SLR1 +slr=cmac_0:SLR1 +slr=client_arbiter:SLR0 +slr=sendrecv:SLR0 + +sp=ccl_offload_0.m_axi_0:HBM[0:5] +sp=ccl_offload_0.m_axi_1:HBM[0:5] +sp=sendrecv.m_axi_gmem:HBM[0:5] + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:client_arbiter.cmd_clients_0 +stream_connect=client_arbiter.ack_clients_0:hostctrl_0.sts +stream_connect=sendrecv.cmd:client_arbiter.cmd_clients_1 +stream_connect=client_arbiter.ack_clients_1:sendrecv.sts +stream_connect=client_arbiter.cmd_cclo:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:client_arbiter.ack_cclo + + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in +stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl + diff --git a/cmake/accl.cmake b/cmake/accl.cmake index 6b064d95..097d4094 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -93,9 +93,10 @@ add_custom_target( # Build the ACCL Plugins set(ACCL_PLUGINS_DIR ${extern_accl_SOURCE_DIR}/kernels/plugins) set(ACCL_PLUGINS_HOSTCTRL ${ACCL_PLUGINS_DIR}/hostctrl/hostctrl.xo) -set(ACCL_PLUGINS_SUM ${ACCL_PLUGINS_DIR}/reduce_sum/reduce_sum.xo) +set(ACCL_PLUGINS_SUM ${ACCL_PLUGINS_DIR}/reduce_ops/reduce_ops.xo) set(ACCL_PLUGINS_COMPRESSION ${ACCL_PLUGINS_DIR}/hp_compression/hp_compression.xo) set(ACCL_PLUGINS_LOOPBACK ${ACCL_PLUGINS_DIR}/loopback/loopback.xo) +set(ACCL_PLUGINS_ARBITER ${ACCL_PLUGINS_DIR}/client_arbiter/client_arbiter.xo) add_custom_command( OUTPUT ${ACCL_PLUGINS_HOSTCTRL} @@ -104,7 +105,7 @@ add_custom_command( add_custom_command( OUTPUT ${ACCL_PLUGINS_SUM} COMMAND vitis_hls build.tcl -tclargs ip ${ACCL_DEVICE_NAME} - WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/reduce_sum ) + WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/reduce_ops ) add_custom_command( OUTPUT ${ACCL_PLUGINS_COMPRESSION} COMMAND vitis_hls build.tcl -tclargs ip ${ACCL_DEVICE_NAME} @@ -113,11 +114,16 @@ add_custom_command( OUTPUT ${ACCL_PLUGINS_LOOPBACK} COMMAND vitis_hls build_loopback.tcl -tclargs ip ${ACCL_DEVICE_NAME} WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/loopback ) +add_custom_command( + OUTPUT ${ACCL_PLUGINS_ARBITER} + COMMAND vitis_hls build_client_arbiter.tcl -tclargs ip ${ACCL_DEVICE_NAME} + WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/client_arbiter ) + add_custom_target( accl_plugins DEPENDS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} - ${ACCL_PLUGINS_COMPRESSION}) + ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_ARBITER}) set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XOS} ${ACCL_UDP_NET_XO} CACHE INTERNAL "Object files required for ACCL with UDP") @@ -125,6 +131,10 @@ set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLU set(ACCL_TCP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_TCP_CMAC_XO} ${ACCL_TCP_XO} CACHE INTERNAL "Object files required for ACCL with TCP") +if (DEFINED USE_ACCL_CLIENT_ARBITER) + list(APPEND ${ACCL_UDP_XOS} ${ACCL_PLUGINS_ARBITER}) + list(APPEND ${ACCL_TCP_XOS} ${ACCL_PLUGINS_ARBITER}) +endif() if (ACCL_STACK_TYPE STREQUAL "UDP") set(ACCL_XOS ${ACCL_UDP_XOS} CACHE INTERNAL "Object files required for ACCL") else() From 150fac323fb952eab7de119bbf92bf9766dbaf8c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 21 Sep 2022 13:34:18 +0100 Subject: [PATCH 100/318] Fix build for b_eff with PL --- b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake | 2 +- .../settings.link.xilinx.accl_pl.u55c.hbm.ini | 4 ++-- b_eff/src/device/communication_ACCL_pl.cpp | 14 ++++++-------- .../src/host/execution_types/execution_accl_pl.hpp | 3 ++- cmake/accl.cmake | 6 ++++-- 5 files changed, 15 insertions(+), 14 deletions(-) diff --git a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake index 65516d5b..45e2b5d7 100644 --- a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake +++ b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake @@ -16,7 +16,7 @@ set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini CACHE FILEPATH "" FORCE) set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE) set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) -set(XILINX_KERNEL_NAMES "sendrecv" CACHE STRING "" FORCE) +set(XILINX_KERNEL_NAMES "send_recv" CACHE STRING "" FORCE) set(USE_ACCL_CLIENT_ARBITER Yes CACHE BOOL "Use the client arbiter kernel to connect multiple kernels to the CCLO cmd stream" FORCE) # STREAM specific options # Defaults to a total of ~12GB data diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini index f344c4bb..a59018d2 100644 --- a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini +++ b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini @@ -20,11 +20,11 @@ nk=networklayer:1:networklayer_0 nk=ccl_offload:1:ccl_offload_0 nk=hostctrl:1:hostctrl_0 nk=cmac_0:1:cmac_0 -nk=reduce_sum:1:arith_0 +nk=reduce_ops:1:arith_0 nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 nk=loopback:1:lb_user_krnl nk=client_arbiter:1:client_arbiter -nk=sendrecv:1:sendrecv +nk=send_recv:1:sendrecv # Kernels Foorplaning slr=compression_0_0:SLR0 diff --git a/b_eff/src/device/communication_ACCL_pl.cpp b/b_eff/src/device/communication_ACCL_pl.cpp index 528c69cb..58fc7fed 100644 --- a/b_eff/src/device/communication_ACCL_pl.cpp +++ b/b_eff/src/device/communication_ACCL_pl.cpp @@ -21,15 +21,13 @@ SOFTWARE. */ #include "accl_hls.h" -extern "C" { -void send_recv(char *read_buffer,char *write_buffer, unsigned int size_in_bytes, unsigned int num_iterations, - unsigned int neighbor_rank, addr_t communicator_addr, - hls::stream > &cmd, hls::stream > &sts) { +void send_recv(float *read_buffer,float *write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, + ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, + hls::stream &cmd, hls::stream &sts) { + accl_hls::ACCLCommand accl_cmd(cmd, sts, communicator_addr, datapath_cfg,0,0); for (int i = 0; i < num_iterations; i++) { - ACCLCommand accl_cmd(cmd, sts, communicator_addr, 0,0,0); - accl_cmd.send(size_in_bytes, 0, neighbor_rank, read_buffer); - accl_cmd.recv(size_in_bytes, 0, neighbor_rank, write_buffer); + accl_cmd.send(size, 0, neighbor_rank, (ap_uint<64>)read_buffer); + accl_cmd.recv(size, 0, neighbor_rank, (ap_uint<64>)write_buffer); } } -} \ No newline at end of file diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp index e82fa29a..3f39cfab 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp @@ -80,7 +80,8 @@ namespace network::execution_types::accl_pl { for (int i = 0; i < config.programSettings->kernelReplications; i++) { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); - auto run = sendrecvKernel(acclSendBuffers[i]->bo(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.accl->get_communicator_adr()); + auto run = sendrecvKernel(acclSendBuffers[i]->bo(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + config.accl->get_communicator_adr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32})); run.wait(); auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); diff --git a/cmake/accl.cmake b/cmake/accl.cmake index 097d4094..7c3d1f08 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -18,6 +18,8 @@ if (ACCL_STACK_TYPE STREQUAL "UDP") list(APPEND ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE}) endif() +list(APPEND XILINX_ADDITIONAL_COMPILE_FLAGS "-I${extern_accl_SOURCE_DIR}/driver/hls" "-DACCL_SYNTHESIS") + set(ACCL_UDP_MAC_XOS "") math(EXPR loopend "${ACCL_UDP_ETH_IFS} - 1") @@ -132,8 +134,8 @@ set(ACCL_TCP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLU ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_TCP_CMAC_XO} ${ACCL_TCP_XO} CACHE INTERNAL "Object files required for ACCL with TCP") if (DEFINED USE_ACCL_CLIENT_ARBITER) - list(APPEND ${ACCL_UDP_XOS} ${ACCL_PLUGINS_ARBITER}) - list(APPEND ${ACCL_TCP_XOS} ${ACCL_PLUGINS_ARBITER}) + list(APPEND ACCL_UDP_XOS ${ACCL_PLUGINS_ARBITER}) + list(APPEND ACCL_TCP_XOS ${ACCL_PLUGINS_ARBITER}) endif() if (ACCL_STACK_TYPE STREQUAL "UDP") set(ACCL_XOS ${ACCL_UDP_XOS} CACHE INTERNAL "Object files required for ACCL") From 5b2748b09fa39b1e9639d736399723c35586cc5d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 21 Sep 2022 16:54:14 +0100 Subject: [PATCH 101/318] Update settings to use reduce_ops --- .../settings.link.xilinx.hpl_torus_accl.hbm.generator.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini index 1e75a2eb..ec8cbfa6 100644 --- a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini @@ -37,7 +37,7 @@ nk=networklayer:1:networklayer_0 nk=ccl_offload:1:ccl_offload_0 nk=hostctrl:1:hostctrl_0 nk=cmac_0:1:cmac_0 -nk=reduce_sum:1:arith_0 +nk=reduce_ops:1:arith_0 nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 nk=loopback:1:lb_user_krnl From 695912dcfc8616c1bbf14d15d71550d32f7d93bf Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 22 Sep 2022 16:15:37 +0100 Subject: [PATCH 102/318] Update PTRANS stream approach --- .../configs/Xilinx_U55C_HBM_ACCL_stream.cmake | 28 +++++++ ...nk.xilinx.transpose_pq_accl_stream.hbm.ini | 76 +++++++++++++++++++ .../src/device/transpose_PQ_ACCL_stream.cpp | 7 +- 3 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini diff --git a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake new file mode 100644 index 00000000..e75d0ff7 --- /dev/null +++ b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake @@ -0,0 +1,28 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE) +set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) +set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE) +set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini new file mode 100644 index 00000000..d4c5567e --- /dev/null +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini @@ -0,0 +1,76 @@ +# /******************************************************************************* +# Copyright (C) 2021 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ +[connectivity] +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_sum:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=transpose_read0:1 +nk=transpose_write0:1 + +# Kernels Foorplaning +slr=compression_0_0:SLR0 +slr=compression_0_1:SLR0 +slr=compression_0_2:SLR0 +slr=arith_0:SLR0 +slr=ccl_offload_0:SLR0 +slr=hostctrl_0:SLR0 +slr=networklayer_0:SLR1 +slr=cmac_0:SLR2 +slr=transpose_read0_1:SLR0 +slr=transpose_write0_1:SLR0 + +sp=ccl_offload_0.m_axi_0:HBM[31] +sp=ccl_offload_0.m_axi_1:HBM[31] +sp=transpose_read0_1.m_axi_gmem0:HBM[0:7] +sp=transpose_write0_1.m_axi_gmem0:HBM[8:15] +sp=transpose_write0_1.m_axi_gmem1:HBM[16:23] + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl +stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl + diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp index 2cda216f..1c136dc8 100644 --- a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp +++ b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp @@ -118,11 +118,12 @@ void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) % (channel_width)]; } - ap_uint<512> data = 0; + ap_uint<512> data; // load tranposed A from global memory for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { - data |= ((ap_uint<8*sizeof(DEVICE_DATA_TYPE)>)data_chunk[unroll_count]) << (unroll_count * sizeof(DEVICE_DATA_TYPE)); + data(unroll_count * sizeof(DEVICE_DATA_TYPE)*8, unroll_count * sizeof(DEVICE_DATA_TYPE)*8 + sizeof(DEVICE_DATA_TYPE) * 8 - 1) + = data_chunk[unroll_count]; } ap_axiu<512, 0, 0, 8> tmp; @@ -182,7 +183,7 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B, // rotate temporary buffer to store data into local buffer for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { - data_chunk[unroll_count] = (DEVICE_DATA_TYPE)((tmp.data >> (unroll_count * sizeof(DEVICE_DATA_TYPE))) & ((1 << 32) - 1)); + data_chunk[unroll_count] = tmp.data(unroll_count * sizeof(DEVICE_DATA_TYPE)*8, unroll_count * sizeof(DEVICE_DATA_TYPE)*8 + sizeof(DEVICE_DATA_TYPE) * 8 - 1); } // load tranposed A from global memory From 8f4d24cf74ca4b8c9af59107c9ae49774226d734 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 22 Sep 2022 16:23:00 +0100 Subject: [PATCH 103/318] Add configs for HPL U55c --- ...cmake => Xilinx_U55C_B8_SB3_R1_ACCL.cmake} | 2 +- .../Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake | 30 ++++++++++++++++ ...nk.xilinx.hpl_torus_pcie.hbm.generator.ini | 34 +++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) rename LINPACK/configs/{Xilinx_U55C_B8_SB3_R2_ACCL.cmake => Xilinx_U55C_B8_SB3_R1_ACCL.cmake} (96%) create mode 100644 LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake similarity index 96% rename from LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake rename to LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake index 15e2edeb..ec9d153b 100644 --- a/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_ACCL.cmake +++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake @@ -18,7 +18,7 @@ set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE) set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE) set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) -set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) +set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE) set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake new file mode 100644 index 00000000..bbf80c86 --- /dev/null +++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake @@ -0,0 +1,30 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) +set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE) +set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE) +# LINPACK specific options +set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE) +set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE) +set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) + +set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) +set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini" CACHE STRING "Link settings file" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE) +set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE) diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini new file mode 100644 index 00000000..df381966 --- /dev/null +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini @@ -0,0 +1,34 @@ +[connectivity] +nk=lu:1 +nk=left_update:1 +nk=top_update:1 +nk=inner_update_mm0:$PY_CODE_GEN num_replications$ + +# slrs +# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR +slr=lu_1:SLR0 +slr=left_update_1:SLR0 +slr=top_update_1:SLR0 +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN (i +1) % 3$ +# PY_CODE_GEN block_end + +# matrix ports +sp=lu_1.m_axi_gmem0:HBM[0:5] +sp=lu_1.m_axi_gmem1:HBM[6] +sp=lu_1.m_axi_gmem2:HBM[7] + +sp=top_update_1.m_axi_gmem0:HBM[0:5] +sp=top_update_1.m_axi_gmem1:HBM[6] +sp=top_update_1.m_axi_gmem2:HBM[8] + +sp=left_update_1.m_axi_gmem0:HBM[0:5] +sp=left_update_1.m_axi_gmem1:HBM[7] +sp=left_update_1.m_axi_gmem2:HBM[9] + +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[9] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8] +# PY_CODE_GEN block_end + From 2acae30e61b5aee71354a1b930733408ad1fc414 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 22 Sep 2022 16:32:13 +0100 Subject: [PATCH 104/318] Update the SLR mapping and ACCL kernel names --- ...ink.xilinx.transpose_pq_accl_stream.hbm.ini | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini index d4c5567e..2dadc525 100644 --- a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini @@ -20,21 +20,21 @@ nk=networklayer:1:networklayer_0 nk=ccl_offload:1:ccl_offload_0 nk=hostctrl:1:hostctrl_0 nk=cmac_0:1:cmac_0 -nk=reduce_sum:1:arith_0 +nk=reduce_ops:1:arith_0 nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 nk=transpose_read0:1 nk=transpose_write0:1 # Kernels Foorplaning -slr=compression_0_0:SLR0 -slr=compression_0_1:SLR0 -slr=compression_0_2:SLR0 -slr=arith_0:SLR0 -slr=ccl_offload_0:SLR0 -slr=hostctrl_0:SLR0 +slr=compression_0_0:SLR1 +slr=compression_0_1:SLR1 +slr=compression_0_2:SLR1 +slr=arith_0:SLR1 +slr=ccl_offload_0:SLR1 +slr=hostctrl_0:SLR1 slr=networklayer_0:SLR1 -slr=cmac_0:SLR2 -slr=transpose_read0_1:SLR0 +slr=cmac_0:SLR1 +slr=transpose_read0_1:SLR2 slr=transpose_write0_1:SLR0 sp=ccl_offload_0.m_axi_0:HBM[31] From 9c7c5ee5d9875574355467a34ef430df05d279df Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 22 Sep 2022 16:44:28 +0100 Subject: [PATCH 105/318] Add PTRANS baseline configs for U55c --- PTRANS/configs/Xilinx_U55C_HBM_PCIE.cmake | 23 +++++++++++++++++++ ...s.compile.xilinx.transpose_pq_pcie.hbm.ini | 3 --- ...xilinx.transpose_pq_pcie.hbm.generator.ini | 17 ++++++++++++++ 3 files changed, 40 insertions(+), 3 deletions(-) create mode 100644 PTRANS/configs/Xilinx_U55C_HBM_PCIE.cmake create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini diff --git a/PTRANS/configs/Xilinx_U55C_HBM_PCIE.cmake b/PTRANS/configs/Xilinx_U55C_HBM_PCIE.cmake new file mode 100644 index 00000000..c2f3cb4d --- /dev/null +++ b/PTRANS/configs/Xilinx_U55C_HBM_PCIE.cmake @@ -0,0 +1,23 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) +set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE) +set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini index 7e52533c..8b137891 100644 --- a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini +++ b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini @@ -1,4 +1 @@ -kernel_frequency=450 -[hls] -max_memory_ports=all diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini new file mode 100644 index 00000000..e6f72be5 --- /dev/null +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini @@ -0,0 +1,17 @@ + +# Set number of available SLRs +# PY_CODE_GEN num_slrs = 3 +# PY_CODE_GEN num_ddrs = 2 + +[connectivity] +nk=transpose0:$PY_CODE_GEN num_replications$ + +# Assign kernels to the SLRs +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +slr=transpose0_$PY_CODE_GEN i + 1$:SLR$PY_CODE_GEN i % num_slrs$ +# PY_CODE_GEN block_end + +# Assign the kernels to the memory ports +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem:HBM[$PY_CODE_GEN i*8$:$PY_CODE_GEN (i+1)*8$] +# PY_CODE_GEN block_end From ef2da9ee51ffbc9fdc2f5301bb7186a1497e9c26 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 28 Sep 2022 09:44:45 +0200 Subject: [PATCH 106/318] Update CI script for Noctua2 --- .gitlab-ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a8b41ac6..40ca7a1f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,13 +3,14 @@ stages: - test variables: - SCHEDULER_PARAMETERS: "-A pc2-mitarbeiter -p short" + SCHEDULER_PARAMETERS: "-A pc2-mitarbeiter -p normal -q cont -t 00:30:00 -n 2 -N 1" default: tags: - jacamar before_script: - - module load intelFPGA_pro/21.2.0 bittware_520n/20.4.0_max toolchain/foss/2021a devel/CMake/3.20.1-GCCcore-10.3.0 + - module load fpga/intel/opencl_sdk/21.2.0 fpga/bittware/520n/20.4.0_max toolchain/foss/2021a devel/CMake/3.20.1-GCCcore-10.3.0 lang/Python/3.9.5-GCCcore-10.3.0 + - python -m pip install pandas ### # From 4bb0862cd9126ef3b5a01150a5b8559f3dba72ec Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 29 Sep 2022 13:56:28 +0100 Subject: [PATCH 107/318] Add ACCL to RPATH for convenience (PTRANS) --- PTRANS/src/host/CMakeLists.txt | 5 +++++ shared/CMakeLists.txt | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt index 5bb10e54..1ad17d14 100755 --- a/PTRANS/src/host/CMakeLists.txt +++ b/PTRANS/src/host/CMakeLists.txt @@ -33,6 +33,11 @@ if (INTELFPGAOPENCL_FOUND) endif() if (Vitis_FOUND) + if (USE_ACCL) + set(CMAKE_SKIP_BUILD_RPATH No) + set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes) + list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib) + endif() add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host) diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt index 43731ce8..22b2e1f4 100644 --- a/shared/CMakeLists.txt +++ b/shared/CMakeLists.txt @@ -3,7 +3,7 @@ project(HPCCBaseLibrary VERSION 1.0.1) set(HPCC_BASE_SOURCES "") if (USE_ACCL) - include(${extern_accl_SOURCE_DIR}/driver/xrt/CMakeLists.txt) + add_subdirectory(${extern_accl_SOURCE_DIR}/driver/xrt ${CMAKE_BINARY_DIR}/lib/accl) list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp) endif() if (USE_XRT_HOST) From a786b0919154118ff908cf428232da2c82b1c756 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Sun, 4 Sep 2022 21:16:18 +0200 Subject: [PATCH 108/318] Add platform_str parameter add option for passing platform string insted of platform index overwrites all index options when used necessary as order of platforms is not deterministic anymore --- shared/include/hpcc_benchmark.hpp | 11 ++++++++++- shared/include/setup/fpga_setup.hpp | 2 +- shared/setup/fpga_setup.cpp | 16 ++++++++++++++-- shared/tests/hpcc_base_benchmark_test.cpp | 13 ++++++++++--- 4 files changed, 35 insertions(+), 7 deletions(-) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 17e17bb9..aed3f901 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -89,6 +89,12 @@ class BaseSettings { */ int defaultPlatform; + /** + * @brief The platform string of the platform that should be used + * + */ + std::string platformString; + /** * @brief The default device that should be used for execution. * A number representing the index in the list of available devices @@ -134,6 +140,7 @@ class BaseSettings { skipValidation(static_cast(results.count("skip-validation"))), defaultPlatform(results["platform"].as()), defaultDevice(results["device"].as()), + platformString(results["platform_str"].as()), kernelFileName(results["f"].as()), #ifdef NUM_REPLICATIONS kernelReplications(results.count("r") > 0 ? results["r"].as() : NUM_REPLICATIONS), @@ -380,6 +387,7 @@ class HpccFpgaBenchmark { "you will be asked which platform to use if there are multiple "\ "platforms available.", cxxopts::value()->default_value(std::to_string(DEFAULT_PLATFORM))) + ("platform_str", "Name of the platform that has to be used", cxxopts::value()->default_value(std::string())) #ifdef NUM_REPLICATIONS ("r", "Number of used kernel replications", cxxopts::value()->default_value(std::to_string(NUM_REPLICATIONS))) @@ -478,7 +486,8 @@ class HpccFpgaBenchmark { if (!programSettings->testOnly) { usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, - programSettings->defaultDevice); + programSettings->defaultDevice, + programSettings->platformString); context = std::unique_ptr(new cl::Context(*usedDevice)); program = fpga_setup::fpgaSetup(context.get(), {*usedDevice}, diff --git a/shared/include/setup/fpga_setup.hpp b/shared/include/setup/fpga_setup.hpp index 0799900c..7f88f8b1 100644 --- a/shared/include/setup/fpga_setup.hpp +++ b/shared/include/setup/fpga_setup.hpp @@ -157,7 +157,7 @@ choose a device. @return A list containing a single selected device */ std::unique_ptr - selectFPGADevice(int defaultPlatform, int defaultDevice); + selectFPGADevice(int defaultPlatform, int defaultDevice, std::string platformString); } // namespace fpga_setup #endif // SRC_HOST_FPGA_SETUP_H_ diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp index dd1ddd28..70125df0 100644 --- a/shared/setup/fpga_setup.cpp +++ b/shared/setup/fpga_setup.cpp @@ -224,7 +224,7 @@ choose a device. @return A list containing a single selected device */ std::unique_ptr - selectFPGADevice(int defaultPlatform, int defaultDevice) { + selectFPGADevice(int defaultPlatform, int defaultDevice, std::string platformString) { // Integer used to store return codes of OpenCL library calls int err; @@ -243,7 +243,19 @@ choose a device. // Choose the target platform long unsigned int chosenPlatformId = 0; if (defaultPlatform >= 0) { - if (defaultPlatform < static_cast(platformList.size())) { + if (platformString.size() > 0) { + bool found = false; + for (int i = 0; i < platformList.size(); i++) { + if (platformList[i].getInfo() == platformString) { + chosenPlatformId = i; + found = true; + break; + } + } + if (!found) { + throw FpgaSetupException("Invalid platform string specified: " + platformString); + } + } else if (defaultPlatform < static_cast(platformList.size())) { chosenPlatformId = defaultPlatform; } else { std::cerr << "Default platform " << defaultPlatform diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp index a93a2a69..1c491b49 100644 --- a/shared/tests/hpcc_base_benchmark_test.cpp +++ b/shared/tests/hpcc_base_benchmark_test.cpp @@ -170,21 +170,28 @@ TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenTestOnlyAndSetupSuccess) { * Checks if using default platform and device is successful */ TEST_F(BaseHpccBenchmarkTest, SuccessUseDefaultPlatform) { - EXPECT_NE(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, bm->getExecutionSettings().programSettings->defaultDevice).get(), nullptr); + EXPECT_NE(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, bm->getExecutionSettings().programSettings->defaultDevice, bm->getExecutionSettings().programSettings->platformString).get(), nullptr); } /** * Checks if non existing platform leads to an error */ TEST_F(BaseHpccBenchmarkTest, FindNonExistingPlatform) { - ASSERT_THROW(fpga_setup::selectFPGADevice(100, bm->getExecutionSettings().programSettings->defaultDevice).get(), fpga_setup::FpgaSetupException); + ASSERT_THROW(fpga_setup::selectFPGADevice(100, bm->getExecutionSettings().programSettings->defaultDevice, bm->getExecutionSettings().programSettings->platformString).get(), fpga_setup::FpgaSetupException); } /** * Checks if non existing device leads to an error */ TEST_F(BaseHpccBenchmarkTest, FindNonExistingDevice) { - ASSERT_THROW(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, 100).get(), fpga_setup::FpgaSetupException); + ASSERT_THROW(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, 100, bm->getExecutionSettings().programSettings->platformString).get(), fpga_setup::FpgaSetupException); +} + +/* + * Check if wrong platform string leads to an error + */ +TEST_F(BaseHpccBenchmarkTest, FindNonExistingPlatformString) { + ASSERT_THROW(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, bm->getExecutionSettings().programSettings->defaultDevice, "This is not a platform").get(), fpga_setup::FpgaSetupException); } /** From a39430c8de9f60163a4e66bfd45bebb88f5f693c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 4 Oct 2022 08:47:34 +0100 Subject: [PATCH 109/318] Reduce replications for PTRANS via streams --- PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake index e75d0ff7..e7a5a22e 100644 --- a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake +++ b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake @@ -23,6 +23,6 @@ set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE) set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE) set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) -set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) +set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) From 1b58508270a4df0d744b1871e14aea7c24828b6a Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 4 Oct 2022 08:50:16 +0100 Subject: [PATCH 110/318] Update PTRANS for stream_put transpose --- .../src/device/transpose_PQ_ACCL_stream.cpp | 25 +- PTRANS/src/host/CMakeLists.txt | 2 + .../execution_xrt_accl_stream_pq.hpp | 366 ++++++++++++++++++ PTRANS/src/host/transpose_benchmark.hpp | 5 +- shared/CMakeLists.txt | 9 +- shared/include/hpcc_benchmark.hpp | 4 +- shared/setup/fpga_setup_accl.cpp | 2 +- 7 files changed, 390 insertions(+), 23 deletions(-) create mode 100644 PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp index 1c136dc8..72d3d0cb 100644 --- a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp +++ b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp @@ -6,16 +6,14 @@ * - Change to row-column loop structure *****************************************************************************/ #include "parameters.h" -#include "hls_stream.h" #include "ap_int.h" #include "ap_utils.h" #include "ap_axi_sdata.h" +#include "accl_hls.h" const unsigned int block_size = BLOCK_SIZE; const unsigned int channel_width = CHANNEL_WIDTH; -extern "C" { - // PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] /** @@ -41,10 +39,8 @@ void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, const unsigned int number_of_blocks, const unsigned int width_in_blocks, const unsigned int height_in_blocks, - hls::stream > &krnl2cclo) { + STREAM &krnl2cclo) { #pragma HLS INTERFACE axis register both port=krnl2cclo -#pragma HLS INTERFACE ap_ctrl_none port=return - // local memory double buffer for a matrix block DEVICE_DATA_TYPE a_block[2][block_size * block_size / channel_width][channel_width]; @@ -118,19 +114,16 @@ void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) % (channel_width)]; } - ap_uint<512> data; + stream_word tmp; // load tranposed A from global memory for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { - data(unroll_count * sizeof(DEVICE_DATA_TYPE)*8, unroll_count * sizeof(DEVICE_DATA_TYPE)*8 + sizeof(DEVICE_DATA_TYPE) * 8 - 1) + tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8) = data_chunk[unroll_count]; } - - ap_axiu<512, 0, 0, 8> tmp; - tmp.data = data; tmp.dest = 0; tmp.keep = -1; - krnl2cclo.write(tmp); + STREAM_WRITE(krnl2cclo,tmp); } } } @@ -156,9 +149,8 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B, const unsigned int number_of_blocks, const unsigned int width_in_blocks, const unsigned int height_in_blocks, - hls::stream > &cclo2krnl) { + STREAM &cclo2krnl) { #pragma HLS INTERFACE axis register both port=cclo2krnl -#pragma HLS INTERFACE ap_ctrl_none port=return // transpose the matrix block-wise from global memory block_loop: @@ -179,11 +171,11 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B, DEVICE_DATA_TYPE data_chunk[channel_width]; #pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0 - ap_axiu<512, 0, 0, 8> tmp = cclo2krnl.read(); + stream_word tmp = STREAM_READ(cclo2krnl); // rotate temporary buffer to store data into local buffer for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { - data_chunk[unroll_count] = tmp.data(unroll_count * sizeof(DEVICE_DATA_TYPE)*8, unroll_count * sizeof(DEVICE_DATA_TYPE)*8 + sizeof(DEVICE_DATA_TYPE) * 8 - 1); + data_chunk[unroll_count] = tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8); } // load tranposed A from global memory @@ -201,4 +193,3 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B, // PY_CODE_GEN block_end -} diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt index 1ad17d14..fe7214c4 100755 --- a/PTRANS/src/host/CMakeLists.txt +++ b/PTRANS/src/host/CMakeLists.txt @@ -37,10 +37,12 @@ if (Vitis_FOUND) set(CMAKE_SKIP_BUILD_RPATH No) set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes) list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib) + list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/transpose_PQ_ACCL_stream.cpp) endif() add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + target_include_directories(${LIB_NAME}_xilinx PRIVATE ${extern_accl_SOURCE_DIR}/hlslib/include/xilinx) add_executable(${HOST_EXE_NAME}_xilinx main.cpp) target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp new file mode 100644 index 00000000..4bc406e1 --- /dev/null +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp @@ -0,0 +1,366 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_ACCL_STREAM_PQ_EXECUTION_H_ +#define SRC_HOST_ACCL_STREAM_PQ_EXECUTION_H_ + +/* C++ standard library headers */ +#include +#include +#include + +/* Project's headers */ +#include "buffer.hpp" +#include "cclo.hpp" +#include "constants.hpp" +#include "data_handlers/data_handler_types.h" +#include "data_handlers/pq.hpp" +#include "fpgabuffer.hpp" +#include "transpose_data.hpp" +#include "cclo_bfm.h" +#include "Simulation.h" +#include "dummybuffer.hpp" + +extern void transpose_write(const DEVICE_DATA_TYPE *B, + DEVICE_DATA_TYPE *A_out, + const unsigned int offset_b, + const unsigned int number_of_blocks, + const unsigned int width_in_blocks, + const unsigned int height_in_blocks, + hlslib::Stream &cclo2krnl); + +extern void transpose_read( const DEVICE_DATA_TYPE *A, + const unsigned int offset_a, + const unsigned int number_of_blocks, + const unsigned int width_in_blocks, + const unsigned int height_in_blocks, + hlslib::Stream &krnl2cclo); + +namespace transpose { +namespace fpga_execution { +namespace accl_stream_pq { + +/** + * @brief Transpose and add the matrices using the OpenCL kernel using a PQ + * distribution and PCIe+MPI over the host for communication + * + * @param config The progrma configuration + * @param data data object that contains all required data for the execution on + * the FPGA + * @param handler data handler instance that should be used to exchange data + * between hosts + * @return std::unique_ptr The measured + * execution times + */ +static std::unique_ptr calculate( + const hpcc_base::ExecutionSettings &config, + transpose::TransposeData &data, + transpose::data_handler::DistributedPQTransposeDataHandler< + xrt::device, bool, xrt::uuid> &handler) { + int err; + + if (config.programSettings->dataHandlerIdentifier != + transpose::data_handler::DataHandlerType::pq) { + throw std::runtime_error( + "Used data handler not supported by execution handler!"); + } +#ifdef USE_SVM + throw new std::runtime_error("SVM not supported in the host implementation " + "of this communication method"); +#endif +#ifdef USE_BUFFER_WRITE_RECT_FOR_A + throw new std::runtime_error( + "Using the Write Rect method is not supported in this host " + "implementation of this communication method"); +#endif + + std::vector bufferSizeList; + std::vector bufferStartList; + std::vector bufferOffsetList; + std::vector bufferListA; + std::vector bufferListB; + std::vector bufferListA_out; + std::vector transposeReadKernelList; + std::vector transposeWriteKernelList; + std::vector blocksPerReplication; + + size_t local_matrix_width = handler.getWidthforRank(); + size_t local_matrix_height = handler.getHeightforRank(); + size_t local_matrix_width_bytes = + local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); + + size_t total_offset = 0; + size_t row_offset = 0; +#ifndef NDEBUG + std::cout << "Start kernel creation" << std::endl; +#endif + // Setup the kernels depending on the number of kernel replications + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + + // Calculate how many blocks the current kernel replication will need to + // process. + size_t blocks_per_replication = + (local_matrix_height * local_matrix_width / + config.programSettings->kernelReplications); + size_t blocks_remainder = (local_matrix_height * local_matrix_width) % + config.programSettings->kernelReplications; + if (blocks_remainder > r) { + // Catch the case, that the number of blocks is not divisible by the + // number of kernel replications + blocks_per_replication += 1; + } + if (blocks_per_replication < 1) { + continue; + } + blocksPerReplication.push_back(blocks_per_replication); + size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / + local_matrix_width * local_matrix_width * + data.blockSize * data.blockSize; + bufferSizeList.push_back(buffer_size); + bufferStartList.push_back(total_offset); + bufferOffsetList.push_back(row_offset); + + row_offset = (row_offset + blocks_per_replication) % local_matrix_width; + + total_offset += (bufferOffsetList.back() + blocks_per_replication) / + local_matrix_width * local_matrix_width; + + if (!config.programSettings->useAcclEmulation) { + // create the kernels + xrt::kernel transposeReadKernel( + *config.device, *config.program, + ("transpose_read0:{transpose_read0_" + std::to_string(r + 1) + "}").c_str()); + xrt::kernel transposeWriteKernel( + *config.device, *config.program, + ("transpose_write0:{transpose_write0_" + std::to_string(r + 1) + "}").c_str()); + + if (r == 0 || config.programSettings->copyA) { + xrt::bo bufferA(*config.device, data.A, + data.numBlocks * data.blockSize * data.blockSize * + sizeof(HOST_DATA_TYPE), + transposeReadKernel.group_id(0)); + bufferListA.push_back(bufferA); + } + xrt::bo bufferB( + *config.device, + &data.B[bufferStartList[r] * data.blockSize * data.blockSize], + buffer_size * sizeof(HOST_DATA_TYPE), transposeWriteKernel.group_id(0)); + xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), + transposeWriteKernel.group_id(1)); + + bufferListB.push_back(bufferB); + bufferListA_out.push_back(bufferA_out); + transposeReadKernelList.push_back(transposeReadKernel); + transposeWriteKernelList.push_back(transposeWriteKernel); + } + } + + std::vector transferTimings; + std::vector calculationTimings; + + for (int repetition = 0; repetition < config.programSettings->numRepetitions; + repetition++) { + +#ifndef NDEBUG + std::cout << "Start data transfer" << std::endl; +#endif + auto startTransfer = std::chrono::high_resolution_clock::now(); + + if (!config.programSettings->useAcclEmulation) { + for (int r = 0; r < transposeReadKernelList.size(); r++) { + if (r == 0 || config.programSettings->copyA) { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + } + auto endTransfer = std::chrono::high_resolution_clock::now(); + + std::chrono::duration transferTime = + std::chrono::duration_cast>( + endTransfer - startTransfer); + + MPI_Barrier(MPI_COMM_WORLD); + +#ifndef NDEBUG + std::cout << "Start BFM" << std::endl; +#endif + + HLSLIB_DATAFLOW_INIT(); + hlslib::Stream cclo2krnl, krnl2cclo; + hlslib::Stream cmd, sts; + + int pq_width = handler.getP(); + + int mpi_comm_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank); + int mpi_comm_size; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size); + int pq_row = mpi_comm_rank / pq_width; + int pq_col = mpi_comm_rank % pq_width; + + int pair_rank = pq_width * pq_col + pq_row; + std::vector dest = {0,9, 18}; + CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo); + cclo.run(); + MPI_Barrier(MPI_COMM_WORLD); + + auto startCalculation = std::chrono::high_resolution_clock::now(); + +#ifndef NDEBUG + std::cout << "Start kernel execution" << std::endl; +#endif + std::vector runs; + auto startKernelCalculation = std::chrono::high_resolution_clock::now(); + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + if (!config.programSettings->useAcclEmulation) { + runs.push_back(transposeReadKernelList[r]( + (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]), + static_cast(bufferOffsetList[r]), + static_cast(blocksPerReplication[r]), + static_cast(handler.getWidthforRank()), + static_cast( + (bufferSizeList[r]) / + (local_matrix_width * data.blockSize * data.blockSize)))); + runs.push_back(transposeWriteKernelList[r]( + bufferListB[r], bufferListA_out[r], + static_cast(bufferOffsetList[r]), + static_cast(blocksPerReplication[r]), + static_cast(handler.getWidthforRank()), + static_cast( + (bufferSizeList[r]) / + (local_matrix_width * data.blockSize * data.blockSize)))); + } else { + HLSLIB_DATAFLOW_FUNCTION(transpose_read, + (config.programSettings->copyA ? data.A : data.A), + static_cast(bufferOffsetList[r]), + static_cast(blocksPerReplication[r]), + static_cast(handler.getWidthforRank()), + static_cast( + (bufferSizeList[r]) / + (local_matrix_width * data.blockSize * data.blockSize)), + krnl2cclo); + HLSLIB_DATAFLOW_FUNCTION(transpose_write, + data.B, data.result, + static_cast(bufferOffsetList[r]), + static_cast(blocksPerReplication[r]), + static_cast(handler.getWidthforRank()), + static_cast( + (bufferSizeList[r]) / + (local_matrix_width * data.blockSize * data.blockSize)), + cclo2krnl); + } + } + auto dbuffer = config.accl->create_buffer(1,ACCL::dataType::float32); + // Exchange A data via ACCL + config.accl->stream_put(*dbuffer, data.blockSize * data.blockSize * data.numBlocks, + pair_rank, 9, ACCL::GLOBAL_COMM, + false, ACCL::streamFlags::OP0_STREAM); + // config.accl->send(*dbuffer, data.blockSize * data.blockSize * data.numBlocks, + // pair_rank, 9, ACCL::GLOBAL_COMM, + // false, ACCL::streamFlags::OP0_STREAM | ACCL::streamFlags::RES_STREAM ); +#ifndef NDEBUG + std::cout << "Wait for kernels to complete" << std::endl; +#endif + for (int r = 0; r < runs.size(); r++) { + runs[r].wait(); + } + cclo.stop(); + HLSLIB_DATAFLOW_FINALIZE(); + auto endCalculation = std::chrono::high_resolution_clock::now(); +#ifndef NDEBUG + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + std::cout << "Rank " << mpi_rank << ": " + << "Done i=" << repetition << std::endl; + std::cout << "Kernel execution time: " + << std::chrono::duration_cast>( + endCalculation - startKernelCalculation) + .count() + << "s (" + << ((config.programSettings->matrixSize * + config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * + 3) / + std::chrono::duration_cast>( + endCalculation - startKernelCalculation) + .count() * + 1.0e-9) + << " GB/s)" << std::endl; +#endif + + std::chrono::duration calculationTime = + std::chrono::duration_cast>( + endCalculation - startCalculation); + calculationTimings.push_back(calculationTime.count()); + + std::vector tmp_write_buffer( + local_matrix_height * local_matrix_width * data.blockSize * + data.blockSize); + + startTransfer = std::chrono::high_resolution_clock::now(); + if (!config.programSettings->useAcclEmulation) { + for (int r = 0; r < transposeReadKernelList.size(); r++) { + // Copy possibly incomplete first block row + if (bufferOffsetList[r] != 0) { + bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bufferListA_out[r].read(tmp_write_buffer.data()); + for (int row = 0; row < data.blockSize; row++) { + for (int col = bufferOffsetList[r] * data.blockSize; + col < local_matrix_width * data.blockSize; col++) { + data.result[bufferStartList[r] * data.blockSize * data.blockSize + + row * local_matrix_width * data.blockSize + col] = + tmp_write_buffer[row * local_matrix_width * data.blockSize + + col]; + } + } + // Copy remaining buffer + std::copy(tmp_write_buffer.begin() + + local_matrix_width * data.blockSize * data.blockSize, + tmp_write_buffer.begin() + bufferSizeList[r], + &data.result[(bufferStartList[r] + local_matrix_width) * + data.blockSize * data.blockSize]); + } else { + bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bufferListA_out[r].read( + &data.result[bufferStartList[r] * data.blockSize * data.blockSize]); + } + } + } + endTransfer = std::chrono::high_resolution_clock::now(); + + transferTime += std::chrono::duration_cast>( + endTransfer - startTransfer); + transferTimings.push_back(transferTime.count()); + } + + std::unique_ptr result( + new transpose::TransposeExecutionTimings{transferTimings, + calculationTimings}); + + return result; +} + +} // namespace accl_pq +} // namespace fpga_execution +} // namespace transpose + +#endif diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index 585e60be..f2b06965 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -41,6 +41,7 @@ SOFTWARE. #include "execution_types/execution_xrt_pcie_pq.hpp" #ifdef USE_ACCL #include "execution_types/execution_xrt_accl_pq.hpp" +#include "execution_types/execution_xrt_accl_stream_pq.hpp" #endif #endif #include "execution_types/execution_cpu.hpp" @@ -141,8 +142,10 @@ public hpcc_base::HpccFpgaBenchmarkexecutionSettings), data, reinterpret_cast&>(*this->dataHandler)); break; #ifdef USE_ACCL + // case hpcc_base::CommunicationType::accl: + // return transpose::fpga_execution::accl_pq::calculate(*(this->executionSettings), data, reinterpret_cast&>(*this->dataHandler)); break; case hpcc_base::CommunicationType::accl: - return transpose::fpga_execution::accl_pq::calculate(*(this->executionSettings), data, reinterpret_cast&>(*this->dataHandler)); break; + return transpose::fpga_execution::accl_stream_pq::calculate(*(this->executionSettings), data, reinterpret_cast&>(*this->dataHandler)); break; #endif #endif #ifdef MKL_FOUND diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt index 22b2e1f4..19ab7ff2 100644 --- a/shared/CMakeLists.txt +++ b/shared/CMakeLists.txt @@ -5,16 +5,19 @@ set(HPCC_BASE_SOURCES "") if (USE_ACCL) add_subdirectory(${extern_accl_SOURCE_DIR}/driver/xrt ${CMAKE_BINARY_DIR}/lib/accl) list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp) + if (CMAKE_BUILD_TYPE EQUAL "Debug") + set(ACCL_DEBUG Yes) + endif() endif() if (USE_XRT_HOST) - list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp) + list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp ${extern_accl_SOURCE_DIR}/test/model/bfm/cclo_bfm.cpp) endif() list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) add_library(hpcc_fpga_base STATIC ${HPCC_BASE_SOURCES}) if (USE_ACCL) add_subdirectory(${extern_accl_SOURCE_DIR}/test/hardware/xup_vitis_network_example/xrt_host_api - ${CMAKE_BINARY_DIR}/libs/xrt_host_api) - target_include_directories(hpcc_fpga_base PRIVATE ${VNX_INCLUDE_PATH} ${ACCL_INCLUDE_PATH}) + ${CMAKE_BINARY_DIR}/lib/xrt_host_api) + target_include_directories(hpcc_fpga_base PUBLIC ${VNX_INCLUDE_PATH} ${ACCL_INCLUDE_PATH} ${extern_accl_SOURCE_DIR}/test/model/bfm ${extern_accl_SOURCE_DIR}/driver/hls ${extern_hlslib_SOURCE_DIR}/include/hlslib/xilinx) target_link_libraries(hpcc_fpga_base accl vnx) endif() if (USE_XRT_HOST) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index bece837c..7ff91bae 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -517,7 +517,9 @@ class HpccFpgaBenchmark { #ifdef USE_XRT_HOST usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultDevice); context = std::unique_ptr(new bool(false)); - program = fpga_setup::fpgaSetup(*usedDevice, programSettings->kernelFileName); + if (!programSettings->useAcclEmulation) { + program = fpga_setup::fpgaSetup(*usedDevice, programSettings->kernelFileName); + } #endif #ifdef USE_OCL_HOST usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 2f78366d..51c7f87e 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -99,7 +99,7 @@ std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &progra } else { // TODO: Add start port here. Currenty hardcoded! return std::unique_ptr( - new ACCL::ACCL(ranks, current_rank, 5500, device, ACCL::networkProtocol::UDP, 16, ACCL_BUFFER_SIZE)); + new ACCL::ACCL(ranks, current_rank, 6000, device, ACCL::networkProtocol::UDP, 16, ACCL_BUFFER_SIZE)); } } From 610071e65de239d4e44de42ffcd58999355cff6f Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 4 Oct 2022 08:50:37 +0100 Subject: [PATCH 111/318] Update ACCL source to dev branch --- extern/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 141899f7..341f73cd 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -61,8 +61,8 @@ if (DEFINED USE_ACCL) FetchContent_Declare( extern_accl - GIT_REPOSITORY https://github.com/Mellich/ACCL.git - GIT_TAG udp_address_fix_and_new_tcp) + GIT_REPOSITORY https://github.com/Xilinx/ACCL.git + GIT_TAG dev) FetchContent_GetProperties(extern_accl) if(NOT extern_accl_POPULATED) From ff89927672206f974982707d00fbffe15b171bab Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 4 Oct 2022 10:44:06 +0100 Subject: [PATCH 112/318] Fix settings for PTRANS ACCL stream --- PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake | 2 +- .../settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini | 3 ++- .../settings.link.xilinx.transpose_pq_accl_stream.hbm.ini | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake index e7a5a22e..f84f73a2 100644 --- a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake +++ b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake @@ -15,7 +15,7 @@ set(USE_OCL_HOST No CACHE BOOL "" FORCE) set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE) set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini CACHE FILEPATH "" FORCE) -set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE) set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE) # STREAM specific options diff --git a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini index 8b137891..d259f88f 100644 --- a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini +++ b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini @@ -1 +1,2 @@ - +[hls] +max_memory_ports=all diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini index 2dadc525..f9be4bec 100644 --- a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini @@ -22,6 +22,7 @@ nk=hostctrl:1:hostctrl_0 nk=cmac_0:1:cmac_0 nk=reduce_ops:1:arith_0 nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:0 nk=transpose_read0:1 nk=transpose_write0:1 From f475d4cf1a96ef23b04abf4b95e3f8d1b06e5175 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 5 Oct 2022 13:44:20 +0100 Subject: [PATCH 113/318] Fix PTRANS emulation results --- PTRANS/src/device/transpose_PQ_ACCL_stream.cpp | 11 +++++++---- .../execution_types/execution_xrt_accl_stream_pq.hpp | 10 ++++------ shared/setup/fpga_setup_accl.cpp | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp index 72d3d0cb..739792e0 100644 --- a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp +++ b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp @@ -118,12 +118,14 @@ void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, // load tranposed A from global memory for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + DEVICE_DATA_TYPE v = data_chunk[unroll_count]; tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8) - = data_chunk[unroll_count]; + = *reinterpret_cast*>(&v); } - tmp.dest = 0; + tmp.dest = 9; + tmp.last = 1; tmp.keep = -1; - STREAM_WRITE(krnl2cclo,tmp); + STREAM_WRITE(krnl2cclo,tmp); } } } @@ -175,7 +177,8 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B, // rotate temporary buffer to store data into local buffer for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { - data_chunk[unroll_count] = tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8); + ap_uint v = tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8); + data_chunk[unroll_count] = *reinterpret_cast(&v); } // load tranposed A from global memory diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp index 4bc406e1..8fac597c 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp @@ -206,7 +206,7 @@ static std::unique_ptr calculate( #endif HLSLIB_DATAFLOW_INIT(); - hlslib::Stream cclo2krnl, krnl2cclo; + hlslib::Stream cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo"); hlslib::Stream cmd, sts; int pq_width = handler.getP(); @@ -219,7 +219,7 @@ static std::unique_ptr calculate( int pq_col = mpi_comm_rank % pq_width; int pair_rank = pq_width * pq_col + pq_row; - std::vector dest = {0,9, 18}; + std::vector dest = {0}; CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo); cclo.run(); MPI_Barrier(MPI_COMM_WORLD); @@ -275,17 +275,15 @@ static std::unique_ptr calculate( config.accl->stream_put(*dbuffer, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 9, ACCL::GLOBAL_COMM, false, ACCL::streamFlags::OP0_STREAM); - // config.accl->send(*dbuffer, data.blockSize * data.blockSize * data.numBlocks, - // pair_rank, 9, ACCL::GLOBAL_COMM, - // false, ACCL::streamFlags::OP0_STREAM | ACCL::streamFlags::RES_STREAM ); #ifndef NDEBUG std::cout << "Wait for kernels to complete" << std::endl; #endif for (int r = 0; r < runs.size(); r++) { runs[r].wait(); } - cclo.stop(); + MPI_Barrier(MPI_COMM_WORLD); HLSLIB_DATAFLOW_FINALIZE(); + cclo.stop(); auto endCalculation = std::chrono::high_resolution_clock::now(); #ifndef NDEBUG int mpi_rank; diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 51c7f87e..58fc3f67 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -99,7 +99,7 @@ std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &progra } else { // TODO: Add start port here. Currenty hardcoded! return std::unique_ptr( - new ACCL::ACCL(ranks, current_rank, 6000, device, ACCL::networkProtocol::UDP, 16, ACCL_BUFFER_SIZE)); + new ACCL::ACCL(ranks, current_rank, 6000, device, ACCL::networkProtocol::TCP, 16, ACCL_BUFFER_SIZE)); } } From 9828b49fd9651538ea59d06c107c80dd7d42cb37 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 5 Oct 2022 13:48:40 +0100 Subject: [PATCH 114/318] Make start/stop BFM optional --- .../host/execution_types/execution_xrt_accl_stream_pq.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp index 8fac597c..9a43fef9 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp @@ -221,7 +221,9 @@ static std::unique_ptr calculate( int pair_rank = pq_width * pq_col + pq_row; std::vector dest = {0}; CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo); - cclo.run(); + if (config.programSettings->useAcclEmulation) { + cclo.run(); + } MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); @@ -283,7 +285,9 @@ static std::unique_ptr calculate( } MPI_Barrier(MPI_COMM_WORLD); HLSLIB_DATAFLOW_FINALIZE(); - cclo.stop(); + if (config.programSettings->useAcclEmulation) { + cclo.stop(); + } auto endCalculation = std::chrono::high_resolution_clock::now(); #ifndef NDEBUG int mpi_rank; From f046192909eced188d141c5264683f6c68301ab2 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 6 Oct 2022 10:54:42 +0100 Subject: [PATCH 115/318] Add send/recv implementation for PTRANS with ACCL --- .../transpose_PQ_ACCL_stream_sendrecv.cpp | 252 ++++++++++ .../execution_xrt_accl_stream_pq_sendrecv.hpp | 456 ++++++++++++++++++ 2 files changed, 708 insertions(+) create mode 100644 PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp create mode 100644 PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp new file mode 100644 index 00000000..4c9452b2 --- /dev/null +++ b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp @@ -0,0 +1,252 @@ +/****************************************************************************** + * Author: Arjun Ramaswami + * + * Edited by Marius Meyer: + * - Adapt to used kernel signature + * - Change to row-column loop structure + *****************************************************************************/ +#include "parameters.h" +#include "ap_int.h" +#include "ap_utils.h" +#include "ap_axi_sdata.h" +#include "accl_hls.h" + + +const int block_size = BLOCK_SIZE; +const int channel_width = CHANNEL_WIDTH; + +/** + * @brief Modulo operation that always produces positive values in range [0,op-1]. This is required for the PQ transpose algorithm and is different from the usual remainder calculation done with %! + * + * @tparam T Data type used for the modulo operation. + * @param number Number the modulo is calculated from + * @param op Modulo operator + * @return T number mod op + */ +template +T mod(T number, T op) { + T result = number % op; + // result >= op required for unsinged data types + return (result < 0 || result >= op) ? op + result : result; +} + + +void transpose_block_transpose(const DEVICE_DATA_TYPE *A, + DEVICE_DATA_TYPE a_block[][channel_width], + const unsigned int offset_a, + const unsigned int width_in_blocks, + const unsigned int height_in_blocks) { + +#pragma HLS INTERFACE axis register both port=krnl2cclo + + // transpose the matrix block-wise from global memory +read_A: + for (unsigned int row = 0; row < block_size; row++) { +read_A_line: + for (unsigned int col = 0; col < block_size / channel_width; col++) { +#pragma HLS PIPELINE + unsigned long block_row_a = (offset_a) / width_in_blocks; + unsigned long block_col_a = (offset_a) % width_in_blocks; + unsigned long ls_address_trans = block_col_a * block_size * block_size * height_in_blocks + + block_row_a * block_size + + row * block_size * height_in_blocks; + + + // read in block of A from global memory and store it in a memory efficient manner for transpose + DEVICE_DATA_TYPE rotate_in[channel_width]; +#pragma HLS ARRAY_PARTITION variable = rotate_in complete dim = 0 + + // Blocks of a will be stored columnwise in global memory + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + rotate_in[unroll_count] = A[ls_address_trans + col * channel_width + unroll_count]; + } + + unsigned int chunk = row * (block_size / channel_width) + col; + + unsigned rot = (row) % (channel_width); + + // rotate temporary buffer to store data into local buffer + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + // every block of (N / channel_width), rotates the index by 1 + // store in double buffer + a_block[chunk][unroll_count] = rotate_in[(unroll_count + channel_width - rot) + % (channel_width)]; + } + } + } +} + +void transpose_block_forward(DEVICE_DATA_TYPE a_block[][channel_width], + STREAM &krnl2cclo) { + +read_A: + for (unsigned int row = 0; row < block_size; row++) { +read_A_line: + for (unsigned int col = 0; col < block_size / channel_width; col++) { + DEVICE_DATA_TYPE data_chunk[channel_width]; +#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0 + DEVICE_DATA_TYPE rotate_out[channel_width]; +#pragma HLS ARRAY_PARTITION variable = rotate_out complete dim = 0 + + unsigned int base = col * block_size; + unsigned int offset = row / channel_width; + + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + unsigned rot = ((channel_width + unroll_count - row) * (block_size / channel_width)) % + (block_size); + unsigned row_rotate = base + offset + rot; + rotate_out[unroll_count] = a_block[row_rotate][unroll_count]; + } + + unsigned rot_out = row % (channel_width); + + // rotate temporary buffer to store data into local buffer + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) % (channel_width)]; + } + + stream_word tmp; + + // load tranposed A from global memory + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + DEVICE_DATA_TYPE v = data_chunk[unroll_count]; + tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8) + = *reinterpret_cast*>(&v); + } + tmp.dest = 9; + tmp.last = 1; + tmp.keep = -1; + STREAM_WRITE(krnl2cclo,tmp); + } + } +} + +/** + * + * ext. channel -> trans(A) + B -> A_out + * + * @param B Buffer for matrix B + * @param A_out Buffer for result matrix + * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise + on the block level, the whole matrix A might be written to global memory and the relevant columns + need to be picked using this offset. + * @param number_of_blocks The number of blocks that will be processed starting from the block offset + * @param width_in_blocks The with of matrix A in blocks + * @param height_in_blocks The height of matix A in blocks + */ +void transpose_block_receive(const DEVICE_DATA_TYPE *B, + DEVICE_DATA_TYPE *A_out, + const unsigned int offset_b, + const unsigned int width_in_blocks, + STREAM &cclo2krnl) { +#pragma HLS INTERFACE axis register both port=cclo2krnl + + // transpose the matrix block-wise from global memory +#pragma HLS loop_tripcount min=1 max=1024 avg=1 + // Read transposed A from local memory and add B +read_B: + for (unsigned int row = 0; row < block_size; row++) { +read_B_line: + for (unsigned int col = 0; col < block_size / channel_width; col++) { + unsigned long block_row = (offset_b) / width_in_blocks; + unsigned long block_col = (offset_b) % width_in_blocks; + unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks + + block_col * block_size + + row * block_size * width_in_blocks; + unsigned int chunk = row * (block_size / channel_width) + col; + + DEVICE_DATA_TYPE data_chunk[channel_width]; +#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0 + + stream_word tmp = STREAM_READ(cclo2krnl); + + // rotate temporary buffer to store data into local buffer + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + ap_uint v = tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8); + data_chunk[unroll_count] = *reinterpret_cast(&v); + } + + // load tranposed A from global memory + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + data_chunk[unroll_count] += B[ls_address_row + col * channel_width + unroll_count]; + } + + for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) { + A_out[ls_address_row + col * channel_width + unroll_count] = data_chunk[unroll_count]; + } + } + } +} + +void transpose_read(const DEVICE_DATA_TYPE* A, + const int* target_list, + int pq_row, int pq_col, + int pq_width, int pq_height, + int gcd, int least_common_multiple, + int height_per_rank, + int width_per_rank, + STREAM &krnl2cclo) { + + // Begin algorithm from Figure 14 for general case + int g = mod(pq_row - pq_col, gcd); + int p = mod(pq_col + g, pq_width); + int q = mod(pq_row - g, pq_height); + + for (int j = 0; j < least_common_multiple/pq_width; j++) { + for (int i = 0; i < least_common_multiple/pq_height; i++) { + // Determine sender and receiver rank of current rank for current communication step + int send_rank = mod(p + i * gcd, pq_width) + mod(q - j * gcd, pq_height) * pq_width; + + for (int col = 0; col < least_common_multiple/pq_width; col++) { + for (int row = 0; row < least_common_multiple/pq_height; row++) { + if (target_list[row * least_common_multiple/pq_width + col] == send_rank) { + for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) { + for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) { + unsigned int matrix_buffer_offset = (col + lcm_col * least_common_multiple/pq_width) + (row + lcm_row * least_common_multiple/pq_height) * width_per_rank; + DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width]; + transpose_block_transpose(A, a_block, matrix_buffer_offset, width_per_rank, height_per_rank); + transpose_block_forward(a_block, krnl2cclo); + } + } + } + } + } + } + } +} + +void transpose_write(const DEVICE_DATA_TYPE* B, + DEVICE_DATA_TYPE* C, + const int* target_list, + int pq_row, int pq_col, + int pq_width, int pq_height, + int gcd, int least_common_multiple, + int height_per_rank, + int width_per_rank, + STREAM &cclo2krnl) { + + // Begin algorithm from Figure 14 for general case + int g = mod(pq_row - pq_col, gcd); + int p = mod(pq_col + g, pq_width); + int q = mod(pq_row - g, pq_height); + for (int j = 0; j < least_common_multiple/pq_width; j++) { + for (int i = 0; i < least_common_multiple/pq_height; i++) { + + int recv_rank = mod(p - i * gcd, pq_width) + mod(q + j * gcd, pq_height) * pq_width; + + for (int col = 0; col < least_common_multiple/pq_width; col++) { + for (int row = 0; row < least_common_multiple/pq_height; row++) { + if (target_list[row * least_common_multiple/pq_width + col] == recv_rank) { + for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) { + for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) { + unsigned int matrix_buffer_offset = (col + lcm_col * least_common_multiple/pq_width) + (row + lcm_row * least_common_multiple/pq_height) * width_per_rank; + transpose_block_receive(B,C,matrix_buffer_offset,width_per_rank, cclo2krnl); + } + } + } + } + } + } + } +} + diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp new file mode 100644 index 00000000..b51d8120 --- /dev/null +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp @@ -0,0 +1,456 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_ACCL_STREAM_PQ_SENDRECV_EXECUTION_H_ +#define SRC_HOST_ACCL_STREAM_PQ_SENDRECV_EXECUTION_H_ + +/* C++ standard library headers */ +#include +#include +#include + +/* Project's headers */ +#include "buffer.hpp" +#include "cclo.hpp" +#include "constants.hpp" +#include "data_handlers/data_handler_types.h" +#include "data_handlers/pq.hpp" +#include "fpgabuffer.hpp" +#include "transpose_data.hpp" +#include "cclo_bfm.h" +#include "Simulation.h" +#include "dummybuffer.hpp" + +extern void transpose_write(const DEVICE_DATA_TYPE* B, + DEVICE_DATA_TYPE* C, + const int* target_list, + int pq_row, int pq_col, + int pq_width, int pq_height, + int gcd, int least_common_multiple, + int height_per_rank, + int width_per_rank, + STREAM &cclo2krnl); + +extern void transpose_read(const DEVICE_DATA_TYPE* A, + const int* target_list, + int pq_row, int pq_col, + int pq_width, int pq_height, + int gcd, int least_common_multiple, + int height_per_rank, + int width_per_rank, + STREAM &krnl2cclo); + +namespace transpose { +namespace fpga_execution { +namespace accl_stream_pq { + +/** + * @brief Transpose and add the matrices using the OpenCL kernel using a PQ + * distribution and PCIe+MPI over the host for communication + * + * @param config The progrma configuration + * @param data data object that contains all required data for the execution on + * the FPGA + * @param handler data handler instance that should be used to exchange data + * between hosts + * @return std::unique_ptr The measured + * execution times + */ +static std::unique_ptr calculate( + const hpcc_base::ExecutionSettings &config, + transpose::TransposeData &data, + transpose::data_handler::DistributedPQTransposeDataHandler< + xrt::device, bool, xrt::uuid> &handler) { + int err; + + if (config.programSettings->dataHandlerIdentifier != + transpose::data_handler::DataHandlerType::pq) { + throw std::runtime_error( + "Used data handler not supported by execution handler!"); + } +#ifdef USE_SVM + throw new std::runtime_error("SVM not supported in the host implementation " + "of this communication method"); +#endif +#ifdef USE_BUFFER_WRITE_RECT_FOR_A + throw new std::runtime_error( + "Using the Write Rect method is not supported in this host " + "implementation of this communication method"); +#endif + + std::vector bufferSizeList; + std::vector bufferStartList; + std::vector bufferOffsetList; + std::vector bufferListA; + std::vector bufferListB; + std::vector bufferListA_out; + std::vector>> bufferListTargets; + std::vector transposeReadKernelList; + std::vector transposeWriteKernelList; + std::vector blocksPerReplication; + + size_t local_matrix_width = handler.getWidthforRank(); + size_t local_matrix_height = handler.getHeightforRank(); + size_t local_matrix_width_bytes = + local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE); + + size_t total_offset = 0; + size_t row_offset = 0; + + // Algorithm defines + int pq_width = handler.getP(); + int pq_height = handler.getQ(); + + int mpi_comm_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank); + int mpi_comm_size; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size); + int pq_row = mpi_comm_rank / pq_width; + int pq_col = mpi_comm_rank % pq_width; + + int gcd = std::__gcd(pq_height, pq_width); + int least_common_multiple = pq_height * pq_width / gcd; + +#ifndef NDEBUG + std::cout << "Start kernel creation" << std::endl; +#endif + // Setup the kernels depending on the number of kernel replications + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + + // Calculate how many blocks the current kernel replication will need to + // process. + size_t blocks_per_replication = + (local_matrix_height * local_matrix_width / + config.programSettings->kernelReplications); + size_t blocks_remainder = (local_matrix_height * local_matrix_width) % + config.programSettings->kernelReplications; + if (blocks_remainder > r) { + // Catch the case, that the number of blocks is not divisible by the + // number of kernel replications + blocks_per_replication += 1; + } + if (blocks_per_replication < 1) { + continue; + } + blocksPerReplication.push_back(blocks_per_replication); + size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) / + local_matrix_width * local_matrix_width * + data.blockSize * data.blockSize; + bufferSizeList.push_back(buffer_size); + bufferStartList.push_back(total_offset); + bufferOffsetList.push_back(row_offset); + + row_offset = (row_offset + blocks_per_replication) % local_matrix_width; + + total_offset += (bufferOffsetList.back() + blocks_per_replication) / + local_matrix_width * local_matrix_width; + + // Pre-calculate target ranks in LCM block + // The vector list variable can be interpreted as 2D matrix. Every entry + // represents the target rank of the sub-block Since the LCM block will + // repeat, we only need to store this small amount of data! + auto target_list = config.accl->create_buffer(least_common_multiple / pq_height * + least_common_multiple / pq_width, ACCL::dataType::int32); + for (int row = 0; row < least_common_multiple / pq_height; row++) { + for (int col = 0; col < least_common_multiple / pq_width; col++) { + int global_block_col = pq_col + col * pq_width; + int global_block_row = pq_row + row * pq_height; + int destination_rank = (global_block_col % pq_height) * pq_width + + (global_block_row % pq_width); + target_list->buffer()[row * least_common_multiple / pq_width + col] = + destination_rank; + } + } + target_list->sync_to_device(); + bufferListTargets.push_back(std::move(target_list)); + + if (!config.programSettings->useAcclEmulation) { + // create the kernels + xrt::kernel transposeReadKernel( + *config.device, *config.program, + ("transpose_read0:{transpose_read0_" + std::to_string(r + 1) + "}").c_str()); + xrt::kernel transposeWriteKernel( + *config.device, *config.program, + ("transpose_write0:{transpose_write0_" + std::to_string(r + 1) + "}").c_str()); + + if (r == 0 || config.programSettings->copyA) { + xrt::bo bufferA(*config.device, data.A, + data.numBlocks * data.blockSize * data.blockSize * + sizeof(HOST_DATA_TYPE), + transposeReadKernel.group_id(0)); + bufferListA.push_back(bufferA); + } + + xrt::bo bufferB( + *config.device, + &data.B[bufferStartList[r] * data.blockSize * data.blockSize], + buffer_size * sizeof(HOST_DATA_TYPE), transposeWriteKernel.group_id(0)); + xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE), + transposeWriteKernel.group_id(1)); + + bufferListB.push_back(bufferB); + bufferListA_out.push_back(bufferA_out); + transposeReadKernelList.push_back(transposeReadKernel); + transposeWriteKernelList.push_back(transposeWriteKernel); + } + } + + std::vector transferTimings; + std::vector calculationTimings; + + for (int repetition = 0; repetition < config.programSettings->numRepetitions; + repetition++) { + +#ifndef NDEBUG + std::cout << "Start data transfer" << std::endl; +#endif + auto startTransfer = std::chrono::high_resolution_clock::now(); + + if (!config.programSettings->useAcclEmulation) { + for (int r = 0; r < transposeReadKernelList.size(); r++) { + if (r == 0 || config.programSettings->copyA) { + bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + } + auto endTransfer = std::chrono::high_resolution_clock::now(); + + std::chrono::duration transferTime = + std::chrono::duration_cast>( + endTransfer - startTransfer); + + MPI_Barrier(MPI_COMM_WORLD); + +#ifndef NDEBUG + std::cout << "Start BFM" << std::endl; +#endif + + HLSLIB_DATAFLOW_INIT(); + hlslib::Stream cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo"); + hlslib::Stream cmd, sts; + + std::vector dest = {0}; + CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo); + if (config.programSettings->useAcclEmulation) { + cclo.run(); + } + MPI_Barrier(MPI_COMM_WORLD); + + auto startCalculation = std::chrono::high_resolution_clock::now(); + +#ifndef NDEBUG + std::cout << "Start kernel execution" << std::endl; + std::cout << bufferListTargets[0]->buffer()[0] << std::endl; +#endif + std::vector runs; + auto startKernelCalculation = std::chrono::high_resolution_clock::now(); + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + if (!config.programSettings->useAcclEmulation) { + runs.push_back(transposeReadKernelList[r]( + (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]), + static_cast(bufferOffsetList[r]), + static_cast(blocksPerReplication[r]), + static_cast(handler.getWidthforRank()), + static_cast( + (bufferSizeList[r]) / + (local_matrix_width * data.blockSize * data.blockSize)))); + runs.push_back(transposeWriteKernelList[r]( + bufferListB[r], bufferListA_out[r], + static_cast(bufferOffsetList[r]), + static_cast(blocksPerReplication[r]), + static_cast(handler.getWidthforRank()), + static_cast( + (bufferSizeList[r]) / + (local_matrix_width * data.blockSize * data.blockSize)))); + } else { + HLSLIB_DATAFLOW_FUNCTION(transpose_read, + (config.programSettings->copyA ? data.A : data.A), + bufferListTargets[r]->buffer(), + pq_row, pq_col, pq_width, pq_height, + gcd, least_common_multiple, + static_cast(handler.getWidthforRank()), + static_cast( + (bufferSizeList[r]) / + (local_matrix_width * data.blockSize * data.blockSize)), + krnl2cclo); + HLSLIB_DATAFLOW_FUNCTION(transpose_write, + data.B, data.result, + bufferListTargets[r]->buffer(), + pq_row, pq_col, pq_width, pq_height, + gcd, least_common_multiple, + static_cast(handler.getWidthforRank()), + static_cast( + (bufferSizeList[r]) / + (local_matrix_width * data.blockSize * data.blockSize)), + cclo2krnl); + } + } +#ifndef NDEBUG + std::cout << "Start ACCL send/recv" << std::endl; +#endif + auto dbuffer = config.accl->create_buffer(1,ACCL::dataType::float32); + int g = transpose::data_handler::mod(pq_row - pq_col, gcd); + int p = transpose::data_handler::mod(pq_col + g, pq_width); + int q = transpose::data_handler::mod(pq_row - g, pq_height); + // Exchange A data via ACCL + for (int k=0; k < 2; k++) { + for (int j = 0; j < least_common_multiple/pq_width; j++) { + for (int i = 0; i < least_common_multiple/pq_height; i++) { + // Determine sender and receiver rank of current rank for current communication step + int send_rank = transpose::data_handler::mod(p + i * gcd, pq_width) + transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width; + int recv_rank = transpose::data_handler::mod(p - i * gcd, pq_width) + transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width; + + // Also count receiving buffer size because sending and receiving buffer size may differ in certain scenarios! + int receiving_size = 0; + int sending_size = 0; + + std::vector send_rows; + std::vector send_cols; + // Look up which blocks are affected by the current rank + for (int row = 0; row < least_common_multiple/pq_height; row++) { + for (int col = 0; col < least_common_multiple/pq_width; col++) { +#ifndef NDEBUG + std::cout << "Check" << row * least_common_multiple/pq_width + col << std::endl; +#endif + if (bufferListTargets[0]->buffer()[row * least_common_multiple/pq_width + col] == send_rank) { + send_rows.push_back(row); + send_cols.push_back(col); + sending_size += data.blockSize * data.blockSize; + } + if (bufferListTargets[0]->buffer()[row * least_common_multiple/pq_width + col] == recv_rank) { + receiving_size += data.blockSize * data.blockSize; + } + } + } + receiving_size *= (local_matrix_height)/(least_common_multiple/pq_height) * ((local_matrix_width)/(least_common_multiple/pq_width)); + sending_size *= (local_matrix_height)/(least_common_multiple/pq_height) * ((local_matrix_width)/(least_common_multiple/pq_width)); + + // Do actual MPI communication + if (k==0) { + // First schedule all sends, then all receives. This works if communication rounds <= ACCL buffers. + // Non-blocking communication would not offer many benefits, because the CCLO can only execute send OR recv +#ifndef NDEBUG + std::cout << "Send blocks " << sending_size / (data.blockSize * data.blockSize) << " to " << send_rank << std::endl << std::flush; +#endif + config.accl->send(*dbuffer, sending_size, send_rank, ACCL::TAG_ANY, ACCL::GLOBAL_COMM, false, ACCL::streamFlags::OP0_STREAM); + // TODO Use stream_put to simulate this implementation approach on single FPGA since send/recv to same rank is not working! + // config.accl->stream_put(*dbuffer, sending_size, send_rank, 9, ACCL::GLOBAL_COMM, false, ACCL::streamFlags::OP0_STREAM); + } else { + #ifndef NDEBUG + std::cout << "Recv blocks " << receiving_size / (data.blockSize * data.blockSize) << " from " << recv_rank << std::endl << std::flush; + #endif + config.accl->recv(*dbuffer, receiving_size, recv_rank, ACCL::TAG_ANY, ACCL::GLOBAL_COMM, false, ACCL::streamFlags::RES_STREAM); + } + } + } + } + +#ifndef NDEBUG + std::cout << "Wait for kernels to complete" << std::endl; +#endif + for (int r = 0; r < runs.size(); r++) { + runs[r].wait(); + } + MPI_Barrier(MPI_COMM_WORLD); + HLSLIB_DATAFLOW_FINALIZE(); + if (config.programSettings->useAcclEmulation) { + cclo.stop(); + } + auto endCalculation = std::chrono::high_resolution_clock::now(); +#ifndef NDEBUG + int mpi_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + std::cout << "Rank " << mpi_rank << ": " + << "Done i=" << repetition << std::endl; + std::cout << "Kernel execution time: " + << std::chrono::duration_cast>( + endCalculation - startKernelCalculation) + .count() + << "s (" + << ((config.programSettings->matrixSize * + config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) * + 3) / + std::chrono::duration_cast>( + endCalculation - startKernelCalculation) + .count() * + 1.0e-9) + << " GB/s)" << std::endl; +#endif + + std::chrono::duration calculationTime = + std::chrono::duration_cast>( + endCalculation - startCalculation); + calculationTimings.push_back(calculationTime.count()); + + std::vector tmp_write_buffer( + local_matrix_height * local_matrix_width * data.blockSize * + data.blockSize); + + startTransfer = std::chrono::high_resolution_clock::now(); + if (!config.programSettings->useAcclEmulation) { + for (int r = 0; r < transposeReadKernelList.size(); r++) { + // Copy possibly incomplete first block row + if (bufferOffsetList[r] != 0) { + bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bufferListA_out[r].read(tmp_write_buffer.data()); + for (int row = 0; row < data.blockSize; row++) { + for (int col = bufferOffsetList[r] * data.blockSize; + col < local_matrix_width * data.blockSize; col++) { + data.result[bufferStartList[r] * data.blockSize * data.blockSize + + row * local_matrix_width * data.blockSize + col] = + tmp_write_buffer[row * local_matrix_width * data.blockSize + + col]; + } + } + // Copy remaining buffer + std::copy(tmp_write_buffer.begin() + + local_matrix_width * data.blockSize * data.blockSize, + tmp_write_buffer.begin() + bufferSizeList[r], + &data.result[(bufferStartList[r] + local_matrix_width) * + data.blockSize * data.blockSize]); + } else { + bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE); + bufferListA_out[r].read( + &data.result[bufferStartList[r] * data.blockSize * data.blockSize]); + } + } + } + endTransfer = std::chrono::high_resolution_clock::now(); + + transferTime += std::chrono::duration_cast>( + endTransfer - startTransfer); + transferTimings.push_back(transferTime.count()); + } + + std::unique_ptr result( + new transpose::TransposeExecutionTimings{transferTimings, + calculationTimings}); + + return result; +} + +} // namespace accl_pq +} // namespace fpga_execution +} // namespace transpose + +#endif From b782ebad4cca5d4c410c7576ecc295ac7888bbfd Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 6 Oct 2022 12:07:57 +0100 Subject: [PATCH 116/318] ACCL emulation for b_eff --- b_eff/src/device/communication_ACCL_pl.cpp | 4 +- b_eff/src/host/CMakeLists.txt | 6 +++ .../execution_types/execution_accl_pl.hpp | 43 ++++++++++++++++--- shared/setup/fpga_setup_accl.cpp | 2 +- 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/b_eff/src/device/communication_ACCL_pl.cpp b/b_eff/src/device/communication_ACCL_pl.cpp index 58fc7fed..97a21907 100644 --- a/b_eff/src/device/communication_ACCL_pl.cpp +++ b/b_eff/src/device/communication_ACCL_pl.cpp @@ -22,9 +22,9 @@ SOFTWARE. #include "accl_hls.h" -void send_recv(float *read_buffer,float *write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, +void send_recv(const float *read_buffer,float *write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, - hls::stream &cmd, hls::stream &sts) { + STREAM &cmd, STREAM &sts) { accl_hls::ACCLCommand accl_cmd(cmd, sts, communicator_addr, datapath_cfg,0,0); for (int i = 0; i < num_iterations; i++) { accl_cmd.send(size, 0, neighbor_rank, (ap_uint<64>)read_buffer); diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt index adaa8348..e5e09aed 100755 --- a/b_eff/src/host/CMakeLists.txt +++ b/b_eff/src/host/CMakeLists.txt @@ -19,6 +19,12 @@ if (INTELFPGAOPENCL_FOUND) add_test(NAME test_intel_host_executable COMMAND $ -h) endif() if (Vitis_FOUND) +if (USE_ACCL) + set(CMAKE_SKIP_BUILD_RPATH No) + set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes) + list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib) + list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl.cpp) +endif() add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host) diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp index 3f39cfab..5bbda303 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp @@ -30,11 +30,18 @@ SOFTWARE. /* External library headers */ #include "mpi.h" #include "accl.hpp" +#include "cclo_bfm.h" +#include "accl_hls.h" /* Project's headers */ +extern void send_recv(const float *read_buffer,float *write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, + ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, + STREAM &cmd, STREAM &sts); + namespace network::execution_types::accl_pl { + /* Implementation for the single kernel. @copydoc bm_execution::calculate() @@ -57,13 +64,23 @@ namespace network::execution_types::accl_pl { int current_size; MPI_Comm_size(MPI_COMM_WORLD, & current_size); + hlslib::Stream cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo"); + hlslib::Stream cmd, sts; + + std::vector dest = {0}; + CCLO_BFM cclo(6000, current_rank, current_size, dest, cmd, sts, cclo2krnl, krnl2cclo); + if (config.programSettings->useAcclEmulation) { + cclo.run(); + } + MPI_Barrier(MPI_COMM_WORLD); + std::vector calculationTimings; for (uint r =0; r < config.programSettings->numRepetitions; r++) { dummyBufferContents.clear(); - recvBufferContents.clear(); - acclSendBuffers.clear(); - acclRecvBuffers.clear(); - int size_in_values = (size_in_bytes + 3) / 4; + recvBufferContents.clear(); + acclSendBuffers.clear(); + acclRecvBuffers.clear(); + int size_in_values = (size_in_bytes + 3) / 4; // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels for (int r = 0; r < config.programSettings->kernelReplications; r++) { dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); @@ -74,15 +91,21 @@ namespace network::execution_types::accl_pl { acclRecvBuffers.back()->sync_to_device(); } - xrt::kernel sendrecvKernel(*config.device, *config.program, "sendrecv"); + xrt::kernel sendrecvKernel(*config.device, *config.program, "send_recv"); double calculationTime = 0.0; for (int i = 0; i < config.programSettings->kernelReplications; i++) { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); + if (!config.programSettings->useAcclEmulation) { auto run = sendrecvKernel(acclSendBuffers[i]->bo(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, - config.accl->get_communicator_adr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32})); + config.accl->get_communicator_addr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32})); run.wait(); + } else { + send_recv(reinterpret_cast(acclSendBuffers[i]->buffer()), reinterpret_cast(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + config.accl->get_communicator_addr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}), + cmd, sts); + } auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); #ifndef NDEBUG @@ -98,10 +121,16 @@ namespace network::execution_types::accl_pl { std::cout << "Rank " << current_rank << ": Done " << r << std::endl; #endif } + + if (config.programSettings->useAcclEmulation) { + cclo.stop(); + } // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { - acclRecvBuffers.back()->sync_from_device(); + if (!config.programSettings->useAcclEmulation) { + acclRecvBuffers.back()->sync_from_device(); + } std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); } std::shared_ptr result(new network::ExecutionTimings{ diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 58fc3f67..51c7f87e 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -99,7 +99,7 @@ std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &progra } else { // TODO: Add start port here. Currenty hardcoded! return std::unique_ptr( - new ACCL::ACCL(ranks, current_rank, 6000, device, ACCL::networkProtocol::TCP, 16, ACCL_BUFFER_SIZE)); + new ACCL::ACCL(ranks, current_rank, 6000, device, ACCL::networkProtocol::UDP, 16, ACCL_BUFFER_SIZE)); } } From e22128ab3d10d6ba70d9623f4fd6ac247a744cac Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 7 Oct 2022 16:08:00 +0100 Subject: [PATCH 117/318] Update LINPACK --- .../settings.link.xilinx.hpl_torus_accl.hbm.generator.ini | 3 +-- LINPACK/src/host/CMakeLists.txt | 5 +++++ .../host/execution_types/execution_xrt_accl_stream_pq.hpp | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini index ec8cbfa6..4783d320 100644 --- a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini @@ -9,8 +9,7 @@ nk=inner_update_mm0:$PY_CODE_GEN num_replications$ slr=lu_1:SLR0 slr=left_update_1:SLR0 slr=top_update_1:SLR0 -slr=inner_update_mm0_1:SLR0 -slr=inner_update_mm0_2:SLR2 +slr=inner_update_mm0_1:SLR2 # matrix ports sp=lu_1.m_axi_gmem0:HBM[0:4] diff --git a/LINPACK/src/host/CMakeLists.txt b/LINPACK/src/host/CMakeLists.txt index 5422f31f..72abdf1c 100755 --- a/LINPACK/src/host/CMakeLists.txt +++ b/LINPACK/src/host/CMakeLists.txt @@ -23,6 +23,11 @@ if (INTELFPGAOPENCL_FOUND) endif() if (Vitis_FOUND) + if (USE_ACCL) + set(CMAKE_SKIP_BUILD_RPATH No) + set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes) + list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib) + endif() add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host) diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp index 9a43fef9..9bf1aaf6 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp @@ -219,7 +219,7 @@ static std::unique_ptr calculate( int pq_col = mpi_comm_rank % pq_width; int pair_rank = pq_width * pq_col + pq_row; - std::vector dest = {0}; + std::vector dest = {0, 9}; CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo); if (config.programSettings->useAcclEmulation) { cclo.run(); @@ -283,8 +283,8 @@ static std::unique_ptr calculate( for (int r = 0; r < runs.size(); r++) { runs[r].wait(); } - MPI_Barrier(MPI_COMM_WORLD); HLSLIB_DATAFLOW_FINALIZE(); + MPI_Barrier(MPI_COMM_WORLD); if (config.programSettings->useAcclEmulation) { cclo.stop(); } From d8be6afbb852ab12c4d5cdf763343735ad8c0ca0 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 17 Oct 2022 13:54:46 +0100 Subject: [PATCH 118/318] Update PTRANS stream settings --- PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake | 4 ++-- ...gs.link.xilinx.transpose_pq_accl_stream.hbm.ini | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake index 89114c4d..6b196634 100644 --- a/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake +++ b/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake @@ -14,7 +14,7 @@ set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) set(USE_OCL_HOST No CACHE BOOL "" FORCE) set(ACCL_STACK_TYPE "TCP" CACHE STRING "" FORCE) set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) -set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini CACHE FILEPATH "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini CACHE FILEPATH "" FORCE) set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE) set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE) @@ -23,6 +23,6 @@ set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE) set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE) set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) -set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) +set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini index f9be4bec..559ff34f 100644 --- a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini @@ -27,12 +27,12 @@ nk=transpose_read0:1 nk=transpose_write0:1 # Kernels Foorplaning -slr=compression_0_0:SLR1 -slr=compression_0_1:SLR1 -slr=compression_0_2:SLR1 -slr=arith_0:SLR1 -slr=ccl_offload_0:SLR1 -slr=hostctrl_0:SLR1 +slr=compression_0_0:SLR0 +slr=compression_0_1:SLR0 +slr=compression_0_2:SLR0 +slr=arith_0:SLR0 +slr=ccl_offload_0:SLR0 +slr=hostctrl_0:SLR0 slr=networklayer_0:SLR1 slr=cmac_0:SLR1 slr=transpose_read0_1:SLR2 @@ -73,5 +73,5 @@ stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 # Tie off user kernel interface stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl -stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl +stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:32 From 1d70455c52019775b244512594ec97dfb54ceca6 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 21 Oct 2022 11:34:40 +0100 Subject: [PATCH 119/318] Fix ACCL and XRT setup --- shared/setup/fpga_setup_accl.cpp | 2 +- shared/setup/fpga_setup_xrt.cpp | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 51c7f87e..54b78c5c 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -77,7 +77,7 @@ std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &progra std::vector ranks = {}; for (int i = 0; i < current_size; ++i) { // TODO: Replace the ip addresses and ports here for execution of real hardware? - ACCL::rank_t new_rank = {"10.10.10." + current_rank, 5500 + i, i, ACCL_BUFFER_SIZE}; + ACCL::rank_t new_rank = {"10.10.10." + std::to_string(current_rank), 5500 + i, i, ACCL_BUFFER_SIZE}; ranks.emplace_back(new_rank); } if (!useAcclEmulation) { diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp index 103eda17..eae39fe8 100644 --- a/shared/setup/fpga_setup_xrt.cpp +++ b/shared/setup/fpga_setup_xrt.cpp @@ -35,9 +35,11 @@ namespace fpga_setup { std::unique_ptr selectFPGADevice(int defaultDevice) { - int current_rank; - MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); - - return std::unique_ptr(new xrt::device(current_rank)); + int current_device; + MPI_Comm_rank(MPI_COMM_WORLD, & current_device); + if (defaultDevice >= 0) { + current_device = defaultDevice; + } + return std::unique_ptr(new xrt::device(current_device)); } } // namespace fpga_setup From fb6895655415d30e3daa96e1ef1c5649d61eab51 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 21 Oct 2022 17:29:02 +0100 Subject: [PATCH 120/318] Add ACCL stream flag to PTRANS --- PTRANS/src/host/transpose_benchmark.hpp | 54 ++++++++++++++++--------- PTRANS/src/host/transpose_data.cpp | 3 +- PTRANS/src/host/transpose_data.hpp | 5 +++ 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index f2b06965..553ce002 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -41,6 +41,7 @@ SOFTWARE. #include "execution_types/execution_xrt_pcie_pq.hpp" #ifdef USE_ACCL #include "execution_types/execution_xrt_accl_pq.hpp" +#include "execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp" #include "execution_types/execution_xrt_accl_stream_pq.hpp" #endif #endif @@ -83,7 +84,8 @@ public hpcc_base::HpccFpgaBenchmark()->default_value(DEFAULT_DIST_TYPE)) - ("copy-a", "Create a copy of matrix A for each kernel replication"); + ("copy-a", "Create a copy of matrix A for each kernel replication") + ("accl-stream", "Use design with user kernels directly connected to CCLO"); } std::unique_ptr> dataHandler; @@ -144,8 +146,18 @@ public hpcc_base::HpccFpgaBenchmarkexecutionSettings), data, reinterpret_cast&>(*this->dataHandler)); break; - case hpcc_base::CommunicationType::accl: - return transpose::fpga_execution::accl_stream_pq::calculate(*(this->executionSettings), data, reinterpret_cast&>(*this->dataHandler)); break; + case hpcc_base::CommunicationType::accl: + if (this->executionSettings->programSettings->useAcclStreams) { + auto h = reinterpret_cast&>(*this->dataHandler); + if (!h.getP() == h.getQ()) { + return transpose::fpga_execution::accl_stream_sendrecv_pq::calculate(*(this->executionSettings), data, reinterpret_cast&>(*this->dataHandler)); + } + else { + return transpose::fpga_execution::accl_stream_pq::calculate(*(this->executionSettings), data, reinterpret_cast&>(*this->dataHandler)); + } + } else { + return transpose::fpga_execution::accl_pq::calculate(*(this->executionSettings), data, reinterpret_cast&>(*this->dataHandler)); + } break; #endif #endif #ifdef MKL_FOUND @@ -188,30 +200,32 @@ public hpcc_base::HpccFpgaBenchmark*>(this->dataHandler.get())->getHeightforRank(); long width_per_rank = reinterpret_cast*>(this->dataHandler.get())->getWidthforRank(); if (error_count > 0) { - std::cout << "A:" << std::endl; - for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { - for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { - std::cout << oldA[j * width_per_rank * data.blockSize + i] << ", "; + if ( this->mpi_comm_rank == 0) { + std::cout << "A:" << std::endl; + for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { + for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { + std::cout << oldA[j * width_per_rank * data.blockSize + i] << ", "; + } + std::cout << std::endl; } std::cout << std::endl; - } - std::cout << std::endl; - std::cout << "B:" << std::endl; - for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { - for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { - std::cout << data.B[j * width_per_rank * data.blockSize + i] << ", "; + std::cout << "B:" << std::endl; + for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { + for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { + std::cout << data.B[j * width_per_rank * data.blockSize + i] << ", "; + } + std::cout << std::endl; } std::cout << std::endl; - } - std::cout << std::endl; - std::cout << "Transposed A:" << std::endl; - for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { - for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { - std::cout << data.A[j * width_per_rank * data.blockSize + i] << ", "; + std::cout << "Transposed A:" << std::endl; + for (size_t j = 0; j < height_per_rank * data.blockSize; j++) { + for (size_t i = 0; i < width_per_rank * data.blockSize; i++) { + std::cout << data.A[j * width_per_rank * data.blockSize + i] << ", "; + } + std::cout << std::endl; } std::cout << std::endl; } - std::cout << std::endl; } #endif diff --git a/PTRANS/src/host/transpose_data.cpp b/PTRANS/src/host/transpose_data.cpp index e8a7c8f0..36979413 100644 --- a/PTRANS/src/host/transpose_data.cpp +++ b/PTRANS/src/host/transpose_data.cpp @@ -7,7 +7,8 @@ transpose::TransposeProgramSettings::TransposeProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), matrixSize(results["m"].as() * results["b"].as()), blockSize(results["b"].as()), dataHandlerIdentifier(transpose::data_handler::stringToHandler(results["handler"].as())), - distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as()), copyA(results["copy-a"].count() > 0) { + distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as()), copyA(results["copy-a"].count() > 0), + useAcclStreams(results["accl-stream"].count() > 0) { // auto detect data distribution type if required if (dataHandlerIdentifier == transpose::data_handler::DataHandlerType::automatic) { diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp index fed4eff6..cd9020e4 100644 --- a/PTRANS/src/host/transpose_data.hpp +++ b/PTRANS/src/host/transpose_data.hpp @@ -80,6 +80,11 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings { */ bool copyA; + /** + * @brief Indicate, if a design is used where the user kernels are directly connected to the ACCL CCLO + */ + bool useAcclStreams; + /** * @brief Construct a new Transpose Program Settings object * From 3a1cc5f32ed107974489f3fd2992bafd58ba1556 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 21 Oct 2022 17:31:27 +0100 Subject: [PATCH 121/318] Apply ACCL API changes to PTRANS --- .../execution_types/execution_xrt_accl_pq.hpp | 14 ++--- .../execution_xrt_accl_stream_pq.hpp | 26 ++++----- .../execution_xrt_accl_stream_pq_sendrecv.hpp | 55 ++++++++++++------- 3 files changed, 49 insertions(+), 46 deletions(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp index 3fdaeb1f..8e6c0f5b 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp @@ -28,12 +28,9 @@ SOFTWARE. #include /* Project's headers */ -#include "buffer.hpp" -#include "cclo.hpp" -#include "constants.hpp" +#include "accl.hpp" #include "data_handlers/data_handler_types.h" #include "data_handlers/pq.hpp" -#include "fpgabuffer.hpp" #include "transpose_data.hpp" namespace transpose { @@ -87,15 +84,14 @@ void accl_exchangeData( accl.send(*acclBuffersA[0]->slice( data.blockSize * data.blockSize * block_num, data.blockSize * data.blockSize * (block_num + 1)), - data.blockSize * data.blockSize, pair_rank, 0, ACCL::GLOBAL_COMM, true, - ACCL::streamFlags::NO_STREAM); + data.blockSize * data.blockSize, pair_rank, 0, ACCL::GLOBAL_COMM, true); } for (int block_num = block_chunk; block_num < std::min(data.numBlocks, block_chunk + 16); block_num++) { accl.recv(*acclBufferA_recv->slice( data.blockSize * data.blockSize * block_num, data.blockSize * data.blockSize * (block_num + 1)), data.blockSize * data.blockSize, pair_rank, - 1, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM); + 1, ACCL::GLOBAL_COMM, true); } } @@ -266,11 +262,11 @@ void accl_exchangeData( #endif accl_requests[current_parallel_execution] = (accl.send( *send_buffers[current_parallel_execution], sending_size, - send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM, + send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::dataType::none, true)); accl_requests[current_parallel_execution + gcd] = (accl.recv( *recv_buffers[current_parallel_execution], sending_size, - send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::streamFlags::NO_STREAM, + send_rank, 0, ACCL::GLOBAL_COMM, true, ACCL::dataType::none, true)); // Increase the counter for parallel executions current_parallel_execution = (current_parallel_execution + 1) % gcd; diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp index 9bf1aaf6..50a07998 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp @@ -28,16 +28,12 @@ SOFTWARE. #include /* Project's headers */ -#include "buffer.hpp" -#include "cclo.hpp" -#include "constants.hpp" #include "data_handlers/data_handler_types.h" #include "data_handlers/pq.hpp" -#include "fpgabuffer.hpp" #include "transpose_data.hpp" #include "cclo_bfm.h" #include "Simulation.h" -#include "dummybuffer.hpp" +#include "accl.hpp" extern void transpose_write(const DEVICE_DATA_TYPE *B, DEVICE_DATA_TYPE *A_out, @@ -201,10 +197,6 @@ static std::unique_ptr calculate( MPI_Barrier(MPI_COMM_WORLD); -#ifndef NDEBUG - std::cout << "Start BFM" << std::endl; -#endif - HLSLIB_DATAFLOW_INIT(); hlslib::Stream cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo"); hlslib::Stream cmd, sts; @@ -220,9 +212,13 @@ static std::unique_ptr calculate( int pair_rank = pq_width * pq_col + pq_row; std::vector dest = {0, 9}; - CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo); + std::unique_ptr cclo; if (config.programSettings->useAcclEmulation) { - cclo.run(); +#ifndef NDEBUG + std::cout << "Start BFM" << std::endl; +#endif + cclo = std::make_unique(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo); + cclo->run(); } MPI_Barrier(MPI_COMM_WORLD); @@ -272,11 +268,9 @@ static std::unique_ptr calculate( cclo2krnl); } } - auto dbuffer = config.accl->create_buffer(1,ACCL::dataType::float32); // Exchange A data via ACCL - config.accl->stream_put(*dbuffer, data.blockSize * data.blockSize * data.numBlocks, - pair_rank, 9, ACCL::GLOBAL_COMM, - false, ACCL::streamFlags::OP0_STREAM); + config.accl->stream_put(ACCL::dataType::float32, data.blockSize * data.blockSize * data.numBlocks, + pair_rank, 0); #ifndef NDEBUG std::cout << "Wait for kernels to complete" << std::endl; #endif @@ -286,7 +280,7 @@ static std::unique_ptr calculate( HLSLIB_DATAFLOW_FINALIZE(); MPI_Barrier(MPI_COMM_WORLD); if (config.programSettings->useAcclEmulation) { - cclo.stop(); + cclo->stop(); } auto endCalculation = std::chrono::high_resolution_clock::now(); #ifndef NDEBUG diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp index b51d8120..c01bab4c 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp @@ -28,18 +28,14 @@ SOFTWARE. #include /* Project's headers */ -#include "buffer.hpp" -#include "cclo.hpp" -#include "constants.hpp" #include "data_handlers/data_handler_types.h" #include "data_handlers/pq.hpp" -#include "fpgabuffer.hpp" #include "transpose_data.hpp" #include "cclo_bfm.h" #include "Simulation.h" -#include "dummybuffer.hpp" +#include "accl.hpp" -extern void transpose_write(const DEVICE_DATA_TYPE* B, +void transpose_write_sendrecv(const DEVICE_DATA_TYPE* B, DEVICE_DATA_TYPE* C, const int* target_list, int pq_row, int pq_col, @@ -49,7 +45,7 @@ extern void transpose_write(const DEVICE_DATA_TYPE* B, int width_per_rank, STREAM &cclo2krnl); -extern void transpose_read(const DEVICE_DATA_TYPE* A, +void transpose_read_sendrecv(const DEVICE_DATA_TYPE* A, const int* target_list, int pq_row, int pq_col, int pq_width, int pq_height, @@ -60,7 +56,7 @@ extern void transpose_read(const DEVICE_DATA_TYPE* A, namespace transpose { namespace fpga_execution { -namespace accl_stream_pq { +namespace accl_stream_sendrecv_pq { /** * @brief Transpose and add the matrices using the OpenCL kernel using a PQ @@ -104,6 +100,7 @@ static std::unique_ptr calculate( std::vector bufferListB; std::vector bufferListA_out; std::vector>> bufferListTargets; + std::vector>> bufferListCopy; std::vector transposeReadKernelList; std::vector transposeWriteKernelList; std::vector blocksPerReplication; @@ -159,6 +156,10 @@ static std::unique_ptr calculate( bufferStartList.push_back(total_offset); bufferOffsetList.push_back(row_offset); +#ifndef NDEBUG + std::cout << "Blocks per replication: " << blocks_per_replication << std::endl; +#endif + row_offset = (row_offset + blocks_per_replication) % local_matrix_width; total_offset += (bufferOffsetList.back() + blocks_per_replication) / @@ -170,6 +171,7 @@ static std::unique_ptr calculate( // repeat, we only need to store this small amount of data! auto target_list = config.accl->create_buffer(least_common_multiple / pq_height * least_common_multiple / pq_width, ACCL::dataType::int32); + bufferListCopy.push_back(config.accl->create_buffer(buffer_size, ACCL::dataType::float32)); for (int row = 0; row < least_common_multiple / pq_height; row++) { for (int col = 0; col < least_common_multiple / pq_width; col++) { int global_block_col = pq_col + col * pq_width; @@ -187,10 +189,10 @@ static std::unique_ptr calculate( // create the kernels xrt::kernel transposeReadKernel( *config.device, *config.program, - ("transpose_read0:{transpose_read0_" + std::to_string(r + 1) + "}").c_str()); + ("transpose_read_sendrecv0:{transpose_read_sendrecv0_" + std::to_string(r + 1) + "}").c_str()); xrt::kernel transposeWriteKernel( *config.device, *config.program, - ("transpose_write0:{transpose_write0_" + std::to_string(r + 1) + "}").c_str()); + ("transpose_write_sendrecv0:{transpose_write_sendrecv0_" + std::to_string(r + 1) + "}").c_str()); if (r == 0 || config.programSettings->copyA) { xrt::bo bufferA(*config.device, data.A, @@ -249,10 +251,14 @@ static std::unique_ptr calculate( hlslib::Stream cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo"); hlslib::Stream cmd, sts; - std::vector dest = {0}; - CCLO_BFM cclo(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo); + std::vector dest = {0, 9}; + std::unique_ptr cclo; if (config.programSettings->useAcclEmulation) { - cclo.run(); +#ifndef NDEBUG + std::cout << "Start BFM" << std::endl; +#endif + cclo = std::make_unique(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo); + cclo->run(); } MPI_Barrier(MPI_COMM_WORLD); @@ -260,7 +266,6 @@ static std::unique_ptr calculate( #ifndef NDEBUG std::cout << "Start kernel execution" << std::endl; - std::cout << bufferListTargets[0]->buffer()[0] << std::endl; #endif std::vector runs; auto startKernelCalculation = std::chrono::high_resolution_clock::now(); @@ -283,7 +288,7 @@ static std::unique_ptr calculate( (bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)))); } else { - HLSLIB_DATAFLOW_FUNCTION(transpose_read, + HLSLIB_DATAFLOW_FUNCTION(transpose_read_sendrecv, (config.programSettings->copyA ? data.A : data.A), bufferListTargets[r]->buffer(), pq_row, pq_col, pq_width, pq_height, @@ -293,7 +298,7 @@ static std::unique_ptr calculate( (bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)), krnl2cclo); - HLSLIB_DATAFLOW_FUNCTION(transpose_write, + HLSLIB_DATAFLOW_FUNCTION(transpose_write_sendrecv, data.B, data.result, bufferListTargets[r]->buffer(), pq_row, pq_col, pq_width, pq_height, @@ -352,14 +357,22 @@ static std::unique_ptr calculate( #ifndef NDEBUG std::cout << "Send blocks " << sending_size / (data.blockSize * data.blockSize) << " to " << send_rank << std::endl << std::flush; #endif - config.accl->send(*dbuffer, sending_size, send_rank, ACCL::TAG_ANY, ACCL::GLOBAL_COMM, false, ACCL::streamFlags::OP0_STREAM); - // TODO Use stream_put to simulate this implementation approach on single FPGA since send/recv to same rank is not working! - // config.accl->stream_put(*dbuffer, sending_size, send_rank, 9, ACCL::GLOBAL_COMM, false, ACCL::streamFlags::OP0_STREAM); + if (send_rank == mpi_comm_rank) { + //TODO copy from and to string not implemented in driver yet + // config.accl->copy_from_stream(*bufferListCopy[0], sending_size); + } else { + config.accl->send(ACCL::dataType::float32, sending_size, send_rank, 0); + } } else { #ifndef NDEBUG std::cout << "Recv blocks " << receiving_size / (data.blockSize * data.blockSize) << " from " << recv_rank << std::endl << std::flush; #endif - config.accl->recv(*dbuffer, receiving_size, recv_rank, ACCL::TAG_ANY, ACCL::GLOBAL_COMM, false, ACCL::streamFlags::RES_STREAM); + if (recv_rank == mpi_comm_rank) { + //TODO copy from and to string not implemented in driver yet + // config.accl->copy_to_stream(*bufferListCopy[0], receiving_size); + } else { + config.accl->recv(ACCL::dataType::float32, receiving_size, recv_rank, 0); + } } } } @@ -374,7 +387,7 @@ static std::unique_ptr calculate( MPI_Barrier(MPI_COMM_WORLD); HLSLIB_DATAFLOW_FINALIZE(); if (config.programSettings->useAcclEmulation) { - cclo.stop(); + cclo->stop(); } auto endCalculation = std::chrono::high_resolution_clock::now(); #ifndef NDEBUG From 539047ca17044c9b6cd0b568c2025ab7c36153cd Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 21 Oct 2022 17:31:52 +0100 Subject: [PATCH 122/318] Add more ACCL debug output --- shared/setup/fpga_setup_accl.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 54b78c5c..8a5f685c 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -86,10 +86,12 @@ std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &progra std::cout << "Create hostctrl" << std::endl; auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}", xrt::kernel::cu_access_mode::exclusive); - + std::cout << "Create CMAC" << std::endl; auto cmac = CMAC(xrt::ip(device, program, "cmac_0:{cmac_0}")); + std::cout << "Create Network Layer" << std::endl; auto network_layer = Networklayer( xrt::ip(device, program, "networklayer:{networklayer_0}")); + std::cout << "Configure VNX" << std::endl; configure_vnx(cmac, network_layer, ranks, current_rank); std::vector mem(1, 0); From 599228b700a84a7277118bf9fe70591a9c20f087 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 21 Oct 2022 17:32:46 +0100 Subject: [PATCH 123/318] Add sendrecv PTRANS kernel to build --- PTRANS/src/device/CMakeLists.txt | 2 +- PTRANS/src/host/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/PTRANS/src/device/CMakeLists.txt b/PTRANS/src/device/CMakeLists.txt index 34c47551..bbee1bff 100644 --- a/PTRANS/src/device/CMakeLists.txt +++ b/PTRANS/src/device/CMakeLists.txt @@ -11,7 +11,7 @@ if (INTELFPGAOPENCL_FOUND) endif() if (VITIS_FOUND) - generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE transpose_PQ_ACCL_buffers transpose_PQ_ACCL_stream) + generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE transpose_PQ_ACCL_buffers transpose_PQ_ACCL_stream transpose_PQ_ACCL_stream_sendrecv) add_test(NAME test_emulation_PQ_PCIE_xilinx COMMAND Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt index fe7214c4..e162b809 100755 --- a/PTRANS/src/host/CMakeLists.txt +++ b/PTRANS/src/host/CMakeLists.txt @@ -37,7 +37,7 @@ if (Vitis_FOUND) set(CMAKE_SKIP_BUILD_RPATH No) set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes) list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib) - list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/transpose_PQ_ACCL_stream.cpp) + list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp ${CMAKE_SOURCE_DIR}/src/device/transpose_PQ_ACCL_stream.cpp) endif() add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) From 1a4a69f55a4ba888c61b88d584a484498f6d9183 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 21 Oct 2022 17:33:31 +0100 Subject: [PATCH 124/318] Rename sendrecv PTRANS kernels --- PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp index 4c9452b2..13e1c300 100644 --- a/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp +++ b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp @@ -178,7 +178,7 @@ void transpose_block_receive(const DEVICE_DATA_TYPE *B, } } -void transpose_read(const DEVICE_DATA_TYPE* A, +void transpose_read_sendrecv(const DEVICE_DATA_TYPE* A, const int* target_list, int pq_row, int pq_col, int pq_width, int pq_height, @@ -215,7 +215,7 @@ void transpose_read(const DEVICE_DATA_TYPE* A, } } -void transpose_write(const DEVICE_DATA_TYPE* B, +void transpose_write_sendrecv(const DEVICE_DATA_TYPE* B, DEVICE_DATA_TYPE* C, const int* target_list, int pq_row, int pq_col, From 2e1405ed27fb29dbe1d4e3cbbe29d7cca1939266 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 21 Oct 2022 17:46:38 +0100 Subject: [PATCH 125/318] Attempt to fix PTRANS sendrecv kernels --- .../src/device/transpose_PQ_ACCL_stream_sendrecv.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp index 13e1c300..c43736d9 100644 --- a/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp +++ b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp @@ -200,9 +200,9 @@ void transpose_read_sendrecv(const DEVICE_DATA_TYPE* A, for (int col = 0; col < least_common_multiple/pq_width; col++) { for (int row = 0; row < least_common_multiple/pq_height; row++) { if (target_list[row * least_common_multiple/pq_width + col] == send_rank) { - for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) { - for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) { - unsigned int matrix_buffer_offset = (col + lcm_col * least_common_multiple/pq_width) + (row + lcm_row * least_common_multiple/pq_height) * width_per_rank; + for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_height); lcm_col++) { + for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_width); lcm_row++) { + unsigned int matrix_buffer_offset = (row + lcm_col * least_common_multiple/pq_height) + (col + lcm_row * least_common_multiple/pq_width) * width_per_rank; DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width]; transpose_block_transpose(A, a_block, matrix_buffer_offset, width_per_rank, height_per_rank); transpose_block_forward(a_block, krnl2cclo); @@ -237,9 +237,9 @@ void transpose_write_sendrecv(const DEVICE_DATA_TYPE* B, for (int col = 0; col < least_common_multiple/pq_width; col++) { for (int row = 0; row < least_common_multiple/pq_height; row++) { if (target_list[row * least_common_multiple/pq_width + col] == recv_rank) { - for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_height); lcm_row++) { - for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_width); lcm_col++) { - unsigned int matrix_buffer_offset = (col + lcm_col * least_common_multiple/pq_width) + (row + lcm_row * least_common_multiple/pq_height) * width_per_rank; + for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_width); lcm_row++) { + for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_height); lcm_col++) { + unsigned int matrix_buffer_offset = (row + lcm_col * least_common_multiple/pq_height) + (col + lcm_row * least_common_multiple/pq_width) * width_per_rank; transpose_block_receive(B,C,matrix_buffer_offset,width_per_rank, cclo2krnl); } } From 9a03cf2c59dfcee7419758d32823e0044f15eda9 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 21 Oct 2022 18:08:55 +0100 Subject: [PATCH 126/318] Extend memory alignment for b_eff --- b_eff/src/host/network_benchmark.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index 964ec5ca..472ab15d 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -42,7 +42,7 @@ struct aligned_allocator { pointer allocate(size_t pCount, const_pointer = 0){ T* mem = 0; - if (posix_memalign(reinterpret_cast(&mem), 1024 , sizeof(T) * pCount) != 0) { + if (posix_memalign(reinterpret_cast(&mem), 4096, sizeof(T) * pCount) != 0) { throw std::bad_alloc(); } return mem; From 8455a7d7082b0a8bab70e8d644f9df3223dad726 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 21 Oct 2022 18:26:36 +0100 Subject: [PATCH 127/318] Add profiling config for b_eff --- .../Xilinx_U55C_HBM_ACCL_pl_profile.cmake | 27 ++++++ ...s.link.xilinx.accl_pl.u55c.hbm.profile.ini | 88 +++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 b_eff/configs/Xilinx_U55C_HBM_ACCL_pl_profile.cmake create mode 100644 b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini diff --git a/b_eff/configs/Xilinx_U55C_HBM_ACCL_pl_profile.cmake b/b_eff/configs/Xilinx_U55C_HBM_ACCL_pl_profile.cmake new file mode 100644 index 00000000..5cd3ed0a --- /dev/null +++ b/b_eff/configs/Xilinx_U55C_HBM_ACCL_pl_profile.cmake @@ -0,0 +1,27 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE) +set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE) +set(XILINX_KERNEL_NAMES "send_recv" CACHE STRING "" FORCE) +set(USE_ACCL_CLIENT_ARBITER Yes CACHE BOOL "Use the client arbiter kernel to connect multiple kernels to the CCLO cmd stream" FORCE) +# STREAM specific options +# Defaults to a total of ~12GB data +set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini new file mode 100644 index 00000000..96e13497 --- /dev/null +++ b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini @@ -0,0 +1,88 @@ +# /******************************************************************************* +# Copyright (C) 2021 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ +[connectivity] +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_ops:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:1:lb_user_krnl +nk=client_arbiter:1:client_arbiter +nk=send_recv:1:sendrecv + +# Kernels Foorplaning +slr=compression_0_0:SLR0 +slr=compression_0_1:SLR0 +slr=compression_0_2:SLR0 +slr=lb_user_krnl:SLR0 +slr=arith_0:SLR0 +slr=ccl_offload_0:SLR0 +slr=hostctrl_0:SLR0 +slr=networklayer_0:SLR1 +slr=cmac_0:SLR1 +slr=client_arbiter:SLR0 +slr=sendrecv:SLR0 + +sp=ccl_offload_0.m_axi_0:HBM[0:5] +sp=ccl_offload_0.m_axi_1:HBM[0:5] +sp=sendrecv.m_axi_gmem:HBM[0:5] + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:client_arbiter.cmd_clients_0 +stream_connect=client_arbiter.ack_clients_0:hostctrl_0.sts +stream_connect=sendrecv.cmd:client_arbiter.cmd_clients_1 +stream_connect=client_arbiter.ack_clients_1:sendrecv.sts +stream_connect=client_arbiter.cmd_cclo:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:client_arbiter.ack_cclo + + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in +stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl + +[profile] +data=ccl_offload:all:all # Monitor data on all instances of kernel k1 +data=send_recv:all:all # Specific CU master +memory=all # Monitor transfers for all memories +stall=ccl_offload:all # Monitor stalls for all CUs of all kernels +stall=send_recv:all # Stalls only for cu2 +exec=all:all # Monitor execution times for all CUs From 79967790aa4010f539d06a1c44161ef260128db6 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 24 Oct 2022 08:34:53 +0100 Subject: [PATCH 128/318] Fix IP address for hardware execution --- shared/setup/fpga_setup_accl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 8a5f685c..c560ee29 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -77,7 +77,7 @@ std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &progra std::vector ranks = {}; for (int i = 0; i < current_size; ++i) { // TODO: Replace the ip addresses and ports here for execution of real hardware? - ACCL::rank_t new_rank = {"10.10.10." + std::to_string(current_rank), 5500 + i, i, ACCL_BUFFER_SIZE}; + ACCL::rank_t new_rank = {"10.10.10." + std::to_string(i), 5500 + i, i, ACCL_BUFFER_SIZE}; ranks.emplace_back(new_rank); } if (!useAcclEmulation) { From 7237139d14ea8427ffab29350009969a38183b77 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 24 Oct 2022 08:53:33 +0100 Subject: [PATCH 129/318] Make debug output optional --- b_eff/src/host/execution_types/execution_accl.hpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp index c4686b29..8d1638d9 100644 --- a/b_eff/src/host/execution_types/execution_accl.hpp +++ b/b_eff/src/host/execution_types/execution_accl.hpp @@ -79,8 +79,19 @@ namespace network::execution_types::accl { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); for (int l = 0; l < looplength; l++) { +#ifndef NDEBUG + std::cout << "Send " << size_in_values << " bytes to " + << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl; +#endif config.accl->send(*acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); - config.accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); +#ifndef NDEBUG + std::cout << "Recv " << size_in_values << " bytes from " + << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl; +#endif + config.accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); +#ifndef NDEBUG + std::cout << "Done" << std::endl; +#endif } auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); From 2f565a7a031055984cd5497975db0e9bc4f44cec Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 24 Oct 2022 10:32:33 +0200 Subject: [PATCH 130/318] Explicitly set ACCL buffer size in config --- b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake index 45e2b5d7..81c20e1d 100644 --- a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake +++ b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake @@ -22,5 +22,6 @@ set(USE_ACCL_CLIENT_ARBITER Yes CACHE BOOL "Use the client arbiter kernel to con # Defaults to a total of ~12GB data set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE) set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE) +set(ACCL_BUFFER_SIZE 4194304 CACHE STRING "Size of the ACCL buffers" FORCE) set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) From d675b98444b20da78c84b745ce9d45b33242e8db Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 24 Oct 2022 10:56:45 +0200 Subject: [PATCH 131/318] Make BFM optional in ACCL PL --- b_eff/src/host/execution_types/execution_accl_pl.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp index 5bbda303..0bac2fa8 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp @@ -68,9 +68,10 @@ namespace network::execution_types::accl_pl { hlslib::Stream cmd, sts; std::vector dest = {0}; - CCLO_BFM cclo(6000, current_rank, current_size, dest, cmd, sts, cclo2krnl, krnl2cclo); + std::unique_ptr cclo; if (config.programSettings->useAcclEmulation) { - cclo.run(); + cclo = std::make_unique(6000, current_rank, current_size, dest, cmd, sts, cclo2krnl, krnl2cclo); + cclo->run(); } MPI_Barrier(MPI_COMM_WORLD); @@ -123,7 +124,7 @@ namespace network::execution_types::accl_pl { } if (config.programSettings->useAcclEmulation) { - cclo.stop(); + cclo->stop(); } // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! From 456a7825735d9f96f00fd602b28251aa27eac2c4 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 25 Oct 2022 14:56:44 +0100 Subject: [PATCH 132/318] Fix ACCL pl emulation w/o bitstream --- b_eff/src/host/execution_types/execution_accl_pl.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp index 0bac2fa8..ed35d552 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp @@ -92,7 +92,10 @@ namespace network::execution_types::accl_pl { acclRecvBuffers.back()->sync_to_device(); } - xrt::kernel sendrecvKernel(*config.device, *config.program, "send_recv"); + xrt::kernel sendrecvKernel; + if (!config.programSettings->useAcclEmulation) { + sendrecvKernel(*config.device, *config.program, "send_recv"); + } double calculationTime = 0.0; for (int i = 0; i < config.programSettings->kernelReplications; i++) { From 7ec36450a885c17005673de699d0bed4b4984736 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 25 Oct 2022 14:57:10 +0100 Subject: [PATCH 133/318] Profile everything --- .../settings.link.xilinx.accl_pl.u55c.hbm.profile.ini | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini index 96e13497..9a1ce41d 100644 --- a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini +++ b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini @@ -80,9 +80,7 @@ stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl [profile] -data=ccl_offload:all:all # Monitor data on all instances of kernel k1 -data=send_recv:all:all # Specific CU master -memory=all # Monitor transfers for all memories -stall=ccl_offload:all # Monitor stalls for all CUs of all kernels -stall=send_recv:all # Stalls only for cu2 -exec=all:all # Monitor execution times for all CUs +data=all:all:all +memory=all +stall=all:all +exec=all:all From 70a7ef200654c20d31309d518c8913db82cea3a3 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 25 Oct 2022 18:56:53 +0200 Subject: [PATCH 134/318] Fix memory bank for top kernel --- LINPACK/src/host/execution_types/execution_xrt_pcie.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp index 6de18915..1a2610e0 100644 --- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp @@ -111,7 +111,7 @@ std::unique_ptr calculate( *config.device, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * (config.programSettings->blockSize), - kernel_lu.group_id(0)); + kernel_lu.group_id(1)); } for (int i = 0; i < blocks_per_col; i++) { From 30fdfc35075451af44373a4ef39ce6f53da402bb Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 26 Oct 2022 15:23:47 +0100 Subject: [PATCH 135/318] Fix memory bank in HPL ACCL host code --- LINPACK/src/host/execution_types/execution_accl_buffers.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp index 5e26e267..e7db12b3 100644 --- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp +++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp @@ -149,7 +149,7 @@ std::unique_ptr calculate( sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * (config.programSettings->blockSize), - lu_tmp_kernel.group_id(0)); + lu_tmp_kernel.group_id(1)); Buffer_top_list.back().push_back( config.accl->create_buffer( tmp_bos.back(), From 3cbf9e98a5bd49d1c73f48ca8b37dc4422fec0b0 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 26 Oct 2022 15:56:58 +0100 Subject: [PATCH 136/318] Add PTRANS ACCL profile config --- .../Xilinx_U55C_HBM_ACCL_stream_profile.cmake | 30 +++++++ ...x.transpose_pq_accl_stream.hbm.profile.ini | 82 +++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream_profile.cmake create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini diff --git a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream_profile.cmake b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream_profile.cmake new file mode 100644 index 00000000..a61bd058 --- /dev/null +++ b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream_profile.cmake @@ -0,0 +1,30 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE) +set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) +set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE) +set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) +set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini new file mode 100644 index 00000000..1c3a4861 --- /dev/null +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini @@ -0,0 +1,82 @@ +# /******************************************************************************* +# Copyright (C) 2021 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ +[connectivity] +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_ops:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:0 +nk=transpose_read0:1 +nk=transpose_write0:1 + +# Kernels Foorplaning +slr=compression_0_0:SLR0 +slr=compression_0_1:SLR0 +slr=compression_0_2:SLR0 +slr=arith_0:SLR0 +slr=ccl_offload_0:SLR0 +slr=hostctrl_0:SLR0 +slr=networklayer_0:SLR1 +slr=cmac_0:SLR1 +slr=transpose_read0_1:SLR2 +slr=transpose_write0_1:SLR0 + +sp=ccl_offload_0.m_axi_0:HBM[31] +sp=ccl_offload_0.m_axi_1:HBM[31] +sp=transpose_read0_1.m_axi_gmem0:HBM[0:7] +sp=transpose_write0_1.m_axi_gmem0:HBM[8:15] +sp=transpose_write0_1.m_axi_gmem1:HBM[16:23] + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl +stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:512 + +[profile] +data=all:all:all +memory=all +stall=all:all +exec=all:all \ No newline at end of file From b9709f85360662153485a381eaf3c25ff8982cd5 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 26 Oct 2022 16:02:26 +0100 Subject: [PATCH 137/318] LINPACK ACCL update configs --- .../configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake | 2 +- .../Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake | 2 +- ...linx.hpl_torus_accl.hbm.u55c.generator.ini | 88 +++++++++++++++++++ 3 files changed, 90 insertions(+), 2 deletions(-) create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake index ec9d153b..800a33e0 100644 --- a/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake +++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake @@ -22,7 +22,7 @@ set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication k set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE) set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) -set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini" CACHE STRING "Link settings file" FORCE) +set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini" CACHE STRING "Link settings file" FORCE) set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE) set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE) diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake index bbf80c86..dfd8611b 100644 --- a/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake +++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake @@ -9,7 +9,7 @@ set(USE_MPI Yes CACHE BOOL "" FORCE) set(USE_SVM No CACHE BOOL "" FORCE) set(USE_HBM No CACHE BOOL "" FORCE) -set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_ACCL No CACHE BOOL "" FORCE) set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) set(USE_OCL_HOST No CACHE BOOL "" FORCE) set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini new file mode 100644 index 00000000..4783d320 --- /dev/null +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini @@ -0,0 +1,88 @@ +[connectivity] +nk=lu:1 +nk=left_update:1 +nk=top_update:1 +nk=inner_update_mm0:$PY_CODE_GEN num_replications$ + +# slrs +# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR +slr=lu_1:SLR0 +slr=left_update_1:SLR0 +slr=top_update_1:SLR0 +slr=inner_update_mm0_1:SLR2 + +# matrix ports +sp=lu_1.m_axi_gmem0:HBM[0:4] +sp=lu_1.m_axi_gmem1:HBM[5:6] +sp=lu_1.m_axi_gmem2:HBM[5:6] + +sp=top_update_1.m_axi_gmem0:HBM[0:4] +sp=top_update_1.m_axi_gmem1:HBM[5:6] +sp=top_update_1.m_axi_gmem2:HBM[5:6] + +sp=left_update_1.m_axi_gmem0:HBM[0:4] +sp=left_update_1.m_axi_gmem1:HBM[5:6] +sp=left_update_1.m_axi_gmem2:HBM[5:6] + +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:4] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[5:6] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[5:6] +# PY_CODE_GEN block_end + +#ACCL +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_ops:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:1:lb_user_krnl + +# Kernels Foorplaning +slr=compression_0_0:SLR1 +slr=compression_0_1:SLR1 +slr=compression_0_2:SLR1 +slr=lb_user_krnl:SLR1 +slr=arith_0:SLR1 +slr=ccl_offload_0:SLR1 +slr=hostctrl_0:SLR1 +slr=networklayer_0:SLR1 +slr=cmac_0:SLR1 + +sp=ccl_offload_0.m_axi_0:HBM[5:6] +sp=ccl_offload_0.m_axi_1:HBM[5:6] + + + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in +stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl From 41bd60c4d61abf6c306e6b65bff8e3e1a95098d9 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 26 Oct 2022 16:07:00 +0100 Subject: [PATCH 138/318] LINPACK add profile config for U55c --- .../Xilinx_U55C_B8_SB3_R1_ACCL_profile.cmake | 31 ++++++ ..._torus_accl.hbm.u55c.profile.generator.ini | 94 +++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL_profile.cmake create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL_profile.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL_profile.cmake new file mode 100644 index 00000000..ed8cc15a --- /dev/null +++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL_profile.cmake @@ -0,0 +1,31 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) + +# LINPACK specific options +set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE) +set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE) +set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) + +set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE) +set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) +set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini" CACHE STRING "Link settings file" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE) +set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE) + diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini new file mode 100644 index 00000000..d4d128dc --- /dev/null +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini @@ -0,0 +1,94 @@ +[connectivity] +nk=lu:1 +nk=left_update:1 +nk=top_update:1 +nk=inner_update_mm0:$PY_CODE_GEN num_replications$ + +# slrs +# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR +slr=lu_1:SLR0 +slr=left_update_1:SLR0 +slr=top_update_1:SLR0 +slr=inner_update_mm0_1:SLR2 + +# matrix ports +sp=lu_1.m_axi_gmem0:HBM[0:4] +sp=lu_1.m_axi_gmem1:HBM[5:6] +sp=lu_1.m_axi_gmem2:HBM[5:6] + +sp=top_update_1.m_axi_gmem0:HBM[0:4] +sp=top_update_1.m_axi_gmem1:HBM[5:6] +sp=top_update_1.m_axi_gmem2:HBM[5:6] + +sp=left_update_1.m_axi_gmem0:HBM[0:4] +sp=left_update_1.m_axi_gmem1:HBM[5:6] +sp=left_update_1.m_axi_gmem2:HBM[5:6] + +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:4] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[5:6] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[5:6] +# PY_CODE_GEN block_end + +#ACCL +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_ops:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:1:lb_user_krnl + +# Kernels Foorplaning +slr=compression_0_0:SLR1 +slr=compression_0_1:SLR1 +slr=compression_0_2:SLR1 +slr=lb_user_krnl:SLR1 +slr=arith_0:SLR1 +slr=ccl_offload_0:SLR1 +slr=hostctrl_0:SLR1 +slr=networklayer_0:SLR1 +slr=cmac_0:SLR1 + +sp=ccl_offload_0.m_axi_0:HBM[5:6] +sp=ccl_offload_0.m_axi_1:HBM[5:6] + + + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in +stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl + +[profile] +data=all:all:all +memory=all +stall=all:all +exec=all:all \ No newline at end of file From ecf3714d222dd19be119b6b1fa98b8caca6bca24 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 26 Oct 2022 16:37:00 +0100 Subject: [PATCH 139/318] Add profiling config for PTRANS ACCL on U280 --- .../Xilinx_U280_HBM_ACCL_stream_profile.cmake | 30 +++++++ ...nspose_pq_accl_stream.hbm.u280.profile.ini | 82 +++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 PTRANS/configs/Xilinx_U280_HBM_ACCL_stream_profile.cmake create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini diff --git a/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream_profile.cmake b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream_profile.cmake new file mode 100644 index 00000000..1b1aa691 --- /dev/null +++ b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream_profile.cmake @@ -0,0 +1,30 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_gen3x16_xdma_1_202211_1" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE) +set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) +set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE) +set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) +set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini new file mode 100644 index 00000000..3860eb41 --- /dev/null +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini @@ -0,0 +1,82 @@ +# /******************************************************************************* +# Copyright (C) 2021 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ +[connectivity] +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_ops:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:0 +nk=transpose_read0:1 +nk=transpose_write0:1 + +# Kernels Foorplaning +slr=compression_0_0:SLR1 +slr=compression_0_1:SLR1 +slr=compression_0_2:SLR1 +slr=arith_0:SLR1 +slr=ccl_offload_0:SLR1 +slr=hostctrl_0:SLR1 +slr=networklayer_0:SLR2 +slr=cmac_0:SLR2 +slr=transpose_read0_1:SLR1 +slr=transpose_write0_1:SLR0 + +sp=ccl_offload_0.m_axi_0:HBM[31] +sp=ccl_offload_0.m_axi_1:HBM[31] +sp=transpose_read0_1.m_axi_gmem0:HBM[0:7] +sp=transpose_write0_1.m_axi_gmem0:HBM[8:15] +sp=transpose_write0_1.m_axi_gmem1:HBM[16:23] + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl +stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:512 + +[profile] +data=all:all:all +memory=all +stall=all:all +exec=all:all \ No newline at end of file From d4739966fdca21f2a30557bc552b8fb8341bb683 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 26 Oct 2022 16:49:57 +0100 Subject: [PATCH 140/318] Set LINPACk ACCL buffer size suffiently large --- LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake index 800a33e0..20afd309 100644 --- a/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake +++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake @@ -23,6 +23,7 @@ set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication k set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE) set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini" CACHE STRING "Link settings file" FORCE) +set(ACCL_BUFFER_SIZE 524288 CACHE STRING "Set ACCL buffer size to fit single matrix block" FORCE) set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE) set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE) From 350bb3bea8e617d8751c363279f7c9edb664a350 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 27 Oct 2022 09:08:36 +0100 Subject: [PATCH 141/318] Add profile config for b_eff U280 --- .../Xilinx_U280_HBM_ACCL_pl_profile.cmake | 27 ++++++ ...s.link.xilinx.accl_pl.u280.hbm.profile.ini | 86 +++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake create mode 100644 b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini diff --git a/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake b/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake new file mode 100644 index 00000000..94489fba --- /dev/null +++ b/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake @@ -0,0 +1,27 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE) +set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE) +set(XILINX_KERNEL_NAMES "send_recv" CACHE STRING "" FORCE) +set(USE_ACCL_CLIENT_ARBITER Yes CACHE BOOL "Use the client arbiter kernel to connect multiple kernels to the CCLO cmd stream" FORCE) +# STREAM specific options +# Defaults to a total of ~12GB data +set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini new file mode 100644 index 00000000..2f284b1c --- /dev/null +++ b/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini @@ -0,0 +1,86 @@ +# /******************************************************************************* +# Copyright (C) 2021 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ +[connectivity] +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_ops:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:1:lb_user_krnl +nk=client_arbiter:1:client_arbiter +nk=send_recv:1:sendrecv + +# Kernels Foorplaning +slr=compression_0_0:SLR1 +slr=compression_0_1:SLR1 +slr=compression_0_2:SLR1 +slr=lb_user_krnl:SLR1 +slr=arith_0:SLR1 +slr=ccl_offload_0:SLR1 +slr=hostctrl_0:SLR1 +slr=networklayer_0:SLR2 +slr=cmac_0:SLR2 +slr=client_arbiter:SLR1 +slr=sendrecv:SLR1 + +sp=ccl_offload_0.m_axi_0:HBM[0:5] +sp=ccl_offload_0.m_axi_1:HBM[0:5] +sp=sendrecv.m_axi_gmem:HBM[0:5] + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:client_arbiter.cmd_clients_0 +stream_connect=client_arbiter.ack_clients_0:hostctrl_0.sts +stream_connect=sendrecv.cmd:client_arbiter.cmd_clients_1 +stream_connect=client_arbiter.ack_clients_1:sendrecv.sts +stream_connect=client_arbiter.cmd_cclo:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:client_arbiter.ack_cclo + + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in +stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl + +[profile] +data=all:all:all +memory=all +stall=all:all +exec=all:all From 64b3602abecd37b3835967773298719ee59fcd77 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 28 Oct 2022 10:50:51 +0100 Subject: [PATCH 142/318] Fixes in b_eff ACCL PL host code --- b_eff/src/host/execution_types/execution_accl_pl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp index ed35d552..4b3ff2ee 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp @@ -94,7 +94,7 @@ namespace network::execution_types::accl_pl { xrt::kernel sendrecvKernel; if (!config.programSettings->useAcclEmulation) { - sendrecvKernel(*config.device, *config.program, "send_recv"); + sendrecvKernel = xrt::kernel(*config.device, *config.program, "send_recv"); } double calculationTime = 0.0; @@ -102,7 +102,7 @@ namespace network::execution_types::accl_pl { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); if (!config.programSettings->useAcclEmulation) { - auto run = sendrecvKernel(acclSendBuffers[i]->bo(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + auto run = sendrecvKernel(*(acclSendBuffers[i]->bo()), *(acclRecvBuffers[i]->bo()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.accl->get_communicator_addr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32})); run.wait(); } else { From 69bf7aaa7e5150e337b8023b62c699c6c97e9563 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 28 Oct 2022 11:10:13 +0100 Subject: [PATCH 143/318] Update b_eff ACCL profile link config --- .../settings.link.xilinx.accl_pl.u55c.hbm.profile.ini | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini index 9a1ce41d..778054e5 100644 --- a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini +++ b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini @@ -80,7 +80,12 @@ stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl [profile] -data=all:all:all +data=send_recv:all:all +data=client_arbiter:all:all +data=ccl_offload:all:m_axis_eth_tx_data +data=networklayer:all:M_AXIS_nl2sk +data=networklayer:all:M_AXIS_nl2eth +data=cmac_0:all:M_AXIS memory=all -stall=all:all +stall=all exec=all:all From ebf3c8a1f1c260b0ceefc3566d01457d6c6892aa Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 28 Oct 2022 11:26:39 +0100 Subject: [PATCH 144/318] Specify the important profiling metrics for PTRANS ACCL --- ....link.xilinx.transpose_pq_accl_stream.hbm.profile.ini | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini index 1c3a4861..9dec51d7 100644 --- a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini @@ -76,7 +76,8 @@ stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:512 [profile] -data=all:all:all -memory=all -stall=all:all -exec=all:all \ No newline at end of file +data=transpose_read0:all:all +data=transpose_write0:all:all +memory=transpose_read0_1.m_axi_gmem0 +memory=transpose_write0_1.m_axi_gmem0 +memory=transpose_write0_1.m_axi_gmem1 From 0dffc714c7464b053d8bd68f7cf9d92e74eb10a5 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 8 Nov 2022 11:16:14 +0100 Subject: [PATCH 145/318] Add R3 config for Linpack on U280 --- .../Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake | 30 ++++++++++++++++ ..._pcie.distribute_kernels.hbm.generator.ini | 34 +++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 LINPACK/configs/Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake new file mode 100644 index 00000000..de080ee7 --- /dev/null +++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake @@ -0,0 +1,30 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) +set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE) +set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE) +# LINPACK specific options +set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE) +set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE) +set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(NUM_REPLICATIONS 3 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) + +set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) +set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini" CACHE STRING "Link settings file" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE) +set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE) diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini new file mode 100644 index 00000000..2815cc38 --- /dev/null +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini @@ -0,0 +1,34 @@ +[connectivity] +nk=lu:1 +nk=left_update:1 +nk=top_update:1 +nk=inner_update_mm0:$PY_CODE_GEN num_replications$ + +# slrs +# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR +slr=lu_1:SLR0 +slr=left_update_1:SLR1 +slr=top_update_1:SLR2 +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN i % 3$ +# PY_CODE_GEN block_end + +# matrix ports +sp=lu_1.m_axi_gmem0:HBM[0:5] +sp=lu_1.m_axi_gmem1:HBM[6] +sp=lu_1.m_axi_gmem2:HBM[7] + +sp=top_update_1.m_axi_gmem0:HBM[0:5] +sp=top_update_1.m_axi_gmem1:HBM[6] +sp=top_update_1.m_axi_gmem2:HBM[8] + +sp=left_update_1.m_axi_gmem0:HBM[0:5] +sp=left_update_1.m_axi_gmem1:HBM[7] +sp=left_update_1.m_axi_gmem2:HBM[9] + +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:5] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[9] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8] +# PY_CODE_GEN block_end + From 1cb75def2fb56dc6d2e3e8b39e4151787ac84f83 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 8 Nov 2022 13:12:44 +0100 Subject: [PATCH 146/318] Profiling U280 b_eff PL --- b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake | 2 +- .../settings.link.xilinx.accl_pl.u280.hbm.profile.ini | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake b/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake index 94489fba..c40efff7 100644 --- a/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake +++ b/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake @@ -12,7 +12,7 @@ set(USE_HBM No CACHE BOOL "" FORCE) set(USE_ACCL Yes CACHE BOOL "" FORCE) set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) set(USE_OCL_HOST No CACHE BOOL "" FORCE) -set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini CACHE FILEPATH "" FORCE) set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE) set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE) diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini index 2f284b1c..374a41c9 100644 --- a/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini +++ b/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini @@ -80,7 +80,12 @@ stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl [profile] -data=all:all:all +data=send_recv:all:all +data=client_arbiter:all:all +data=ccl_offload:all:m_axis_eth_tx_data +data=networklayer:all:M_AXIS_nl2sk +data=networklayer:all:M_AXIS_nl2eth +data=cmac_0:all:M_AXIS memory=all -stall=all:all +stall=all exec=all:all From 809eb38d0165570cb3b3523235ec89a06cd0c79d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 8 Nov 2022 13:19:10 +0100 Subject: [PATCH 147/318] Fix config naming --- ...SB3_R3_DDR_PCIE.cmake => Xilinx_U280_B8_SB3_R3_HBM_PCIE.cmake} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename LINPACK/configs/{Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake => Xilinx_U280_B8_SB3_R3_HBM_PCIE.cmake} (100%) diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R3_HBM_PCIE.cmake similarity index 100% rename from LINPACK/configs/Xilinx_U280_B8_SB3_R3_DDR_PCIE.cmake rename to LINPACK/configs/Xilinx_U280_B8_SB3_R3_HBM_PCIE.cmake From cf5561b5c5c819705552831ebda8ea37a6acd2ac Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 8 Nov 2022 14:33:16 +0100 Subject: [PATCH 148/318] Add PTRANS U280 ACCL stream config --- .../configs/Xilinx_U280_HBM_ACCL_stream.cmake | 29 +++++++ ...linx.transpose_pq_accl_stream.hbm.u280.ini | 76 +++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake create mode 100644 PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini diff --git a/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake new file mode 100644 index 00000000..827da9a9 --- /dev/null +++ b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake @@ -0,0 +1,29 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_gen3x16_xdma_1_202211_1" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE) +set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE) +set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE) +set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE) +set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini new file mode 100644 index 00000000..83150287 --- /dev/null +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini @@ -0,0 +1,76 @@ +# /******************************************************************************* +# Copyright (C) 2021 Xilinx, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# *******************************************************************************/ +[connectivity] +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_ops:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:0 +nk=transpose_read0:1 +nk=transpose_write0:1 + +# Kernels Foorplaning +slr=compression_0_0:SLR1 +slr=compression_0_1:SLR1 +slr=compression_0_2:SLR1 +slr=arith_0:SLR1 +slr=ccl_offload_0:SLR1 +slr=hostctrl_0:SLR1 +slr=networklayer_0:SLR2 +slr=cmac_0:SLR2 +slr=transpose_read0_1:SLR1 +slr=transpose_write0_1:SLR0 + +sp=ccl_offload_0.m_axi_0:HBM[31] +sp=ccl_offload_0.m_axi_1:HBM[31] +sp=transpose_read0_1.m_axi_gmem0:HBM[0:7] +sp=transpose_write0_1.m_axi_gmem0:HBM[8:15] +sp=transpose_write0_1.m_axi_gmem1:HBM[16:23] + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl +stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:512 From c1b697ee40c69e07093521cb4c37ac6fd1ab56be Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 8 Nov 2022 14:59:05 +0100 Subject: [PATCH 149/318] Fix PTRANS PCIE DDR config for U280 --- .../settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini index 882d5af1..3b7b0497 100644 --- a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini +++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini @@ -13,7 +13,5 @@ slr=transpose0_$PY_CODE_GEN i + 1$:SLR$PY_CODE_GEN i % num_slrs$ # Assign the kernels to the memory ports # PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] -sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem0:DDR[$PY_CODE_GEN i % num_ddrs$] -sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem1:DDR[$PY_CODE_GEN i % num_ddrs$] -sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem2:DDR[$PY_CODE_GEN i % num_ddrs$] +sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem:DDR[$PY_CODE_GEN i % num_ddrs$] # PY_CODE_GEN block_end From bdc93c880494e23f181c84b3d2f67f790e1703a3 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 8 Nov 2022 17:51:20 +0100 Subject: [PATCH 150/318] Make zmq optional in PTRANS w/o ACCL --- PTRANS/src/host/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt index e162b809..d17422ca 100755 --- a/PTRANS/src/host/CMakeLists.txt +++ b/PTRANS/src/host/CMakeLists.txt @@ -46,7 +46,9 @@ if (Vitis_FOUND) add_executable(${HOST_EXE_NAME}_xilinx main.cpp) target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) - target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx zmqpp) + if (USE_ACCL) + target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx zmqpp) + endif() target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") From 58a5b2139ed4c59c219b55a8ec3b7222258f9288 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 8 Nov 2022 18:43:56 +0100 Subject: [PATCH 151/318] Use old platform for synthesis --- PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake index 827da9a9..d5223408 100644 --- a/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake +++ b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake @@ -13,7 +13,7 @@ set(USE_ACCL Yes CACHE BOOL "" FORCE) set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) set(USE_OCL_HOST No CACHE BOOL "" FORCE) set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE) -set(FPGA_BOARD_NAME "xilinx_u280_gen3x16_xdma_1_202211_1" CACHE STRING "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini CACHE FILEPATH "" FORCE) set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE) set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE) From 4bb1fd23ae7ee62305e11af7b1f98adbc7200a6f Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 9 Nov 2022 09:21:57 +0100 Subject: [PATCH 152/318] Reduce target clock frequency of design --- LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake index 5ddc6b30..37f843d3 100644 --- a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake +++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake @@ -13,7 +13,7 @@ set(USE_ACCL Yes CACHE BOOL "" FORCE) set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) set(USE_OCL_HOST No CACHE BOOL "" FORCE) set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) -set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 200 CACHE STRING "" FORCE) set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE) set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE) # LINPACK specific options From f2d148b27245ff240caae632ba7113e0d10f94d2 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 9 Nov 2022 13:54:55 +0100 Subject: [PATCH 153/318] Fix device selection --- PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake | 1 + shared/setup/fpga_setup_xrt.cpp | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake index 46ef245c..a75d3fd4 100644 --- a/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake +++ b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake @@ -12,6 +12,7 @@ set(USE_HBM No CACHE BOOL "" FORCE) set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini CACHE FILEPATH "" FORCE) set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE) +set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE) # STREAM specific options # Defaults to a total of ~12GB data diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp index eae39fe8..f5d7ef32 100644 --- a/shared/setup/fpga_setup_xrt.cpp +++ b/shared/setup/fpga_setup_xrt.cpp @@ -39,6 +39,10 @@ namespace fpga_setup { MPI_Comm_rank(MPI_COMM_WORLD, & current_device); if (defaultDevice >= 0) { current_device = defaultDevice; + } else { + //TODO Use xrt::system::enumerate_devices() in "experimental/xrt_system.h" for future XRT versions + // instead of hardcoded number of devices. + current_device = current_device % 3; } return std::unique_ptr(new xrt::device(current_device)); } From 3fb8810fb2437285b2640d89963c378d75a4c3b4 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 9 Nov 2022 14:25:13 +0100 Subject: [PATCH 154/318] Switch to XRT host code --- PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake index a75d3fd4..eb878f8d 100644 --- a/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake +++ b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake @@ -9,10 +9,13 @@ set(USE_MPI Yes CACHE BOOL "" FORCE) set(USE_SVM No CACHE BOOL "" FORCE) set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini CACHE FILEPATH "" FORCE) set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE) set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE) +set(XILINX_KERNEL_NAMES transpose0 CACHE STRING "" FORCE) # STREAM specific options # Defaults to a total of ~12GB data From 5a42ab3c4533a3fcd67e07e68e77824460c39db4 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 9 Nov 2022 16:54:53 +0100 Subject: [PATCH 155/318] Fix build scripts for XRT --- PTRANS/src/host/CMakeLists.txt | 3 ++- shared/CMakeLists.txt | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt index d17422ca..b9e0541b 100755 --- a/PTRANS/src/host/CMakeLists.txt +++ b/PTRANS/src/host/CMakeLists.txt @@ -46,8 +46,9 @@ if (Vitis_FOUND) add_executable(${HOST_EXE_NAME}_xilinx main.cpp) target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) + target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) if (USE_ACCL) - target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx zmqpp) + target_link_libraries(${HOST_EXE_NAME}_xilinx zmqpp) endif() target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA) diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt index 19ab7ff2..64260c94 100644 --- a/shared/CMakeLists.txt +++ b/shared/CMakeLists.txt @@ -4,13 +4,13 @@ set(HPCC_BASE_SOURCES "") if (USE_ACCL) add_subdirectory(${extern_accl_SOURCE_DIR}/driver/xrt ${CMAKE_BINARY_DIR}/lib/accl) - list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp) + list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp ${extern_accl_SOURCE_DIR}/test/model/bfm/cclo_bfm.cpp) if (CMAKE_BUILD_TYPE EQUAL "Debug") set(ACCL_DEBUG Yes) endif() endif() if (USE_XRT_HOST) - list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp ${extern_accl_SOURCE_DIR}/test/model/bfm/cclo_bfm.cpp) + list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp) endif() list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) add_library(hpcc_fpga_base STATIC ${HPCC_BASE_SOURCES}) From 2157327888e0e2034bc94c0b6aae556a146b170e Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 10 Nov 2022 10:06:51 +0100 Subject: [PATCH 156/318] Remove ACCL deps from PCIE config --- LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake index 37f843d3..9d5cc02f 100644 --- a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake +++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake @@ -9,11 +9,11 @@ set(USE_MPI Yes CACHE BOOL "" FORCE) set(USE_SVM No CACHE BOOL "" FORCE) set(USE_HBM No CACHE BOOL "" FORCE) -set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_ACCL No CACHE BOOL "" FORCE) set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) set(USE_OCL_HOST No CACHE BOOL "" FORCE) set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) -set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 200 CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE) set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE) # LINPACK specific options From 242fa19ab0a36fb42d84a2b502dfb14f317e5d14 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 14 Nov 2022 11:41:13 +0100 Subject: [PATCH 157/318] Add config for U280 HPL with ACCL --- .../configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake | 30 +++++++ ...linx.hpl_torus_accl.hbm.u280.generator.ini | 88 +++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake new file mode 100644 index 00000000..186266ca --- /dev/null +++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake @@ -0,0 +1,30 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL Yes CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) + +# LINPACK specific options +set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE) +set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE) +set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) + +set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) +set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini" CACHE STRING "Link settings file" FORCE) +set(ACCL_BUFFER_SIZE 524288 CACHE STRING "Set ACCL buffer size to fit single matrix block" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE) +set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE) + diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini new file mode 100644 index 00000000..289a6263 --- /dev/null +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini @@ -0,0 +1,88 @@ +[connectivity] +nk=lu:1 +nk=left_update:1 +nk=top_update:1 +nk=inner_update_mm0:$PY_CODE_GEN num_replications$ + +# slrs +# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR +slr=lu_1:SLR0 +slr=left_update_1:SLR0 +slr=top_update_1:SLR0 +slr=inner_update_mm0_1:SLR1 + +# matrix ports +sp=lu_1.m_axi_gmem0:HBM[0:4] +sp=lu_1.m_axi_gmem1:HBM[5:6] +sp=lu_1.m_axi_gmem2:HBM[5:6] + +sp=top_update_1.m_axi_gmem0:HBM[0:4] +sp=top_update_1.m_axi_gmem1:HBM[5:6] +sp=top_update_1.m_axi_gmem2:HBM[5:6] + +sp=left_update_1.m_axi_gmem0:HBM[0:4] +sp=left_update_1.m_axi_gmem1:HBM[5:6] +sp=left_update_1.m_axi_gmem2:HBM[5:6] + +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:4] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[5:6] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[5:6] +# PY_CODE_GEN block_end + +#ACCL +# Define number of kernels and their name +nk=networklayer:1:networklayer_0 +nk=ccl_offload:1:ccl_offload_0 +nk=hostctrl:1:hostctrl_0 +nk=cmac_0:1:cmac_0 +nk=reduce_ops:1:arith_0 +nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2 +nk=loopback:1:lb_user_krnl + +# Kernels Foorplaning +slr=compression_0_0:SLR2 +slr=compression_0_1:SLR2 +slr=compression_0_2:SLR2 +slr=lb_user_krnl:SLR2 +slr=arith_0:SLR2 +slr=ccl_offload_0:SLR2 +slr=hostctrl_0:SLR2 +slr=networklayer_0:SLR2 +slr=cmac_0:SLR2 + +sp=ccl_offload_0.m_axi_0:HBM[5:6] +sp=ccl_offload_0.m_axi_1:HBM[5:6] + + + +# Connect host controllers to CCL Offload +stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req +stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts + +# Connect CCL Offload kernel to UDP Network Kernel +stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512 +stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512 + +# Connect UDP Network Kernel to CMAC Kernel +stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl +stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS + +# arithmetic connections +stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0 +stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1 +stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res + +# caster connections +stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r +stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0 + +stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r +stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1 + +stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r +stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 + +# Tie off user kernel interface +stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in +stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl From 72e66929f1048fa570ba60cf6749921b3d10a33c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 14 Nov 2022 13:32:15 +0100 Subject: [PATCH 158/318] Profiling for LINPACK U280 --- ...linx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake | 32 ++++++++++++++++ ..._torus_pcie.hbm.u280.profile.generator.ini | 38 +++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake create mode 100644 LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake new file mode 100644 index 00000000..63210758 --- /dev/null +++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake @@ -0,0 +1,32 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(USE_ACCL No CACHE BOOL "" FORCE) +set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) +set(USE_OCL_HOST No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE) +set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE) +set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE) +# LINPACK specific options +set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE) +set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE) +set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) + +set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) +set(XILINX_LINK_SETTINGS_FILE + "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini" CACHE STRING "Link settings file" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE) +set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE) diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini new file mode 100644 index 00000000..ca99d858 --- /dev/null +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini @@ -0,0 +1,38 @@ +[connectivity] +nk=lu:1 +nk=left_update:1 +nk=top_update:1 +nk=inner_update_mm0:$PY_CODE_GEN num_replications$ + +# slrs +# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR +slr=lu_1:SLR0 +slr=left_update_1:SLR0 +slr=top_update_1:SLR0 +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN (i +1) % 3$ +# PY_CODE_GEN block_end + +# matrix ports +sp=lu_1.m_axi_gmem0:HBM[0:5] +sp=lu_1.m_axi_gmem1:HBM[6] +sp=lu_1.m_axi_gmem2:HBM[7] + +sp=top_update_1.m_axi_gmem0:HBM[0:5] +sp=top_update_1.m_axi_gmem1:HBM[6] +sp=top_update_1.m_axi_gmem2:HBM[8] + +sp=left_update_1.m_axi_gmem0:HBM[0:5] +sp=left_update_1.m_axi_gmem1:HBM[7] +sp=left_update_1.m_axi_gmem2:HBM[9] + +# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[9] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8] +# PY_CODE_GEN block_end + +[profile] +memory=all +exec=all:all + From d89939f5f0186896a97941409e31e897c10aaccb Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 15 Nov 2022 17:01:53 +0100 Subject: [PATCH 159/318] Update config for HPL U280 profile --- LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake | 3 +-- ...s.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake index 63210758..654bc3f3 100644 --- a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake +++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake @@ -25,8 +25,7 @@ set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipul set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) -set(XILINX_LINK_SETTINGS_FILE - "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini" CACHE STRING "Link settings file" FORCE) +set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini" CACHE STRING "Link settings file" FORCE) set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE) set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE) diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini index ca99d858..e0bb5aaa 100644 --- a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini @@ -27,7 +27,7 @@ sp=left_update_1.m_axi_gmem1:HBM[7] sp=left_update_1.m_axi_gmem2:HBM[9] # PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] -sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:5] sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[9] sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8] # PY_CODE_GEN block_end From 719353eb927f68f3a7b4a2f16c67e33edc95b7db Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 16 Nov 2022 11:30:52 +0100 Subject: [PATCH 160/318] Fix HBM link config for LINPACK --- ...ttings.link.xilinx.hpl_torus_pcie.hbm.generator.ini | 10 +++++----- ...ilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini index df381966..fe68d728 100644 --- a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini @@ -19,15 +19,15 @@ sp=lu_1.m_axi_gmem1:HBM[6] sp=lu_1.m_axi_gmem2:HBM[7] sp=top_update_1.m_axi_gmem0:HBM[0:5] -sp=top_update_1.m_axi_gmem1:HBM[6] -sp=top_update_1.m_axi_gmem2:HBM[8] +sp=top_update_1.m_axi_gmem1:HBM[8] +sp=top_update_1.m_axi_gmem2:HBM[6] sp=left_update_1.m_axi_gmem0:HBM[0:5] -sp=left_update_1.m_axi_gmem1:HBM[7] -sp=left_update_1.m_axi_gmem2:HBM[9] +sp=left_update_1.m_axi_gmem1:HBM[9] +sp=left_update_1.m_axi_gmem2:HBM[7] # PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] -sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0] +sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:5] sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[9] sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8] # PY_CODE_GEN block_end diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini index e0bb5aaa..5a7bbbf0 100644 --- a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini @@ -19,12 +19,12 @@ sp=lu_1.m_axi_gmem1:HBM[6] sp=lu_1.m_axi_gmem2:HBM[7] sp=top_update_1.m_axi_gmem0:HBM[0:5] -sp=top_update_1.m_axi_gmem1:HBM[6] -sp=top_update_1.m_axi_gmem2:HBM[8] +sp=top_update_1.m_axi_gmem1:HBM[8] +sp=top_update_1.m_axi_gmem2:HBM[6] sp=left_update_1.m_axi_gmem0:HBM[0:5] -sp=left_update_1.m_axi_gmem1:HBM[7] -sp=left_update_1.m_axi_gmem2:HBM[9] +sp=left_update_1.m_axi_gmem1:HBM[9] +sp=left_update_1.m_axi_gmem2:HBM[7] # PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:5] From 3ffe0c4ab6198a1d5aa6390a1da5cfb116133653 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Wed, 16 Nov 2022 12:12:43 +0100 Subject: [PATCH 161/318] document new argument of selectFPGADevice --- shared/setup/fpga_setup.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp index 70125df0..e6039973 100644 --- a/shared/setup/fpga_setup.cpp +++ b/shared/setup/fpga_setup.cpp @@ -220,6 +220,9 @@ choose a device. @param defaultDevice The index of the device that has to be used. If a value < 0 is given, the device can be chosen interactively +@param platformString The platform string which should be chosen. + If it is empty, it will be ignored. If it is not empty, + but the string is not found an exception is thrown. @return A list containing a single selected device */ From a50dfe47ec49a64b852dada0d27c9754eda42256 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 17 Nov 2022 09:40:57 +0100 Subject: [PATCH 162/318] Update ACCL constructor --- shared/setup/fpga_setup_accl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index c560ee29..4d3207af 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -97,7 +97,7 @@ std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &progra std::vector mem(1, 0); std::cout << "Create ACCL" << std::endl; return std::unique_ptr( - new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, 0, ACCL::networkProtocol::UDP)); + new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, ACCL::networkProtocol::UDP)); } else { // TODO: Add start port here. Currenty hardcoded! return std::unique_ptr( From af24978072c02e55328f2988676ff3889c846981 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 17 Nov 2022 11:12:32 +0100 Subject: [PATCH 163/318] Disable LINPACK AllBlockExternResult test --- LINPACK/tests/test_kernel_communication.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/LINPACK/tests/test_kernel_communication.cpp b/LINPACK/tests/test_kernel_communication.cpp index dfcb8867..603bedef 100644 --- a/LINPACK/tests/test_kernel_communication.cpp +++ b/LINPACK/tests/test_kernel_communication.cpp @@ -1206,8 +1206,10 @@ class LinpackKernelCommunicationTestAll : public LinpackKernelCommunicationTest } }; - -TEST_F(LinpackKernelCommunicationTestAll, AllBlockExternalResultisCorrect) { +// TODO: This test is disabled because it fails non-deterministicly although +// calculations with benchmark host are correct. +// Maybe this is related to a problem with intel external channels in emulation. +TEST_F(LinpackKernelCommunicationTestAll, DISABLED_AllBlockExternalResultisCorrect) { uint matrix_size = bm->getExecutionSettings().programSettings->matrixSize; auto ref_data = bm->generateInputData(); From 49fde98adbd0b7f9c2b39c6684bac186ad2e846b Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 17 Nov 2022 14:20:30 +0100 Subject: [PATCH 164/318] Fix memory mapping of buffers in XRT HPL base version --- LINPACK/src/host/execution_types/execution_xrt_pcie.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp index 1a2610e0..f35df7b9 100644 --- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp @@ -111,7 +111,7 @@ std::unique_ptr calculate( *config.device, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * (config.programSettings->blockSize), - kernel_lu.group_id(1)); + kernel_top.group_id(1)); } for (int i = 0; i < blocks_per_col; i++) { @@ -119,7 +119,7 @@ std::unique_ptr calculate( *config.device, sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) * (config.programSettings->blockSize), - kernel_lu.group_id(2)); + kernel_left.group_id(1)); } } From d55e450cd9c9fae41b442d77f70a49d0b2f0b279 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 17 Nov 2022 14:31:30 +0100 Subject: [PATCH 165/318] Extend profile options in config --- LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake | 2 +- ...s.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake index 654bc3f3..9bc20f5c 100644 --- a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake +++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake @@ -13,7 +13,7 @@ set(USE_ACCL No CACHE BOOL "" FORCE) set(USE_XRT_HOST Yes CACHE BOOL "" FORCE) set(USE_OCL_HOST No CACHE BOOL "" FORCE) set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) -set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE) +set(XILINX_ADDITIONAL_COMPILE_FLAGS -g --profile.stall all:all CACHE STRING "" FORCE) set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE) set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE) set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE) diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini index 5a7bbbf0..4ac80a17 100644 --- a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini @@ -33,6 +33,7 @@ sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8] # PY_CODE_GEN block_end [profile] -memory=all +stall=all:all +data=all:all:all exec=all:all From 993f919359a1bfe2369d07c45823c22c4820be3c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 18 Nov 2022 12:29:24 +0100 Subject: [PATCH 166/318] Add 250MHz target to config --- LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake index 186266ca..94a9c4f6 100644 --- a/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake +++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake @@ -20,6 +20,7 @@ set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of t set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE) set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE) +set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE) set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE) set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE) set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini" CACHE STRING "Link settings file" FORCE) From f0d446f938aaf9c370fc9cbabcec03ed1a70275d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 18 Nov 2022 12:34:34 +0100 Subject: [PATCH 167/318] Fix clock for cclo in HPL --- .../settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini | 3 +++ 1 file changed, 3 insertions(+) diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini index 289a6263..d20bdb66 100644 --- a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini @@ -86,3 +86,6 @@ stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 # Tie off user kernel interface stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl + +[clock] +freqHz=250000000:ccl_offload_0 \ No newline at end of file From efdac5daf78faafef8b3f20c549d7c2710fddb0d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 18 Nov 2022 13:42:54 +0100 Subject: [PATCH 168/318] Remove fixed clock since itis not supported with the platform --- .../settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini | 3 --- 1 file changed, 3 deletions(-) diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini index d20bdb66..289a6263 100644 --- a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini @@ -86,6 +86,3 @@ stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2 # Tie off user kernel interface stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl - -[clock] -freqHz=250000000:ccl_offload_0 \ No newline at end of file From 83a0867cf1f0beba8ec979dc3f9fbedc1f1e9b57 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 23 Nov 2022 14:27:19 +0100 Subject: [PATCH 169/318] Add trace memory in HBM --- ...k.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini index 4ac80a17..aeea6acf 100644 --- a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini @@ -33,7 +33,9 @@ sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8] # PY_CODE_GEN block_end [profile] -stall=all:all +stall=all:all:all data=all:all:all -exec=all:all - +exec=all:all:all +trace_memory=HBM[16]:SLR0 +trace_memory=HBM[17]:SLR1 +trace_memory=HBM[18]:SLR2 From d17a0a6356edd3397bf53bc58f22049ef935d512 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Fri, 25 Nov 2022 13:38:59 +0100 Subject: [PATCH 170/318] Extend documentation for communication types --- docs/source/index.rst | 11 +++++++++++ .../source/technical_support/Basic Setup/index.rst | 14 ++++++++------ .../Host Input Parameters/index.rst | 3 +++ shared/include/communication_types.hpp | 7 ------- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 96fd71bc..13f1de5c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -25,6 +25,17 @@ The pages collected under **Benchmark Descriptions** contain information about t **Technical Support** tackles selected topics of configuration, build, and execution of the benchmarks. **Benchmark Results** for the base implementations of the benchmarks are listed at the bottom of this page. They are reported together with the used CPU and other relevant infrastructure, as well as the configuration and resource utilization of the bitstreams. +The scalability and performance of applications executed over multiple FPGAs is not least dependent on the communication capabilities of these devices. The benchmark suite supports the implementation of different communication strategies to compare their impact on the overall benchmark performance. This is only available to the benchmarks which rely on communication: b_eff, PTRANS and LINPACK. + +The first and most obvious strategy is host-to-host communication using PCIe and MPI. This strategy requires, in most cases, no additional hardware or software and only relies on moving data between the host and FPGA. +The data is then exchanged via the existing CPU network, which makes it broadly appliable in the HPC context. +As a consequence, this approach is used for the base implementations in this benchmark suite. +For comparison, the suite can be extended with different communication types. +Intel is providing external channels for direct communication between the FPGAs. +This approach is based on point-to-point connections between FPGA and requires manual routing of data through the network. + +Further optimized implementations that use such device-specific communication approaches will be added in the future to the suite. + .. toctree:: diff --git a/docs/source/technical_support/Basic Setup/index.rst b/docs/source/technical_support/Basic Setup/index.rst index ed80740a..7308fc23 100644 --- a/docs/source/technical_support/Basic Setup/index.rst +++ b/docs/source/technical_support/Basic Setup/index.rst @@ -103,20 +103,22 @@ You can always get an overview of the available targets by executing the followi BENCHMARK_VENDOR, "Builds the host application " BENCHMARK_test_VENDOR, "Compile the tests and its dependencies " -Moreover, there are additional targets to generate kernel reports and bitstreams. +Moreover, there are additional targets to generate device reports and bitstreams. + The kernel targets are: .. csv-table:: Device code build targets :header: "Target","Description" :widths: 10, 30 - BENCHMARK_VENDOR , Synthesizes the kernel (takes several hours!) - BENCHMARK_report_VENDOR , Just compile the kernel and create logs and reports - BENCHMARK_emulate_VENDOR , Create an emulation kernel + BASENAME_{COMM_}VENDOR , Synthesizes the device kernels (takes several hours!) + BASENAME_{COMM_}report_VENDOR , Just compile the kernels and create logs and reports + BASENAME_{COMM_}emulate_VENDOR , Creates the emulation kernels `VENDOR` is either `intel` or `xilinx` depending if the Intel SDK or Xilinx Vitis should be used. -`BENCHMARK` is the kernel name. -A benchmark can provide multiple kernels and thus, these targets will be generated for every kernel file. +`BASENAME` is the name of the file containing the device code. +A benchmark can provide multiple kernel implementations and thus, these targets will be generated for every file containing kernel code. +For all benchmarks using communication between FPGAs the different communcation types are encoded into the device code file name and therefore part of target name. These are b_eff, PTRANS and LINPACK. ------------------------------------------------------ Configure and Build STREAM for a fictional Xilinx FPGA diff --git a/docs/source/technical_support/Host Input Parameters/index.rst b/docs/source/technical_support/Host Input Parameters/index.rst index 46abe6f3..50121964 100644 --- a/docs/source/technical_support/Host Input Parameters/index.rst +++ b/docs/source/technical_support/Host Input Parameters/index.rst @@ -43,6 +43,9 @@ Input parameters (or options) can be appended to the host execution call like th Please note, that the benchmark will always fail with this option since it assumes the validation failed, so it will return a non-zero exit code! For reported measurements, the validation has to be enabled and the host should return with an exit code 0. +``--comm-type COMM``: + This parameter chooses the communication strategy which will be used. Current Options are "IEC" for using the Intel External Channel, "PCIE" for using the host-to-host communicationa and "CPU" for calculating on the CPU. + ``--test``: This option will also skip the execution of the benchmark. It can be used to test different data generation schemes or the benchmark summary before the actual execution. Please note, that the host will exit with a non-zero exit code, because it will not be able to validate the output. diff --git a/shared/include/communication_types.hpp b/shared/include/communication_types.hpp index bb46bb8d..005f4c03 100644 --- a/shared/include/communication_types.hpp +++ b/shared/include/communication_types.hpp @@ -46,12 +46,6 @@ typedef enum _CommunicationType { */ pcie_mpi, - /** - * @brief Communcation using the Streaming Message Interface - * - */ - smi, - /** * @brief Calculate the benchmark on CPU instead of FPGA * @@ -75,7 +69,6 @@ typedef enum _CommunicationType { static const std::map comm_to_str_map{ {"IEC", CommunicationType::intel_external_channels}, {"PCIE", CommunicationType::pcie_mpi}, - {"SMI", CommunicationType::smi}, {"CPU", CommunicationType::cpu_only}, {"UNSUPPORTED", CommunicationType::unsupported}, {"AUTO", CommunicationType::automatic} From d74711be72feef99fbbb9c77bf66cdfe55f21ae4 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 28 Nov 2022 19:33:06 +0100 Subject: [PATCH 171/318] Add TCP host setup for ACCL --- .../host/execution_types/execution_accl.hpp | 22 +++---- .../execution_types/execution_accl_pl.hpp | 8 +-- b_eff/src/host/network_benchmark.cpp | 4 +- b_eff/src/host/network_benchmark.hpp | 5 +- shared/include/hpcc_benchmark.hpp | 45 ++++++------- shared/include/setup/fpga_setup_accl.hpp | 28 +++++++- shared/setup/fpga_setup_accl.cpp | 64 +++++++++++++++---- 7 files changed, 116 insertions(+), 60 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp index 8d1638d9..2ade570b 100644 --- a/b_eff/src/host/execution_types/execution_accl.hpp +++ b/b_eff/src/host/execution_types/execution_accl.hpp @@ -45,11 +45,11 @@ namespace network::execution_types::accl { cl::vector &validationData) { int err; - std::vector> dummyBufferContents; - std::vector> recvBufferContents; - std::vector>> acclSendBuffers; - std::vector>> acclRecvBuffers; - cl_uint size_in_bytes = std::max(static_cast(validationData.size()), (1 << messageSize)); + std::vector> dummyBufferContents; + std::vector> recvBufferContents; + std::vector>> acclSendBuffers; + std::vector>> acclRecvBuffers; + size_t size_in_bytes = std::max(static_cast(validationData.size()), static_cast(1 << messageSize)); int current_rank; MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); @@ -66,10 +66,10 @@ namespace network::execution_types::accl { int size_in_values = (size_in_bytes + 3) / 4; // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels for (int r = 0; r < config.programSettings->kernelReplications; r++) { - dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); - recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); - acclSendBuffers.push_back(config.accl->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); - acclRecvBuffers.push_back(config.accl->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); + dummyBufferContents.emplace_back(size_in_values, static_cast(messageSize & (255))); + recvBufferContents.emplace_back(size_in_values, static_cast(0)); + acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_values, ACCL::dataType::float32)); + acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_values, ACCL::dataType::float32)); acclSendBuffers.back()->sync_to_device(); acclRecvBuffers.back()->sync_to_device(); } @@ -83,12 +83,12 @@ namespace network::execution_types::accl { std::cout << "Send " << size_in_values << " bytes to " << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl; #endif - config.accl->send(*acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + config.context->accl->send(*acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); #ifndef NDEBUG std::cout << "Recv " << size_in_values << " bytes from " << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl; #endif - config.accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + config.context->accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); #ifndef NDEBUG std::cout << "Done" << std::endl; #endif diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp index 4b3ff2ee..eecb552e 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp @@ -86,8 +86,8 @@ namespace network::execution_types::accl_pl { for (int r = 0; r < config.programSettings->kernelReplications; r++) { dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); - acclSendBuffers.push_back(config.accl->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); - acclRecvBuffers.push_back(config.accl->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); + acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); + acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); acclSendBuffers.back()->sync_to_device(); acclRecvBuffers.back()->sync_to_device(); } @@ -103,11 +103,11 @@ namespace network::execution_types::accl_pl { auto startCalculation = std::chrono::high_resolution_clock::now(); if (!config.programSettings->useAcclEmulation) { auto run = sendrecvKernel(*(acclSendBuffers[i]->bo()), *(acclRecvBuffers[i]->bo()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, - config.accl->get_communicator_addr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32})); + config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32})); run.wait(); } else { send_recv(reinterpret_cast(acclSendBuffers[i]->buffer()), reinterpret_cast(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, - config.accl->get_communicator_addr(), config.accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}), + config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}), cmd, sts); } auto endCalculation = std::chrono::high_resolution_clock::now(); diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index 2eef9621..5265ac46 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -110,7 +110,7 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) { for (auto& run : data.items) { if (world_rank == 0) { - std::cout << "Measure for " << (1 << run.messageSize) << " Byte" << std::endl; + std::cout << std::dec << "Measure for " << (1 << run.messageSize) << " Byte" << std::endl; } std::shared_ptr timing; switch (executionSettings->programSettings->communicationType) { @@ -211,7 +211,7 @@ network::NetworkBenchmark::collectAndPrintResults(const network::NetworkExecutio maxBandwidths.push_back(maxCalcBW); - std::cout << std::setw(ENTRY_SPACE) << (1 << msgSizeResults.first) << " " + std::cout << std::dec << std::setw(ENTRY_SPACE) << (1 << msgSizeResults.first) << " " << std::setw(ENTRY_SPACE) << looplength << " " << std::setw(ENTRY_SPACE) << totalMaxMinCalculationTime[i] << " " << std::setw(ENTRY_SPACE) << maxCalcBW diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index 472ab15d..cb0c61ea 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -251,8 +251,11 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark #endif #ifdef USE_XRT_HOST +#ifdef USE_ACCL + public hpcc_base::HpccFpgaBenchmark +#else public hpcc_base::HpccFpgaBenchmark - +#endif #endif { protected: diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 7ff91bae..494d18c8 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -126,11 +126,19 @@ class BaseSettings { */ CommunicationType communicationType; +#ifdef USE_ACCL /** * @brief Use ACCL emulation constructor instead of hardware execution */ bool useAcclEmulation; + /** + * @brief Used ACCL network stack + * + */ + ACCL::networkProtocol acclProtocol; +#endif + /** * @brief Construct a new Base Settings object * @@ -153,8 +161,7 @@ class BaseSettings { #endif #ifdef USE_ACCL useAcclEmulation(static_cast(results.count("accl-emulation"))), -#else - useAcclEmulation(false), + acclProtocol(fpga_setup::acclProtocolStringToEnum(results["accl-protocol"].as())), #endif #ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED communicationType(retrieveCommunicationType(results["comm-type"].as(), results["f"].as())), @@ -219,14 +226,6 @@ class ExecutionSettings { */ std::unique_ptr program; -#ifdef USE_ACCL - /** - * @brief Pointer to ACCL instance - * - */ - std::unique_ptr accl; -#endif - /** * @brief Construct a new Execution Settings object * @@ -237,16 +236,10 @@ class ExecutionSettings { */ ExecutionSettings(std::unique_ptr programSettings_, std::unique_ptr device_, std::unique_ptr context_, std::unique_ptr program_ -#ifdef USE_ACCL - , std::unique_ptr accl_ -#endif ): programSettings(std::move(programSettings_)), device(std::move(device_)), - context(std::move(context_)), program(std::move(program_)) -#ifdef USE_ACCL - , accl(std::move(accl_)) -#endif + context(std::move(context_)), program(std::move(program_)) {} /** @@ -406,6 +399,8 @@ class HpccFpgaBenchmark { #endif #ifdef USE_ACCL ("accl-emulation", "Use the accl emulation instead of hardware execution") + ("accl-protocol", "Specify the network protocol that should be used with ACCL.", + cxxopts::value()->default_value("UDP")) #endif ("skip-validation", "Skip the validation of the output data. This will speed up execution and helps when working with special data types.") ("device", "Index of the device that has to be used. If not given you "\ @@ -510,13 +505,13 @@ class HpccFpgaBenchmark { std::unique_ptr context; std::unique_ptr program; std::unique_ptr usedDevice; -#ifdef USE_ACCL - std::unique_ptr accl; -#endif + if (!programSettings->testOnly) { #ifdef USE_XRT_HOST usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultDevice); +#ifndef USE_ACCL context = std::unique_ptr(new bool(false)); +#endif if (!programSettings->useAcclEmulation) { program = fpga_setup::fpgaSetup(*usedDevice, programSettings->kernelFileName); } @@ -530,19 +525,17 @@ class HpccFpgaBenchmark { #endif #ifdef USE_ACCL if (programSettings->communicationType == CommunicationType::accl) { - accl = fpga_setup::fpgaSetupACCL(*usedDevice, *program, programSettings->useAcclEmulation); + context = std::unique_ptr(new fpga_setup::ACCLContext(fpga_setup::fpgaSetupACCL(*usedDevice, *program, programSettings->useAcclEmulation, + programSettings->acclProtocol))); } else { - accl = std::unique_ptr(nullptr); + context = std::unique_ptr(new fpga_setup::ACCLContext()); } #endif } executionSettings = std::unique_ptr>(new ExecutionSettings(std::move(programSettings), std::move(usedDevice), - std::move(context), std::move(program) -#ifdef USE_ACCL - , std::move(accl) -#endif + std::move(context), std::move(program) )); if (mpi_comm_rank == 0) { if (!checkInputParameters()) { diff --git a/shared/include/setup/fpga_setup_accl.hpp b/shared/include/setup/fpga_setup_accl.hpp index dcf2a530..ff493ccc 100644 --- a/shared/include/setup/fpga_setup_accl.hpp +++ b/shared/include/setup/fpga_setup_accl.hpp @@ -36,6 +36,29 @@ SOFTWARE. namespace fpga_setup { + +struct ACCLContext { + std::unique_ptr accl; + std::unique_ptr tx_buf_network; + std::unique_ptr rx_buf_network; +}; + + +static const std::map acclProtocolMap = { + {"UDP", ACCL::networkProtocol::UDP}, + {"TCP", ACCL::networkProtocol::TCP} +}; + +static ACCL::networkProtocol acclProtocolStringToEnum(std::string string_representation) { + if (acclProtocolMap.count(string_representation)) { + return acclProtocolMap.at(string_representation); + } + else { + std::runtime_error("ACCL network protocol could not be parsed from string: " + string_representation); + } + return ACCL::networkProtocol::UDP; +} + /** Sets up the given FPGA with the kernel in the provided file. @@ -44,8 +67,9 @@ Sets up the given FPGA with the kernel in the provided file. @param useAcclEmulation Construct an ACCL emulation instance instead of hardware execution @return The ACCL instance used for communication */ -std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &program, - bool useAcclEmulation); +ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program, + bool useAcclEmulation, + ACCL::networkProtocol protocol); } // namespace fpga_setup #endif // SRC_HOST_FPGA_SETUP_H_ diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 4d3207af..ed84ea08 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -66,8 +66,23 @@ void configure_vnx(CMAC &cmac, Networklayer &network_layer, network_layer.arp_discovery(); } -std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &program, - bool useAcclEmulation) { +void configure_tcp(ACCL::BaseBuffer &tx_buf_network, ACCL::BaseBuffer &rx_buf_network, + xrt::kernel &network_krnl, std::vector &ranks, + int rank) { + std::cout << "Configure TCP Network Kernel" << std::endl; + tx_buf_network.sync_to_device(); + rx_buf_network.sync_to_device(); + + uint local_fpga_ip = ACCL::ip_encode(ranks[rank].ip); + std::cout << "rank: " << rank << " FPGA IP: " << std::hex << local_fpga_ip + << std::endl; + + network_krnl(local_fpga_ip, static_cast(rank), local_fpga_ip, + *(tx_buf_network.bo()), *(rx_buf_network.bo())); +} + +ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program, + bool useAcclEmulation, ACCL::networkProtocol protocol) { int current_rank; MPI_Comm_rank(MPI_COMM_WORLD, ¤t_rank); @@ -80,29 +95,50 @@ std::unique_ptr fpgaSetupACCL(xrt::device &device, xrt::uuid &progra ACCL::rank_t new_rank = {"10.10.10." + std::to_string(i), 5500 + i, i, ACCL_BUFFER_SIZE}; ranks.emplace_back(new_rank); } + + ACCLContext accl; + if (!useAcclEmulation) { std::cout << "Create cclo ip" << std::endl; auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}"); std::cout << "Create hostctrl" << std::endl; auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}", xrt::kernel::cu_access_mode::exclusive); - std::cout << "Create CMAC" << std::endl; - auto cmac = CMAC(xrt::ip(device, program, "cmac_0:{cmac_0}")); - std::cout << "Create Network Layer" << std::endl; - auto network_layer = Networklayer( - xrt::ip(device, program, "networklayer:{networklayer_0}")); - std::cout << "Configure VNX" << std::endl; - configure_vnx(cmac, network_layer, ranks, current_rank); - + if (protocol == ACCL::networkProtocol::UDP) { + std::cout << "Create CMAC" << std::endl; + auto cmac = CMAC(xrt::ip(device, program, "cmac_0:{cmac_0}")); + std::cout << "Create Network Layer" << std::endl; + auto network_layer = Networklayer( + xrt::ip(device, program, "networklayer:{networklayer_0}")); + std::cout << "Configure VNX" << std::endl; + configure_vnx(cmac, network_layer, ranks, current_rank); + } + if (protocol == ACCL::networkProtocol::TCP) { + auto network_krnl = xrt::kernel(device, program, "network_krnl:{network_krnl_0}", + xrt::kernel::cu_access_mode::exclusive); + accl.tx_buf_network = std::unique_ptr(new ACCL::FPGABuffer( + 64 * 1024 * 1024, ACCL::dataType::int8, device, network_krnl.group_id(3))); + accl.rx_buf_network = std::unique_ptr(new ACCL::FPGABuffer( + 64 * 1024 * 1024, ACCL::dataType::int8, device, network_krnl.group_id(4))); + configure_tcp(*accl.tx_buf_network, *accl.rx_buf_network, network_krnl, ranks, current_rank); + } std::vector mem(1, 0); std::cout << "Create ACCL" << std::endl; - return std::unique_ptr( - new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, ACCL::networkProtocol::UDP)); + accl.accl = std::unique_ptr( + new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, protocol, 16, ACCL_BUFFER_SIZE)); } else { // TODO: Add start port here. Currenty hardcoded! - return std::unique_ptr( - new ACCL::ACCL(ranks, current_rank, 6000, device, ACCL::networkProtocol::UDP, 16, ACCL_BUFFER_SIZE)); + accl.accl = std::unique_ptr( + new ACCL::ACCL(ranks, current_rank, 6000, device, protocol, 16, ACCL_BUFFER_SIZE)); + } + + if (protocol == ACCL::networkProtocol::TCP) { + MPI_Barrier(MPI_COMM_WORLD); + accl.accl->open_port(); + MPI_Barrier(MPI_COMM_WORLD); + accl.accl->open_con(); } + return accl; } } // namespace fpga_setup From a2e7b37871f2ff5b86691a39ee9c62bb51df1728 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 29 Nov 2022 10:47:53 +0100 Subject: [PATCH 172/318] Fix ACCL configuration bug --- cmake/general_benchmark_build_setup.cmake | 7 +++---- extern/CMakeLists.txt | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake index 1537b092..cc59db91 100644 --- a/cmake/general_benchmark_build_setup.cmake +++ b/cmake/general_benchmark_build_setup.cmake @@ -3,9 +3,6 @@ INCLUDE (CheckTypeSize) set (CMAKE_CXX_STANDARD 14) -# Download build dependencies -add_subdirectory(${CMAKE_SOURCE_DIR}/../extern ${CMAKE_BINARY_DIR}/extern) - if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) enable_testing() endif() @@ -45,12 +42,14 @@ if (NOT KERNEL_REPLICATION_ENABLED) unset(NUM_REPLICATIONS) endif() - if (HPCC_FPGA_CONFIG) message(STATUS "HPCC FPGA configuration defined. Overwrite default values with configuration: ${HPCC_FPGA_CONFIG}") include(${HPCC_FPGA_CONFIG}) endif() +# Download build dependencies +add_subdirectory(${CMAKE_SOURCE_DIR}/../extern ${CMAKE_BINARY_DIR}/extern) + # Set the used data type if (NOT DATA_TYPE) set(DATA_TYPE float CACHE STRING "Data type used for calculation") diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 341f73cd..3bbf1a84 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -55,7 +55,7 @@ if(NOT extern_cxxopts_POPULATED) EXCLUDE_FROM_ALL) endif() -if (DEFINED USE_ACCL) +if (USE_ACCL) # ------------------------------------------------------------------------------- # ACCL Library FetchContent_Declare( From 57febee258edeb89a45e4a18b319a5ff0ee6f466 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 29 Nov 2022 11:34:15 +0100 Subject: [PATCH 173/318] Include ACCL earlier in the config process --- cmake/general_benchmark_build_setup.cmake | 5 +++++ cmake/kernelTargets.cmake | 4 ---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake index cc59db91..82ba4ac7 100644 --- a/cmake/general_benchmark_build_setup.cmake +++ b/cmake/general_benchmark_build_setup.cmake @@ -50,6 +50,11 @@ endif() # Download build dependencies add_subdirectory(${CMAKE_SOURCE_DIR}/../extern ${CMAKE_BINARY_DIR}/extern) +# Enable ACCL if required +if (USE_ACCL) + include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake) +endif() + # Set the used data type if (NOT DATA_TYPE) set(DATA_TYPE float CACHE STRING "Data type used for calculation") diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake index 86aeeb1c..4b8adee3 100644 --- a/cmake/kernelTargets.cmake +++ b/cmake/kernelTargets.cmake @@ -9,10 +9,6 @@ else() set(VPP_FLAGS "-O3") endif() -if (USE_ACCL) - include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake) -endif() - set(file_endings "cl" "cpp" ) ## From 48976ddfa814557926ad40f8c3b739cb4d2d11c0 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 29 Nov 2022 11:35:16 +0100 Subject: [PATCH 174/318] Build PTRANS with ACCl context --- .../execution_types/execution_xrt_accl_pq.hpp | 14 +++++++------- .../execution_xrt_accl_stream_pq.hpp | 8 ++++---- .../execution_xrt_accl_stream_pq_sendrecv.hpp | 16 ++++++++-------- .../execution_types/execution_xrt_pcie_pq.hpp | 7 ++++--- PTRANS/src/host/main.cpp | 4 ++++ PTRANS/src/host/transpose_data.hpp | 7 +++++-- 6 files changed, 32 insertions(+), 24 deletions(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp index 8e6c0f5b..3a2111f3 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp @@ -40,8 +40,8 @@ namespace accl_pq { void accl_exchangeData( ACCL::ACCL &accl, transpose::data_handler::DistributedPQTransposeDataHandler< - xrt::device, bool, xrt::uuid> &handler, - transpose::TransposeData &data, std::vector &bufferAXrt, + xrt::device, fpga_setup::ACCLContext, xrt::uuid> &handler, + transpose::TransposeData &data, std::vector &bufferAXrt, int global_width) { int pq_width = handler.getP(); @@ -368,10 +368,10 @@ void accl_exchangeData( */ static std::unique_ptr calculate( const hpcc_base::ExecutionSettings &config, - transpose::TransposeData &data, + xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config, + transpose::TransposeData &data, transpose::data_handler::DistributedPQTransposeDataHandler< - xrt::device, bool, xrt::uuid> &handler) { + xrt::device, fpga_setup::ACCLContext, xrt::uuid> &handler) { int err; if (config.programSettings->dataHandlerIdentifier != @@ -494,7 +494,7 @@ static std::unique_ptr calculate( #ifndef NDEBUG std::cout << "Start data exchange with ACCL" << std::endl; #endif - accl_exchangeData(*config.accl, handler, data, bufferListA, + accl_exchangeData(*(config.context->accl), handler, data, bufferListA, config.programSettings->matrixSize / data.blockSize); #ifndef NDEBUG std::cout << "End data exchange with ACCL" << std::endl; @@ -578,7 +578,7 @@ static std::unique_ptr calculate( } endTransfer = std::chrono::high_resolution_clock::now(); - accl_exchangeData(*config.accl, handler, data, bufferListA, + accl_exchangeData(*(config.context->accl), handler, data, bufferListA, config.programSettings->matrixSize / data.blockSize); transferTime += std::chrono::duration_cast>( diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp index 50a07998..27e240e6 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp @@ -68,10 +68,10 @@ namespace accl_stream_pq { */ static std::unique_ptr calculate( const hpcc_base::ExecutionSettings &config, - transpose::TransposeData &data, + xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config, + transpose::TransposeData &data, transpose::data_handler::DistributedPQTransposeDataHandler< - xrt::device, bool, xrt::uuid> &handler) { + xrt::device, fpga_setup::ACCLContext, xrt::uuid> &handler) { int err; if (config.programSettings->dataHandlerIdentifier != @@ -269,7 +269,7 @@ static std::unique_ptr calculate( } } // Exchange A data via ACCL - config.accl->stream_put(ACCL::dataType::float32, data.blockSize * data.blockSize * data.numBlocks, + config.context->accl->stream_put(ACCL::dataType::float32, data.blockSize * data.blockSize * data.numBlocks, pair_rank, 0); #ifndef NDEBUG std::cout << "Wait for kernels to complete" << std::endl; diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp index c01bab4c..5282b5da 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp @@ -72,10 +72,10 @@ namespace accl_stream_sendrecv_pq { */ static std::unique_ptr calculate( const hpcc_base::ExecutionSettings &config, - transpose::TransposeData &data, + xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config, + transpose::TransposeData &data, transpose::data_handler::DistributedPQTransposeDataHandler< - xrt::device, bool, xrt::uuid> &handler) { + xrt::device, fpga_setup::ACCLContext, xrt::uuid> &handler) { int err; if (config.programSettings->dataHandlerIdentifier != @@ -169,9 +169,9 @@ static std::unique_ptr calculate( // The vector list variable can be interpreted as 2D matrix. Every entry // represents the target rank of the sub-block Since the LCM block will // repeat, we only need to store this small amount of data! - auto target_list = config.accl->create_buffer(least_common_multiple / pq_height * + auto target_list = config.context->accl->create_buffer(least_common_multiple / pq_height * least_common_multiple / pq_width, ACCL::dataType::int32); - bufferListCopy.push_back(config.accl->create_buffer(buffer_size, ACCL::dataType::float32)); + bufferListCopy.push_back(config.context->accl->create_buffer(buffer_size, ACCL::dataType::float32)); for (int row = 0; row < least_common_multiple / pq_height; row++) { for (int col = 0; col < least_common_multiple / pq_width; col++) { int global_block_col = pq_col + col * pq_width; @@ -313,7 +313,7 @@ static std::unique_ptr calculate( #ifndef NDEBUG std::cout << "Start ACCL send/recv" << std::endl; #endif - auto dbuffer = config.accl->create_buffer(1,ACCL::dataType::float32); + auto dbuffer = config.context->accl->create_buffer(1,ACCL::dataType::float32); int g = transpose::data_handler::mod(pq_row - pq_col, gcd); int p = transpose::data_handler::mod(pq_col + g, pq_width); int q = transpose::data_handler::mod(pq_row - g, pq_height); @@ -361,7 +361,7 @@ static std::unique_ptr calculate( //TODO copy from and to string not implemented in driver yet // config.accl->copy_from_stream(*bufferListCopy[0], sending_size); } else { - config.accl->send(ACCL::dataType::float32, sending_size, send_rank, 0); + config.context->accl->send(ACCL::dataType::float32, sending_size, send_rank, 0); } } else { #ifndef NDEBUG @@ -371,7 +371,7 @@ static std::unique_ptr calculate( //TODO copy from and to string not implemented in driver yet // config.accl->copy_to_stream(*bufferListCopy[0], receiving_size); } else { - config.accl->recv(ACCL::dataType::float32, receiving_size, recv_rank, 0); + config.context->accl->recv(ACCL::dataType::float32, receiving_size, recv_rank, 0); } } } diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp index f0d4eeed..0fa0f9c2 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp @@ -48,12 +48,13 @@ namespace pcie_pq { * @return std::unique_ptr The measured * execution times */ +template static std::unique_ptr calculate( const hpcc_base::ExecutionSettings &config, - transpose::TransposeData &data, + xrt::device, TContext, xrt::uuid> &config, + transpose::TransposeData &data, transpose::data_handler::DistributedPQTransposeDataHandler< - xrt::device, bool, xrt::uuid> &handler) { + xrt::device, TContext, xrt::uuid> &handler) { int err; if (config.programSettings->dataHandlerIdentifier != diff --git a/PTRANS/src/host/main.cpp b/PTRANS/src/host/main.cpp index d4db9803..126f6ff3 100644 --- a/PTRANS/src/host/main.cpp +++ b/PTRANS/src/host/main.cpp @@ -11,7 +11,11 @@ main(int argc, char *argv[]) { #ifdef USE_OCL_HOST TransposeBenchmark bm(argc, argv); #else +#ifndef USE_ACCL TransposeBenchmark bm(argc, argv); +#else + TransposeBenchmark bm(argc, argv); +#endif #endif bool success = bm.executeBenchmark(); if (success) { diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp index cd9020e4..9949aede 100644 --- a/PTRANS/src/host/transpose_data.hpp +++ b/PTRANS/src/host/transpose_data.hpp @@ -159,8 +159,11 @@ class TransposeData { * @param block_size size of the quadratic blocks that are stored within this object * @param y_size number of blocks that are stored within this object per replication */ - TransposeData(TContext context, uint block_size, uint y_size): context(context), - numBlocks(y_size), blockSize(block_size) { + TransposeData(TContext &context, uint block_size, uint y_size): +#ifdef USE_SVM + context(context), +#endif + numBlocks(y_size), blockSize(block_size) { if (numBlocks * blockSize > 0) { #ifdef USE_SVM A = reinterpret_cast( From ac6b911b5cabfe38835e0380bfa2c432347268f3 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 29 Nov 2022 12:45:25 +0100 Subject: [PATCH 175/318] Compile LINPACK with new ACCL context --- .../execution_accl_buffers.hpp | 26 +++++------ .../host/execution_types/execution_iec.hpp | 2 +- .../host/execution_types/execution_pcie.hpp | 2 +- .../execution_types/execution_xrt_pcie.hpp | 5 ++- LINPACK/src/host/linpack_benchmark.hpp | 14 +++--- LINPACK/src/host/linpack_data.cpp | 31 ------------- LINPACK/src/host/linpack_data.hpp | 43 +++++++++++++++---- LINPACK/src/host/main.cpp | 4 ++ 8 files changed, 64 insertions(+), 63 deletions(-) diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp index e7db12b3..4266f605 100644 --- a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp +++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp @@ -49,8 +49,8 @@ namespace accl_buffers { */ std::unique_ptr calculate( const hpcc_base::ExecutionSettings &config, - linpack::LinpackData &data) { + xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config, + linpack::LinpackData &data) { cl_int err; @@ -67,7 +67,7 @@ std::unique_ptr calculate( // Get group of global communicator std::vector all_accl_ranks = - config.accl->get_comm_group(ACCL::GLOBAL_COMM); + config.context->accl->get_comm_group(ACCL::GLOBAL_COMM); std::vector row_ranks; std::vector col_ranks; @@ -86,9 +86,9 @@ std::unique_ptr calculate( } // Create communicators from sub-groups - ACCL::communicatorId row_comm = config.accl->create_communicator( + ACCL::communicatorId row_comm = config.context->accl->create_communicator( row_ranks, config.programSettings->torus_col); - ACCL::communicatorId col_comm = config.accl->create_communicator( + ACCL::communicatorId col_comm = config.context->accl->create_communicator( col_ranks, config.programSettings->torus_row); // Create global memory buffers @@ -120,7 +120,7 @@ std::unique_ptr calculate( (config.programSettings->blockSize) * (config.programSettings->blockSize), lu_tmp_kernel.group_id(1)); - auto Buffer_lu1 = config.accl->create_buffer( + auto Buffer_lu1 = config.context->accl->create_buffer( tmp_bos.back(), (config.programSettings->blockSize) * (config.programSettings->blockSize), ACCL::dataType::float32); @@ -129,7 +129,7 @@ std::unique_ptr calculate( (config.programSettings->blockSize) * (config.programSettings->blockSize), lu_tmp_kernel.group_id(2)); - auto Buffer_lu2 = config.accl->create_buffer( + auto Buffer_lu2 = config.context->accl->create_buffer( tmp_bos.back(), (config.programSettings->blockSize) * (config.programSettings->blockSize), ACCL::dataType::float32); @@ -151,7 +151,7 @@ std::unique_ptr calculate( (config.programSettings->blockSize), lu_tmp_kernel.group_id(1)); Buffer_top_list.back().push_back( - config.accl->create_buffer( + config.context->accl->create_buffer( tmp_bos.back(), (config.programSettings->blockSize) * (config.programSettings->blockSize), @@ -166,7 +166,7 @@ std::unique_ptr calculate( (config.programSettings->blockSize), lu_tmp_kernel.group_id(2)); Buffer_left_list.back().push_back( - config.accl->create_buffer( + config.context->accl->create_buffer( tmp_bos.back(), (config.programSettings->blockSize) * (config.programSettings->blockSize), @@ -291,12 +291,12 @@ std::unique_ptr calculate( // FPGAs // Broadcast LU block in column to update all left blocks - config.accl->bcast(*Buffer_lu2, + config.context->accl->bcast(*Buffer_lu2, config.programSettings->blockSize * config.programSettings->blockSize, local_block_row_remainder, col_comm, true, true); // Broadcast LU block in row to update all top blocks - config.accl->bcast(*Buffer_lu1, + config.context->accl->bcast(*Buffer_lu1, config.programSettings->blockSize * config.programSettings->blockSize, local_block_col_remainder, row_comm, true, true); @@ -352,7 +352,7 @@ std::unique_ptr calculate( lbi < std::max(static_cast(blocks_per_col - local_block_col), 0); lbi++) { - config.accl->bcast(*Buffer_left_list[block_row % 2][lbi], + config.context->accl->bcast(*Buffer_left_list[block_row % 2][lbi], config.programSettings->blockSize * config.programSettings->blockSize, local_block_col_remainder, row_comm, true, true); @@ -361,7 +361,7 @@ std::unique_ptr calculate( tbi < std::max(static_cast(blocks_per_row - local_block_row), 0); tbi++) { - config.accl->bcast(*Buffer_top_list[block_row % 2][tbi], + config.context->accl->bcast(*Buffer_top_list[block_row % 2][tbi], config.programSettings->blockSize * config.programSettings->blockSize, local_block_row_remainder, col_comm, true, true); diff --git a/LINPACK/src/host/execution_types/execution_iec.hpp b/LINPACK/src/host/execution_types/execution_iec.hpp index b07ed6a6..279db54a 100644 --- a/LINPACK/src/host/execution_types/execution_iec.hpp +++ b/LINPACK/src/host/execution_types/execution_iec.hpp @@ -46,7 +46,7 @@ namespace iec { */ std::unique_ptr calculate(const hpcc_base::ExecutionSettings&config, - linpack::LinpackData& data) { + linpack::LinpackData& data) { int err; diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp index 5ef4ad27..b484a822 100644 --- a/LINPACK/src/host/execution_types/execution_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_pcie.hpp @@ -52,7 +52,7 @@ namespace pcie { */ std::unique_ptr calculate(const hpcc_base::ExecutionSettings&config, - linpack::LinpackData& data) { + linpack::LinpackData& data) { cl_int err; diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp index f35df7b9..aa0484e1 100644 --- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp @@ -47,10 +47,11 @@ namespace xrt_pcie { @copydoc bm_execution::calculate() */ +template std::unique_ptr calculate( const hpcc_base::ExecutionSettings &config, - linpack::LinpackData &data) { + xrt::device, TContext, xrt::uuid> &config, + linpack::LinpackData &data) { cl_int err; diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp index d1d3093c..48819296 100644 --- a/LINPACK/src/host/linpack_benchmark.hpp +++ b/LINPACK/src/host/linpack_benchmark.hpp @@ -48,7 +48,7 @@ namespace linpack { * */ template -class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark { +class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark, LinpackExecutionTimings> { protected: @@ -77,7 +77,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark& data) { uint global_matrix_size = this->executionSettings->programSettings->matrixSize; uint matrix_width = data.matrix_width; uint matrix_height = data.matrix_height; @@ -209,7 +209,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark The input and output data of the benchmark */ - std::unique_ptr + std::unique_ptr> generateInputData() override { int local_matrix_width = this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->torus_width; int local_matrix_height = this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->torus_height; @@ -219,7 +219,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark(new linpack::LinpackData(*this->executionSettings->context ,local_matrix_width, local_matrix_height)); + auto d = std::unique_ptr>(new linpack::LinpackData(*this->executionSettings->context ,local_matrix_width, local_matrix_height)); std::mt19937 gen(this->mpi_comm_rank); std::uniform_real_distribution<> dis(0.0, 1.0); d->norma = 0.0; @@ -305,7 +305,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark Measured runtimes of the kernel execution */ std::unique_ptr - executeKernel(LinpackData &data) override { + executeKernel(LinpackData &data) override { std::unique_ptr timings; switch (this->executionSettings->programSettings->communicationType) { #ifdef USE_OCL_HOST @@ -335,7 +335,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark &data) override { uint n= this->executionSettings->programSettings->matrixSize; uint matrix_width = data.matrix_width; uint matrix_height = data.matrix_height; @@ -577,7 +577,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark(argc, argv) { + LinpackBenchmark(int argc, char* argv[]) : hpcc_base::HpccFpgaBenchmark, linpack::LinpackExecutionTimings>(argc, argv) { this->setupBenchmark(argc, argv); } diff --git a/LINPACK/src/host/linpack_data.cpp b/LINPACK/src/host/linpack_data.cpp index 2c724796..f2c3cfa4 100644 --- a/LINPACK/src/host/linpack_data.cpp +++ b/LINPACK/src/host/linpack_data.cpp @@ -62,37 +62,6 @@ linpack::LinpackProgramSettings::getSettingsMap() { return map; } -linpack::LinpackData::LinpackData(cl::Context context, size_t width, size_t height) : norma(0.0), context(context), - matrix_width(width), matrix_height(height) { -#ifdef USE_SVM - A = reinterpret_cast( - clSVMAlloc(context(), 0 , - size * size * sizeof(HOST_DATA_TYPE), 1024)); - b = reinterpret_cast( - clSVMAlloc(context(), 0 , - size * sizeof(HOST_DATA_TYPE), 1024)); - ipvt = reinterpret_cast( - clSVMAlloc(context(), 0 , - size * sizeof(cl_int), 1024)); -#else - posix_memalign(reinterpret_cast(&A), 4096, width * height * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&b), 4096, width * sizeof(HOST_DATA_TYPE)); - posix_memalign(reinterpret_cast(&ipvt), 4096, height * sizeof(cl_int)); -#endif - } - -linpack::LinpackData::~LinpackData() { -#ifdef USE_SVM - clSVMFree(context(), reinterpret_cast(A)); - clSVMFree(context(), reinterpret_cast(b)); - clSVMFree(context(), reinterpret_cast(ipvt)); -#else - free(A); - free(b); - free(ipvt); -#endif -} - /** Standard LU factorization on a block with fixed size diff --git a/LINPACK/src/host/linpack_data.hpp b/LINPACK/src/host/linpack_data.hpp index 51324a5c..341ce0a2 100644 --- a/LINPACK/src/host/linpack_data.hpp +++ b/LINPACK/src/host/linpack_data.hpp @@ -117,6 +117,7 @@ class LinpackProgramSettings : public hpcc_base::BaseSettings { * @brief Data class containing the data the kernel is exeucted with * */ +template class LinpackData { public: @@ -155,7 +156,7 @@ class LinpackData { * @brief The context that is used to allocate memory in SVM mode * */ - cl::Context context; + TContext context; /** * @brief The maximum value of A that will be used for the error calculation @@ -176,13 +177,39 @@ class LinpackData { * @param width width of the local matrix in values * @param height height of the local matrix in values */ - LinpackData(cl::Context context, size_t width, size_t height); - - /** - * @brief Destroy the Linpack Data object. Free the allocated memory - * - */ - ~LinpackData(); + LinpackData(TContext &context, size_t width, size_t height) : norma(0.0), +#ifdef USE_SVM + context(context), +#endif + matrix_width(width), matrix_height(height) { +#ifdef USE_SVM + A = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * size * sizeof(HOST_DATA_TYPE), 1024)); + b = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(HOST_DATA_TYPE), 1024)); + ipvt = reinterpret_cast( + clSVMAlloc(context(), 0 , + size * sizeof(cl_int), 1024)); +#else + posix_memalign(reinterpret_cast(&A), 4096, width * height * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&b), 4096, width * sizeof(HOST_DATA_TYPE)); + posix_memalign(reinterpret_cast(&ipvt), 4096, height * sizeof(cl_int)); +#endif + } + + ~LinpackData() { +#ifdef USE_SVM + clSVMFree(context(), reinterpret_cast(A)); + clSVMFree(context(), reinterpret_cast(b)); + clSVMFree(context(), reinterpret_cast(ipvt)); +#else + free(A); + free(b); + free(ipvt); +#endif + } }; diff --git a/LINPACK/src/host/main.cpp b/LINPACK/src/host/main.cpp index cfd89914..51c4d292 100644 --- a/LINPACK/src/host/main.cpp +++ b/LINPACK/src/host/main.cpp @@ -16,7 +16,11 @@ main(int argc, char *argv[]) { LinpackBenchmark bm(argc, argv); #endif #ifdef USE_XRT_HOST +#ifndef USE_ACCL LinpackBenchmark bm(argc, argv); +#else + LinpackBenchmark bm(argc, argv); +#endif #endif bool success = bm.executeBenchmark(); if (success) { From 0a5981efea2fc2e5cb0cdf1181a0dd60b9b7ecc9 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 29 Nov 2022 15:39:47 +0100 Subject: [PATCH 176/318] Split up parameter.h generation to reduce redundancy --- LINPACK/src/common/parameters.h.in | 15 ++------------- PTRANS/src/common/parameters.h.in | 18 +----------------- b_eff/src/common/parameters.h.in | 18 ++---------------- cmake/general_benchmark_build_setup.cmake | 4 ++++ shared/include/base_parameters.h.in | 22 ++++++++++++++++++++++ 5 files changed, 31 insertions(+), 46 deletions(-) create mode 100644 shared/include/base_parameters.h.in diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in index a5bac5e0..5c7b0331 100644 --- a/LINPACK/src/common/parameters.h.in +++ b/LINPACK/src/common/parameters.h.in @@ -1,14 +1,11 @@ #ifndef SRC_COMMON_PARAMETERS_H_ #define SRC_COMMON_PARAMETERS_H_ +#include "base_parameters.h" + /** * Host specific parameters */ -#define VERSION "@PROJECT_VERSION@" -#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@ -#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@ -#define DEFAULT_DEVICE @DEFAULT_DEVICE@ -#define HOST_DATA_TYPE @HOST_DATA_TYPE@ #define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@ #define DEFAULT_P_VALUE @DEFAULT_P_VALUE@ #cmakedefine _DP @@ -22,15 +19,12 @@ /** * Device specific parameters */ -#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ #define LOCAL_MEM_BLOCK_LOG @LOCAL_MEM_BLOCK_LOG@ #define REGISTER_BLOCK_LOG @REGISTER_BLOCK_LOG@ #define REGISTER_BLOCK_MM_LOG @REGISTER_BLOCK_MM_LOG@ -#define NUM_REPLICATIONS @NUM_REPLICATIONS@ #cmakedefine USE_SVM #cmakedefine DISTRIBUTED_VALIDATION -#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@ /* Short description of the program @@ -39,11 +33,6 @@ Short description of the program " proposed in the HPCC benchmark suite for FPGA.\n"\ "Version: " VERSION "\n" -/** -Output separator -*/ -#define HLINE "-------------------------------------------------------------\n" - #define LEFT_BLOCK (1 << 1) #define TOP_BLOCK (1 << 2) #define LU_BLOCK_OUT (1 << 3) diff --git a/PTRANS/src/common/parameters.h.in b/PTRANS/src/common/parameters.h.in index e42792ff..9575d69e 100644 --- a/PTRANS/src/common/parameters.h.in +++ b/PTRANS/src/common/parameters.h.in @@ -1,32 +1,21 @@ #ifndef SRC_COMMON_PARAMETERS_H_ #define SRC_COMMON_PARAMETERS_H_ -#define VERSION "@PROJECT_VERSION@" +#include "base_parameters.h" #define READ_KERNEL_NAME "@READ_KERNEL_NAME@" #define WRITE_KERNEL_NAME "@WRITE_KERNEL_NAME@" -#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@ #define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@ #define DEFAULT_COMM_TYPE "@DEFAULT_COMM_TYPE@" #define DEFAULT_DIST_TYPE "@DEFAULT_DIST_TYPE@" -#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@ -#define DEFAULT_DEVICE @DEFAULT_DEVICE@ #define DEFAULT_P_VALUE @DEFAULT_P_VALUE@ -#define NUM_REPLICATIONS @NUM_REPLICATIONS@ -#cmakedefine HOST_EMULATION_REORDER - -#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@ - /** * Kernel Parameters */ #define BLOCK_SIZE @BLOCK_SIZE@ #define CHANNEL_WIDTH @CHANNEL_WIDTH@ -#define HOST_DATA_TYPE @HOST_DATA_TYPE@ -#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ - #cmakedefine USE_SVM #cmakedefine USE_BUFFER_WRITE_RECT_FOR_A #cmakedefine XILINX_UNROLL_INNER_LOOPS @@ -39,9 +28,4 @@ Moreover the version and build time is also compiled into the description. " proposed in the HPCC benchmark suite for FPGA.\n"\ "Version: " VERSION "\n" -/** -Output separator -*/ -#define HLINE "-------------------------------------------------------------\n" - #endif // SRC_COMMON_PARAMETERS_H_ \ No newline at end of file diff --git a/b_eff/src/common/parameters.h.in b/b_eff/src/common/parameters.h.in index 5c823610..3dc3e8a0 100644 --- a/b_eff/src/common/parameters.h.in +++ b/b_eff/src/common/parameters.h.in @@ -1,12 +1,10 @@ #ifndef SRC_COMMON_PARAMETERS_H_ #define SRC_COMMON_PARAMETERS_H_ -#define VERSION "@PROJECT_VERSION@" +#include "base_parameters.h" + #define SEND_KERNEL_NAME "@SEND_KERNEL_NAME@" #define RECV_KERNEL_NAME "@RECV_KERNEL_NAME@" -#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@ -#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@ -#define DEFAULT_DEVICE @DEFAULT_DEVICE@ #define DEFAULT_MAX_MESSAGE_SIZE @DEFAULT_MAX_MESSAGE_SIZE@ #define DEFAULT_MAX_LOOP_LENGTH @DEFAULT_MAX_LOOP_LENGTH@ #define DEFAULT_MIN_LOOP_LENGTH @DEFAULT_MIN_LOOP_LENGTH@ @@ -17,13 +15,6 @@ * Kernel Parameters */ #define CHANNEL_WIDTH @CHANNEL_WIDTH@ -#define NUM_REPLICATIONS @NUM_REPLICATIONS@ - -#define HOST_DATA_TYPE @HOST_DATA_TYPE@ -#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ - -#cmakedefine HOST_EMULATION_REORDER -#cmakedefine ACCL_BUFFER_SIZE @ACCL_BUFFER_SIZE@ /* Short description of the program. @@ -34,9 +25,4 @@ Moreover the version and build time is also compiled into the description. " proposed in the HPCC benchmark suite for FPGA.\n"\ "Version: " VERSION "\n" -/** -Output separator -*/ -#define HLINE "-------------------------------------------------------------\n" - #endif // SRC_COMMON_PARAMETERS_H_ \ No newline at end of file diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake index 82ba4ac7..427aaab4 100644 --- a/cmake/general_benchmark_build_setup.cmake +++ b/cmake/general_benchmark_build_setup.cmake @@ -164,6 +164,10 @@ list(APPEND CMAKE_EXTRA_INCLUDE_FILES "CL/opencl.h") check_type_size("${HOST_DATA_TYPE}" DATA_TYPE_SIZE) # Configure the header file with definitions used by the host code +configure_file( + "${CMAKE_SOURCE_DIR}/../shared/include/base_parameters.h.in" + "${CMAKE_BINARY_DIR}/src/common/base_parameters.h" +) configure_file( "${CMAKE_SOURCE_DIR}/src/common/parameters.h.in" "${CMAKE_BINARY_DIR}/src/common/parameters.h" diff --git a/shared/include/base_parameters.h.in b/shared/include/base_parameters.h.in new file mode 100644 index 00000000..45a1100b --- /dev/null +++ b/shared/include/base_parameters.h.in @@ -0,0 +1,22 @@ +#ifndef BASE_PARAMETERS_H +#define BASE_PARAMETERS_H + +#define VERSION "@PROJECT_VERSION@" +#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@ +#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@ +#define DEFAULT_DEVICE @DEFAULT_DEVICE@ +#cmakedefine NUM_REPLICATIONS @NUM_REPLICATIONS@ +#define HOST_DATA_TYPE @HOST_DATA_TYPE@ +#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ + +#cmakedefine HOST_EMULATION_REORDER +#cmakedefine DEFAULT_ACCL_BUFFER_SIZE @DEFAULT_ACCL_BUFFER_SIZE@ +#cmakedefine DEFAULT_ACCL_BUFFER_COUNT @DEFAULT_ACCL_BUFFER_COUNT@ +#cmakedefine ACCL_STACK_TYPE "@ACCL_STACK_TYPE@" + +/** +Output separator +*/ +#define HLINE "-------------------------------------------------------------\n" + +#endif \ No newline at end of file From c46559f66bfb4dbd02d000caf2d6e5a21418519a Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 29 Nov 2022 15:40:55 +0100 Subject: [PATCH 177/318] Explicitly load ACCL network stacks as dep --- cmake/accl.cmake | 7 ++++--- extern/CMakeLists.txt | 31 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/cmake/accl.cmake b/cmake/accl.cmake index 7c3d1f08..fd29f4ee 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -3,12 +3,13 @@ set(ACCL_STACK_TYPE "UDP" CACHE STRING "Network layer used in ACCL") set(ACCL_UDP_ETH_IFS 1 CACHE STRING "Number of Ethernet interfaces to synthesize for UDP stack") set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform") -set(ACCL_BUFFER_SIZE 8192 CACHE STRING "Size of ACCL buffers in bytes") +set(DEFAULT_ACCL_BUFFER_SIZE 8192 CACHE STRING "Size of ACCL buffers in KB") +set(DEFAULT_ACCL_BUFFER_COUNT 16 CACHE STRING "Number of ACCL buffers") set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware) set(ACCL_CCLO_ADDITIONAL_BUILD_ARGS "" CACHE STRING "Add additional build arguments that will be passed to the CCLO makefile") set(ACCL_CCLO_BUILD_ARGS ${ACCL_CCLO_ADDITIONAL_BUILD_ARGS}) # UDP related definitions -set(ACCL_VNX_DIR ${ACCL_HARDWARE_DIR}/xup_vitis_network_example/) +set(ACCL_VNX_DIR ${extern_accl_udp_SOURCE_DIR}) set(ACCL_NETLAYER_HLS ${ACCL_VNX_DIR}/NetLayers/100G-fpga-network-stack-core) set(ACCL_UDP_NET_XO ${ACCL_VNX_DIR}/NetLayers/_x.${FPGA_BOARD_NAME}/networklayer.xo) set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HBM) @@ -42,7 +43,7 @@ add_custom_target( DEPENDS ${ACCL_UDP_MAC_XOS} ${ACCL_UDP_NET_XO}) # TCP related definitions -set(ACCL_TCP_BASE_DIR ${ACCL_HARDWARE_DIR}/Vitis_with_100Gbps_TCP-IP) +set(ACCL_TCP_BASE_DIR ${extern_accl_tcp_SOURCE_DIR}) set(ACCL_TCP_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/network_krnl.xo) set(ACCL_TCP_CMAC_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/cmac_krnl.xo) if (ACCL_STACK_TYPE STREQUAL "TCP") diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 3bbf1a84..eec6a24d 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -56,6 +56,7 @@ if(NOT extern_cxxopts_POPULATED) endif() if (USE_ACCL) +message(STATUS "ACCL was selected. Fetch ACCL dependencies") # ------------------------------------------------------------------------------- # ACCL Library FetchContent_Declare( @@ -70,4 +71,34 @@ if(NOT extern_accl_POPULATED) FetchContent_Populate(extern_accl) set(extern_accl_SOURCE_DIR ${extern_accl_SOURCE_DIR} PARENT_SCOPE) endif() + +# ------------------------------------------------------------------------------- +# UDP Library +FetchContent_Declare( + extern_accl_udp + + GIT_REPOSITORY https://github.com/Xilinx/xup_vitis_network_example.git + GIT_TAG master) + +FetchContent_GetProperties(extern_accl_udp) +if(NOT extern_accl_udp_POPULATED) + message(STATUS "Fetching mandatory build dependency ACCL UDP stack") + FetchContent_Populate(extern_accl_udp) + set(extern_accl_udp_SOURCE_DIR ${extern_accl_udp_SOURCE_DIR} PARENT_SCOPE) +endif() + +# ------------------------------------------------------------------------------- +# TCP Library +FetchContent_Declare( + extern_accl_tcp + + GIT_REPOSITORY https://github.com/fpgasystems/Vitis_with_100Gbps_TCP-IP.git + GIT_TAG vitis_2022_1) + +FetchContent_GetProperties(extern_accl_tcp) +if(NOT extern_accl_tcp_POPULATED) + message(STATUS "Fetching mandatory build dependency ACCL TCP stack") + FetchContent_Populate(extern_accl_tcp) + set(extern_accl_tcp_SOURCE_DIR ${extern_accl_tcp_SOURCE_DIR} PARENT_SCOPE) +endif() endif() From c7172d76a5c5f92ca4f591d3a29b5d622c599851 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 29 Nov 2022 15:42:32 +0100 Subject: [PATCH 178/318] Refactor hpcc base code --- shared/CMakeLists.txt | 2 +- shared/hpcc_settings.cpp | 59 +++++++ shared/include/hpcc_benchmark.hpp | 212 ++--------------------- shared/include/hpcc_settings.hpp | 200 +++++++++++++++++++++ shared/include/setup/fpga_setup_accl.hpp | 6 +- shared/setup/fpga_setup_accl.cpp | 16 +- shared/setup/fpga_setup_xrt.cpp | 2 +- 7 files changed, 282 insertions(+), 215 deletions(-) create mode 100644 shared/hpcc_settings.cpp create mode 100644 shared/include/hpcc_settings.hpp diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt index 64260c94..70d8184d 100644 --- a/shared/CMakeLists.txt +++ b/shared/CMakeLists.txt @@ -12,7 +12,7 @@ endif() if (USE_XRT_HOST) list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp) endif() -list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp) +list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp ${CMAKE_CURRENT_SOURCE_DIR}/hpcc_settings.cpp) add_library(hpcc_fpga_base STATIC ${HPCC_BASE_SOURCES}) if (USE_ACCL) add_subdirectory(${extern_accl_SOURCE_DIR}/test/hardware/xup_vitis_network_example/xrt_host_api diff --git a/shared/hpcc_settings.cpp b/shared/hpcc_settings.cpp new file mode 100644 index 00000000..3751d10f --- /dev/null +++ b/shared/hpcc_settings.cpp @@ -0,0 +1,59 @@ +#include "hpcc_settings.hpp" + +#ifdef USE_ACCL +#include "setup/fpga_setup_accl.hpp" +#endif + + /** + * @brief Construct a new Base Settings object + * + * @param results The resulting map from parsing the program input parameters + */ +hpcc_base::BaseSettings::BaseSettings(cxxopts::ParseResult &results) : numRepetitions(results["n"].as()), +#ifdef INTEL_FPGA + useMemoryInterleaving(static_cast(results.count("i"))), +#else + useMemoryInterleaving(true), +#endif + skipValidation(static_cast(results.count("skip-validation"))), + defaultPlatform(results["platform"].as()), + defaultDevice(results["device"].as()), + kernelFileName(results["f"].as()), +#ifdef NUM_REPLICATIONS + kernelReplications(results.count("r") > 0 ? results["r"].as() : NUM_REPLICATIONS), +#else + kernelReplications(results.count("r") > 0 ? results["r"].as() : 1), +#endif +#ifdef USE_ACCL + useAcclEmulation(static_cast(results.count("accl-emulation"))), + acclProtocol(fpga_setup::acclProtocolStringToEnum(results["accl-protocol"].as())), + acclBufferSize(results["accl-buffer-size"].as() * 1024), + acclBufferCount(results["accl-buffer-count"].as()), +#endif +#ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED + communicationType(retrieveCommunicationType(results["comm-type"].as(), results["f"].as())), +#else + communicationType(retrieveCommunicationType("UNSUPPORTED", results["f"].as())), +#endif + testOnly(static_cast(results.count("test"))) {} + +/** + * @brief Get a map of the settings. This map will be used to print the final configuration. + * Derived classes should override it to add additional configuration options + * + * @return std::map + */ +std::map +hpcc_base::BaseSettings::getSettingsMap() { + int mpi_size = 0; +#ifdef _USE_MPI_ + MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); +#endif + std::string str_mpi_ranks = "None"; + if (mpi_size > 0) { + str_mpi_ranks = std::to_string(mpi_size); + } + return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, + {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"}, + {"Communication Type", commToString(communicationType)}}; +} \ No newline at end of file diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 494d18c8..6fad3147 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -45,6 +45,7 @@ SOFTWARE. #include "cxxopts.hpp" #include "parameters.h" #include "communication_types.hpp" +#include "hpcc_settings.hpp" #define STR_EXPAND(tok) #tok #define STR(tok) STR_EXPAND(tok) @@ -58,204 +59,6 @@ SOFTWARE. */ namespace hpcc_base { -/** - * @brief This class should be derived and extended for every benchmark. - * It is a pure data object containing the benchmark settings that are - * used to execute the benchmark kernel. - * - */ -class BaseSettings { - -public: - - /** - * @brief Number of times the kernel execution will be repeated - * - */ - uint numRepetitions; - - /** - * @brief Boolean showing if memory interleaving is used that is - * triggered from the host side (Intel specific) - * - */ - bool useMemoryInterleaving; - - /** - * @brief Boolean showing if the output data of the benchmark kernel - * should be validated or not - * - */ - bool skipValidation; - - /** - * @brief The default platform that should be used for execution. - * A number representing the index in the list of available platforms - * - */ - int defaultPlatform; - - /** - * @brief The default device that should be used for execution. - * A number representing the index in the list of available devices - * - */ - int defaultDevice; - - /** - * @brief Path to the kernel file that is used for execution - * - */ - std::string kernelFileName; - - /** - * @brief Number of times the kernel is replicated - * - */ - uint kernelReplications; - - /** - * @brief Only test the given configuration. Do not execute the benchmarks - * - */ - bool testOnly; - - /** - * @brief Type of inter-FPGA communication used - * - */ - CommunicationType communicationType; - -#ifdef USE_ACCL - /** - * @brief Use ACCL emulation constructor instead of hardware execution - */ - bool useAcclEmulation; - - /** - * @brief Used ACCL network stack - * - */ - ACCL::networkProtocol acclProtocol; -#endif - - /** - * @brief Construct a new Base Settings object - * - * @param results The resulting map from parsing the program input parameters - */ - BaseSettings(cxxopts::ParseResult &results) : numRepetitions(results["n"].as()), -#ifdef INTEL_FPGA - useMemoryInterleaving(static_cast(results.count("i"))), -#else - useMemoryInterleaving(true), -#endif - skipValidation(static_cast(results.count("skip-validation"))), - defaultPlatform(results["platform"].as()), - defaultDevice(results["device"].as()), - kernelFileName(results["f"].as()), -#ifdef NUM_REPLICATIONS - kernelReplications(results.count("r") > 0 ? results["r"].as() : NUM_REPLICATIONS), -#else - kernelReplications(results.count("r") > 0 ? results["r"].as() : 1), -#endif -#ifdef USE_ACCL - useAcclEmulation(static_cast(results.count("accl-emulation"))), - acclProtocol(fpga_setup::acclProtocolStringToEnum(results["accl-protocol"].as())), -#endif -#ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED - communicationType(retrieveCommunicationType(results["comm-type"].as(), results["f"].as())), -#else - communicationType(retrieveCommunicationType("UNSUPPORTED", results["f"].as())), -#endif - testOnly(static_cast(results.count("test"))) {} - - /** - * @brief Get a map of the settings. This map will be used to print the final configuration. - * Derived classes should override it to add additional configuration options - * - * @return std::map - */ - virtual std::map getSettingsMap() { - int mpi_size = 0; -#ifdef _USE_MPI_ - MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); -#endif - std::string str_mpi_ranks = "None"; - if (mpi_size > 0) { - str_mpi_ranks = std::to_string(mpi_size); - } - return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, - {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"}, - {"Communication Type", commToString(communicationType)}}; - } - -}; - -/** - * @brief Settings class that is containing the program settings together with - * additional information about the OpenCL runtime - * - * @tparam TSettings The program settings class that should be used (Must derive from BaseSettings) - */ -template -class ExecutionSettings { -public: - - /** - * @brief Pointer to the additional program settings - * - */ - std::unique_ptr programSettings; - - /** - * @brief The OpenCL device that should be used for execution - * - */ - std::unique_ptr device; - - /** - * @brief The OpenCL context that should be used for execution - * - */ - std::unique_ptr context; - - /** - * @brief The OpenCL program that contains the benchmark kernel - * - */ - std::unique_ptr program; - - /** - * @brief Construct a new Execution Settings object - * - * @param programSettings_ Pointer to an existing program settings object that is derived from BaseSettings - * @param device_ Used OpenCL device - * @param context_ Used OpenCL context - * @param program_ Used OpenCL program - */ - ExecutionSettings(std::unique_ptr programSettings_, std::unique_ptr device_, - std::unique_ptr context_, std::unique_ptr program_ - - ): - programSettings(std::move(programSettings_)), device(std::move(device_)), - context(std::move(context_)), program(std::move(program_)) - {} - - /** - * @brief Destroy the Execution Settings object. Used to specify the order the contained objects are destroyed - * to prevent segmentation faults during exit. - * - */ - ~ExecutionSettings() { - program = nullptr; - context = nullptr; - device = nullptr; - programSettings = nullptr; - } - -}; - /** * @brief Base benchmark class. Every benchmark should be derived from this class and implement its abstract methods. * @@ -263,7 +66,8 @@ class ExecutionSettings { * @tparam TData Class used to represent the benchmark input and output data * @tparam TOutput Class representing the measurements like timings etc */ -template +template ::value>::type> class HpccFpgaBenchmark { private: @@ -400,7 +204,11 @@ class HpccFpgaBenchmark { #ifdef USE_ACCL ("accl-emulation", "Use the accl emulation instead of hardware execution") ("accl-protocol", "Specify the network protocol that should be used with ACCL.", - cxxopts::value()->default_value("UDP")) + cxxopts::value()->default_value(ACCL_STACK_TYPE)) + ("accl-buffer-size", "Specify the size of the ACCL buffers in KB", + cxxopts::value()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_SIZE))) + ("accl-buffer-count", "Specify the number of ACCL buffers used within the benchmark", + cxxopts::value()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_COUNT))) #endif ("skip-validation", "Skip the validation of the output data. This will speed up execution and helps when working with special data types.") ("device", "Index of the device that has to be used. If not given you "\ @@ -525,8 +333,8 @@ class HpccFpgaBenchmark { #endif #ifdef USE_ACCL if (programSettings->communicationType == CommunicationType::accl) { - context = std::unique_ptr(new fpga_setup::ACCLContext(fpga_setup::fpgaSetupACCL(*usedDevice, *program, programSettings->useAcclEmulation, - programSettings->acclProtocol))); + context = std::unique_ptr(new fpga_setup::ACCLContext( + fpga_setup::fpgaSetupACCL(*usedDevice, *program, *programSettings))); } else { context = std::unique_ptr(new fpga_setup::ACCLContext()); diff --git a/shared/include/hpcc_settings.hpp b/shared/include/hpcc_settings.hpp new file mode 100644 index 00000000..defa2892 --- /dev/null +++ b/shared/include/hpcc_settings.hpp @@ -0,0 +1,200 @@ +#ifndef HPCC_BASE_SETTINGS_H_ +#define HPCC_BASE_SETTINGS_H_ + +#include "cxxopts.hpp" +#include "parameters.h" +#include "communication_types.hpp" + +#ifdef _USE_MPI_ +#include "mpi.h" +#endif + +#ifdef USE_ACCL +#include "accl.hpp" +#endif + +/** + * @brief Contains all classes and functions that are used as basis + * for all benchmarks. + * + */ +namespace hpcc_base { + +/** + * @brief This class should be derived and extended for every benchmark. + * It is a pure data object containing the benchmark settings that are + * used to execute the benchmark kernel. + * + */ +class BaseSettings { + +public: + + /** + * @brief Number of times the kernel execution will be repeated + * + */ + uint numRepetitions; + + /** + * @brief Boolean showing if memory interleaving is used that is + * triggered from the host side (Intel specific) + * + */ + bool useMemoryInterleaving; + + /** + * @brief Boolean showing if the output data of the benchmark kernel + * should be validated or not + * + */ + bool skipValidation; + + /** + * @brief The default platform that should be used for execution. + * A number representing the index in the list of available platforms + * + */ + int defaultPlatform; + + /** + * @brief The default device that should be used for execution. + * A number representing the index in the list of available devices + * + */ + int defaultDevice; + + /** + * @brief Path to the kernel file that is used for execution + * + */ + std::string kernelFileName; + + /** + * @brief Number of times the kernel is replicated + * + */ + uint kernelReplications; + + /** + * @brief Only test the given configuration. Do not execute the benchmarks + * + */ + bool testOnly; + + /** + * @brief Type of inter-FPGA communication used + * + */ + CommunicationType communicationType; + +#ifdef USE_ACCL + /** + * @brief Use ACCL emulation constructor instead of hardware execution + */ + bool useAcclEmulation; + + /** + * @brief Used ACCL network stack + * + */ + ACCL::networkProtocol acclProtocol; + + /** + * @brief Size of the ACCL buffers in bytes + * + */ + uint acclBufferSize; + + /** + * @brief Number of ACCL buffers to use + * + */ + uint acclBufferCount; +#endif + + /** + * @brief Construct a new Base Settings object + * + * @param results The resulting map from parsing the program input parameters + */ + BaseSettings(cxxopts::ParseResult &results); + + /** + * @brief Get a map of the settings. This map will be used to print the final configuration. + * Derived classes should override it to add additional configuration options + * + * @return std::map + */ + virtual std::map getSettingsMap(); + +}; + +/** + * @brief Settings class that is containing the program settings together with + * additional information about the OpenCL runtime + * + * @tparam TSettings The program settings class that should be used (Must derive from BaseSettings) + */ +template ::value>::type> +class ExecutionSettings { +public: + + /** + * @brief Pointer to the additional program settings + * + */ + std::unique_ptr programSettings; + + /** + * @brief The OpenCL device that should be used for execution + * + */ + std::unique_ptr device; + + /** + * @brief The OpenCL context that should be used for execution + * + */ + std::unique_ptr context; + + /** + * @brief The OpenCL program that contains the benchmark kernel + * + */ + std::unique_ptr program; + + /** + * @brief Construct a new Execution Settings object + * + * @param programSettings_ Pointer to an existing program settings object that is derived from BaseSettings + * @param device_ Used OpenCL device + * @param context_ Used OpenCL context + * @param program_ Used OpenCL program + */ + ExecutionSettings(std::unique_ptr programSettings_, std::unique_ptr device_, + std::unique_ptr context_, std::unique_ptr program_ + + ): + programSettings(std::move(programSettings_)), device(std::move(device_)), + context(std::move(context_)), program(std::move(program_)) + {} + + /** + * @brief Destroy the Execution Settings object. Used to specify the order the contained objects are destroyed + * to prevent segmentation faults during exit. + * + */ + ~ExecutionSettings() { + program = nullptr; + context = nullptr; + device = nullptr; + programSettings = nullptr; + } + +}; + +} + +#endif \ No newline at end of file diff --git a/shared/include/setup/fpga_setup_accl.hpp b/shared/include/setup/fpga_setup_accl.hpp index ff493ccc..0f451ced 100644 --- a/shared/include/setup/fpga_setup_accl.hpp +++ b/shared/include/setup/fpga_setup_accl.hpp @@ -33,6 +33,7 @@ SOFTWARE. /* External libraries */ #include "accl.hpp" #include "xrt/xrt_device.h" +#include "hpcc_settings.hpp" namespace fpga_setup { @@ -64,12 +65,11 @@ Sets up the given FPGA with the kernel in the provided file. @param device The device used for the program @param program The program used to find the ACCL kernels for hardware execution -@param useAcclEmulation Construct an ACCL emulation instance instead of hardware execution +@param programSettings Pass current program settings to configure ACCL according to user specification @return The ACCL instance used for communication */ ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program, - bool useAcclEmulation, - ACCL::networkProtocol protocol); + hpcc_base::BaseSettings &programSettings); } // namespace fpga_setup #endif // SRC_HOST_FPGA_SETUP_H_ diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index ed84ea08..36561553 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -82,7 +82,7 @@ void configure_tcp(ACCL::BaseBuffer &tx_buf_network, ACCL::BaseBuffer &rx_buf_ne } ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program, - bool useAcclEmulation, ACCL::networkProtocol protocol) { + hpcc_base::BaseSettings &programSettings) { int current_rank; MPI_Comm_rank(MPI_COMM_WORLD, ¤t_rank); @@ -92,19 +92,19 @@ ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program, std::vector ranks = {}; for (int i = 0; i < current_size; ++i) { // TODO: Replace the ip addresses and ports here for execution of real hardware? - ACCL::rank_t new_rank = {"10.10.10." + std::to_string(i), 5500 + i, i, ACCL_BUFFER_SIZE}; + ACCL::rank_t new_rank = {"10.10.10." + std::to_string(i), 6000 + i, i, programSettings.acclBufferSize}; ranks.emplace_back(new_rank); } ACCLContext accl; - if (!useAcclEmulation) { + if (!programSettings.useAcclEmulation) { std::cout << "Create cclo ip" << std::endl; auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}"); std::cout << "Create hostctrl" << std::endl; auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}", xrt::kernel::cu_access_mode::exclusive); - if (protocol == ACCL::networkProtocol::UDP) { + if (programSettings.acclProtocol == ACCL::networkProtocol::UDP) { std::cout << "Create CMAC" << std::endl; auto cmac = CMAC(xrt::ip(device, program, "cmac_0:{cmac_0}")); std::cout << "Create Network Layer" << std::endl; @@ -113,7 +113,7 @@ ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program, std::cout << "Configure VNX" << std::endl; configure_vnx(cmac, network_layer, ranks, current_rank); } - if (protocol == ACCL::networkProtocol::TCP) { + if (programSettings.acclProtocol == ACCL::networkProtocol::TCP) { auto network_krnl = xrt::kernel(device, program, "network_krnl:{network_krnl_0}", xrt::kernel::cu_access_mode::exclusive); accl.tx_buf_network = std::unique_ptr(new ACCL::FPGABuffer( @@ -125,14 +125,14 @@ ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program, std::vector mem(1, 0); std::cout << "Create ACCL" << std::endl; accl.accl = std::unique_ptr( - new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, protocol, 16, ACCL_BUFFER_SIZE)); + new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize)); } else { // TODO: Add start port here. Currenty hardcoded! accl.accl = std::unique_ptr( - new ACCL::ACCL(ranks, current_rank, 6000, device, protocol, 16, ACCL_BUFFER_SIZE)); + new ACCL::ACCL(ranks, current_rank, 6000, device, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize)); } - if (protocol == ACCL::networkProtocol::TCP) { + if (programSettings.acclProtocol == ACCL::networkProtocol::TCP) { MPI_Barrier(MPI_COMM_WORLD); accl.accl->open_port(); MPI_Barrier(MPI_COMM_WORLD); diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp index f5d7ef32..1b41f9e0 100644 --- a/shared/setup/fpga_setup_xrt.cpp +++ b/shared/setup/fpga_setup_xrt.cpp @@ -42,7 +42,7 @@ namespace fpga_setup { } else { //TODO Use xrt::system::enumerate_devices() in "experimental/xrt_system.h" for future XRT versions // instead of hardcoded number of devices. - current_device = current_device % 3; + current_device = current_device + 1 % 3; } return std::unique_ptr(new xrt::device(current_device)); } From 43c4f78edc76d84a597b8b876f95497cd8b69ec3 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 29 Nov 2022 18:30:35 +0100 Subject: [PATCH 179/318] Fix base tests to compile with xrt --- shared/tests/hpcc_base_benchmark_test.cpp | 167 ++++++++++++++-------- 1 file changed, 108 insertions(+), 59 deletions(-) diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp index a93a2a69..42a49a70 100644 --- a/shared/tests/hpcc_base_benchmark_test.cpp +++ b/shared/tests/hpcc_base_benchmark_test.cpp @@ -16,7 +16,8 @@ // and enable the included tests void use_hpcc_base_lib() {} -class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark { +template +class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark::type, typename std::tuple_element<1, T>::type, typename std::tuple_element<2, T>::type, int, int> { protected: @@ -47,12 +48,12 @@ class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark::type, typename std::tuple_element<1, T>::type, typename std::tuple_element<2, T>::type, int, int>(0, { nullptr}) {} }; - -class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark { +template +class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark { protected: @@ -102,29 +103,66 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark::checkInputParameters(); + return hpcc_base::HpccFpgaBenchmark::checkInputParameters(); } } - SuccessBenchmark() : HpccFpgaBenchmark(0, { nullptr}) {} + SuccessBenchmark() : hpcc_base::HpccFpgaBenchmark(0, { nullptr}) {} }; +template class BaseHpccBenchmarkTest :public ::testing::Test { +using TDevice = typename std::tuple_element<0,T>::type; +using TContext = typename std::tuple_element<1,T>::type; +using TProgram = typename std::tuple_element<2,T>::type; + public: - std::unique_ptr bm; + std::unique_ptr> bm; BaseHpccBenchmarkTest() { - bm = std::unique_ptr(new SuccessBenchmark()); + bm = std::unique_ptr>(new SuccessBenchmark()); bm->setupBenchmark(global_argc, global_argv); } }; - -TEST_F(BaseHpccBenchmarkTest, SetupSucceedsForBenchmarkTest) { - bool success = bm->setupBenchmark(global_argc, global_argv); +template +class SetupTest : public ::testing::Test {}; + +#ifdef USE_OCL_HOST +typedef ::testing::Types> cl_types; +TYPED_TEST_SUITE( + BaseHpccBenchmarkTest, + cl_types); +TYPED_TEST_SUITE( + SetupTest, + cl_types); +#endif +#ifdef USE_XRT_HOST +#ifndef USE_ACCL +typedef ::testing::Types> xrt_types; +TYPED_TEST_SUITE( + BaseHpccBenchmarkTest, + xrt_types); +TYPED_TEST_SUITE( + SetupTest, + xrt_types); +#else +typedef ::testing::Types> accl_types; +TYPED_TEST_SUITE( + BaseHpccBenchmarkTest, + accl_types); +TYPED_TEST_SUITE( + SetupTest, + accl_types); +#endif +#endif + + +TYPED_TEST(BaseHpccBenchmarkTest, SetupSucceedsForBenchmarkTest) { + bool success = this->bm->setupBenchmark(global_argc, global_argv); EXPECT_TRUE(success); } @@ -132,97 +170,108 @@ TEST_F(BaseHpccBenchmarkTest, SetupSucceedsForBenchmarkTest) { /** * Checks if the testing flag works as expected */ -TEST_F(BaseHpccBenchmarkTest, AllExecutedWhenNotTestOnly) { - bm->getExecutionSettings().programSettings->testOnly = false; - bm->executeBenchmark(); - EXPECT_EQ(bm->validateOutputcalled, 1); - EXPECT_EQ(bm->executeKernelcalled, 1); - EXPECT_EQ(bm->generateInputDatacalled, 1); +TYPED_TEST(BaseHpccBenchmarkTest, AllExecutedWhenNotTestOnly) { + this->bm->getExecutionSettings().programSettings->testOnly = false; + this->bm->executeBenchmark(); + EXPECT_EQ(this->bm->validateOutputcalled, 1); + EXPECT_EQ(this->bm->executeKernelcalled, 1); + EXPECT_EQ(this->bm->generateInputDatacalled, 1); } -TEST_F(BaseHpccBenchmarkTest, NothingExecutedWhenTestOnly) { - bm->getExecutionSettings().programSettings->testOnly = true; - bm->executeBenchmark(); - EXPECT_EQ(bm->validateOutputcalled, 0); - EXPECT_EQ(bm->executeKernelcalled, 0); - EXPECT_EQ(bm->generateInputDatacalled, 0); +TYPED_TEST(BaseHpccBenchmarkTest, NothingExecutedWhenTestOnly) { + this->bm->getExecutionSettings().programSettings->testOnly = true; + this->bm->executeBenchmark(); + EXPECT_EQ(this->bm->validateOutputcalled, 0); + EXPECT_EQ(this->bm->executeKernelcalled, 0); + EXPECT_EQ(this->bm->generateInputDatacalled, 0); } -TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenNotTestOnly) { - bm->getExecutionSettings().programSettings->testOnly = false; - EXPECT_TRUE(bm->executeBenchmark()); +TYPED_TEST(BaseHpccBenchmarkTest, ExecutionSuccessWhenNotTestOnly) { + this->bm->getExecutionSettings().programSettings->testOnly = false; + EXPECT_TRUE(this->bm->executeBenchmark()); } -TEST_F(BaseHpccBenchmarkTest, ExecutionFailsWhenTestOnlyAndSetupFails) { - bm->getExecutionSettings().programSettings->testOnly = true; - bm->forceSetupFail = true; - bm->setupBenchmark(global_argc, global_argv); - EXPECT_FALSE(bm->executeBenchmark()); +TYPED_TEST(BaseHpccBenchmarkTest, ExecutionFailsWhenTestOnlyAndSetupFails) { + this->bm->getExecutionSettings().programSettings->testOnly = true; + this->bm->forceSetupFail = true; + this->bm->setupBenchmark(global_argc, global_argv); + EXPECT_FALSE(this->bm->executeBenchmark()); } -TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenTestOnlyAndSetupSuccess) { - bm->getExecutionSettings().programSettings->testOnly = true; - EXPECT_TRUE(bm->executeBenchmark()); +TYPED_TEST(BaseHpccBenchmarkTest, ExecutionSuccessWhenTestOnlyAndSetupSuccess) { + this->bm->getExecutionSettings().programSettings->testOnly = true; + EXPECT_TRUE(this->bm->executeBenchmark()); } /** - * Checks if using default platform and device is successful + * Checks if non existing device leads to an error */ -TEST_F(BaseHpccBenchmarkTest, SuccessUseDefaultPlatform) { - EXPECT_NE(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, bm->getExecutionSettings().programSettings->defaultDevice).get(), nullptr); +TYPED_TEST(BaseHpccBenchmarkTest, FindNonExistingDevice) { +#ifdef USE_OCL_HOST + ASSERT_THROW(fpga_setup::selectFPGADevice(this->bm->getExecutionSettings().programSettings->defaultPlatform, 100).get(), fpga_setup::FpgaSetupException); +#else + ASSERT_THROW(fpga_setup::selectFPGADevice(100).get(), fpga_setup::FpgaSetupException); +#endif } /** - * Checks if non existing platform leads to an error + * Checks if using default platform and device is successful */ -TEST_F(BaseHpccBenchmarkTest, FindNonExistingPlatform) { - ASSERT_THROW(fpga_setup::selectFPGADevice(100, bm->getExecutionSettings().programSettings->defaultDevice).get(), fpga_setup::FpgaSetupException); +TYPED_TEST(BaseHpccBenchmarkTest, SuccessUseDefaultPlatformandDevice) { +#ifdef USE_OCL_HOST + EXPECT_NE(fpga_setup::selectFPGADevice(this->bm->getExecutionSettings().programSettings->defaultPlatform, this->bm->getExecutionSettings().programSettings->defaultDevice).get(), nullptr); +#else + EXPECT_NE(fpga_setup::selectFPGADevice(this->bm->getExecutionSettings().programSettings->defaultDevice).get(), nullptr); +#endif } +#ifdef USE_OCL_HOST /** - * Checks if non existing device leads to an error + * Checks if non existing platform leads to an error */ -TEST_F(BaseHpccBenchmarkTest, FindNonExistingDevice) { - ASSERT_THROW(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, 100).get(), fpga_setup::FpgaSetupException); +TYPED_TEST(BaseHpccBenchmarkTest, FindNonExistingPlatform) { + ASSERT_THROW(fpga_setup::selectFPGADevice(100, this->bm->getExecutionSettings().programSettings->defaultDevice).get(), fpga_setup::FpgaSetupException); } +#endif + /** * Execute kernel and validation is success */ -TEST_F(BaseHpccBenchmarkTest, SuccessfulExeAndVal) { - EXPECT_TRUE(bm->executeBenchmark()); +TYPED_TEST(BaseHpccBenchmarkTest, SuccessfulExeAndVal) { + EXPECT_TRUE(this->bm->executeBenchmark()); } /** * Execute kernel is success, but validation fails */ -TEST_F(BaseHpccBenchmarkTest, SuccessfulExeFailedVal) { - bm->returnValidate = false; - EXPECT_FALSE(bm->executeBenchmark()); +TYPED_TEST(BaseHpccBenchmarkTest, SuccessfulExeFailedVal) { + this->bm->returnValidate = false; + EXPECT_FALSE(this->bm->executeBenchmark()); } /** * Execute kernel fails */ -TEST_F(BaseHpccBenchmarkTest, FailedExe) { - bm->returnExecuteKernel = false; - EXPECT_FALSE(bm->executeBenchmark()); +TYPED_TEST(BaseHpccBenchmarkTest, FailedExe) { + this->bm->returnExecuteKernel = false; + EXPECT_FALSE(this->bm->executeBenchmark()); } /** * Benchmark Setup is successful with default data */ -TEST(SetupTest, BenchmarkSetupIsSuccessful) { - std::unique_ptr bm = std::unique_ptr(new MinimalBenchmark()); +TYPED_TEST(SetupTest, BenchmarkSetupIsSuccessful) { + std::unique_ptr> bm = std::unique_ptr>(new MinimalBenchmark()); EXPECT_TRUE(bm->setupBenchmark(global_argc, global_argv)); } /** * Benchmark Setup fails because of failing configuration check */ -TEST(SetupTest, BenchmarkConfigurationFailsSetup) { - std::unique_ptr bm = std::unique_ptr(new MinimalBenchmark()); +TYPED_TEST(SetupTest, BenchmarkConfigurationFailsSetup) { + std::unique_ptr> bm = std::unique_ptr>(new MinimalBenchmark()); bm->configurationCheckSucceeds = false; EXPECT_FALSE(bm->setupBenchmark(global_argc, global_argv)); } @@ -230,8 +279,8 @@ TEST(SetupTest, BenchmarkConfigurationFailsSetup) { /** * Benchmark Execution fails if configuration check failed */ -TEST(SetupTest, BenchmarkConfigurationFailsExecution) { - std::unique_ptr bm = std::unique_ptr(new MinimalBenchmark()); +TYPED_TEST(SetupTest, BenchmarkConfigurationFailsExecution) { + std::unique_ptr> bm = std::unique_ptr>(new MinimalBenchmark()); bm->configurationCheckSucceeds = false; bm->setupBenchmark(global_argc, global_argv); EXPECT_FALSE(bm->executeBenchmark()); @@ -240,8 +289,8 @@ TEST(SetupTest, BenchmarkConfigurationFailsExecution) { /** * Benchmark Setup fails with empty data */ -TEST(SetupTest, BenchmarkSetupFails) { - std::unique_ptr bm = std::unique_ptr(new MinimalBenchmark()); +TYPED_TEST(SetupTest, BenchmarkSetupFails) { + std::unique_ptr> bm = std::unique_ptr>(new MinimalBenchmark()); char** tmp_argv = new char*[2]; char* name_str = new char[5]; strcpy(name_str, "name"); From b4a1c610d212a89d1646afb5d5d94779d7102258 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 30 Nov 2022 14:10:23 +0100 Subject: [PATCH 180/318] Fix test build for network kernel --- cmake/kernelTargets.cmake | 4 ++-- cmake/unitTestTargets.cmake | 11 ++++++++--- shared/tests/CMakeLists.txt | 11 +++++++++++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake index 4b8adee3..50da445f 100644 --- a/cmake/kernelTargets.cmake +++ b/cmake/kernelTargets.cmake @@ -117,13 +117,13 @@ function(generate_kernel_targets_xilinx) set(kernel_name_flag -k ${CMAKE_MATCH_1}) endif() add_custom_command(OUTPUT ${kernel} - COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -DKERNEL_${CMAKE_MATCH_1} -DEMULATE -t sw_emu ${kernel_name_flag} ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS} -f ${FPGA_BOARD_NAME} -g -c ${XILINX_COMPILE_FLAGS} -o ${kernel} ${source_f} + COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -DKERNEL_${CMAKE_MATCH_1} -DEMULATE -t sw_emu ${kernel_name_flag} ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS} -f ${FPGA_BOARD_NAME} -c ${XILINX_COMPILE_FLAGS} -o ${kernel} ${source_f} MAIN_DEPENDENCY ${source_f} DEPENDS ${XILINX_COMPILE_SETTINGS_FILE} ) endforeach() add_custom_command(OUTPUT ${bitstream_emulate_f} - COMMAND ${Vitis_COMPILER} ${local_CL_FLAGS} ${VPP_FLAGS} -DEMULATE -t sw_emu ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} -f ${FPGA_BOARD_NAME} -g -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_emulate_f} ${bitstream_compile_emulate} + COMMAND ${Vitis_COMPILER} ${local_CL_FLAGS} ${VPP_FLAGS} -DEMULATE -t sw_emu ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} -f ${FPGA_BOARD_NAME} -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_emulate_f} ${bitstream_compile_emulate} DEPENDS ${bitstream_compile_emulate} DEPENDS ${xilinx_link_settings} ) diff --git a/cmake/unitTestTargets.cmake b/cmake/unitTestTargets.cmake index 776269e7..263d4033 100644 --- a/cmake/unitTestTargets.cmake +++ b/cmake/unitTestTargets.cmake @@ -21,14 +21,19 @@ endif() if (Vitis_FOUND) include_directories(SYSTEM ${Vitis_INCLUDE_DIRS}) + if (USE_ACCL) + set(CMAKE_SKIP_BUILD_RPATH No) + set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes) + list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib) + endif() add_executable(${HOST_EXE_NAME}_test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") target_link_libraries(${HOST_EXE_NAME}_test_xilinx hpcc_fpga_base_test) - if (NOT "${kernel_emulation_targets_xilinx}" STREQUAL "") - add_dependencies(${HOST_EXE_NAME}_test_xilinx ${kernel_emulation_targets_xilinx}) - endif() target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + if (USE_ACCL) + target_link_libraries(${HOST_EXE_NAME}_xilinx zmqpp) + endif() foreach (kernel_target ${kernel_emulation_targets_xilinx}) string(REPLACE "_xilinx" ".xclbin" kernel_name ${kernel_target}) add_test(NAME test_unit_${kernel_target} COMMAND $ -f ${kernel_name} ${TEST_HOST_FLAGS} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) diff --git a/shared/tests/CMakeLists.txt b/shared/tests/CMakeLists.txt index a4ea0a4d..5d4c441b 100644 --- a/shared/tests/CMakeLists.txt +++ b/shared/tests/CMakeLists.txt @@ -1,6 +1,14 @@ set(HPCC_BASE_TEST_SOURCES main.cpp hpcc_base_benchmark_test.cpp) +if (USE_ACCL) + set(ACCL_EMULATOR_DIR ${CMAKE_BINARY_DIR}/lib/accl-emulator CACHE STRING "Directory of ACCL emulator") + add_subdirectory(${extern_accl_SOURCE_DIR}/test/model/emulator ${ACCL_EMULATOR_DIR}) + if (CMAKE_BUILD_TYPE EQUAL "Debug") + set(ACCL_DEBUG Yes) + endif() +endif() + add_library(hpcc_fpga_base_test STATIC ${HPCC_BASE_TEST_SOURCES}) target_link_libraries(hpcc_fpga_base_test gtest gmock hpcc_fpga_base) target_include_directories(hpcc_fpga_base_test PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) @@ -11,6 +19,9 @@ if (INTELFPGAOPENCL_FOUND) elseif(Vitis_FOUND) target_include_directories(hpcc_fpga_base_test PUBLIC ${Vitis_INCLUDE_DIRS}) target_link_libraries(hpcc_fpga_base_test ${Vitis_LIBRARIES}) +if (USE_ACCL) + add_dependencies(hpcc_fpga_base_test cclo_emu) +endif() else() message(ERROR "No OpenCL header found on system!") endif() From 401465ea7e5361888206a72c8e7f72392fe0f0d9 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 1 Dec 2022 16:56:06 +0100 Subject: [PATCH 181/318] Fix performance issue with XRT PCIe HPL host --- LINPACK/src/host/execution_types/execution_xrt_pcie.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp index aa0484e1..0269a8e3 100644 --- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp @@ -413,11 +413,13 @@ std::unique_ptr calculate( for (auto &run : outer_mms) { run.wait(); } + +#ifndef NDEBUG + // Wait for iiner MMs in this communication round to keep + // sync with prints for (auto &run : inner_mms) { run.wait(); } - -#ifndef NDEBUG MPI_Barrier(MPI_COMM_WORLD); if (is_calulating_lu_block) std::cout << "---------------" << std::endl; From bc5f38e958dfc74cbb56535e823933f3371a707a Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Tue, 22 Mar 2022 11:08:49 +0100 Subject: [PATCH 182/318] add generator2.py v0.1 --- STREAM/src/device/stream_kernels_single.cl | 17 +++---- cmake/general_benchmark_build_setup.cmake | 2 +- scripts/code_generator/generator2.py | 55 ++++++++++++++++++++++ 3 files changed, 62 insertions(+), 12 deletions(-) create mode 100644 scripts/code_generator/generator2.py diff --git a/STREAM/src/device/stream_kernels_single.cl b/STREAM/src/device/stream_kernels_single.cl index 678d4fc1..6d421b94 100644 --- a/STREAM/src/device/stream_kernels_single.cl +++ b/STREAM/src/device/stream_kernels_single.cl @@ -15,19 +15,14 @@ KERNEL_NUMBER will be replaced by the build script with the ID of the current re #pragma OPENCL EXTENSION cl_khr_fp16 : enable #endif -/* PY_CODE_GEN -try: - kernel_param_attributes = generate_attributes(num_replications) -except: - kernel_param_attributes = ["" for i in range(num_replications)] -*/ +{% set kernel_param_attributes = generate_attributes(num_replications) %} -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +{% for i in range(num_replications) %} __kernel __attribute__((uses_global_work_offset(0))) -void calc_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ const DEVICE_ARRAY_DATA_TYPE *restrict in1, - __global /*PY_CODE_GEN kernel_param_attributes[i]*/ const DEVICE_ARRAY_DATA_TYPE *restrict in2, - __global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_ARRAY_DATA_TYPE *restrict out, +void calc_{{ i }}(__global {{ kernel_param_attributes[i] }} const DEVICE_ARRAY_DATA_TYPE *restrict in1, + __global {{ kernel_param_attributes[i] }} const DEVICE_ARRAY_DATA_TYPE *restrict in2, + __global {{ kernel_param_attributes[i] }} DEVICE_ARRAY_DATA_TYPE *restrict out, const DEVICE_SCALAR_DATA_TYPE scalar, const uint array_size, const uint operation_type) { @@ -126,4 +121,4 @@ void calc_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ } } -// PY_CODE_GEN block_end +{% endfor %} \ No newline at end of file diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake index 64aa8d0a..0a3f8d66 100644 --- a/cmake/general_benchmark_build_setup.cmake +++ b/cmake/general_benchmark_build_setup.cmake @@ -214,7 +214,7 @@ if (INTELFPGAOPENCL_FOUND) separate_arguments(AOC_FLAGS) endif() -set(CODE_GENERATOR "${CMAKE_SOURCE_DIR}/../scripts/code_generator/generator.py" CACHE FILEPATH "Path to the code generator executable") +set(CODE_GENERATOR "${CMAKE_SOURCE_DIR}/../scripts/code_generator/generator2.py" CACHE FILEPATH "Path to the code generator executable") set(CUSTOM_KERNEL_FOLDER ${CMAKE_SOURCE_DIR}/src/device/custom/) diff --git a/scripts/code_generator/generator2.py b/scripts/code_generator/generator2.py new file mode 100644 index 00000000..09a0c142 --- /dev/null +++ b/scripts/code_generator/generator2.py @@ -0,0 +1,55 @@ +import argparse +import sys +from jinja2 import Environment, PackageLoader, BaseLoader, TemplateNotFound, select_autoescape +from os.path import join, exists, getmtime + +parser = argparse.ArgumentParser(description='Preprocessor for code replication and advanced code modification.') +parser.add_argument('file', metavar='CODE_FILE', type=str, + help='Path to the file that is used as input') +parser.add_argument("-o", dest="output_file", default=None, help="Path to the output file. If not given, output will printed to stdout.") +parser.add_argument("-p", dest="params", default=[], action="append", help="Python statement that is parsed before modifying the files. Can be used to define global variables.") + +# create a simple loader to load templates from the file system +class SimpleLoader(BaseLoader): + def __init__(self, path): + self.path = path + + def get_source(self, environment, template): + path = join(self.path, template) + if not exists(path): + raise TemplateNotFound(template) + mtime = getmtime(path) + with open(path) as f: + source = f.read() + return source, path, lambda: mtime == getmtime(path) + +env = Environment( + loader=SimpleLoader("./"), + autoescape=select_autoescape() +) + +if __name__ == '__main__': + args = parser.parse_args() + if not args.file: + print('no input file given') + exit(1) + if not args.output_file: + print('no output file given') + exit(1) + for p in args.params: + print("Parse statement: %s" % p) + exec(p, globals()) + + template = env.get_template(args.file) + + try: + template.globals.update({"generate_attributes": generate_attributes}) + except: + generate_attributes = lambda r : ["" for i in range(r)] + template.globals.update({"generate_attributes": generate_attributes}) + + if num_replications is None: + num_replications = 1 + + with open(args.output_file, 'w') as f: + f.write(template.render(num_replications=num_replications)) \ No newline at end of file From 9b703894f809b1321ef9b06b974bd6a0f7618ba5 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Wed, 23 Mar 2022 13:30:47 +0100 Subject: [PATCH 183/318] use new generator for all benchmarks --- .../settings.gen.intel.fft1d_float_8.hbm.py | 2 +- .../settings.gen.intel.fft1d_float_8.svm.py | 2 +- FFT/src/device/fft1d_float_8.cl | 83 ++++--- .../settings.gen.intel.gemm_base.520n_mx.py | 2 +- .../settings.gen.intel.gemm_base.hbm.py | 2 +- GEMM/src/device/gemm_base.cl | 29 ++- LINPACK/src/device/hpl_torus_IEC.cl | 6 +- LINPACK/src/device/hpl_torus_PCIE.cl | 6 +- PTRANS/src/device/transpose_DIAG_IEC.cl | 24 +-- PTRANS/src/device/transpose_DIAG_PCIE.cl | 6 +- PTRANS/src/device/transpose_PQ_IEC.cl | 24 +-- PTRANS/src/device/transpose_PQ_PCIE.cl | 20 +- PTRANS/src/device/transpose_c2_DIAG_IEC.cl | 32 +-- .../device/random_access_kernels_single.cl | 15 +- STREAM/src/device/stream_kernels.cl | 12 +- b_eff/src/device/communication_bw520n_IEC.cl | 40 ++-- cmake/general_benchmark_build_setup.cmake | 2 +- scripts/code_generator/README.md | 4 +- scripts/code_generator/generator.py | 202 +++++------------- scripts/code_generator/generator2.py | 55 ----- 20 files changed, 201 insertions(+), 367 deletions(-) mode change 100755 => 100644 scripts/code_generator/generator.py delete mode 100644 scripts/code_generator/generator2.py diff --git a/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py b/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py index b4775387..c72f4081 100644 --- a/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py +++ b/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py @@ -1,7 +1,7 @@ global_memory_name = "HBM" -def generate_attributes(num_replications, num_global_memory_banks=32): +def generate_bi_map_attributes(num_replications, num_global_memory_banks=32): """ Generates the kernel attributes for the global memory. They specify in which global memory the buffer is located. The buffers will be placed using a diff --git a/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py b/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py index 86e3cc3a..2cb14bde 100644 --- a/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py +++ b/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py @@ -1,5 +1,5 @@ -def generate_attributes(num_replications, num_global_memory_banks=32): +def generate_bi_map_attributes(num_replications, num_global_memory_banks=32): """ Generates the kernel attributes for the global memory. They specify in which global memory the buffer is located. The buffers will be placed using a diff --git a/FFT/src/device/fft1d_float_8.cl b/FFT/src/device/fft1d_float_8.cl index 69da1432..9cfe70c7 100644 --- a/FFT/src/device/fft1d_float_8.cl +++ b/FFT/src/device/fft1d_float_8.cl @@ -51,12 +51,7 @@ // code generation expects an array of maps of size num_replications with the keys "in" and "out". // The value of the keys have to be strings containing the attributes that // have to be assigned to input and output buffers in global memory -/* PY_CODE_GEN -try: - kernel_param_attributes = generate_attributes(num_replications) -except: - kernel_param_attributes = [{"in": "", "out": ""} for i in range(num_replications)] -*/ +{% set kernel_param_attributes = generate_bi_map_attributes(num_replications) %} #define min(a,b) (a> LOGPOINTS)][(current_index + shift) & (POINTS - 1)]; } #ifdef XILINX_FPGA - buf2x8.i0 = write_chunk[0]; - buf2x8.i1 = write_chunk[1]; - buf2x8.i2 = write_chunk[2]; - buf2x8.i3 = write_chunk[3]; - buf2x8.i4 = write_chunk[4]; + buf2x8.i0 = write_chunk[0]; + buf2x8.i1 = write_chunk[1]; + buf2x8.i2 = write_chunk[2]; + buf2x8.i3 = write_chunk[3]; + buf2x8.i4 = write_chunk[4]; buf2x8.i5 = write_chunk[5]; buf2x8.i6 = write_chunk[6]; buf2x8.i7 = write_chunk[7]; // Start in the second iteration to forward the buffered data over the pipe - write_pipe_block(chanin/*PY_CODE_GEN i*/, &buf2x8); + write_pipe_block(chanin{{ i }}, &buf2x8); #endif #ifdef INTEL_FPGA - write_channel_intel(chanin/*PY_CODE_GEN i*/[0], write_chunk[0]); - write_channel_intel(chanin/*PY_CODE_GEN i*/[1], write_chunk[1]); - write_channel_intel(chanin/*PY_CODE_GEN i*/[2], write_chunk[2]); - write_channel_intel(chanin/*PY_CODE_GEN i*/[3], write_chunk[3]); - write_channel_intel(chanin/*PY_CODE_GEN i*/[4], write_chunk[4]); - write_channel_intel(chanin/*PY_CODE_GEN i*/[5], write_chunk[5]); - write_channel_intel(chanin/*PY_CODE_GEN i*/[6], write_chunk[6]); - write_channel_intel(chanin/*PY_CODE_GEN i*/[7], write_chunk[7]); + write_channel_intel(chanin{{ i }}[0], write_chunk[0]); + write_channel_intel(chanin{{ i }}[1], write_chunk[1]); + write_channel_intel(chanin{{ i }}[2], write_chunk[2]); + write_channel_intel(chanin{{ i }}[3], write_chunk[3]); + write_channel_intel(chanin{{ i }}[4], write_chunk[4]); + write_channel_intel(chanin{{ i }}[5], write_chunk[5]); + write_channel_intel(chanin{{ i }}[6], write_chunk[6]); + write_channel_intel(chanin{{ i }}[7], write_chunk[7]); #endif } } @@ -193,10 +188,10 @@ void fetch/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["i __attribute__ ((max_global_work_dim(0))) __attribute__((reqd_work_group_size(1,1,1))) -kernel void fft1d/*PY_CODE_GEN i*/( +kernel void fft1d{{ i }}( #ifdef INTEL_FPGA // Intel does not need a store kernel and directly writes back the result to global memory - __global /*PY_CODE_GEN kernel_param_attributes[i]["out"]*/ float2 * restrict dest, + __global {{ kernel_param_attributes[i]["out"] }} float2 * restrict dest, #endif int count, int inverse) { @@ -235,17 +230,17 @@ kernel void fft1d/*PY_CODE_GEN i*/( // Perform memory transfers only when reading data in range if (i < count * (N / POINTS)) { #ifdef INTEL_FPGA - data.i0 = read_channel_intel(chanin/*PY_CODE_GEN i*/[0]); - data.i1 = read_channel_intel(chanin/*PY_CODE_GEN i*/[1]); - data.i2 = read_channel_intel(chanin/*PY_CODE_GEN i*/[2]); - data.i3 = read_channel_intel(chanin/*PY_CODE_GEN i*/[3]); - data.i4 = read_channel_intel(chanin/*PY_CODE_GEN i*/[4]); - data.i5 = read_channel_intel(chanin/*PY_CODE_GEN i*/[5]); - data.i6 = read_channel_intel(chanin/*PY_CODE_GEN i*/[6]); - data.i7 = read_channel_intel(chanin/*PY_CODE_GEN i*/[7]); + data.i0 = read_channel_intel(chanin{{ i }}[0]); + data.i1 = read_channel_intel(chanin{{ i }}[1]); + data.i2 = read_channel_intel(chanin{{ i }}[2]); + data.i3 = read_channel_intel(chanin{{ i }}[3]); + data.i4 = read_channel_intel(chanin{{ i }}[4]); + data.i5 = read_channel_intel(chanin{{ i }}[5]); + data.i6 = read_channel_intel(chanin{{ i }}[6]); + data.i7 = read_channel_intel(chanin{{ i }}[7]); #endif #ifdef XILINX_FPGA - read_pipe_block(chanin/*PY_CODE_GEN i*/, &data); + read_pipe_block(chanin{{ i }}, &data); #endif } else { data.i0 = data.i1 = data.i2 = data.i3 = @@ -274,7 +269,7 @@ kernel void fft1d/*PY_CODE_GEN i*/( #endif #ifdef XILINX_FPGA // For Xilinx send the data to the store kernel to enable memory bursts - write_pipe_block(chanout/*PY_CODE_GEN i*/, &data); + write_pipe_block(chanout{{ i }}, &data); #endif } } @@ -287,14 +282,14 @@ This kernel works without conditional branches which enables memory bursts. */ __kernel __attribute__ ((max_global_work_dim(0), reqd_work_group_size(1,1,1))) -void store/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["out"]*/ float2 * restrict dest, int iter) { +void store{{ i }}(__global {{ kernel_param_attributes[i]["out"] }} float2 * restrict dest, int iter) { const int N = (1 << LOGN); // write the data back to global memory using memory bursts for(unsigned k = 0; k < iter * (N / POINTS); k++){ float2x8 buf2x8; - read_pipe_block(chanout/*PY_CODE_GEN i*/, &buf2x8); + read_pipe_block(chanout{{ i }}, &buf2x8); dest[(k << LOGPOINTS)] = buf2x8.i0; dest[(k << LOGPOINTS) + 1] = buf2x8.i1; @@ -308,4 +303,4 @@ void store/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["o } #endif -//PY_CODE_GEN block_end +{% endfor %} \ No newline at end of file diff --git a/GEMM/settings/settings.gen.intel.gemm_base.520n_mx.py b/GEMM/settings/settings.gen.intel.gemm_base.520n_mx.py index ba180d5c..023500c0 100644 --- a/GEMM/settings/settings.gen.intel.gemm_base.520n_mx.py +++ b/GEMM/settings/settings.gen.intel.gemm_base.520n_mx.py @@ -1,7 +1,7 @@ global_memory_name = "HBM" -def generate_attributes(num_replications, num_global_memory_banks=32): +def generate_map_attributes(num_replications, num_global_memory_banks=32): """ Generates the kernel attributes for the global memory. They specify in which global memory the buffer is located. The buffers will be placed using a diff --git a/GEMM/settings/settings.gen.intel.gemm_base.hbm.py b/GEMM/settings/settings.gen.intel.gemm_base.hbm.py index ab88f63a..4b3f1813 100644 --- a/GEMM/settings/settings.gen.intel.gemm_base.hbm.py +++ b/GEMM/settings/settings.gen.intel.gemm_base.hbm.py @@ -1,7 +1,7 @@ global_memory_name = "HBM" -def generate_attributes(num_replications, num_global_memory_banks=32): +def generate_map_attributes(num_replications, num_global_memory_banks=32): """ Generates the kernel attributes for the global memory. They specify in which global memory the buffer is located. The buffers will be placed using a diff --git a/GEMM/src/device/gemm_base.cl b/GEMM/src/device/gemm_base.cl index 3599e6cd..6511a221 100644 --- a/GEMM/src/device/gemm_base.cl +++ b/GEMM/src/device/gemm_base.cl @@ -33,12 +33,7 @@ SOFTWARE. // code generation expects an array of maps of size num_replications with the keys a,b,c,out. // The value of the keys have to be strings containing the attributes that // have to be assigned to input and output buffers in global memory -/* PY_CODE_GEN -try: - kernel_param_attributes = generate_attributes(num_replications) -except: - kernel_param_attributes = [{"a": "", "b": "", "c": "", "out": ""} for i in range(num_replications)] -*/ +{% set kernel_param_attributes = generate_map_attributes(num_replications) %} /** Calculate for the Level 2 block: @@ -260,7 +255,7 @@ to BRAM. // Here we use the total replications. This will also create three kernels for the Xilinx compiler because they all // use different hard-coded ranges in the outer loop -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +{% for i in range(num_replications) %} /** Two level blocked GEMM kernel @@ -277,21 +272,21 @@ calculates C_OUT = alpha * A.dot(B) + beta * C */ __attribute__((uses_global_work_offset(0))) __kernel -void gemm/*PY_CODE_GEN i*/( +void gemm{{ i }}( #ifdef ENABLE_MIXED_PRECISION // In mixed precision convert the values accordingly // from single precision to the target precision on the FPGA - __global /*PY_CODE_GEN kernel_param_attributes[i]["a"]*/ const float* restrict a, - __global /*PY_CODE_GEN kernel_param_attributes[i]["b"]*/ const float* restrict b, - __global /*PY_CODE_GEN kernel_param_attributes[i]["c"]*/ const float* restrict c, - __global /*PY_CODE_GEN kernel_param_attributes[i]["out"]*/ float* restrict c_out, + __global {{ kernel_param_attributes[i]["a"] }} const float* restrict a, + __global {{ kernel_param_attributes[i]["b"] }} const float* restrict b, + __global {{ kernel_param_attributes[i]["c"] }} const float* restrict c, + __global {{ kernel_param_attributes[i]["out"] }} float* restrict c_out, const float alpha, const float beta, #else - __global /*PY_CODE_GEN kernel_param_attributes[i]["a"]*/ const DEVICE_DATA_TYPE* restrict a, - __global /*PY_CODE_GEN kernel_param_attributes[i]["b"]*/ const DEVICE_DATA_TYPE* restrict b, - __global /*PY_CODE_GEN kernel_param_attributes[i]["c"]*/ const DEVICE_DATA_TYPE* restrict c, - __global /*PY_CODE_GEN kernel_param_attributes[i]["out"]*/ DEVICE_DATA_TYPE* restrict c_out, + __global {{ kernel_param_attributes[i]["a"] }} const DEVICE_DATA_TYPE* restrict a, + __global {{ kernel_param_attributes[i]["b"] }} const DEVICE_DATA_TYPE* restrict b, + __global {{ kernel_param_attributes[i]["c"] }} const DEVICE_DATA_TYPE* restrict c, + __global {{ kernel_param_attributes[i]["out"] }} DEVICE_DATA_TYPE* restrict c_out, const DEVICE_DATA_TYPE alpha, const DEVICE_DATA_TYPE beta, #endif @@ -445,4 +440,4 @@ __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL))) } } -// PY_CODE_GEN block_end +{% endfor %} diff --git a/LINPACK/src/device/hpl_torus_IEC.cl b/LINPACK/src/device/hpl_torus_IEC.cl index fc3d0257..7e8f57ea 100644 --- a/LINPACK/src/device/hpl_torus_IEC.cl +++ b/LINPACK/src/device/hpl_torus_IEC.cl @@ -839,7 +839,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a, } } -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +{% for i in range(num_replications) %} /** Update the inner blocks using the left and right column and rows @@ -847,7 +847,7 @@ Update the inner blocks using the left and right column and rows */ __attribute__((uses_global_work_offset(0))) __kernel -void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a, +void inner_update_mm{{ i }}(__global DEVICE_DATA_TYPE* restrict a, __global DEVICE_DATA_TYPE* restrict left_global_buffer, __global DEVICE_DATA_TYPE* restrict top_global_buffer, const uint block_col, @@ -945,4 +945,4 @@ void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a, } } -// PY_CODE_GEN block_end +{% endfor %} diff --git a/LINPACK/src/device/hpl_torus_PCIE.cl b/LINPACK/src/device/hpl_torus_PCIE.cl index 2b3d312d..2b86657d 100644 --- a/LINPACK/src/device/hpl_torus_PCIE.cl +++ b/LINPACK/src/device/hpl_torus_PCIE.cl @@ -708,7 +708,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a, } } -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +{% for i in range(num_replications) %} /** Update the inner blocks using the left and right column and rows @@ -716,7 +716,7 @@ Update the inner blocks using the left and right column and rows */ __attribute__((uses_global_work_offset(0))) __kernel -void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a, +void inner_update_mm{{ i }}(__global DEVICE_DATA_TYPE* restrict a, __global DEVICE_DATA_TYPE* restrict left_global_buffer, __global DEVICE_DATA_TYPE* restrict top_global_buffer, const uint block_col, @@ -862,4 +862,4 @@ void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a, } } -// PY_CODE_GEN block_end +{% endfor %} \ No newline at end of file diff --git a/PTRANS/src/device/transpose_DIAG_IEC.cl b/PTRANS/src/device/transpose_DIAG_IEC.cl index 513b39e8..94077736 100644 --- a/PTRANS/src/device/transpose_DIAG_IEC.cl +++ b/PTRANS/src/device/transpose_DIAG_IEC.cl @@ -16,11 +16,11 @@ typedef struct { DEVICE_DATA_TYPE data[CHANNEL_WIDTH]; } ch_data; -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)] +{% for i in range(num_total_replications) %} // Channel used to send the transposed blocks of A -channel ch_data chan_a_out/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(i) + "\""*/), depth(1))); -channel ch_data chan_a_in/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2 * (i // 2) + ((i + 1) % 2)) + "\""*/), depth(1))); -// PY_CODE_GEN block_end +channel ch_data chan_a_out{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(i) }}), depth(1))); +channel ch_data chan_a_in{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2 * (i // 2) + ((i + 1) % 2)) }}), depth(1))); +{% endfor %} #endif /** @@ -64,7 +64,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) } } -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)] +{% for i in range(num_total_replications) %} /** * send a chunk of A into local memory in a reordered fashion @@ -77,7 +77,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) * */ void -send_chunk_of_a/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH], +send_chunk_of_a{{ i }}(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH], const ulong row, const ulong col) { @@ -104,7 +104,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) data.data[unroll_count] = rotate_out[(unroll_count + rot_out) & (CHANNEL_WIDTH - 1)]; } - write_channel_intel(chan_a_out/*PY_CODE_GEN i*/, data); + write_channel_intel(chan_a_out{{ i }}, data); } /** @@ -121,7 +121,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) */ __attribute__((max_global_work_dim(0))) __kernel -void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, +void transpose_read{{ i }}(__global DEVICE_DATA_TYPE *restrict A, const ulong block_offset, const ulong number_of_blocks) { @@ -139,7 +139,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, load_chunk_of_a(A, a_block[block & 1], block, row, col); } if (block > 0) { - send_chunk_of_a/*PY_CODE_GEN i*/(a_block[(block - 1) & 1], row, col); + send_chunk_of_a{{ i }}(a_block[(block - 1) & 1], row, col); } } } @@ -162,7 +162,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, */ __attribute__((max_global_work_dim(0))) __kernel -void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B, +void transpose_write{{ i }}(__global DEVICE_DATA_TYPE *restrict B, __global DEVICE_DATA_TYPE *restrict A_out, const ulong block_offset, const ulong number_of_blocks) { @@ -173,7 +173,7 @@ void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B, for (ulong row = 0; row < BLOCK_SIZE; row++) { for (ulong col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) { - ch_data data = read_channel_intel(chan_a_in/*PY_CODE_GEN i*/); + ch_data data = read_channel_intel(chan_a_in{{ i }}); unsigned rot_out = row & (CHANNEL_WIDTH - 1); // rotate temporary buffer to store data into local buffer @@ -188,4 +188,4 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) } } -// PY_CODE_GEN block_end +{% endfor %} diff --git a/PTRANS/src/device/transpose_DIAG_PCIE.cl b/PTRANS/src/device/transpose_DIAG_PCIE.cl index 614800f3..b443803d 100644 --- a/PTRANS/src/device/transpose_DIAG_PCIE.cl +++ b/PTRANS/src/device/transpose_DIAG_PCIE.cl @@ -127,7 +127,7 @@ store_a(__global DEVICE_DATA_TYPE *restrict A_out, } } -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +{% for i in range(num_replications) %} /** * Read blocks of matrix A and transpose them in memory. @@ -144,7 +144,7 @@ store_a(__global DEVICE_DATA_TYPE *restrict A_out, */ __attribute__((max_global_work_dim(0))) __kernel -void transpose/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, +void transpose{{ i }}(__global DEVICE_DATA_TYPE *restrict A, __global DEVICE_DATA_TYPE *restrict B, __global DEVICE_DATA_TYPE *restrict A_out, const uint number_of_blocks) { @@ -172,4 +172,4 @@ void transpose/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, } } -// PY_CODE_GEN block_end +{% endfor %} diff --git a/PTRANS/src/device/transpose_PQ_IEC.cl b/PTRANS/src/device/transpose_PQ_IEC.cl index e219ae1c..5bce8ab7 100644 --- a/PTRANS/src/device/transpose_PQ_IEC.cl +++ b/PTRANS/src/device/transpose_PQ_IEC.cl @@ -16,11 +16,11 @@ typedef struct { DEVICE_DATA_TYPE data[CHANNEL_WIDTH]; } ch_data; -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)] +{% for i in range(num_total_replications) %} // Channel used to send the transposed blocks of A -channel ch_data chan_a_out/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(i) + "\""*/), depth(1))); -channel ch_data chan_a_in/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2 * (i // 2) + ((i + 1) % 2)) + "\""*/), depth(1))); -// PY_CODE_GEN block_end +channel ch_data chan_a_out{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(i) }}), depth(1))); +channel ch_data chan_a_in{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2 * (i // 2) + ((i + 1) % 2)) }}), depth(1))); +{% endfor %} #endif /** @@ -69,7 +69,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) } } -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)] +{% for i in range(num_total_replications) %} /** * send a chunk of A into local memory in a reordered fashion @@ -82,7 +82,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) * */ void -send_chunk_of_a/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH], +send_chunk_of_a{{ i }}(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH], const ulong row, const ulong col) { @@ -109,7 +109,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) data.data[unroll_count] = rotate_out[(unroll_count + rot_out) & (CHANNEL_WIDTH - 1)]; } - write_channel_intel(chan_a_out/*PY_CODE_GEN i*/, data); + write_channel_intel(chan_a_out{{ i }}, data); } /** @@ -126,7 +126,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) */ __attribute__((max_global_work_dim(0))) __kernel -void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, +void transpose_read{{ i }}(__global DEVICE_DATA_TYPE *restrict A, const ulong offset, const ulong width_in_blocks, const ulong height_in_blocks, @@ -148,7 +148,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, load_chunk_of_a(A, a_block[block & 1], block_row, block_col, width_in_blocks, row, col); } if (block > offset) { - send_chunk_of_a/*PY_CODE_GEN i*/(a_block[(block - 1) & 1], row, col); + send_chunk_of_a{{ i }}(a_block[(block - 1) & 1], row, col); } } } @@ -171,7 +171,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, */ __attribute__((max_global_work_dim(0))) __kernel -void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B, +void transpose_write{{ i }}(__global DEVICE_DATA_TYPE *restrict B, __global DEVICE_DATA_TYPE *restrict A_out, const ulong offset, const ulong width_in_blocks, @@ -183,7 +183,7 @@ void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B, for (ulong row = 0; row < BLOCK_SIZE; row++) { for (ulong col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) { - ch_data data = read_channel_intel(chan_a_in/*PY_CODE_GEN i*/); + ch_data data = read_channel_intel(chan_a_in{{ i }}); ulong block_col = block % width_in_blocks; ulong block_row = block / width_in_blocks; @@ -202,4 +202,4 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) } } -// PY_CODE_GEN block_end +{% endfor %} \ No newline at end of file diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cl b/PTRANS/src/device/transpose_PQ_PCIE.cl index 161fcb88..caa20143 100644 --- a/PTRANS/src/device/transpose_PQ_PCIE.cl +++ b/PTRANS/src/device/transpose_PQ_PCIE.cl @@ -8,14 +8,9 @@ #include "parameters.h" -/* PY_CODE_GEN -try: - kernel_param_attributes = generate_attributes(num_replications) -except: - kernel_param_attributes = ["" for i in range(num_replications)] -*/ +{% set kernel_param_attributes = generate_attributes(num_replications) %} -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +{% for i in range(num_replications) %} /** * Read blocks of matrix A and transpose them in memory. @@ -37,11 +32,10 @@ except: */ __attribute__((max_global_work_dim(0))) __kernel -void transpose/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE *restrict A, - __global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE *restrict B, - __global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE *restrict A_out, - const uint offset_a, - const uint offset_b, +void transpose{{ i }}(__global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict A, + __global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict B, + __global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict A_out, + const uint offset, const uint number_of_blocks, const uint width_in_blocks, const uint height_in_blocks) { @@ -190,4 +184,4 @@ void transpose/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i } } -// PY_CODE_GEN block_end +{% endfor %} \ No newline at end of file diff --git a/PTRANS/src/device/transpose_c2_DIAG_IEC.cl b/PTRANS/src/device/transpose_c2_DIAG_IEC.cl index dfad9f87..cf2455e7 100644 --- a/PTRANS/src/device/transpose_c2_DIAG_IEC.cl +++ b/PTRANS/src/device/transpose_c2_DIAG_IEC.cl @@ -16,13 +16,13 @@ typedef struct { DEVICE_DATA_TYPE data[CHANNEL_WIDTH/2]; } ch_data; -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)] +{% for i in range(num_replications) %} // Channel used to send the transposed blocks of A -channel ch_data chan_a_out1/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(2*i) + "\""*/), depth(1))); -channel ch_data chan_a_out2/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(2*i + 1) + "\""*/), depth(1))); -channel ch_data chan_a_in1/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2*i + 1) + "\""*/), depth(1))); -channel ch_data chan_a_in2/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2*i) + "\""*/), depth(1))); -// PY_CODE_GEN block_end +channel ch_data chan_a_out1{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(2*i) }}), depth(1))); +channel ch_data chan_a_out2{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(2*i + 1) }}), depth(1))); +channel ch_data chan_a_in1{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2*i + 1) }}), depth(1))); +channel ch_data chan_a_in2{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2*i) }}), depth(1))); +{% endfor %} #endif /** @@ -65,7 +65,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) } } -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)] +{% for i in range(num_total_replications) %} /** * send a chunk of A into local memory in a reordered fashion @@ -78,7 +78,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) * */ void -send_chunk_of_a/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH], +send_chunk_of_a{{ i }}(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH], const ulong row, const ulong col) { @@ -111,7 +111,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2))) for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH/2; unroll_count++) { data1.data[unroll_count] = channel_data[unroll_count]; } - write_channel_intel(chan_a_out1/*PY_CODE_GEN i*/, data1); + write_channel_intel(chan_a_out1{{ i }}, data1); ch_data data2; // rotate temporary buffer to store data into local buffer @@ -119,7 +119,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2))) for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH/2; unroll_count++) { data2.data[unroll_count] = channel_data[CHANNEL_WIDTH/2 + unroll_count]; } - write_channel_intel(chan_a_out2/*PY_CODE_GEN i*/, data2); + write_channel_intel(chan_a_out2{{ i }}, data2); } /** @@ -136,7 +136,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2))) */ __attribute__((max_global_work_dim(0))) __kernel -void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, +void transpose_read{{ i }}(__global DEVICE_DATA_TYPE *restrict A, const ulong block_offset, const ulong number_of_blocks) { @@ -154,7 +154,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, load_chunk_of_a(A, a_block[block & 1], block, row, col); } if (block > 0) { - send_chunk_of_a/*PY_CODE_GEN i*/(a_block[(block - 1) & 1], row, col); + send_chunk_of_a{{ i }}(a_block[(block - 1) & 1], row, col); } } } @@ -177,7 +177,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A, */ __attribute__((max_global_work_dim(0))) __kernel -void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B, +void transpose_write{{ i }}(__global DEVICE_DATA_TYPE *restrict B, __global DEVICE_DATA_TYPE *restrict A_out, const ulong block_offset, const ulong number_of_blocks) { @@ -190,13 +190,13 @@ void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B, DEVICE_DATA_TYPE channel_data[CHANNEL_WIDTH]; - ch_data data1 = read_channel_intel(chan_a_in1/*PY_CODE_GEN i*/); + ch_data data1 = read_channel_intel(chan_a_in1{{ i }}); __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2))) for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH/2; unroll_count++) { channel_data[unroll_count] = data1.data[unroll_count]; } - ch_data data2 = read_channel_intel(chan_a_in2/*PY_CODE_GEN i*/); + ch_data data2 = read_channel_intel(chan_a_in2{{ i }}); __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2))) for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH/2; unroll_count++) { channel_data[CHANNEL_WIDTH/2 + unroll_count] = data2.data[unroll_count]; @@ -217,4 +217,4 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH))) } } -// PY_CODE_GEN block_end +{% endfor %} \ No newline at end of file diff --git a/RandomAccess/src/device/random_access_kernels_single.cl b/RandomAccess/src/device/random_access_kernels_single.cl index f7c59260..16637065 100644 --- a/RandomAccess/src/device/random_access_kernels_single.cl +++ b/RandomAccess/src/device/random_access_kernels_single.cl @@ -34,14 +34,9 @@ Constant used to update the pseudo random number #define BLOCK_SIZE_LOG GLOBAL_MEM_UNROLL_LOG #define BLOCK_SIZE (1 << BLOCK_SIZE_LOG) -/* PY_CODE_GEN -try: - kernel_param_attributes = generate_attributes(num_replications) -except: - kernel_param_attributes = ["" for i in range(num_replications)] -*/ +{% set kernel_param_attributes = generate_attributes(num_replications) %} -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +{% for i in range(num_replications) %} /* Kernel, that will update the given data array accoring to a predefined pseudo- @@ -56,8 +51,8 @@ to the kernel. */ __attribute__((max_global_work_dim(0),uses_global_work_offset(0))) __kernel -void accessMemory_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE_UNSIGNED volatile * restrict data, - __constant /*PY_CODE_GEN kernel_param_attributes[i]*/ const DEVICE_DATA_TYPE_UNSIGNED * restrict random_init, +void accessMemory_{{ i }}(__global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE_UNSIGNED volatile * restrict data, + __constant {{ kernel_param_attributes[i] }} const DEVICE_DATA_TYPE_UNSIGNED * restrict random_init, const DEVICE_DATA_TYPE_UNSIGNED m, const DEVICE_DATA_TYPE_UNSIGNED data_chunk, const uint num_cache_operations, @@ -190,4 +185,4 @@ void accessMemory_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attribut } } -// PY_CODE_GEN block_end +{% endfor %} \ No newline at end of file diff --git a/STREAM/src/device/stream_kernels.cl b/STREAM/src/device/stream_kernels.cl index cd569727..c8a99e2b 100644 --- a/STREAM/src/device/stream_kernels.cl +++ b/STREAM/src/device/stream_kernels.cl @@ -6,11 +6,11 @@ KERNEL_NUMBER will be replaced by the build script with the ID of the current re */ #include "parameters.h" -// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +{% for i in range(num_replications) %} __kernel __attribute__((uses_global_work_offset(0))) -void copy_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in, +void copy_{{ i }}(__global const DEVICE_ARRAY_DATA_TYPE * restrict in, __global DEVICE_ARRAY_DATA_TYPE * restrict out, const uint array_size) { uint number_elements = array_size / VECTOR_COUNT; @@ -22,7 +22,7 @@ void copy_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in, __kernel __attribute__((uses_global_work_offset(0))) -void add_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1, +void add_{{ i }}(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1, __global const DEVICE_ARRAY_DATA_TYPE * restrict in2, __global DEVICE_ARRAY_DATA_TYPE * restrict out, const uint array_size) { @@ -35,7 +35,7 @@ void add_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1, __kernel __attribute__((uses_global_work_offset(0))) -void scale_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in, +void scale_{{ i }}(__global const DEVICE_ARRAY_DATA_TYPE * restrict in, __global DEVICE_ARRAY_DATA_TYPE * restrict out, const DEVICE_SCALAR_DATA_TYPE scalar, const uint array_size) { @@ -48,7 +48,7 @@ void scale_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in __kernel __attribute__((uses_global_work_offset(0))) -void triad_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1, +void triad_{{ i }}(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1, __global const DEVICE_ARRAY_DATA_TYPE * restrict in2, __global DEVICE_ARRAY_DATA_TYPE * restrict out, const DEVICE_SCALAR_DATA_TYPE scalar, @@ -60,4 +60,4 @@ void triad_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in } } -// PY_CODE_GEN block_end +{% endfor %} \ No newline at end of file diff --git a/b_eff/src/device/communication_bw520n_IEC.cl b/b_eff/src/device/communication_bw520n_IEC.cl index ce128d8c..e3d61b74 100644 --- a/b_eff/src/device/communication_bw520n_IEC.cl +++ b/b_eff/src/device/communication_bw520n_IEC.cl @@ -49,17 +49,17 @@ typedef struct { /** * Definition of the external channels */ - // PY_CODE_GEN block_start [replace(local_variables=locals()) for r in range(num_replications)] -channel message_part ch_out_/*PY_CODE_GEN 2*r+1*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(r % 4) + "\""*/))); -channel message_part ch_out_/*PY_CODE_GEN 2*r+2*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str((r + 2) % 4) + "\""*/))); -channel message_part ch_in_/*PY_CODE_GEN 2*r+1*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(r % 4) + "\""*/))); -channel message_part ch_in_/*PY_CODE_GEN 2*r+2*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str((r + 2) % 4) + "\""*/))); -channel message_part ch_exchange/*PY_CODE_GEN 2*r+1*/; -channel message_part ch_exchange/*PY_CODE_GEN 2*r+2*/; -// PY_CODE_GEN block_end +{% for i in range(num_replications) %} +channel message_part ch_out_{{ 2*i+1 }} __attribute((io({{ "\"kernel_output_ch{}\"".format(i % 4) }}))); +channel message_part ch_out_{{ 2*i+2 }} __attribute((io({{ "\"kernel_output_ch{}\"".format((i + 2) % 4) }}))); +channel message_part ch_in_{{ 2*i+1 }} __attribute((io({{ "\"kernel_input_ch{}\"".format(i % 4) }}))); +channel message_part ch_in_{{ 2*i+2 }} __attribute((io({{ "\"kernel_input_ch{}\"".format((i + 2) % 4) }}))); +channel message_part ch_exchange{{ 2*i+1 }}; +channel message_part ch_exchange{{ 2*i+2 }}; +{% endfor %} -// PY_CODE_GEN block_start [replace(local_variables=locals()) for r in range(num_replications)] +{% for i in range(num_replications) %} /** * Send kernel that will send messages through two channels * @@ -68,7 +68,7 @@ channel message_part ch_exchange/*PY_CODE_GEN 2*r+2*/; */ __kernel __attribute__ ((max_global_work_dim(0))) -void send/*PY_CODE_GEN r*/(const unsigned data_size, +void send{{ i }}(const unsigned data_size, const unsigned repetitions) { const unsigned send_iterations = ((1 << data_size) + 2 * ITEMS_PER_CHANNEL - 1) / (2 * ITEMS_PER_CHANNEL); message_part send_part1; @@ -85,13 +85,13 @@ void send/*PY_CODE_GEN r*/(const unsigned data_size, for (unsigned i=0; i < repetitions; i++) { // Send a single message sent over two channels split into multiple chunks for (unsigned k=0; k < send_iterations; k++) { - write_channel_intel(ch_out_/*PY_CODE_GEN 2*r+1*/, send_part1); - write_channel_intel(ch_out_/*PY_CODE_GEN 2*r+2*/, send_part2); + write_channel_intel(ch_out_{{ 2*i+1 }}, send_part1); + write_channel_intel(ch_out_{{ 2*i+2 }}, send_part2); } #ifndef EMULATE // Introduce data dependency between loop iterations to prevent coalescing of loop - send_part1 = read_channel_intel(ch_exchange/*PY_CODE_GEN 2*r+1*/); - send_part2 = read_channel_intel(ch_exchange/*PY_CODE_GEN 2*r+2*/); + send_part1 = read_channel_intel(ch_exchange{{ 2*i+1 }}); + send_part2 = read_channel_intel(ch_exchange{{ 2*i+2 }}); #endif } } @@ -106,7 +106,7 @@ void send/*PY_CODE_GEN r*/(const unsigned data_size, */ __kernel __attribute__ ((max_global_work_dim(0))) -void recv/*PY_CODE_GEN r*/(__global DEVICE_DATA_TYPE* validation_buffer, +void recv{{ i }}(__global DEVICE_DATA_TYPE* validation_buffer, const unsigned data_size, const unsigned repetitions) { const unsigned send_iterations = ((1 << data_size) + 2 * ITEMS_PER_CHANNEL - 1) / (2 * ITEMS_PER_CHANNEL); @@ -117,14 +117,14 @@ void recv/*PY_CODE_GEN r*/(__global DEVICE_DATA_TYPE* validation_buffer, for (unsigned i=0; i < repetitions; i++) { // Receive a single message sent over two channels split into multiple chunks for (unsigned k=0; k < send_iterations; k++) { - recv_part1 = read_channel_intel(ch_in_/*PY_CODE_GEN 2*r+1*/); - recv_part2 = read_channel_intel(ch_in_/*PY_CODE_GEN 2*r+2*/); + recv_part1 = read_channel_intel(ch_in_{{ 2*i+1 }}); + recv_part2 = read_channel_intel(ch_in_{{ 2*i+2 }}); } #ifndef EMULATE // Introduce data dependency between loop iterations to prevent coalescing of loop // by sending the data to the send kernel - write_channel_intel(ch_exchange/*PY_CODE_GEN 2*r+1*/, recv_part1); - write_channel_intel(ch_exchange/*PY_CODE_GEN 2*r+2*/, recv_part2); + write_channel_intel(ch_exchange{{ 2*i+1 }}, recv_part1); + write_channel_intel(ch_exchange{{ 2*i+2 }}, recv_part2); #endif } @@ -139,4 +139,4 @@ void recv/*PY_CODE_GEN r*/(__global DEVICE_DATA_TYPE* validation_buffer, } } -//PY_CODE_GEN block_end +{% endfor %} diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake index 0a3f8d66..64aa8d0a 100644 --- a/cmake/general_benchmark_build_setup.cmake +++ b/cmake/general_benchmark_build_setup.cmake @@ -214,7 +214,7 @@ if (INTELFPGAOPENCL_FOUND) separate_arguments(AOC_FLAGS) endif() -set(CODE_GENERATOR "${CMAKE_SOURCE_DIR}/../scripts/code_generator/generator2.py" CACHE FILEPATH "Path to the code generator executable") +set(CODE_GENERATOR "${CMAKE_SOURCE_DIR}/../scripts/code_generator/generator.py" CACHE FILEPATH "Path to the code generator executable") set(CUSTOM_KERNEL_FOLDER ${CMAKE_SOURCE_DIR}/src/device/custom/) diff --git a/scripts/code_generator/README.md b/scripts/code_generator/README.md index 20730682..2847e7ac 100644 --- a/scripts/code_generator/README.md +++ b/scripts/code_generator/README.md @@ -81,7 +81,7 @@ As an example the dynamic construction of a switch statement: switch(i) { // PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(replicate)] - case /*PY_CODE_GEN i*/: return /*PY_CODE_GEN i+1*/; break; + case {{ i }}: return /*PY_CODE_GEN i+1*/; break; // PY_CODE_GEN block_end } @@ -94,7 +94,7 @@ would result in: case 3: return 4; break; } -Note, that the variables that have to be replaced are written in inline comments `/*PY_CODE_GEN i*/`. +Note, that the variables that have to be replaced are written in inline comments `{{ i }}`. The given statement will be evaluated and the comment will be replaced by the result. Thus, it is also possible to call functions or do arithmetic. diff --git a/scripts/code_generator/generator.py b/scripts/code_generator/generator.py old mode 100755 new mode 100644 index 7b27ee93..b0452121 --- a/scripts/code_generator/generator.py +++ b/scripts/code_generator/generator.py @@ -1,49 +1,33 @@ -#!/usr/bin/env python3 -# -# Copyright (c) 2019 Marius Meyer -# -# Permission is hereby granted, free of charge, to any person obtaining a copy of -# this software and associated documentation files (the "Software"), to deal in -# the Software without restriction, including without limitation the rights to -# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -# of the Software, and to permit persons to whom the Software is furnished to do -# so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -## - import argparse -import itertools import sys import logging -import re - - -comment_symbol = "//" -ml_comment_symbol_start = "/*" -ml_comment_symbol_end = "*/" -pycodegen_cmd = "PY_CODE_GEN" -pragma_cmd = comment_symbol +"\\s*"+ pycodegen_cmd +from jinja2 import Environment, PackageLoader, BaseLoader, TemplateNotFound, select_autoescape +from os.path import join, exists, getmtime parser = argparse.ArgumentParser(description='Preprocessor for code replication and advanced code modification.') parser.add_argument('file', metavar='CODE_FILE', type=str, help='Path to the file that is used as input') parser.add_argument("-o", dest="output_file", default=None, help="Path to the output file. If not given, output will printed to stdout.") -parser.add_argument("--comment", dest="comment_symbol", default=comment_symbol, help="Symbols that are used to comment out lines in the target language. Default='%s'" % comment_symbol) -parser.add_argument("--comment-ml-start", dest="comment_symbol_ml_start", default=ml_comment_symbol_start, help="Symbols that are used to start a multi line comment in the target language. Default='%s'" % ml_comment_symbol_start) -parser.add_argument("--comment-ml-end", dest="comment_symbol_ml_end", default=ml_comment_symbol_end, help="Symbols that are used to end a multi line comment in the target language. Default='%s'" % ml_comment_symbol_end) parser.add_argument("-p", dest="params", default=[], action="append", help="Python statement that is parsed before modifying the files. Can be used to define global variables.") -CODE = "" +# create a simple loader to load templates from the file system +class SimpleLoader(BaseLoader): + def __init__(self, path): + self.path = path + + def get_source(self, environment, template): + path = join(self.path, template) + if not exists(path): + raise TemplateNotFound(template) + mtime = getmtime(path) + with open(path) as f: + source = f.read() + return source, path, lambda: mtime == getmtime(path) + +env = Environment( + loader=SimpleLoader("./"), + autoescape=select_autoescape() +) def use_file(file_name): """ @@ -67,124 +51,50 @@ def use_file(file_name): print("Error while parsing external file. See logs for more information.",file=sys.stderr) exit(1) +if __name__ == '__main__': + args = parser.parse_args() -def replace(code_block=None, local_variables=None): - """ - Evaluate or execute inline code and replace the code with the result. - - @param code_block The input code block that will be parsed and modified - @param local_variables A dictionary containing local variables that should also be considered (like locals()) - - @return the modified code - """ - global CODE - if not code_block: - code_block = CODE - if local_variables is not None: - variables = {**globals(), **local_variables} + if args.output_file: + log_file_name = args.output_file + ".log" else: - variables = globals() - matches = itertools.chain(re.finditer("%s\\s*%s\\s+(?P(.|\n)+?)%s" % (ml_comment_symbol_start, pycodegen_cmd, ml_comment_symbol_end), code_block, flags=0), - re.finditer("%s\\s+(?!block_start\\s+)(?!block_end\\s+)(?P(.)+?)\n" % (pragma_cmd), code_block, flags=0)) - for res_ml in matches: - res_ml_code = res_ml.group(0) - try: - evaluated = str(eval(res_ml.groupdict()["code"], variables)) - code_block = code_block.replace(res_ml_code, evaluated) - logging.debug("Evaluated '%s' to '%s'" % (res_ml.groupdict()["code"], evaluated)) - continue - except Exception as e: - logging.debug("Failed to evaluate inline code") - try: - exec(res_ml.groupdict()["code"], globals()) - code_block = code_block.replace(res_ml_code, "") - logging.debug("Executed in global space: '%s'" % res_ml.groupdict()["code"]) - except Exception as e: - logging.warning("Could not execute inline code:\n\tCommand: '''\n%s\n'''\n\tError: %s" % (res_ml.groupdict()["code"], e)) - return code_block - - -def modify_block(code_block, cmd_str, out): - global CODE - CODE = code_block - if cmd_str == "": - cmd_str = "None" - try: - mod_code = eval(cmd_str, {**globals(), **locals()}) - except Exception as e: - logging.error("Block: %s \n %s" % (code_block, e)) - logging.error("Global variables: %s" % globals()) - print( "Block: %s \n %s" % (code_block, e),file=sys.stderr) - exit(1) - if type(mod_code) is list: - mod_code = "".join(mod_code) - elif mod_code is None: - mod_code = "" - elif type(mod_code) is not str: - logging.warning("%s is not a string. Automatic convert to string!" % mod_code) - mod_code = str(mod_code) - return mod_code - #logging.debug("Start parsing of modified sub-block") - #parse_string(mod_code, out) - #logging.debug("Finished parsing of modified sub-block") - - -def parse_string(code_string, out): - try: - code_string = replace(code_string) - for res in re.finditer("%s\\s+block_start\\s+(?P.*)\n(?P(.|\n)+?)%s\\s+block_end\\s*\n" % (pragma_cmd, pragma_cmd), code_string, flags=0): - logging.debug("Found block match!") - d = res.groupdict() - code_block = d["code"] - logging.debug("Modify the block!") - code_block = modify_block(code_block, d["cmd"], out) - code_string = code_string.replace(res.group(0), code_block) - logging.debug("Parsing complete. Write result to file.") - output.write(code_string) - except Exception as e: - logging.error("Block: %s \n %s" % (code_string, e)) - logging.error("Global variables: %s" % globals()) - logging.error("Local variables: %s" % locals()) - print( "Error while parsing code block: %s \n %s" % (e),file=sys.stderr) + log_file_name = "generator.log" + logging.basicConfig(filename=log_file_name, filemode='w', level=logging.DEBUG) + if not args.file: + logging.debug('no input file given') + exit(1) + if not args.output_file: + logging.debug('no output file given') + exit(1) + for p in args.params: + logging.debug("Parse statement: %s" % p) + exec(p, globals()) -def parse_file(file_name, out): - """ - Opens a single source code file and applies the changes to it. + template = env.get_template(args.file) - The function will output the modified source code into the given output stream. + try: + template.globals.update({"generate_attributes": generate_attributes}) + except: + generate_attributes = lambda r : ["" for i in range(r)] + template.globals.update({"generate_attributes": generate_attributes}) - @param file_name The psth to the source code file relative to the current working directory - @param out Output stream that is used to output the modified source code - """ try: - with open(file_name) as f: - parse_string(f.read(), out) - except Exception as e: - logging.error("Error when opening and parsing file %s: %s" % (file_name, e)) - print("Error occurred when parsing file. See logs for more details.",file=sys.stderr) + template.globals.update({"generate_map_attributes": generate_map_attributes}) + except: + generate_map_attributes = lambda r : [{"a": "", "b": "", "c": "", "out": ""} for i in range(r)] + template.globals.update({"generate_map_attributes": generate_map_attributes}) + try: + template.globals.update({"generate_bi_map_attributes": generate_bi_map_attributes}) + except: + generate_bi_map_attributes = lambda r : [{"in": "", "out": ""} for i in range(r)] + template.globals.update({"generate_bi_map_attributes": generate_bi_map_attributes}) + if num_replications is None: + num_replications = 1 + if num_total_replications is None: + num_total_replications = 1 -if __name__=="__main__": - args = parser.parse_args() - if args.output_file: - log_file_name = args.output_file + ".log" - else: - log_file_name = "generator.log" - logging.basicConfig(filename=log_file_name, filemode='w', level=logging.DEBUG) - output = sys.stdout - for p in args.params: - logging.debug("Parse statement: %s" % p) - exec(p, globals()) - if args.output_file: - logging.debug("Use output file: %s" % args.output_file) - output = open(args.output_file, 'w') - comment_symbol = re.escape(args.comment_symbol) - ml_comment_symbol_start = re.escape(args.comment_symbol_ml_start) - ml_comment_symbol_end = re.escape(args.comment_symbol_ml_end) - pragma_cmd = comment_symbol +"\\s*"+ pycodegen_cmd - logging.debug("Use pragma command: %s", pragma_cmd) - logging.debug("Start parsing file: %s" % args.file) - parse_file(args.file, output) + with open(args.output_file, 'w') as f: + f.write(template.render(num_replications=num_replications, num_total_replications=num_total_replications)) \ No newline at end of file diff --git a/scripts/code_generator/generator2.py b/scripts/code_generator/generator2.py deleted file mode 100644 index 09a0c142..00000000 --- a/scripts/code_generator/generator2.py +++ /dev/null @@ -1,55 +0,0 @@ -import argparse -import sys -from jinja2 import Environment, PackageLoader, BaseLoader, TemplateNotFound, select_autoescape -from os.path import join, exists, getmtime - -parser = argparse.ArgumentParser(description='Preprocessor for code replication and advanced code modification.') -parser.add_argument('file', metavar='CODE_FILE', type=str, - help='Path to the file that is used as input') -parser.add_argument("-o", dest="output_file", default=None, help="Path to the output file. If not given, output will printed to stdout.") -parser.add_argument("-p", dest="params", default=[], action="append", help="Python statement that is parsed before modifying the files. Can be used to define global variables.") - -# create a simple loader to load templates from the file system -class SimpleLoader(BaseLoader): - def __init__(self, path): - self.path = path - - def get_source(self, environment, template): - path = join(self.path, template) - if not exists(path): - raise TemplateNotFound(template) - mtime = getmtime(path) - with open(path) as f: - source = f.read() - return source, path, lambda: mtime == getmtime(path) - -env = Environment( - loader=SimpleLoader("./"), - autoescape=select_autoescape() -) - -if __name__ == '__main__': - args = parser.parse_args() - if not args.file: - print('no input file given') - exit(1) - if not args.output_file: - print('no output file given') - exit(1) - for p in args.params: - print("Parse statement: %s" % p) - exec(p, globals()) - - template = env.get_template(args.file) - - try: - template.globals.update({"generate_attributes": generate_attributes}) - except: - generate_attributes = lambda r : ["" for i in range(r)] - template.globals.update({"generate_attributes": generate_attributes}) - - if num_replications is None: - num_replications = 1 - - with open(args.output_file, 'w') as f: - f.write(template.render(num_replications=num_replications)) \ No newline at end of file From 016a19c7bda3f7e0e38115839b41fcfdc716b70d Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Wed, 30 Mar 2022 09:15:12 +0200 Subject: [PATCH 184/318] do not use format in template --- PTRANS/src/device/transpose_DIAG_IEC.cl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PTRANS/src/device/transpose_DIAG_IEC.cl b/PTRANS/src/device/transpose_DIAG_IEC.cl index 94077736..a5ab3a03 100644 --- a/PTRANS/src/device/transpose_DIAG_IEC.cl +++ b/PTRANS/src/device/transpose_DIAG_IEC.cl @@ -18,8 +18,8 @@ typedef struct { {% for i in range(num_total_replications) %} // Channel used to send the transposed blocks of A -channel ch_data chan_a_out{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(i) }}), depth(1))); -channel ch_data chan_a_in{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2 * (i // 2) + ((i + 1) % 2)) }}), depth(1))); +channel ch_data chan_a_out{{ i }} __attribute((io("kernel_output_ch{{ i }}"), depth(1))); +channel ch_data chan_a_in{{ i }} __attribute((io("kernel_input_ch{{ (2 * (i // 2) + ((i + 1) % 2)) }}"), depth(1))); {% endfor %} #endif From 1612e26562f254d39cd9b3449e00cec833c58ea4 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Wed, 30 Mar 2022 09:15:57 +0200 Subject: [PATCH 185/318] move default attributes to template --- .../settings.gen.intel.fft1d_float_8.hbm.py | 2 +- .../settings.gen.intel.fft1d_float_8.svm.py | 2 +- FFT/src/device/fft1d_float_8.cl | 11 +++++++++-- .../settings.gen.intel.gemm_base.520n_mx.py | 2 +- GEMM/settings/settings.gen.intel.gemm_base.hbm.py | 2 +- GEMM/src/device/gemm_base.cl | 10 +++++++++- STREAM/src/device/stream_kernels_single.cl | 10 +++++++++- scripts/code_generator/generator.py | 15 +-------------- 8 files changed, 32 insertions(+), 22 deletions(-) diff --git a/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py b/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py index c72f4081..b4775387 100644 --- a/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py +++ b/FFT/settings/settings.gen.intel.fft1d_float_8.hbm.py @@ -1,7 +1,7 @@ global_memory_name = "HBM" -def generate_bi_map_attributes(num_replications, num_global_memory_banks=32): +def generate_attributes(num_replications, num_global_memory_banks=32): """ Generates the kernel attributes for the global memory. They specify in which global memory the buffer is located. The buffers will be placed using a diff --git a/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py b/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py index 2cb14bde..86e3cc3a 100644 --- a/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py +++ b/FFT/settings/settings.gen.intel.fft1d_float_8.svm.py @@ -1,5 +1,5 @@ -def generate_bi_map_attributes(num_replications, num_global_memory_banks=32): +def generate_attributes(num_replications, num_global_memory_banks=32): """ Generates the kernel attributes for the global memory. They specify in which global memory the buffer is located. The buffers will be placed using a diff --git a/FFT/src/device/fft1d_float_8.cl b/FFT/src/device/fft1d_float_8.cl index 9cfe70c7..763399cf 100644 --- a/FFT/src/device/fft1d_float_8.cl +++ b/FFT/src/device/fft1d_float_8.cl @@ -51,8 +51,15 @@ // code generation expects an array of maps of size num_replications with the keys "in" and "out". // The value of the keys have to be strings containing the attributes that // have to be assigned to input and output buffers in global memory -{% set kernel_param_attributes = generate_bi_map_attributes(num_replications) %} - +{% macro list(content, count) -%} + [{% for i in range(count) %} content {% if not loop.last %}, {% endif %} {% endfor %} +{%- endmacro %} + +{% if generate_attributes is defined %} + {% set kernel_param_attributes = generate_attributes(num_replications) %} +{% else %} + {% set kernel_param_attributes = list({"in": "", "out": ""}, num_replications) %} +{% endif %} #define min(a,b) (a Date: Wed, 30 Mar 2022 12:45:56 +0200 Subject: [PATCH 186/318] python list comprehension is better than jinja macros --- FFT/src/device/fft1d_float_8.cl | 6 +----- GEMM/src/device/gemm_base.cl | 6 +----- STREAM/src/device/stream_kernels_single.cl | 6 +----- scripts/code_generator/generator.py | 5 +++++ 4 files changed, 8 insertions(+), 15 deletions(-) diff --git a/FFT/src/device/fft1d_float_8.cl b/FFT/src/device/fft1d_float_8.cl index 763399cf..ba451fa8 100644 --- a/FFT/src/device/fft1d_float_8.cl +++ b/FFT/src/device/fft1d_float_8.cl @@ -51,14 +51,10 @@ // code generation expects an array of maps of size num_replications with the keys "in" and "out". // The value of the keys have to be strings containing the attributes that // have to be assigned to input and output buffers in global memory -{% macro list(content, count) -%} - [{% for i in range(count) %} content {% if not loop.last %}, {% endif %} {% endfor %} -{%- endmacro %} - {% if generate_attributes is defined %} {% set kernel_param_attributes = generate_attributes(num_replications) %} {% else %} - {% set kernel_param_attributes = list({"in": "", "out": ""}, num_replications) %} + {% set kernel_param_attributes = create_list({"in": "", "out": ""}, num_replications) %} {% endif %} #define min(a,b) (a Date: Wed, 30 Mar 2022 13:04:50 +0200 Subject: [PATCH 187/318] use templating instead of format everywhere --- PTRANS/src/device/transpose_PQ_IEC.cl | 4 ++-- PTRANS/src/device/transpose_c2_DIAG_IEC.cl | 8 ++++---- b_eff/src/device/communication_bw520n_IEC.cl | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/PTRANS/src/device/transpose_PQ_IEC.cl b/PTRANS/src/device/transpose_PQ_IEC.cl index 5bce8ab7..9bfb6485 100644 --- a/PTRANS/src/device/transpose_PQ_IEC.cl +++ b/PTRANS/src/device/transpose_PQ_IEC.cl @@ -18,8 +18,8 @@ typedef struct { {% for i in range(num_total_replications) %} // Channel used to send the transposed blocks of A -channel ch_data chan_a_out{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(i) }}), depth(1))); -channel ch_data chan_a_in{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2 * (i // 2) + ((i + 1) % 2)) }}), depth(1))); +channel ch_data chan_a_out{{ i }} __attribute((io("kernel_output_ch{{ i }}"), depth(1))); +channel ch_data chan_a_in{{ i }} __attribute((io("kernel_input_ch{{ 2 * (i // 2) + ((i + 1) % 2) }}"), depth(1))); {% endfor %} #endif diff --git a/PTRANS/src/device/transpose_c2_DIAG_IEC.cl b/PTRANS/src/device/transpose_c2_DIAG_IEC.cl index cf2455e7..a40d6bb0 100644 --- a/PTRANS/src/device/transpose_c2_DIAG_IEC.cl +++ b/PTRANS/src/device/transpose_c2_DIAG_IEC.cl @@ -18,10 +18,10 @@ typedef struct { {% for i in range(num_replications) %} // Channel used to send the transposed blocks of A -channel ch_data chan_a_out1{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(2*i) }}), depth(1))); -channel ch_data chan_a_out2{{ i }} __attribute((io({{ "\"kernel_output_ch{}\"".format(2*i + 1) }}), depth(1))); -channel ch_data chan_a_in1{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2*i + 1) }}), depth(1))); -channel ch_data chan_a_in2{{ i }} __attribute((io({{ "\"kernel_input_ch{}\"".format(2*i) }}), depth(1))); +channel ch_data chan_a_out1{{ i }} __attribute((io("kernel_output_ch{{ 2*i }}"), depth(1))); +channel ch_data chan_a_out2{{ i }} __attribute((io("kernel_output_ch{{ 2*i + 1 }}"), depth(1))); +channel ch_data chan_a_in1{{ i }} __attribute((io("kernel_input_ch{{ 2*i + 1 }}"), depth(1))); +channel ch_data chan_a_in2{{ i }} __attribute((io("kernel_input_ch{{ 2*i }}"), depth(1))); {% endfor %} #endif diff --git a/b_eff/src/device/communication_bw520n_IEC.cl b/b_eff/src/device/communication_bw520n_IEC.cl index e3d61b74..f5dcefe8 100644 --- a/b_eff/src/device/communication_bw520n_IEC.cl +++ b/b_eff/src/device/communication_bw520n_IEC.cl @@ -50,12 +50,12 @@ typedef struct { * Definition of the external channels */ {% for i in range(num_replications) %} -channel message_part ch_out_{{ 2*i+1 }} __attribute((io({{ "\"kernel_output_ch{}\"".format(i % 4) }}))); -channel message_part ch_out_{{ 2*i+2 }} __attribute((io({{ "\"kernel_output_ch{}\"".format((i + 2) % 4) }}))); -channel message_part ch_in_{{ 2*i+1 }} __attribute((io({{ "\"kernel_input_ch{}\"".format(i % 4) }}))); -channel message_part ch_in_{{ 2*i+2 }} __attribute((io({{ "\"kernel_input_ch{}\"".format((i + 2) % 4) }}))); -channel message_part ch_exchange{{ 2*i+1 }}; -channel message_part ch_exchange{{ 2*i+2 }}; +channel message_part ch_out_{{ 2*i + 1 }} __attribute((io("kernel_output_ch{{ i % 4 }}"))); +channel message_part ch_out_{{ 2*i + 2 }} __attribute((io("kernel_output_ch{{ (i + 2) % 4 }}"))); +channel message_part ch_in_{{ 2*i + 1 }} __attribute((io("kernel_input_ch{{ i % 4 }} "))); +channel message_part ch_in_{{ 2*i + 2 }} __attribute((io("kernel_input_ch{{ (i + 2) % 4 }}"))); +channel message_part ch_exchange{{ 2*i + 1 }}; +channel message_part ch_exchange{{ 2*i + 2 }}; {% endfor %} From 25b6afaeb501ffeeb1eef01a5ea15562a2950b1f Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Thu, 10 Nov 2022 19:04:34 +0100 Subject: [PATCH 188/318] add default attributes for PTRANS and RandomAccess --- PTRANS/src/device/transpose_PQ_PCIE.cl | 9 +++++++-- RandomAccess/src/device/random_access_kernels_single.cl | 6 +++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cl b/PTRANS/src/device/transpose_PQ_PCIE.cl index caa20143..3fccf79a 100644 --- a/PTRANS/src/device/transpose_PQ_PCIE.cl +++ b/PTRANS/src/device/transpose_PQ_PCIE.cl @@ -8,7 +8,11 @@ #include "parameters.h" -{% set kernel_param_attributes = generate_attributes(num_replications) %} +{% if generate_attributes is defined %} + {% set kernel_param_attributes = generate_attributes(num_replications) %} +{% else %} + {% set kernel_param_attributes = create_list("", num_replications) %} +{% endif %} {% for i in range(num_replications) %} @@ -35,7 +39,8 @@ __kernel void transpose{{ i }}(__global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict A, __global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict B, __global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict A_out, - const uint offset, + const uint offset_a, + const uint offset_b, const uint number_of_blocks, const uint width_in_blocks, const uint height_in_blocks) { diff --git a/RandomAccess/src/device/random_access_kernels_single.cl b/RandomAccess/src/device/random_access_kernels_single.cl index 16637065..5ebc1376 100644 --- a/RandomAccess/src/device/random_access_kernels_single.cl +++ b/RandomAccess/src/device/random_access_kernels_single.cl @@ -34,7 +34,11 @@ Constant used to update the pseudo random number #define BLOCK_SIZE_LOG GLOBAL_MEM_UNROLL_LOG #define BLOCK_SIZE (1 << BLOCK_SIZE_LOG) -{% set kernel_param_attributes = generate_attributes(num_replications) %} +{% if generate_attributes is defined %} + {% set kernel_param_attributes = generate_attributes(num_replications) %} +{% else %} + {% set kernel_param_attributes = create_list("", num_replications) %} +{% endif %} {% for i in range(num_replications) %} From 97b0c829e1f4e9e97fbf30854eee71504fa88d27 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Thu, 17 Nov 2022 19:27:10 +0100 Subject: [PATCH 189/318] add build:docs pipeline --- .gitlab-ci.yml | 21 ++++++++++++++++++++- scripts/code_generator/requirements.txt | 1 + scripts/evaluation/requirements.txt | 2 +- 3 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 scripts/code_generator/requirements.txt diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 40ca7a1f..6d05430b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -10,7 +10,8 @@ default: - jacamar before_script: - module load fpga/intel/opencl_sdk/21.2.0 fpga/bittware/520n/20.4.0_max toolchain/foss/2021a devel/CMake/3.20.1-GCCcore-10.3.0 lang/Python/3.9.5-GCCcore-10.3.0 - - python -m pip install pandas + - python -m pip install -r scripts/evaluation/requirements.txt + - python -m pip install -r scripts/code_generator/requirements.txt ### # @@ -18,6 +19,23 @@ default: # ### +build:docs: + stage: build + script: + - python -m pip install -r docs/requirements.txt + - module load devel/Doxygen/1.9.1-GCCcore-10.3.0 + - cd docs + - make html + - doxygen doxy.config + only: + changes: + - docs/**/* + - .gitlab-ci.yml + artifacts: + paths: + - docs/build + - docs/xml + build:STREAM: stage: build script: @@ -174,6 +192,7 @@ build:LINPACK_DP: - shared/**/* - scripts/**/* - cmake/**/* + - .gitlab-ci.yml build:GEMM: stage: build diff --git a/scripts/code_generator/requirements.txt b/scripts/code_generator/requirements.txt new file mode 100644 index 00000000..ea18cd6f --- /dev/null +++ b/scripts/code_generator/requirements.txt @@ -0,0 +1 @@ +jinja2==2.11.3 diff --git a/scripts/evaluation/requirements.txt b/scripts/evaluation/requirements.txt index f9ccbaa9..efd4927b 100644 --- a/scripts/evaluation/requirements.txt +++ b/scripts/evaluation/requirements.txt @@ -1 +1 @@ -pandas==0.23.3 +pandas==1.4.3 From 7eb91bf26982c6e331b028240a5cf3f9982af324 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Fri, 18 Nov 2022 13:01:53 +0100 Subject: [PATCH 190/318] update code_generator README.md --- scripts/code_generator/README.md | 91 ++++++++++---------------------- 1 file changed, 29 insertions(+), 62 deletions(-) diff --git a/scripts/code_generator/README.md b/scripts/code_generator/README.md index 2847e7ac..9ff4d8ab 100644 --- a/scripts/code_generator/README.md +++ b/scripts/code_generator/README.md @@ -4,85 +4,57 @@ This is a small and highly extendable Python script for Code generation. The main application area is the generation of OpenCL code, but the generator works independently of the used programming language. It can be seen as an extension of the usually used preprocessors to adapt the code before compilation. With this code it is also possible to replicate code sections and do more complex modifications while keeping the code readable. -This is done using inline scripting in code comments. -A generator code line always starts with `PY_CODE_GEN`. +This is done using the [jinja templating engine](https://jinja.palletsprojects.com/en/3.1.x/). ## Execution -The script needs Python3 to run. +The script needs Python3 with the module "jinja2" to run. It will be used by the CMake build system to generate source code and settings for some of the benchmarks. A short summary of the usage of the script that can also be printed by running `./generator.py -h`: - usage: generator.py [-h] [-o OUTPUT_FILE] [--comment COMMENT_SYMBOL] - [--comment-ml-start COMMENT_SYMBOL_ML_START] - [--comment-ml-end COMMENT_SYMBOL_ML_END] [-p PARAMS] - CODE_FILE + usage: generator.py [-h] [-o OUTPUT_FILE] [-p PARAMS] CODE_FILE - Preprocessor for code replication and advanced code modification. + Preprocessor for code replication and advanced code modification using jinja. positional arguments: - CODE_FILE Path to the file that is used as input + CODE_FILE Path to the file that is used as input optional arguments: - -h, --help show this help message and exit - -o OUTPUT_FILE Path to the output file. If not given, output will - printed to stdout. - --comment COMMENT_SYMBOL - Symbols that are used to comment out lines in the - target language. Default='//' - --comment-ml-start COMMENT_SYMBOL_ML_START - Symbols that are used to start a multi line comment in - the target language. Default='/*' - --comment-ml-end COMMENT_SYMBOL_ML_END - Symbols that are used to end a multi line comment in - the target language. Default='*/' - -p PARAMS Python statement that is parsed before modifying the - files. Can be used to define global variables. - + -h, --help show this help message and exit + -o OUTPUT_FILE Path to the output file. If not given, output will printed + to stdout. + -p PARAMS Python statement that is parsed before modifying the files. + Can be used to define global variables. + usage: generator.py [-h] [-o OUTPUT_FILE] [--comment COMMENT_SYMBOL] + [--comment-ml-start COMMENT_SYMBOL_ML_START] + [--comment-ml-end COMMENT_SYMBOL_ML_END] [-p PARAMS] + CODE_FILE ## Code Examples -The generator takes arbitrary code files as input and only applies changes when specific comment patterns are found. +The generator takes arbitrary code files as input and only applies changes when the specific jinja templating syntax is used. The code insertions have the following syntax: - // PY_CODE_GEN [block_start STATEMENT|block_end|STATEMENT] - -it is also possible to write multiple lines of code: - - /* PY_CODE_GEN - STATEMENT1 - STATEMENT2 - ... - */ - -Where `STATEMENT`is an arbitrary python statement. -The input file will be parsed from the beginning to the end and generation statements will be executed immediately. Example for the definition of a global variable: - PY_CODE_GEN replicate=4 + {% set replicate = 4 %} This variable can then be used within the next pragmas to further modify the code. E.g. the defined variable can be used to modifiy a code block: - // PY_CODE_GEN block_start CODE.replace("$R", str(replicate)) - int i = $R; - printf("i should be $R"); - // PY_CODE_GEN block_end - -`CODE` is a global variable containing the code within the recent block. It can be modified like every other Python string. -In most cases it is recommended to use the build-in function `replace()` for replacing variables, but it might be used for more advanced code modifications. -The result of the given Python statement will then be printed in the modified file. + int i = {{ replicate }}; + printf("i should be {{ replicate }}"); This is functionality, which would also be possible using the standard preprocessor. A case, where this script becomes handy is code replication. -This can easily be doe using list comprehension. +This can easily be done using the for-syntax similiar to list comprehension. As an example the dynamic construction of a switch statement: switch(i) { - // PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(replicate)] + {% for i in range(replicate) %} case {{ i }}: return /*PY_CODE_GEN i+1*/; break; - // PY_CODE_GEN block_end + {% endfor %} } would result in: @@ -94,25 +66,20 @@ would result in: case 3: return 4; break; } -Note, that the variables that have to be replaced are written in inline comments `{{ i }}`. +Note, that the variables that have to be replaced are written double brackets `{{ i }}`. The given statement will be evaluated and the comment will be replaced by the result. Thus, it is also possible to call functions or do arithmetic. ## Built-In Functions -The generator can easily be extended by including additional file with the `use_file(FILENAME)` command. - - PY_CODE_GEN use_file(helpers.py) - -This will read the file and make all functions and global variables available within following blocks. +It is possible to insert variables or function definitions with the -p parameter, but they need to be defined explicitly in the script itself to be available in the template engine. -`replace()` makes it easier to replace global variables within the code: +For accessing functions the globals variable of the template needs to be updated. - // PY_CODE_GEN block_start replace(local_variables={"test": 2}) - int var = /*PY_CODE_GEN test*/ - // PY_CODE_GEN block_end + template.globals.update({'function': function}) + +The variables need to be passed in the render step. -will generate the code `int var = 2`. + template.render(variable=variable) -It is easily possible to add other helper functions and extend the functionality of the generator using the `use_file` method -or by declaring functions in multi line comments. +This is very inflexible compared to the previous generation of this script. Further evaluation is needed to find out whether a automatic merge of the globals of the script with the globals of the template is possible. \ No newline at end of file From b88cfab356f750b5c686cceb53147565db41f823 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Fri, 18 Nov 2022 13:02:07 +0100 Subject: [PATCH 191/318] write to stdout if no outputfile is given --- scripts/code_generator/generator.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/scripts/code_generator/generator.py b/scripts/code_generator/generator.py index 2c915876..f8b1da3a 100644 --- a/scripts/code_generator/generator.py +++ b/scripts/code_generator/generator.py @@ -4,7 +4,7 @@ from jinja2 import Environment, PackageLoader, BaseLoader, TemplateNotFound, select_autoescape from os.path import join, exists, getmtime -parser = argparse.ArgumentParser(description='Preprocessor for code replication and advanced code modification.') +parser = argparse.ArgumentParser(description='Preprocessor for code replication and advanced code modification using jinja.') parser.add_argument('file', metavar='CODE_FILE', type=str, help='Path to the file that is used as input') parser.add_argument("-o", dest="output_file", default=None, help="Path to the output file. If not given, output will printed to stdout.") @@ -66,9 +66,6 @@ def create_list(content, count): if not args.file: logging.debug('no input file given') exit(1) - if not args.output_file: - logging.debug('no output file given') - exit(1) for p in args.params: logging.debug("Parse statement: %s" % p) exec(p, globals()) @@ -82,11 +79,15 @@ def create_list(content, count): except: pass - if num_replications is None: + if not 'num_replications' in globals(): num_replications = 1 - if num_total_replications is None: + if not 'num_total_replications' in globals(): num_total_replications = 1 - with open(args.output_file, 'w') as f: - f.write(template.render(num_replications=num_replications, num_total_replications=num_total_replications)) \ No newline at end of file + rendered_template = template.render(num_replications=num_replications, num_total_replications=num_total_replications) + try: + with open(args.output_file, 'w') as f: + f.write(rendered_template) + except: + sys.stdout.write(rendered_template) From abfc85b3c4975358319e217da5e50be8abdd3edb Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 28 Nov 2022 17:05:29 +0100 Subject: [PATCH 192/318] update main README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c8ae0604..1f830da8 100755 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ All benchmarks come with the following build dependencies: - CMake >= 3.13 - C++ compiler with C++11 and support (GCC 4.9.0+) - Intel OpenCL FPGA SDK or Xilinx Vitis -- Python 3 for code generation and with [pandas](https://pandas.pydata.org) installed for the evaluation scripts +- Python 3 with [jinja2](https://jinja.palletsprojects.com) for code generation and [pandas](https://pandas.pydata.org) for the evaluation scripts. Moreover, additional libraries are fetched by the build system during configuration: From a9ee4fa2a44e1f89c77441b3b4e913bdcbbc2794 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 5 Dec 2022 10:17:02 +0100 Subject: [PATCH 193/318] fix b_eff template --- b_eff/src/device/communication_bw520n_IEC.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/b_eff/src/device/communication_bw520n_IEC.cl b/b_eff/src/device/communication_bw520n_IEC.cl index f5dcefe8..8f43756b 100644 --- a/b_eff/src/device/communication_bw520n_IEC.cl +++ b/b_eff/src/device/communication_bw520n_IEC.cl @@ -52,7 +52,7 @@ typedef struct { {% for i in range(num_replications) %} channel message_part ch_out_{{ 2*i + 1 }} __attribute((io("kernel_output_ch{{ i % 4 }}"))); channel message_part ch_out_{{ 2*i + 2 }} __attribute((io("kernel_output_ch{{ (i + 2) % 4 }}"))); -channel message_part ch_in_{{ 2*i + 1 }} __attribute((io("kernel_input_ch{{ i % 4 }} "))); +channel message_part ch_in_{{ 2*i + 1 }} __attribute((io("kernel_input_ch{{ i % 4 }}"))); channel message_part ch_in_{{ 2*i + 2 }} __attribute((io("kernel_input_ch{{ (i + 2) % 4 }}"))); channel message_part ch_exchange{{ 2*i + 1 }}; channel message_part ch_exchange{{ 2*i + 2 }}; From d7177cadeb948b610c3bd283d40d18cf6b25231e Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Tue, 4 Oct 2022 20:48:11 +0200 Subject: [PATCH 194/318] add check stage (without really checking) --- .gitlab-ci.yml | 675 ++++++++++++++++--------------------------------- 1 file changed, 215 insertions(+), 460 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6d05430b..144932ad 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,5 @@ stages: + - check - build - test @@ -15,7 +16,7 @@ default: ### # -# Build all benchmarks +# Build documentation # ### @@ -36,287 +37,203 @@ build:docs: - docs/build - docs/xml -build:STREAM: - stage: build +### +# +# Check formatting of all benchmarks +# +### + +.check: &check + stage: check script: - - rm -rf build - - mkdir -p build - - cd build - - cmake ../STREAM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 - - make -j 40 all - artifacts: - paths: - - build/bin/stream_kernels_single_emulate.aocx - - build/bin/stream_kernels_emulate.aocx - - build/bin/STREAM_FPGA_intel - - build/bin/STREAM_FPGA_test_intel + - module load compiler/Clang/13.0.1-GCCcore-11.2.0 + - find $BENCHMARK_FOLDER -regex '.*\.\(cpp\|hpp\|cc\|cxx\|h\)' -exec clang-format -style=file -i {} \; + - git diff | cat + ## do not test for real yet + #- test -z "$(git status --porcelain)" + only: changes: - - STREAM/**/* + - $BENCHMARK_FOLDER/**/* - shared/**/* - scripts/**/* - cmake/**/* - .gitlab-ci.yml -build:STREAM_HP: +check:STREAM: + <<: *check + variables: + BENCHMARK_FOLDER: STREAM + +check:RandomAccess: + <<: *check + variables: + BENCHMARK_FOLDER: RandomAccess + +check:PTRANS: + <<: *check + variables: + BENCHMARK_FOLDER: PTRANS + +check:LINPACK: + <<: *check + variables: + BENCHMARK_FOLDER: LINPACK + +check:GEMM: + <<: *check + variables: + BENCHMARK_FOLDER: GEMM + +check:FFT: + <<: *check + variables: + BENCHMARK_FOLDER: FFT + +check:b_eff: + <<: *check + variables: + BENCHMARK_FOLDER: b_eff + +### +# +# Build all benchmarks +# +### + +.build: &build stage: build script: - rm -rf build - mkdir -p build - cd build - - cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 + - cmake ../$BENCHMARK_FOLDER -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 $BENCHMARK_OPTIONS - make -j 40 all artifacts: paths: - - build/bin/stream_kernels_single_emulate.aocx - - build/bin/stream_kernels_emulate.aocx - - build/bin/STREAM_FPGA_intel - - build/bin/STREAM_FPGA_test_intel + - build/bin/* only: changes: - - STREAM/**/* + - $BENCHMARK_FOLDER/**/* - shared/**/* - scripts/**/* - cmake/**/* - .gitlab-ci.yml +build:STREAM: + <<: *build + variables: + BENCHMARK_FOLDER: STREAM + dependencies: + - check:STREAM + needs: ["check:STREAM"] + +build:STREAM_HP: + <<: *build + variables: + BENCHMARK_FOLDER: STREAM + BENCHMARK_OPTIONS: -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 + dependencies: + - check:STREAM + needs: ["check:STREAM"] + build:STREAM_DP: - stage: build - script: - - rm -rf build - - mkdir -p build - - cd build - - cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 - - make -j 40 all - artifacts: - paths: - - build/bin/stream_kernels_single_emulate.aocx - - build/bin/stream_kernels_emulate.aocx - - build/bin/STREAM_FPGA_intel - - build/bin/STREAM_FPGA_test_intel - only: - changes: - - STREAM/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml - + <<: *build + variables: + BENCHMARK_FOLDER: STREAM + BENCHMARK_OPTIONS: -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 + dependencies: + - check:STREAM + needs: ["check:STREAM"] + build:RandomAccess: - stage: build - script: - - rm -rf build - - mkdir -p build - - cd build - - cmake ../RandomAccess -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 - - make -j 40 all - artifacts: - paths: - - build/bin/random_access_kernels_single_emulate.aocx - - build/bin/RandomAccess_intel - - build/bin/RandomAccess_test_intel - only: - changes: - - RandomAccess/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml - + <<: *build + variables: + BENCHMARK_FOLDER: RandomAccess + dependencies: + - check:RandomAccess + needs: ["check:RandomAccess"] build:PTRANS: - stage: build - script: - - rm -rf build - - mkdir -p build - - cd build - - cmake ../PTRANS -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DHOST_EMULATION_REORDER=Yes - - make -j 40 all - artifacts: - paths: - - build/bin/transpose_DIAG_IEC_emulate.aocx - - build/bin/transpose_PQ_IEC_emulate.aocx - - build/bin/transpose_PQ_PCIE_emulate.aocx - - build/bin/transpose_DIAG_PCIE_emulate.aocx - - build/bin/transpose_c2_DIAG_IEC_emulate.aocx - - build/bin/Transpose_intel - - build/bin/Transpose_test_intel - only: - changes: - - PTRANS/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml + <<: *build + variables: + BENCHMARK_FOLDER: PTRANS + BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes + dependencies: + - check:PTRANS + needs: ["check:PTRANS"] build:LINPACK: - stage: build - script: - - rm -rf build - - mkdir -p build - - cd build - - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DUSE_PCIE_MPI_COMMUNICATION=Yes - - make -j 40 all - artifacts: - paths: - - build/bin/hpl_torus_PCIE_emulate.aocx - - build/bin/hpl_torus_IEC_emulate.aocx - - build/bin/Linpack_intel - - build/bin/Linpack_test_intel - only: - changes: - - LINPACK/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml - + <<: *build + variables: + BENCHMARK_FOLDER: LINPACK + BENCHMARK_OPTIONS: -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 + dependencies: + - check:LINPACK + needs: ["check:LINPACK"] build:LINPACK_DP: - stage: build - script: - - rm -rf build - - mkdir -p build - - cd build - - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double - - make -j 40 all - artifacts: - paths: - - build/bin/hpl_torus_PCIE_emulate.aocx - - build/bin/hpl_torus_IEC_emulate.aocx - - build/bin/Linpack_intel - - build/bin/Linpack_test_intel - only: - changes: - - LINPACK/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml + <<: *build + variables: + BENCHMARK_FOLDER: LINPACK + BENCHMARK_OPTIONS: -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double + dependencies: + - check:LINPACK + needs: ["check:LINPACK"] build:GEMM: - stage: build - script: - - rm -rf build - - mkdir -p build - - cd build - - cmake ../GEMM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32 - - make -j 40 all - artifacts: - paths: - - build/bin/gemm_base_emulate.aocx - - build/bin/GEMM_intel - - build/bin/GEMM_test_intel - only: - changes: - - GEMM/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml + <<: *build + variables: + BENCHMARK_FOLDER: GEMM + BENCHMARK_OPTIONS: -DBLOCK_SIZE=32 + dependencies: + - check:GEMM + needs: ["check:GEMM"] build:GEMM_HP_REP2: - stage: build - script: - - rm -rf build - - mkdir -p build - - cd build - - cmake ../GEMM -DDATA_TYPE=half -DNUM_REPLICATIONS=2 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32 - - make -j 40 all - artifacts: - paths: - - build/bin/gemm_base_emulate.aocx - - build/bin/GEMM_intel - - build/bin/GEMM_test_intel - only: - changes: - - GEMM/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml + <<: *build + variables: + BENCHMARK_FOLDER: GEMM + BENCHMARK_OPTIONS: -DDATA_TYPE=half -DNUM_REPLICATIONS=2 -DBLOCK_SIZE=32 + dependencies: + - check:GEMM + needs: ["check:GEMM"] build:GEMM_DP_REP2: - stage: build - script: - - rm -rf build - - mkdir -p build - - cd build - - cmake ../GEMM -DDATA_TYPE=double -DNUM_REPLICATIONS=2 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32 - - make -j 40 all - artifacts: - paths: - - build/bin/gemm_base_emulate.aocx - - build/bin/GEMM_intel - - build/bin/GEMM_test_intel - only: - changes: - - GEMM/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml + <<: *build + variables: + BENCHMARK_FOLDER: GEMM + BENCHMARK_OPTIONS: -DDATA_TYPE=double -DNUM_REPLICATIONS=2 -DBLOCK_SIZE=32 + dependencies: + - check:GEMM + needs: ["check:GEMM"] build:FFT: - stage: build - script: - - rm -rf build - - mkdir -p build - - cd build - - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 - - make -j 40 all - artifacts: - paths: - - build/bin/fft1d_float_8_emulate.aocx - - build/bin/FFT_intel - - build/bin/FFT_test_intel - only: - changes: - - FFT/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml + <<: *build + variables: + BENCHMARK_FOLDER: FFT + dependencies: + - check:FFT + needs: ["check:FFT"] build:FFT_small: - stage: build - script: - - rm -rf build - - mkdir -p build - - cd build - - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2 - - make -j 40 all - artifacts: - paths: - - build/bin/fft1d_float_8_emulate.aocx - - build/bin/FFT_intel - - build/bin/FFT_test_intel - only: - changes: - - FFT/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml + <<: *build + variables: + BENCHMARK_FOLDER: FFT + BENCHMARK_OPTIONS: -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2 + dependencies: + - check:FFT + needs: ["check:FFT"] build:b_eff: - stage: build - script: - - rm -rf build - - mkdir -p build - - cd build - - cmake ../b_eff -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DHOST_EMULATION_REORDER=Yes - - make -j 40 all - artifacts: - paths: - - build/bin/* - only: - changes: - - b_eff/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml + <<: *build + variables: + BENCHMARK_FOLDER: b_eff + BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes + dependencies: + - check:b_eff + needs: ["check:b_eff"] + ### # @@ -324,300 +241,138 @@ build:b_eff: # ### -test:STREAM: +.test: &test stage: test script: - cd build - - cmake ../STREAM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 + - cmake ../$BENCHMARK_FOLDER -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 - make CTEST_OUTPUT_ON_FAILURE=1 test - dependencies: - - build:STREAM artifacts: when: on_failure paths: - build/Testing/Temporary/LastTest.log only: changes: - - STREAM/**/* + - $BENCHMARK_FOLDER/**/* - shared/**/* - scripts/**/* - cmake/**/* - .gitlab-ci.yml + + +test:STREAM: + <<: *test + variables: + BENCHMARK_FOLDER: STREAM + dependencies: + - build:STREAM needs: ["build:STREAM"] test:STREAM_HP: - stage: test - script: - - cd build - - cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 - - make CTEST_OUTPUT_ON_FAILURE=1 test + <<: *test + variables: + BENCHMARK_FOLDER: STREAM + BENCHMARK_OPTIONS: -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 dependencies: - build:STREAM_HP - artifacts: - when: on_failure - paths: - - build/Testing/Temporary/LastTest.log - only: - changes: - - STREAM/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml needs: ["build:STREAM_HP"] - # Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE) - allow_failure: true test:STREAM_DP: - stage: test - script: - - cd build - - cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 - - make CTEST_OUTPUT_ON_FAILURE=1 test + <<: *test + variables: + BENCHMARK_FOLDER: STREAM + BENCHMARK_OPTIONS: -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 dependencies: - build:STREAM_DP - artifacts: - when: on_failure - paths: - - build/Testing/Temporary/LastTest.log - only: - changes: - - STREAM/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml needs: ["build:STREAM_DP"] - + test:RandomAccess: - stage: test - script: - - cd build - - cmake ../RandomAccess -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 - - make CTEST_OUTPUT_ON_FAILURE=1 test + <<: *test + variables: + BENCHMARK_FOLDER: RandomAccess dependencies: - build:RandomAccess - artifacts: - when: on_failure - paths: - - build/Testing/Temporary/LastTest.log - only: - changes: - - RandomAccess/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml needs: ["build:RandomAccess"] test:PTRANS: - stage: test - script: - - cd build - - cmake ../PTRANS -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DPTRANS_HOST_EMULATION_REORDER=Yes - - cd bin - - touch kernel_output_ch0 - - touch kernel_output_ch1 - - touch kernel_output_ch2 - - touch kernel_output_ch3 - - ln -s kernel_output_ch0 kernel_input_ch1 - - ln -s kernel_output_ch2 kernel_input_ch3 - - ln -s kernel_output_ch1 kernel_input_ch0 - - ln -s kernel_output_ch3 kernel_input_ch2 - - cd .. - - make CTEST_OUTPUT_ON_FAILURE=1 test + <<: *test + variables: + BENCHMARK_FOLDER: PTRANS + BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes dependencies: - build:PTRANS - artifacts: - when: on_failure - paths: - - build/Testing/Temporary/LastTest.log - only: - changes: - - PTRANS/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml needs: ["build:PTRANS"] test:LINPACK: - stage: test - script: - - cd build - - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 - - make CTEST_OUTPUT_ON_FAILURE=1 test + <<: *test + variables: + BENCHMARK_FOLDER: LINPACK + BENCHMARK_OPTIONS: -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 dependencies: - build:LINPACK - artifacts: - when: on_failure - paths: - - build/Testing/Temporary/LastTest.log - only: - changes: - - LINPACK/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml needs: ["build:LINPACK"] test:LINPACK_DP: - stage: test - script: - - cd build - - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double - - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test + <<: *test + variables: + BENCHMARK_FOLDER: LINPACK + BENCHMARK_OPTIONS: -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double dependencies: - build:LINPACK_DP - artifacts: - when: on_failure - paths: - - build/Testing/Temporary/LastTest.log - only: - changes: - - LINPACK/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* needs: ["build:LINPACK_DP"] test:GEMM: - stage: test - script: - - cd build - - cmake ../GEMM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32 - - make CTEST_OUTPUT_ON_FAILURE=1 test + <<: *test + variables: + BENCHMARK_FOLDER: GEMM + BENCHMARK_OPTIONS: -DBLOCK_SIZE=32 dependencies: - build:GEMM - artifacts: - when: on_failure - paths: - - build/Testing/Temporary/LastTest.log - only: - changes: - - GEMM/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml needs: ["build:GEMM"] test:GEMM_HP_REP2: - stage: test - script: - - cd build - - cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=half -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32 - - make CTEST_OUTPUT_ON_FAILURE=1 test + <<: *test + variables: + BENCHMARK_FOLDER: GEMM + BENCHMARK_OPTIONS: -DDATA_TYPE=half -DNUM_REPLICATIONS=2 -DBLOCK_SIZE=32 dependencies: - build:GEMM_HP_REP2 - artifacts: - when: on_failure - paths: - - build/Testing/Temporary/LastTest.log - only: - changes: - - GEMM/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml needs: ["build:GEMM_HP_REP2"] - # Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE) - allow_failure: true + test:GEMM_DP_REP2: - stage: test - script: - - cd build - - cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=double -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32 - - make CTEST_OUTPUT_ON_FAILURE=1 test + <<: *test + variables: + BENCHMARK_FOLDER: GEMM + BENCHMARK_OPTIONS: -DDATA_TYPE=double -DNUM_REPLICATIONS=2 -DBLOCK_SIZE=32 dependencies: - build:GEMM_DP_REP2 - artifacts: - when: on_failure - paths: - - build/Testing/Temporary/LastTest.log - only: - changes: - - GEMM/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml needs: ["build:GEMM_DP_REP2"] test:FFT: - stage: test - script: - - cd build - - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 - - make CTEST_OUTPUT_ON_FAILURE=1 test + <<: *test + variables: + BENCHMARK_FOLDER: FFT dependencies: - build:FFT - artifacts: - when: on_failure - paths: - - build/Testing/Temporary/LastTest.log - only: - changes: - - FFT/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml needs: ["build:FFT"] test:FFT_small: - stage: test - script: - - cd build - - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2 - - make CTEST_OUTPUT_ON_FAILURE=1 test + <<: *test + variables: + BENCHMARK_FOLDER: FFT + BENCHMARK_OPTIONS: -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2 dependencies: - build:FFT_small - artifacts: - when: on_failure - paths: - - build/Testing/Temporary/LastTest.log - only: - changes: - - FFT/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml needs: ["build:FFT_small"] test:b_eff: - stage: test - script: - - cd build - - cmake ../b_eff -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DHOST_EMULATION_REORDER=Yes - - cd bin - - touch kernel_output_ch0 - - touch kernel_output_ch1 - - touch kernel_output_ch2 - - touch kernel_output_ch3 - - ln -s kernel_output_ch0 kernel_input_ch1 - - ln -s kernel_output_ch2 kernel_input_ch3 - - ln -s kernel_output_ch1 kernel_input_ch0 - - ln -s kernel_output_ch3 kernel_input_ch2 - - cd .. - - make CTEST_OUTPUT_ON_FAILURE=1 test + <<: *test + variables: + BENCHMARK_FOLDER: b_eff + BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes dependencies: - build:b_eff - artifacts: - when: on_failure - paths: - - build/Testing/Temporary/LastTest.log - only: - changes: - - b_eff/**/* - - shared/**/* - - scripts/**/* - - cmake/**/* - - .gitlab-ci.yml needs: ["build:b_eff"] From f4552d62fbfbc348aca3a69212d93aa21b36ca31 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Fri, 25 Nov 2022 13:10:15 +0100 Subject: [PATCH 195/318] use explicit artifacts --- .gitlab-ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 144932ad..015fb208 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -244,8 +244,10 @@ build:b_eff: .test: &test stage: test script: + - mkdir -p build - cd build - - cmake ../$BENCHMARK_FOLDER -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 + - cmake ../$BENCHMARK_FOLDER $BENCHMARK_OPTIONS -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 + - $PREPARE_SCRIPT - make CTEST_OUTPUT_ON_FAILURE=1 test artifacts: when: on_failure From 81fb766bbc9c4f0ee304b306ed713367da650f16 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Fri, 25 Nov 2022 13:25:43 +0100 Subject: [PATCH 196/318] add prepare_tests script for b_eff and PTRANS --- .gitlab-ci.yml | 3 +++ PTRANS/scripts/prepare_tests.sh | 11 +++++++++++ b_eff/scripts/prepare_tests.sh | 11 +++++++++++ 3 files changed, 25 insertions(+) create mode 100755 PTRANS/scripts/prepare_tests.sh create mode 100755 b_eff/scripts/prepare_tests.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 015fb208..e8d9c70e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -247,6 +247,7 @@ build:b_eff: - mkdir -p build - cd build - cmake ../$BENCHMARK_FOLDER $BENCHMARK_OPTIONS -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 + - make all -j41 - $PREPARE_SCRIPT - make CTEST_OUTPUT_ON_FAILURE=1 test artifacts: @@ -301,6 +302,7 @@ test:PTRANS: variables: BENCHMARK_FOLDER: PTRANS BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes + PREPARE_SCRIPT: ../$BENCHMARK_FOLDER/scripts/prepare_tests.sh ./bin dependencies: - build:PTRANS needs: ["build:PTRANS"] @@ -373,6 +375,7 @@ test:b_eff: variables: BENCHMARK_FOLDER: b_eff BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes + PREPARE_SCRIPT: ../$BENCHMARK_FOLDER/scripts/prepare_tests.sh ./bin dependencies: - build:b_eff needs: ["build:b_eff"] diff --git a/PTRANS/scripts/prepare_tests.sh b/PTRANS/scripts/prepare_tests.sh new file mode 100755 index 00000000..2705d74d --- /dev/null +++ b/PTRANS/scripts/prepare_tests.sh @@ -0,0 +1,11 @@ +#!/usr/bin/bash + +cd $1 +touch kernel_output_ch0 +touch kernel_output_ch1 +touch kernel_output_ch2 +touch kernel_output_ch3 +ln -s kernel_output_ch0 kernel_input_ch1 +ln -s kernel_output_ch2 kernel_input_ch3 +ln -s kernel_output_ch1 kernel_input_ch0 +ln -s kernel_output_ch3 kernel_input_ch2 diff --git a/b_eff/scripts/prepare_tests.sh b/b_eff/scripts/prepare_tests.sh new file mode 100755 index 00000000..2705d74d --- /dev/null +++ b/b_eff/scripts/prepare_tests.sh @@ -0,0 +1,11 @@ +#!/usr/bin/bash + +cd $1 +touch kernel_output_ch0 +touch kernel_output_ch1 +touch kernel_output_ch2 +touch kernel_output_ch3 +ln -s kernel_output_ch0 kernel_input_ch1 +ln -s kernel_output_ch2 kernel_input_ch3 +ln -s kernel_output_ch1 kernel_input_ch0 +ln -s kernel_output_ch3 kernel_input_ch2 From e24873c073a1ccc2aedcf383a13fd644f506a98a Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Sat, 10 Dec 2022 13:24:32 +0100 Subject: [PATCH 197/318] allow failure of STREAM_HP and GEMM_HP_REP2 --- .gitlab-ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e8d9c70e..5136df6c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -198,6 +198,7 @@ build:GEMM_HP_REP2: dependencies: - check:GEMM needs: ["check:GEMM"] + allow_failure: true build:GEMM_DP_REP2: <<: *build @@ -279,6 +280,7 @@ test:STREAM_HP: dependencies: - build:STREAM_HP needs: ["build:STREAM_HP"] + allow_failure: true test:STREAM_DP: <<: *test @@ -342,7 +344,7 @@ test:GEMM_HP_REP2: dependencies: - build:GEMM_HP_REP2 needs: ["build:GEMM_HP_REP2"] - + allow_failure: true test:GEMM_DP_REP2: <<: *test From 202edd148c379b12eaac3d0cefdd4d187082abb9 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Sat, 10 Dec 2022 13:35:03 +0100 Subject: [PATCH 198/318] add .clang-format file --- .clang-format | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..5b1cfaa3 --- /dev/null +++ b/.clang-format @@ -0,0 +1,125 @@ +--- +Language: Cpp +# BasedOnStyle: LLVM +# https://releases.llvm.org/12.0.1/tools/clang/docs/ClangFormatStyleOptions.html +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignConsecutiveMacros: None +AlignConsecutiveAssignments: None +AlignConsecutiveBitFields: None +AlignConsecutiveDeclarations: None +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortEnumsOnASingleLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: true +BinPackParameters: true +BreakBeforeBinaryOperators: None +BreakBeforeConceptDeclarations: true +BreakBeforeBraces: Linux +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 120 +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DeriveLineEnding: true +DerivePointerAlignment: false +DisableFormat: false +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +IncludeBlocks: Preserve +IndentCaseLabels: false +IndentCaseBlocks: false +IndentGotoLabels: true +IndentPPDirectives: None +IndentExternBlock: AfterExternBlock +IndentRequires: false +IndentWidth: 4 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: true +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PenaltyIndentedWhitespace: 0 +PointerAlignment: Right +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceAroundPointerQualifiers: Default +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInConditionalStatement: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +BitFieldColonSpacing: Both +Standard: Latest +UseCRLF: false +UseTab: Never +CommentPragmas: '^ IWYU pragma:' +ForEachMacros: + - foreach +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +StatementMacros: [] +StatementAttributeLikeMacros: + - Q_EMIT +WhitespaceSensitiveMacros: [] +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 1 + SortPriority: 0 + CaseSensitive: false From 5c5f02775d356cc4ce913f51ed3b6a4aaa1b286c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 13 Dec 2022 17:01:23 +0100 Subject: [PATCH 199/318] Fix platform string behavior --- shared/setup/fpga_setup.cpp | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp index e6039973..5d0b79d1 100644 --- a/shared/setup/fpga_setup.cpp +++ b/shared/setup/fpga_setup.cpp @@ -245,20 +245,23 @@ choose a device. // Choose the target platform long unsigned int chosenPlatformId = 0; - if (defaultPlatform >= 0) { - if (platformString.size() > 0) { - bool found = false; - for (int i = 0; i < platformList.size(); i++) { - if (platformList[i].getInfo() == platformString) { - chosenPlatformId = i; - found = true; - break; - } + if (platformString.size() > 0) { + // Platform string has highest priority + bool found = false; + for (int i = 0; i < platformList.size(); i++) { + if (platformList[i].getInfo() == platformString) { + chosenPlatformId = i; + found = true; + break; } - if (!found) { - throw FpgaSetupException("Invalid platform string specified: " + platformString); - } - } else if (defaultPlatform < static_cast(platformList.size())) { + } + if (!found) { + throw FpgaSetupException("Invalid platform string specified: " + platformString); + } + } + else if (defaultPlatform >= 0) { + // Otherwise, select platform by index + if (defaultPlatform < static_cast(platformList.size())) { chosenPlatformId = defaultPlatform; } else { std::cerr << "Default platform " << defaultPlatform From ede7793eae0f85044917dc5dde03ef090e7da6a4 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Fri, 23 Sep 2022 11:51:22 +0200 Subject: [PATCH 200/318] first approach --- LINPACK/src/host/linpack_benchmark.cpp | 56 ++++++++----- LINPACK/src/host/linpack_benchmark.hpp | 6 +- extern/CMakeLists.txt | 21 +++++ shared/CMakeLists.txt | 2 +- shared/include/hpcc_benchmark.hpp | 95 ++++++++++++++++++++--- shared/tests/hpcc_base_benchmark_test.cpp | 10 ++- 6 files changed, 158 insertions(+), 32 deletions(-) diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp index d60be9d1..f0cd9867 100644 --- a/LINPACK/src/host/linpack_benchmark.cpp +++ b/LINPACK/src/host/linpack_benchmark.cpp @@ -126,11 +126,11 @@ linpack::LinpackBenchmark::executeKernel(LinpackData &data) { } void -linpack::LinpackBenchmark::collectAndPrintResults(const linpack::LinpackExecutionTimings &output) { +linpack::LinpackBenchmark::collectResults(const linpack::LinpackExecutionTimings &output) { // Calculate performance for kernel execution plus data transfer - double tmean = 0; - double tlumean = 0; - double tslmean = 0; + double t = 0; + double tlu = 0; + double tsl = 0; double tmin = std::numeric_limits::max(); double lu_min = std::numeric_limits::max(); double sl_min = std::numeric_limits::max(); @@ -154,13 +154,13 @@ linpack::LinpackBenchmark::collectAndPrintResults(const linpack::LinpackExecutio } double total_matrix_size = static_cast(executionSettings->programSettings->matrixSize); - double gflops_lu = ((2.0e0*total_matrix_size * total_matrix_size * total_matrix_size)/ 3.0) / 1.0e9; - double gflops_sl = (2.0*(total_matrix_size * total_matrix_size))/1.0e9; + double gflop_lu = ((2.0e0*total_matrix_size * total_matrix_size * total_matrix_size)/ 3.0) / 1.0e9; + double gflop_sl = (2.0*(total_matrix_size * total_matrix_size))/1.0e9; for (int i =0; i < global_lu_times.size(); i++) { double currentTime = global_lu_times[i] + global_sl_times[i]; - tmean += currentTime; - tlumean += global_lu_times[i]; - tslmean += global_sl_times[i]; + t += currentTime; + tlu += global_lu_times[i]; + tsl += global_sl_times[i]; if (currentTime < tmin) { tmin = currentTime; } @@ -171,29 +171,47 @@ linpack::LinpackBenchmark::collectAndPrintResults(const linpack::LinpackExecutio sl_min = global_sl_times[i]; } } - tmean = tmean / global_lu_times.size(); - tlumean = tlumean / global_lu_times.size(); - tslmean = tslmean / global_sl_times.size(); + + results.emplace("t_mean", hpcc_base::HpccResult(t / global_lu_times.size(), "s")); + results.emplace("t_min", hpcc_base::HpccResult(tmin, "?")); + results.emplace("tlu_mean", hpcc_base::HpccResult(tlu / global_lu_times.size(), "s")); + results.emplace("tlu_min", hpcc_base::HpccResult(lu_min, "s")); + results.emplace("tsl_mean", hpcc_base::HpccResult(tsl / global_sl_times.size(), "s")); + results.emplace("tsl_min", hpcc_base::HpccResult(sl_min, "s")); + results.emplace("gflops", hpcc_base::HpccResult((gflop_lu + gflop_sl) / tmin, "GFLOP/s")); + results.emplace("gflops_lu", hpcc_base::HpccResult(gflop_lu / lu_min, "GFLOP/s")); + results.emplace("gflops_sl", hpcc_base::HpccResult(gflop_sl / sl_min, "GFLOP/s")); + + return; +} - std::cout << std::setw(ENTRY_SPACE) +void +linpack::LinpackBenchmark::printResults() { + if (mpi_comm_rank > 0) { + return; + } + + std::cout << std::setw(ENTRY_SPACE) << "Method" << std::setw(ENTRY_SPACE) << "best" << std::setw(ENTRY_SPACE) << "mean" << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl; + /* std::cout << std::setw(ENTRY_SPACE) << "total" << std::setw(ENTRY_SPACE) - << tmin << std::setw(ENTRY_SPACE) << tmean - << std::setw(ENTRY_SPACE) << ((gflops_lu + gflops_sl) / tmin) + << results["t_min"] << std::setw(ENTRY_SPACE) << results["t_mean"] + << std::setw(ENTRY_SPACE) << results["gflops"] << std::endl; std::cout << std::setw(ENTRY_SPACE) << "GEFA" << std::setw(ENTRY_SPACE) - << lu_min << std::setw(ENTRY_SPACE) << tlumean - << std::setw(ENTRY_SPACE) << ((gflops_lu) / lu_min) + << results["tlu_min"] << std::setw(ENTRY_SPACE) << results["tlu_mean"] + << std::setw(ENTRY_SPACE) << results["gflops_lu"] << std::endl; std::cout << std::setw(ENTRY_SPACE) << "GESL" << std::setw(ENTRY_SPACE) - << sl_min << std::setw(ENTRY_SPACE) << tslmean - << std::setw(ENTRY_SPACE) << (gflops_sl / sl_min) + << results["tsl_min"] << std::setw(ENTRY_SPACE) << results["tsl_mean"] + << std::setw(ENTRY_SPACE) << results["gflops_sl"] << std::endl; + */ } std::unique_ptr diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp index c05b323a..adbae5ef 100644 --- a/LINPACK/src/host/linpack_benchmark.hpp +++ b/LINPACK/src/host/linpack_benchmark.hpp @@ -264,8 +264,12 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark +#include /* External library headers */ #ifdef USE_DEPRECATED_HPP_HEADER @@ -37,6 +38,7 @@ SOFTWARE. /* Project's headers */ #include "setup/fpga_setup.hpp" #include "cxxopts.hpp" +#include "nlohmann/json.hpp" #include "parameters.h" #include "communication_types.hpp" @@ -45,6 +47,8 @@ SOFTWARE. #define ENTRY_SPACE 15 +using json = nlohmann::json; + /** * @brief Contains all classes and functions that are used as basis * for all benchmarks. @@ -52,6 +56,25 @@ SOFTWARE. */ namespace hpcc_base { +class HpccResult { + double value; + std::string unit; + +public: + HpccResult(double value, std::string unit): value(value), unit(unit) {} + + friend std::ostream &operator<<(std::ostream &os, const HpccResult &result) { + os << result.value << " " << result.unit; + return os; + } + + std::string to_string() const { + std::ostringstream oss; + oss << *this; + return oss.str(); + } +}; + /** * @brief This class should be derived and extended for every benchmark. * It is a pure data object containing the benchmark settings that are @@ -119,6 +142,8 @@ class BaseSettings { * */ bool testOnly; + + std::string dumpfilePath; /** * @brief Type of inter-FPGA communication used @@ -152,6 +177,7 @@ class BaseSettings { #else communicationType(retrieveCommunicationType("UNSUPPORTED", results["f"].as())), #endif + dumpfilePath(results["dump"].as()), testOnly(static_cast(results.count("test"))) {} /** @@ -236,6 +262,17 @@ class ExecutionSettings { programSettings = nullptr; } + std::string + getDeviceName() const { + std::string device_name; + if (!programSettings->testOnly) { + device->getInfo(CL_DEVICE_NAME, &device_name); + } else { + device_name = "TEST RUN: Not selected!"; + } + return device_name; + } + }; /** @@ -294,6 +331,15 @@ class HpccFpgaBenchmark { * */ bool mpi_external_init = true; + + + /** + * + * @brief vector containing the benchmark results + * + */ + std::map results; + public: @@ -331,7 +377,10 @@ class HpccFpgaBenchmark { * @param output The measurement data of the kernel execution */ virtual void - collectAndPrintResults(const TOutput &output) = 0; + collectResults(const TOutput &output) = 0; + + virtual void + printResults() = 0; /** * @brief Method that can be overwritten by inheriting classes to check the validity of input parameters. @@ -396,6 +445,7 @@ class HpccFpgaBenchmark { ("comm-type", "Used communication type for inter-FPGA communication", cxxopts::value()->default_value(DEFAULT_COMM_TYPE)) #endif + ("dump", "dump benchmark configuration and results to this file", cxxopts::value()->default_value(std::string(""))) ("test", "Only test given configuration and skip execution and validation") ("h,help", "Print this help"); @@ -448,6 +498,32 @@ class HpccFpgaBenchmark { std::cout << "Summary:" << std::endl; std::cout << *executionSettings << std::endl; } + + std::map getResultsMap() { + // TODO: nested maps, recursive? + std::map results_string; + for (auto const &result: results) { + results_string[result.first] = result.second.to_string(); + } + return results_string; + } + + void + dumpConfigurationAndResults(std::string file_path) { + std::fstream fs; + fs.open(file_path, std::ios_base::out); + if (!fs.is_open()) { + std::cout << "Unable to open file for dumping configuration and results" << std::endl; + } else { + json dump; + std::string device_name = executionSettings->getDeviceName(); + dump["device"] = device_name; + dump["settings"] = json(executionSettings->programSettings->getSettingsMap()); + dump["results"] = getResultsMap(); + + fs << dump; + } + } /** * @brief Selects and prepares the target device and prints the final configuration. @@ -586,7 +662,13 @@ class HpccFpgaBenchmark { std::cout << "Validation Time: " << eval_time.count() << " s" << std::endl; } } - collectAndPrintResults(*output); + collectResults(*output); + + if (executionSettings->programSettings->dumpfilePath.size() > 0) { + dumpConfigurationAndResults(executionSettings->programSettings->dumpfilePath); + } + + printResults(); if (mpi_comm_rank == 0) { if (!validateSuccess) { @@ -658,6 +740,7 @@ class HpccFpgaBenchmark { }; + /** * @brief Prints the execution settings to an output stream * @@ -668,14 +751,8 @@ class HpccFpgaBenchmark { */ template std::ostream& operator<<(std::ostream& os, ExecutionSettings const& printedExecutionSettings){ - std::string device_name; + std::string device_name = printedExecutionSettings.getDeviceName(); os << std::left; - if (!printedExecutionSettings.programSettings->testOnly) { - printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name); - } - else { - device_name = "TEST RUN: Not selected!"; - } for (auto k : printedExecutionSettings.programSettings->getSettingsMap()) { os << std::setw(2 * ENTRY_SPACE) << k.first << k.second << std::endl; } diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp index 1c491b49..05489e17 100644 --- a/shared/tests/hpcc_base_benchmark_test.cpp +++ b/shared/tests/hpcc_base_benchmark_test.cpp @@ -45,7 +45,10 @@ class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark Date: Wed, 5 Oct 2022 09:55:51 +0200 Subject: [PATCH 201/318] dump env, timings, git commit and config time --- LINPACK/src/common/parameters.h.in | 3 +- .../host/execution_types/execution_iec.hpp | 11 +++-- .../host/execution_types/execution_pcie.hpp | 10 ++-- LINPACK/src/host/linpack_benchmark.cpp | 18 +++---- LINPACK/src/host/linpack_benchmark.hpp | 6 +-- ...nel_functionality_and_host_integration.cpp | 4 +- shared/include/hpcc_benchmark.hpp | 47 +++++++++++++------ shared/tests/hpcc_base_benchmark_test.cpp | 18 +++---- 8 files changed, 67 insertions(+), 50 deletions(-) diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in index 4c036fb9..7d192e56 100644 --- a/LINPACK/src/common/parameters.h.in +++ b/LINPACK/src/common/parameters.h.in @@ -34,7 +34,8 @@ /* Short description of the program */ -#define PROGRAM_DESCRIPTION "Implementation of the LINPACK benchmark"\ +#define PROGRAM_NAME "LINPACK" +#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\ " proposed in the HPCC benchmark suite for FPGA.\n"\ "Version: " VERSION "\n" diff --git a/LINPACK/src/host/execution_types/execution_iec.hpp b/LINPACK/src/host/execution_types/execution_iec.hpp index b98bcc31..1584f8e2 100644 --- a/LINPACK/src/host/execution_types/execution_iec.hpp +++ b/LINPACK/src/host/execution_types/execution_iec.hpp @@ -44,7 +44,7 @@ namespace iec { /* Prepare kernels and execute benchmark for a bitstream that makes use of intel external channels */ -std::unique_ptr +std::map> calculate(const hpcc_base::ExecutionSettings&config, linpack::LinpackData& data) { @@ -722,13 +722,14 @@ calculate(const hpcc_base::ExecutionSettings&co } buffer_queue.finish(); #endif + + std::map> timings; - std::unique_ptr results( - new linpack::LinpackExecutionTimings{gefaExecutionTimes, geslExecutionTimes}); + timings["gefa"] = gefaExecutionTimes; + timings["gesl"] = geslExecutionTimes; MPI_Barrier(MPI_COMM_WORLD); - - return results; + return timings; } } // namespace iec diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp index 51b9c546..e86600d2 100644 --- a/LINPACK/src/host/execution_types/execution_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_pcie.hpp @@ -50,7 +50,7 @@ namespace pcie { @copydoc bm_execution::calculate() */ -std::unique_ptr +std::map> calculate(const hpcc_base::ExecutionSettings&config, linpack::LinpackData& data) { @@ -717,12 +717,14 @@ calculate(const hpcc_base::ExecutionSettings&co MPI_Comm_free(&row_communicator); MPI_Comm_free(&col_communicator); - std::unique_ptr results( - new linpack::LinpackExecutionTimings{gefaExecutionTimes, geslExecutionTimes}); + std::map> timings; + + timings["gefa"] = gefaExecutionTimes; + timings["gesl"] = geslExecutionTimes; MPI_Barrier(MPI_COMM_WORLD); - return results; + return timings; } } // namespace pcie diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp index f0cd9867..b4156d57 100644 --- a/LINPACK/src/host/linpack_benchmark.cpp +++ b/LINPACK/src/host/linpack_benchmark.cpp @@ -111,9 +111,8 @@ linpack::LinpackBenchmark::addAdditionalParseOptions(cxxopts::Options &options) ("emulation", "Use kernel arguments for emulation. This may be necessary to simulate persistent local memory on the FPGA"); } -std::unique_ptr +void linpack::LinpackBenchmark::executeKernel(LinpackData &data) { - std::unique_ptr timings; switch (executionSettings->programSettings->communicationType) { case hpcc_base::CommunicationType::pcie_mpi : timings = execution::pcie::calculate(*executionSettings, data); break; case hpcc_base::CommunicationType::intel_external_channels: timings = execution::iec::calculate(*executionSettings, data); break; @@ -122,11 +121,10 @@ linpack::LinpackBenchmark::executeKernel(LinpackData &data) { #ifdef DISTRIBUTED_VALIDATION distributed_gesl_nopvt_ref(data); #endif - return timings; } void -linpack::LinpackBenchmark::collectResults(const linpack::LinpackExecutionTimings &output) { +linpack::LinpackBenchmark::collectResults() { // Calculate performance for kernel execution plus data transfer double t = 0; double tlu = 0; @@ -139,10 +137,10 @@ linpack::LinpackBenchmark::collectResults(const linpack::LinpackExecutionTimings std::cout << "Rank " << mpi_comm_rank << ": Result collection started" << std::endl; #endif - std::vector global_lu_times(output.gefaTimings.size()); - MPI_Reduce(output.gefaTimings.data(), global_lu_times.data(), output.gefaTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - std::vector global_sl_times(output.geslTimings.size()); - MPI_Reduce(output.geslTimings.data(), global_sl_times.data(), output.geslTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + std::vector global_lu_times(timings["gefa"].size()); + MPI_Reduce(timings["gefa"].data(), global_lu_times.data(), timings["gefa"].size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + std::vector global_sl_times(timings["gesl"].size()); + MPI_Reduce(timings["gesl"].data(), global_sl_times.data(), timings["gesl"].size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); #ifndef NDEBUG std::cout << "Rank " << mpi_comm_rank << ": Result collection done" << std::endl; #endif @@ -187,10 +185,6 @@ linpack::LinpackBenchmark::collectResults(const linpack::LinpackExecutionTimings void linpack::LinpackBenchmark::printResults() { - if (mpi_comm_rank > 0) { - return; - } - std::cout << std::setw(ENTRY_SPACE) << "Method" << std::setw(ENTRY_SPACE) << "best" << std::setw(ENTRY_SPACE) << "mean" diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp index adbae5ef..7c7ce315 100644 --- a/LINPACK/src/host/linpack_benchmark.hpp +++ b/LINPACK/src/host/linpack_benchmark.hpp @@ -210,7 +210,7 @@ class LinpackExecutionTimings { * @brief Implementation of the Linpack benchmark * */ -class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark { +class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark { protected: @@ -246,7 +246,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark Measured runtimes of the kernel execution */ - std::unique_ptr + void executeKernel(LinpackData &data) override; /** @@ -266,7 +266,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark { * Execution returns correct results for a single repetition */ TEST_P(LinpackKernelTest, FPGACorrectResultsOneRepetition) { - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); for (int i = 0; i < array_size; i++) { EXPECT_NEAR(data->b[i], 1.0, 1.0e-3); } @@ -50,7 +50,7 @@ TEST_P(LinpackKernelTest, FPGACorrectResultsOneRepetition) { * GEFA Execution returns correct results for a single repetition */ TEST_P(LinpackKernelTest, DISABLED_FPGACorrectResultsGEFA) { - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); auto data2 = bm->generateInputData(); if (bm->getExecutionSettings().programSettings->isDiagonallyDominant) { linpack::gefa_ref_nopvt(data2->A, array_size, array_size); diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 56c060e9..20fecd43 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -282,7 +282,7 @@ class ExecutionSettings { * @tparam TData Class used to represent the benchmark input and output data * @tparam TOutput Class representing the measurements like timings etc */ -template +template class HpccFpgaBenchmark { private: @@ -332,10 +332,16 @@ class HpccFpgaBenchmark { */ bool mpi_external_init = true; + /** + * + * @brief map containing the benchmark timings + * + */ + std::map> timings; /** * - * @brief vector containing the benchmark results + * @brief map containing the benchmark results * */ std::map results; @@ -357,7 +363,7 @@ class HpccFpgaBenchmark { * @param data The initialized data for the kernel. It will be replaced by the kernel output for validation * @return std::unique_ptr A data class containing the measurement results of the execution */ - virtual std::unique_ptr + virtual void executeKernel(TData &data) = 0; /** @@ -377,7 +383,7 @@ class HpccFpgaBenchmark { * @param output The measurement data of the kernel execution */ virtual void - collectResults(const TOutput &output) = 0; + collectResults() = 0; virtual void printResults() = 0; @@ -508,6 +514,12 @@ class HpccFpgaBenchmark { return results_string; } + std::map getEnvironmentMap() { + std::map env; + env["LD_LIBRARY_PATH"] = std::string(std::getenv("LD_LIBRARY_PATH")); + return env; + } + void dumpConfigurationAndResults(std::string file_path) { std::fstream fs; @@ -516,10 +528,17 @@ class HpccFpgaBenchmark { std::cout << "Unable to open file for dumping configuration and results" << std::endl; } else { json dump; - std::string device_name = executionSettings->getDeviceName(); - dump["device"] = device_name; - dump["settings"] = json(executionSettings->programSettings->getSettingsMap()); + dump["name"] = PROGRAM_NAME; +#ifdef _USE_MPI_ + dump["mpi"] ={{"version", MPI_VERSION}, {"subversion", MPI_SUBVERSION}}; +#endif + dump["config_time"] = CONFIG_TIME; + dump["git_commit"] = GIT_COMMIT_HASH; + dump["device"] = executionSettings->getDeviceName(); + dump["settings"] = executionSettings->programSettings->getSettingsMap(); + dump["timings"] = timings; dump["results"] = getResultsMap(); + dump["environment"] = getEnvironmentMap(); fs << dump; } @@ -639,7 +658,7 @@ class HpccFpgaBenchmark { bool validateSuccess = false; auto exe_start = std::chrono::high_resolution_clock::now(); - std::unique_ptr output = executeKernel(*data); + executeKernel(*data); #ifdef _USE_MPI_ MPI_Barrier(MPI_COMM_WORLD); @@ -662,15 +681,15 @@ class HpccFpgaBenchmark { std::cout << "Validation Time: " << eval_time.count() << " s" << std::endl; } } - collectResults(*output); + collectResults(); - if (executionSettings->programSettings->dumpfilePath.size() > 0) { - dumpConfigurationAndResults(executionSettings->programSettings->dumpfilePath); - } + if (mpi_comm_rank == 0) { + if (executionSettings->programSettings->dumpfilePath.size() > 0) { + dumpConfigurationAndResults(executionSettings->programSettings->dumpfilePath); + } - printResults(); + printResults(); - if (mpi_comm_rank == 0) { if (!validateSuccess) { std::cerr << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl; } diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp index 05489e17..c3cc7c2f 100644 --- a/shared/tests/hpcc_base_benchmark_test.cpp +++ b/shared/tests/hpcc_base_benchmark_test.cpp @@ -16,7 +16,7 @@ // and enable the included tests void use_hpcc_base_lib() {} -class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark { +class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark { protected: @@ -35,8 +35,8 @@ class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark generateInputData() override { return returnInputData ? std::unique_ptr(new int) : std::unique_ptr(nullptr);} - std::unique_ptr - executeKernel(int &data) override { return returnExecuteKernel ? std::unique_ptr(new int) : std::unique_ptr(nullptr);} + void + executeKernel(int &data) override { return;} bool validateOutputAndPrintError(int &data) override { return returnValidate;} @@ -45,7 +45,7 @@ class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark { +class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark { protected: @@ -83,13 +83,13 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark(new int);} - std::unique_ptr + void executeKernel(int &data) override { if (!returnExecuteKernel) { throw fpga_setup::FpgaSetupException("Test execute kernel failed"); } executeKernelcalled++; - return std::unique_ptr(new int);} + return;} bool validateOutputAndPrintError(int &data) override { @@ -97,7 +97,7 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark::checkInputParameters(); + return hpcc_base::HpccFpgaBenchmark::checkInputParameters(); } } From 55c05112a21c4b99224d2b9d063f8695796611eb Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 17 Oct 2022 13:43:10 +0200 Subject: [PATCH 202/318] output unit and value explicitly --- shared/include/hpcc_benchmark.hpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 20fecd43..410800e8 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -57,10 +57,10 @@ using json = nlohmann::json; namespace hpcc_base { class HpccResult { +public: double value; std::string unit; -public: HpccResult(double value, std::string unit): value(value), unit(unit) {} friend std::ostream &operator<<(std::ostream &os, const HpccResult &result) { @@ -505,11 +505,14 @@ class HpccFpgaBenchmark { std::cout << *executionSettings << std::endl; } - std::map getResultsMap() { + std::map getResultsJson() { // TODO: nested maps, recursive? - std::map results_string; + std::map results_string; for (auto const &result: results) { - results_string[result.first] = result.second.to_string(); + json j; + j["unit"] = result.second.unit; + j["value"] = result.second.value; + results_string[result.first] = j; } return results_string; } @@ -537,7 +540,7 @@ class HpccFpgaBenchmark { dump["device"] = executionSettings->getDeviceName(); dump["settings"] = executionSettings->programSettings->getSettingsMap(); dump["timings"] = timings; - dump["results"] = getResultsMap(); + dump["results"] = getResultsJson(); dump["environment"] = getEnvironmentMap(); fs << dump; From d26f3770b9cd806a658829f9dd1238944dfdebc8 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 17 Oct 2022 13:43:32 +0200 Subject: [PATCH 203/318] output correct unit in json --- shared/include/hpcc_benchmark.hpp | 38 +++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 410800e8..e7f6a664 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -517,11 +517,45 @@ class HpccFpgaBenchmark { return results_string; } - std::map getEnvironmentMap() { + std::map + getEnvironmentMap() { std::map env; env["LD_LIBRARY_PATH"] = std::string(std::getenv("LD_LIBRARY_PATH")); return env; } + + json + parseFPGATorusString(std::string str) { + json j; + size_t space = str.find(" "); + std::string p_str = str.substr(0, space); + std::string q_str = str.substr(space, str.size()); + j["P"] = stoi(p_str.substr(p_str.find("=") + 1, p_str.find(","))); + j["Q"] = stoi(q_str.substr(q_str.find("=") + 1, q_str.size())); + return j; + } + + std::map + jsonifySettingsMap(std::map settings_map) { + json j; + for (const auto& item: settings_map) { + std::string key = item.first; + std::string value = item.second; + try { + int value_int = stoi(value); + j[key] = value_int; + } catch (std::invalid_argument const &ex) { + if (key == "FPGA Torus") { + j[key] = parseFPGATorusString(value); + } else if (key == "Emulate") { + j[key] = value == "Yes"; + } else { + j[key] = value; + } + } + } + return j; + } void dumpConfigurationAndResults(std::string file_path) { @@ -538,7 +572,7 @@ class HpccFpgaBenchmark { dump["config_time"] = CONFIG_TIME; dump["git_commit"] = GIT_COMMIT_HASH; dump["device"] = executionSettings->getDeviceName(); - dump["settings"] = executionSettings->programSettings->getSettingsMap(); + dump["settings"] = jsonifySettingsMap(executionSettings->programSettings->getSettingsMap()); dump["timings"] = timings; dump["results"] = getResultsJson(); dump["environment"] = getEnvironmentMap(); From 8dfd1617ae89cc4f9519e33bd153865742fba640 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 17 Oct 2022 14:08:59 +0200 Subject: [PATCH 204/318] fix LINPACK result printing --- LINPACK/src/host/linpack_benchmark.cpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp index b4156d57..98a35e3d 100644 --- a/LINPACK/src/host/linpack_benchmark.cpp +++ b/LINPACK/src/host/linpack_benchmark.cpp @@ -171,7 +171,7 @@ linpack::LinpackBenchmark::collectResults() { } results.emplace("t_mean", hpcc_base::HpccResult(t / global_lu_times.size(), "s")); - results.emplace("t_min", hpcc_base::HpccResult(tmin, "?")); + results.emplace("t_min", hpcc_base::HpccResult(tmin, "s")); results.emplace("tlu_mean", hpcc_base::HpccResult(tlu / global_lu_times.size(), "s")); results.emplace("tlu_min", hpcc_base::HpccResult(lu_min, "s")); results.emplace("tsl_mean", hpcc_base::HpccResult(tsl / global_sl_times.size(), "s")); @@ -190,22 +190,20 @@ linpack::LinpackBenchmark::printResults() { << "best" << std::setw(ENTRY_SPACE) << "mean" << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl; - /* std::cout << std::setw(ENTRY_SPACE) << "total" << std::setw(ENTRY_SPACE) - << results["t_min"] << std::setw(ENTRY_SPACE) << results["t_mean"] - << std::setw(ENTRY_SPACE) << results["gflops"] + << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean") + << std::setw(ENTRY_SPACE) << results.at("gflops") << std::endl; std::cout << std::setw(ENTRY_SPACE) << "GEFA" << std::setw(ENTRY_SPACE) - << results["tlu_min"] << std::setw(ENTRY_SPACE) << results["tlu_mean"] - << std::setw(ENTRY_SPACE) << results["gflops_lu"] + << results.at("tlu_min") << std::setw(ENTRY_SPACE) << results.at("tlu_mean") + << std::setw(ENTRY_SPACE) << results.at("gflops_lu") << std::endl; std::cout << std::setw(ENTRY_SPACE) << "GESL" << std::setw(ENTRY_SPACE) - << results["tsl_min"] << std::setw(ENTRY_SPACE) << results["tsl_mean"] - << std::setw(ENTRY_SPACE) << results["gflops_sl"] + << results.at("tsl_min") << std::setw(ENTRY_SPACE) << results.at("tsl_mean") + << std::setw(ENTRY_SPACE) << results.at("gflops_sl") << std::endl; - */ } std::unique_ptr From c70f32a602720fd825cbc9be6feaec8e72e68525 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 17 Oct 2022 14:51:15 +0200 Subject: [PATCH 205/318] add version to dump --- shared/include/hpcc_benchmark.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index e7f6a664..60291bec 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -571,6 +571,7 @@ class HpccFpgaBenchmark { #endif dump["config_time"] = CONFIG_TIME; dump["git_commit"] = GIT_COMMIT_HASH; + dump["version"] = VERSION; dump["device"] = executionSettings->getDeviceName(); dump["settings"] = jsonifySettingsMap(executionSettings->programSettings->getSettingsMap()); dump["timings"] = timings; From 791831f89ef26004cc8d1f49b23e401f438a8e07 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Tue, 18 Oct 2022 15:11:18 +0200 Subject: [PATCH 206/318] add dump for GEMM and FFT --- FFT/src/common/parameters.h.in | 3 +- FFT/src/host/execution.h | 2 +- FFT/src/host/execution_default.cpp | 11 +++--- FFT/src/host/fft_benchmark.cpp | 29 +++++++++------ FFT/src/host/fft_benchmark.hpp | 9 +++-- FFT/tests/test_execution_functionality.cpp | 24 ++++++------- GEMM/src/common/parameters.h.in | 6 ++-- GEMM/src/host/execution.h | 4 +-- GEMM/src/host/execution_default.cpp | 10 +++--- GEMM/src/host/gemm_benchmark.cpp | 36 +++++++++++-------- GEMM/src/host/gemm_benchmark.hpp | 23 +++--------- ...nel_functionality_and_host_integration.cpp | 20 +++++------ shared/include/hpcc_benchmark.hpp | 7 +++- 13 files changed, 98 insertions(+), 86 deletions(-) diff --git a/FFT/src/common/parameters.h.in b/FFT/src/common/parameters.h.in index 52a87a98..57c85c61 100644 --- a/FFT/src/common/parameters.h.in +++ b/FFT/src/common/parameters.h.in @@ -27,7 +27,8 @@ Short description of the program. Moreover the version and build time is also compiled into the description. */ -#define PROGRAM_DESCRIPTION "Implementation of the FFT benchmark"\ +#define PROGRAM_NAME "FFT" +#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\ " proposed in the HPCC benchmark suite for FPGA.\n"\ "Version: " VERSION "\n" diff --git a/FFT/src/host/execution.h b/FFT/src/host/execution.h index 2d588ded..fa44dc38 100644 --- a/FFT/src/host/execution.h +++ b/FFT/src/host/execution.h @@ -45,7 +45,7 @@ simple exchange of the different calculation methods. @return The resulting matrix */ - std::unique_ptr + std::map> calculate(hpcc_base::ExecutionSettings const& config, std::complex* data, std::complex* data_out, unsigned iterations, bool inverse); } // namespace bm_execution diff --git a/FFT/src/host/execution_default.cpp b/FFT/src/host/execution_default.cpp index 59a81f87..d0d565da 100644 --- a/FFT/src/host/execution_default.cpp +++ b/FFT/src/host/execution_default.cpp @@ -44,7 +44,7 @@ namespace bm_execution { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::unique_ptr + std::map> calculate(hpcc_base::ExecutionSettings const& config, std::complex* data, std::complex* data_out, @@ -210,10 +210,11 @@ namespace bm_execution { ASSERT_CL(err) #endif } - std::unique_ptr result(new fft::FFTExecutionTimings{ - calculationTimings - }); - return result; + std::map> timings; + + timings["calculation"] = calculationTimings; + + return timings; } } // namespace bm_execution diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp index cf7ad994..ff0710ef 100644 --- a/FFT/src/host/fft_benchmark.cpp +++ b/FFT/src/host/fft_benchmark.cpp @@ -86,37 +86,44 @@ fft::FFTBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { ("inverse", "If set, the inverse FFT is calculated instead"); } -std::unique_ptr +void fft::FFTBenchmark::executeKernel(FFTData &data) { - return bm_execution::calculate(*executionSettings, data.data, data.data_out, executionSettings->programSettings->iterations, + timings = bm_execution::calculate(*executionSettings, data.data, data.data_out, executionSettings->programSettings->iterations, executionSettings->programSettings->inverse); } void -fft::FFTBenchmark::collectAndPrintResults(const fft::FFTExecutionTimings &output) { +fft::FFTBenchmark::collectResults() { double gflop = static_cast(5 * (1 << LOG_FFT_SIZE) * LOG_FFT_SIZE) * executionSettings->programSettings->iterations * 1.0e-9 * mpi_comm_size; - uint number_measurements = output.timings.size(); + uint number_measurements = timings["calculation"].size(); std::vector avg_measures(number_measurements); #ifdef _USE_MPI_ // Copy the object variable to a local variable to make it accessible to the lambda function int mpi_size = mpi_comm_size; - MPI_Reduce(output.timings.data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + MPI_Reduce(timings.data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); std::for_each(avg_measures.begin(),avg_measures.end(), [mpi_size](double& x) {x /= mpi_size;}); #else - std::copy(output.timings.begin(), output.timings.end(), avg_measures.begin()); + std::copy(timings["calculation"].begin(), timings["calculation"].end(), avg_measures.begin()); #endif if (mpi_comm_rank == 0) { double minTime = *min_element(avg_measures.begin(), avg_measures.end()); double avgTime = accumulate(avg_measures.begin(), avg_measures.end(), 0.0) / avg_measures.size(); + results.emplace("t_min", hpcc_base::HpccResult(minTime / (executionSettings->programSettings->iterations * executionSettings->programSettings->kernelReplications), "s")); + results.emplace("t_avg", hpcc_base::HpccResult(avgTime / (executionSettings->programSettings->iterations * executionSettings->programSettings->kernelReplications), "s")); + results.emplace("gflops_min", hpcc_base::HpccResult(gflop / minTime, "GFLOP/s")); + results.emplace("gflops_avg", hpcc_base::HpccResult(gflop / avgTime, "GFLOP/s")); + } +} +void +fft::FFTBenchmark::printResults() { std::cout << std::setw(ENTRY_SPACE) << " " << std::setw(ENTRY_SPACE) << "avg" << std::setw(ENTRY_SPACE) << "best" << std::endl; - std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << std::setw(ENTRY_SPACE) << avgTime / (executionSettings->programSettings->iterations * executionSettings->programSettings->kernelReplications) - << std::setw(ENTRY_SPACE) << minTime / (executionSettings->programSettings->iterations * executionSettings->programSettings->kernelReplications) << std::endl; - std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << std::setw(ENTRY_SPACE) << gflop / avgTime - << std::setw(ENTRY_SPACE) << gflop / minTime << std::endl; - } + std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << std::setw(ENTRY_SPACE) << results.at("t_avg") + << std::setw(ENTRY_SPACE) << results.at("t_min") << std::endl; + std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << std::setw(ENTRY_SPACE) << results.at("gflops_avg") + << std::setw(ENTRY_SPACE) << results.at("gflop_min") << std::endl; } std::unique_ptr diff --git a/FFT/src/host/fft_benchmark.hpp b/FFT/src/host/fft_benchmark.hpp index 4ee82f12..99fd3458 100644 --- a/FFT/src/host/fft_benchmark.hpp +++ b/FFT/src/host/fft_benchmark.hpp @@ -137,7 +137,7 @@ class FFTExecutionTimings { * @brief Implementation of the FFT benchmark * */ -class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark { +class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark { protected: @@ -165,7 +165,7 @@ class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark Measured runtimes of the kernel execution */ - std::unique_ptr + void executeKernel(FFTData &data) override; /** @@ -184,7 +184,10 @@ class FFTBenchmark : public hpcc_base::HpccFpgaBenchmarkgetExecutionSettings().programSettings->numRepetitions = 1; data = bm->generateInputData(); - auto result = bm->executeKernel(*data); - EXPECT_EQ(1, result->timings.size()); + bm->executeKernel(*data); + EXPECT_EQ(1, bm->getTimingsMap().at("calculation").size()); } /** @@ -44,8 +44,8 @@ TEST_F(FFTKernelTest, CalculateReturnsCorrectExecutionResultFor11False) { TEST_F(FFTKernelTest, CalculateReturnsCorrectExecutionResultFor24True) { bm->getExecutionSettings().programSettings->numRepetitions = 2; data = bm->generateInputData(); - auto result = bm->executeKernel(*data); - EXPECT_EQ(2, result->timings.size()); + bm->executeKernel(*data); + EXPECT_EQ(2, bm->getTimingsMap().at("calculation").size()); } /** @@ -56,7 +56,7 @@ TEST_F(FFTKernelTest, FFTReturnsZero) { data->data[i].real(0.0); data->data[i].imag(0.0); } - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { EXPECT_FLOAT_EQ(std::abs(data->data_out[i]), 0.0); } @@ -71,7 +71,7 @@ TEST_F(FFTKernelTest, FFTCloseToZeroForAll1And1) { data->data[i].real(1.0); data->data[i].imag(1.0); } - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); EXPECT_NEAR(data->data_out[0].real(), (1 << LOG_FFT_SIZE), 0.00001); EXPECT_NEAR(data->data_out[0].imag(), (1 << LOG_FFT_SIZE), 0.00001); for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { @@ -88,7 +88,7 @@ TEST_F(FFTKernelTest, FFTCloseToZeroForAll0And0) { data->data[i].real(0.0); data->data[i].imag(0.0); } - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); for (int i=0; i < (1 << LOG_FFT_SIZE); i++) { EXPECT_NEAR(data->data[i].real(), 0.0, 0.00001); EXPECT_NEAR(data->data[i].imag(), 0.0, 0.00001); @@ -104,7 +104,7 @@ TEST_F(FFTKernelTest, IFFTCloseToZeroForAll1And1) { data->data[i].real(1.0); data->data[i].imag(0.0); } - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); EXPECT_NEAR(data->data_out[0].real(), static_cast(1 << LOG_FFT_SIZE), 0.00001); EXPECT_NEAR(data->data_out[0].imag(), 0.0, 0.00001); for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { @@ -119,7 +119,7 @@ TEST_F(FFTKernelTest, IFFTCloseToZeroForAll1And1) { TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) { auto verify_data = bm->generateInputData(); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); // Normalize iFFT result for (int i=0; i<(1 << LOG_FFT_SIZE); i++) { @@ -135,7 +135,7 @@ TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) { } bm->getExecutionSettings().programSettings->inverse = true; - auto result2 = bm->executeKernel(*data); + bm->executeKernel(*data); // Since data was already sorted by iFFT the bit reversal of the kernel has t be undone fft::bit_reverse(data->data_out, 1); @@ -150,7 +150,7 @@ TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) { TEST_F(FFTKernelTest, FPGAFFTAndCPUFFTGiveSameResults) { auto verify_data = bm->generateInputData(); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); fft::fourier_transform_gold(false,LOG_FFT_SIZE,verify_data->data); fft::bit_reverse(verify_data->data, 1); @@ -171,7 +171,7 @@ TEST_F(FFTKernelTest, FPGAiFFTAndCPUiFFTGiveSameResults) { auto verify_data = bm->generateInputData(); bm->getExecutionSettings().programSettings->inverse = true; - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); fft::fourier_transform_gold(true,LOG_FFT_SIZE,verify_data->data); fft::bit_reverse(verify_data->data, 1); diff --git a/GEMM/src/common/parameters.h.in b/GEMM/src/common/parameters.h.in index 3e35bf01..82ca5a25 100644 --- a/GEMM/src/common/parameters.h.in +++ b/GEMM/src/common/parameters.h.in @@ -29,7 +29,9 @@ /* Short description of the program */ -#define PROGRAM_DESCRIPTION "Implementation of the GEMM benchmark"\ +#define PROGRAM_NAME "GEMM" + +#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\ " proposed in the HPCC benchmark adapted for FPGA\n"\ "Version: " VERSION "\n" @@ -49,4 +51,4 @@ Output separator #endif #endif -#endif // SRC_COMMON_PARAMETERS_H_ \ No newline at end of file +#endif // SRC_COMMON_PARAMETERS_H_ diff --git a/GEMM/src/host/execution.h b/GEMM/src/host/execution.h index 9446c16f..c4ce1412 100644 --- a/GEMM/src/host/execution.h +++ b/GEMM/src/host/execution.h @@ -48,9 +48,9 @@ simple exchange of the different calculation methods. execution in number of items @param blockSize Size of a block that is calculated by the kernel -@return The time measurements and the error rate counted from the executions +@return The time measurements */ -std::unique_ptr +std::map> calculate(hpcc_base::ExecutionSettings const& config, HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, HOST_DATA_TYPE* c_out, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta); } // namespace bm_execution diff --git a/GEMM/src/host/execution_default.cpp b/GEMM/src/host/execution_default.cpp index aa89d258..e608a35a 100644 --- a/GEMM/src/host/execution_default.cpp +++ b/GEMM/src/host/execution_default.cpp @@ -42,7 +42,7 @@ namespace bm_execution { @copydoc bm_execution::calculate() */ -std::unique_ptr +std::map> calculate(hpcc_base::ExecutionSettings const& config, HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, HOST_DATA_TYPE* c_out, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta) { @@ -257,10 +257,10 @@ calculate(hpcc_base::ExecutionSettings const& config, } #endif - - std::unique_ptr results( - new gemm::GEMMExecutionTimings{executionTimes}); - return results; + std::map> timings; + + timings["execution"] = executionTimes; + return timings; } } // namespace bm_execution diff --git a/GEMM/src/host/gemm_benchmark.cpp b/GEMM/src/host/gemm_benchmark.cpp index 8910aacf..141ea160 100644 --- a/GEMM/src/host/gemm_benchmark.cpp +++ b/GEMM/src/host/gemm_benchmark.cpp @@ -99,29 +99,25 @@ gemm::GEMMBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { ("replicate-inputs", "Also replicates the input buffer for each kernel"); } -std::unique_ptr +void gemm::GEMMBenchmark::executeKernel(GEMMData &data) { - return bm_execution::calculate(*executionSettings, data.A, data.B, data.C, data.C_out, data.alpha, data.beta); + timings = bm_execution::calculate(*executionSettings, data.A, data.B, data.C, data.C_out, data.alpha, data.beta); } void -gemm::GEMMBenchmark::collectAndPrintResults(const gemm::GEMMExecutionTimings &output) { +gemm::GEMMBenchmark::collectResults() { - uint number_measurements = output.timings.size(); + uint number_measurements = timings.at("execution").size(); std::vector avg_measures(number_measurements); #ifdef _USE_MPI_ // Copy the object variable to a local variable to make it accessible to the lambda function int mpi_size = mpi_comm_size; - MPI_Reduce(output.timings.data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + MPI_Reduce(timings.at("execution").data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); std::for_each(avg_measures.begin(),avg_measures.end(), [mpi_size](double& x) {x /= mpi_size;}); #else - std::copy(output.timings.begin(), output.timings.end(), avg_measures.begin()); + std::copy(timings.at("execution").begin(), timings.at("execution").end(), avg_measures.begin()); #endif if (mpi_comm_rank == 0) { - std::cout << std::setw(ENTRY_SPACE) - << "best" << std::setw(ENTRY_SPACE) << "mean" - << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl; - // Calculate performance for kernel execution double tmean = 0; double tmin = std::numeric_limits::max(); @@ -136,14 +132,24 @@ gemm::GEMMBenchmark::collectAndPrintResults(const gemm::GEMMExecutionTimings &ou } } tmean = tmean / avg_measures.size(); - - std::cout << std::setw(ENTRY_SPACE) - << tmin << std::setw(ENTRY_SPACE) << tmean - << std::setw(ENTRY_SPACE) << gflops / tmin - << std::endl; + results.emplace("t_mean", hpcc_base::HpccResult(tmean, "s")); + results.emplace("t_min", hpcc_base::HpccResult(tmin, "s")); + results.emplace("gflops", hpcc_base::HpccResult(gflops / tmin, "GFLOP/s")); } } +void +gemm::GEMMBenchmark::printResults() { + std::cout << std::setw(ENTRY_SPACE) + << "best" << std::setw(ENTRY_SPACE) << "mean" + << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl; + + std::cout << std::setw(ENTRY_SPACE) + << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean") + << std::setw(ENTRY_SPACE) << results.at("gflops") + << std::endl; +} + std::unique_ptr gemm::GEMMBenchmark::generateInputData() { auto d = std::unique_ptr(new gemm::GEMMData(*executionSettings->context, executionSettings->programSettings->matrixSize)); diff --git a/GEMM/src/host/gemm_benchmark.hpp b/GEMM/src/host/gemm_benchmark.hpp index fde2e2ae..534a5bab 100644 --- a/GEMM/src/host/gemm_benchmark.hpp +++ b/GEMM/src/host/gemm_benchmark.hpp @@ -170,25 +170,11 @@ class GEMMData { }; -/** - * @brief Measured execution timing from the kernel execution - * - */ -class GEMMExecutionTimings { -public: - /** - * @brief A vector containing the timings for all repetitions for the kernel execution - * - */ - std::vector timings; - -}; - /** * @brief Implementation of the GEMM benchmark * */ -class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark { +class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark { protected: @@ -203,7 +189,7 @@ class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark The input and output data of the benchmark */ @@ -216,7 +202,7 @@ class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark Measured runtimes of the kernel execution */ - std::unique_ptr + void executeKernel(GEMMData &data) override; /** @@ -229,13 +215,14 @@ class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark { */ TEST_P(GEMMKernelTest, FPGACorrectNumberOfRepetitionsIs1) { bm->getExecutionSettings().programSettings->numRepetitions = 1; - auto result = bm->executeKernel(*data); - EXPECT_EQ(result->timings.size(), 1); + bm->executeKernel(*data); + EXPECT_EQ(bm->getTimingsMap().at("execution").size(), 1); } /** @@ -49,8 +49,8 @@ TEST_P(GEMMKernelTest, FPGACorrectNumberOfRepetitionsIs1) { */ TEST_P(GEMMKernelTest, FPGACorrectNumberOfRepetitionsIs3) { bm->getExecutionSettings().programSettings->numRepetitions = 3; - auto result = bm->executeKernel(*data); - EXPECT_EQ(result->timings.size(), 3); + bm->executeKernel(*data); + EXPECT_EQ(bm->getTimingsMap().at("execution").size(), 3); } /** @@ -64,7 +64,7 @@ TEST_P(GEMMKernelTest, FPGACorrectCtimesBeta) { data->C[i * matrix_size + j] = OPTIONAL_CAST(1.0); } } - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { EXPECT_NEAR(data->C_out[i * matrix_size + j], 2.0 * data->C[i * matrix_size + j], std::numeric_limits::epsilon()); @@ -85,7 +85,7 @@ TEST_P(GEMMKernelTest, FPGACorrectAtimesAlpha) { data->alpha = 2.0; data->beta = 0.0; - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { EXPECT_NEAR(data->C_out[i * matrix_size + j], 2.0 * data->A[i * matrix_size + j], std::numeric_limits::epsilon()); @@ -105,7 +105,7 @@ TEST_P(GEMMKernelTest, FPGACorrectBtimesAlpha) { } data->alpha = 2.0; data->beta = 0.0; - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { EXPECT_NEAR(data->C_out[i * matrix_size + j], 2.0 * data->B[i * matrix_size + j], std::numeric_limits::epsilon()); @@ -126,7 +126,7 @@ TEST_P(GEMMKernelTest, FPGACorrectAmulB) { } data->alpha = 1.0; data->beta = 1.0; - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); HOST_DATA_TYPE c_ref_out[matrix_size * matrix_size]; ref_matmul(data->A,data->B,c_ref_out,matrix_size); @@ -150,7 +150,7 @@ TEST_P(GEMMKernelTest, FPGACorrectCplusA) { data->alpha = 1.0; data->beta = 1.0; - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { EXPECT_FLOAT_EQ(data->C_out[i * matrix_size + j], data->A[i * matrix_size + j] + data->C[i * matrix_size + j]); @@ -165,7 +165,7 @@ TEST_P(GEMMKernelTest, FPGACorrectCplusA) { TEST_P(GEMMKernelTest, FPGACorrectbetaCplusalphaAB) { HOST_DATA_TYPE c_ref_out[matrix_size * matrix_size]; - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); for (int i = 0; i < matrix_size; i++) { for (int j = 0; j < matrix_size; j++) { c_ref_out[i * matrix_size + j] = data->C[i * matrix_size + j]; diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 60291bec..4c8454b1 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -505,6 +505,11 @@ class HpccFpgaBenchmark { std::cout << *executionSettings << std::endl; } + std::map> + getTimingsMap() { + return timings; + } + std::map getResultsJson() { // TODO: nested maps, recursive? std::map results_string; @@ -547,7 +552,7 @@ class HpccFpgaBenchmark { } catch (std::invalid_argument const &ex) { if (key == "FPGA Torus") { j[key] = parseFPGATorusString(value); - } else if (key == "Emulate") { + } else if (key == "Emulate" || key == "Replicate Inputs") { j[key] = value == "Yes"; } else { j[key] = value; From 14ad00434dc786c0864c23192c7a626f3479e524 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Tue, 18 Oct 2022 16:10:06 +0200 Subject: [PATCH 207/318] add dump for transpose --- PTRANS/src/common/parameters.h.in | 3 +- .../host/execution_types/execution_cpu.hpp | 10 +-- .../host/execution_types/execution_intel.hpp | 13 ++- .../execution_types/execution_intel_pq.hpp | 11 ++- .../host/execution_types/execution_pcie.hpp | 10 +-- .../execution_types/execution_pcie_pq.hpp | 12 ++- PTRANS/src/host/transpose_benchmark.cpp | 85 ++++++++++--------- PTRANS/src/host/transpose_benchmark.hpp | 10 ++- PTRANS/src/host/transpose_data.hpp | 20 ----- PTRANS/tests/test_host_functionality.cpp | 15 ++-- ...nel_functionality_and_host_integration.cpp | 10 +-- shared/include/hpcc_benchmark.hpp | 5 ++ 12 files changed, 98 insertions(+), 106 deletions(-) diff --git a/PTRANS/src/common/parameters.h.in b/PTRANS/src/common/parameters.h.in index 68b50dd7..2f5f95b3 100644 --- a/PTRANS/src/common/parameters.h.in +++ b/PTRANS/src/common/parameters.h.in @@ -33,7 +33,8 @@ Short description of the program. Moreover the version and build time is also compiled into the description. */ -#define PROGRAM_DESCRIPTION "Implementation of the matrix transposition benchmark"\ +#define PROGRAM_NAME "matrix transposition" +#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\ " proposed in the HPCC benchmark suite for FPGA.\n"\ "Version: " VERSION "\n" diff --git a/PTRANS/src/host/execution_types/execution_cpu.hpp b/PTRANS/src/host/execution_types/execution_cpu.hpp index ab74fdc9..a2775809 100644 --- a/PTRANS/src/host/execution_types/execution_cpu.hpp +++ b/PTRANS/src/host/execution_types/execution_cpu.hpp @@ -50,7 +50,7 @@ namespace transpose * @param data data object that contains all required data for the execution * @return std::unique_ptr The measured execution times */ - static std::unique_ptr + static std::map> calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) { int err; @@ -115,10 +115,10 @@ namespace transpose transferTimings.push_back(transferTime.count()); } - std::unique_ptr result(new transpose::TransposeExecutionTimings{ - transferTimings, - calculationTimings}); - return result; + std::map> timings; + timings["transfer"] = transferTimings; + timings["calculation"] = calculationTimings; + return timings; } } // namespace bm_execution diff --git a/PTRANS/src/host/execution_types/execution_intel.hpp b/PTRANS/src/host/execution_types/execution_intel.hpp index d95bf578..fc752d0f 100644 --- a/PTRANS/src/host/execution_types/execution_intel.hpp +++ b/PTRANS/src/host/execution_types/execution_intel.hpp @@ -40,9 +40,9 @@ namespace intel { * * @param config The progrma configuration * @param data data object that contains all required data for the execution on the FPGA - * @return std::unique_ptr The measured execution times + * @return std::map> The measured execution times */ -static std::unique_ptr +static std::map> calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data) { int err; @@ -264,11 +264,10 @@ static std::unique_ptr transferTimings.push_back(transferTime.count()); } - std::unique_ptr result(new transpose::TransposeExecutionTimings{ - transferTimings, - calculationTimings - }); - return result; + std::map> timings; + timings["transfer"] = transferTimings; + timings["calculation"] = calculationTimings; + return timings; } } // namespace transpose diff --git a/PTRANS/src/host/execution_types/execution_intel_pq.hpp b/PTRANS/src/host/execution_types/execution_intel_pq.hpp index 431ff40d..8dcc080e 100644 --- a/PTRANS/src/host/execution_types/execution_intel_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_intel_pq.hpp @@ -43,7 +43,7 @@ namespace intel_pq { * @param data data object that contains all required data for the execution on the FPGA * @return std::unique_ptr The measured execution times */ -static std::unique_ptr +static std::map> calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { int err; @@ -343,11 +343,10 @@ static std::unique_ptr transferTimings.push_back(transferTime.count()); } - std::unique_ptr result(new transpose::TransposeExecutionTimings{ - transferTimings, - calculationTimings - }); - return result; + std::map> timings; + timings["transfer"] = transferTimings; + timings["calculation"] = calculationTimings; + return timings; } } // namespace transpose diff --git a/PTRANS/src/host/execution_types/execution_pcie.hpp b/PTRANS/src/host/execution_types/execution_pcie.hpp index 5e29ad2e..aa0d589f 100644 --- a/PTRANS/src/host/execution_types/execution_pcie.hpp +++ b/PTRANS/src/host/execution_types/execution_pcie.hpp @@ -48,7 +48,7 @@ namespace transpose * @param handler data handler instance that should be used to exchange data between hosts * @return std::unique_ptr The measured execution times */ - static std::unique_ptr + static std::map> calculate(const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler) { int err; @@ -227,10 +227,10 @@ namespace transpose transferTimings.push_back(transferTime.count()); } - std::unique_ptr result(new transpose::TransposeExecutionTimings{ - transferTimings, - calculationTimings}); - return result; + std::map> timings; + timings["transfer"] = transferTimings; + timings["calculation"] = calculationTimings; + return timings; } } // namespace bm_execution diff --git a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp index d2cfae7e..6be472a4 100644 --- a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp @@ -44,7 +44,7 @@ namespace pcie_pq { * @param handler data handler instance that should be used to exchange data between hosts * @return std::unique_ptr The measured execution times */ -static std::unique_ptr +static std::map> calculate(const hpcc_base::ExecutionSettings& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) { int err; @@ -366,12 +366,10 @@ static std::unique_ptr transferTimings.push_back(transferTime.count()); } - std::unique_ptr result(new transpose::TransposeExecutionTimings{ - transferTimings, - calculationTimings - }); - - return result; + std::map> timings; + timings["transfer"] = transferTimings; + timings["calculation"] = calculationTimings; + return timings; } } // namespace transpose diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp index 755b11a0..1c2682e3 100644 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -65,22 +65,22 @@ transpose::TransposeBenchmark::addAdditionalParseOptions(cxxopts::Options &optio cxxopts::value()->default_value(DEFAULT_DIST_TYPE)); } -std::unique_ptr +void transpose::TransposeBenchmark::executeKernel(TransposeData &data) { switch (executionSettings->programSettings->communicationType) { case hpcc_base::CommunicationType::intel_external_channels: if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { - return transpose::fpga_execution::intel::calculate(*executionSettings, data); + timings = transpose::fpga_execution::intel::calculate(*executionSettings, data); } else { - return transpose::fpga_execution::intel_pq::calculate(*executionSettings, data, reinterpret_cast(*dataHandler)); + timings = transpose::fpga_execution::intel_pq::calculate(*executionSettings, data, reinterpret_cast(*dataHandler)); } break; case hpcc_base::CommunicationType::pcie_mpi : if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) { - return transpose::fpga_execution::pcie::calculate(*executionSettings, data, *dataHandler); + timings = transpose::fpga_execution::pcie::calculate(*executionSettings, data, *dataHandler); } else { - return transpose::fpga_execution::pcie_pq::calculate(*executionSettings, data, reinterpret_cast(*dataHandler)); + timings = transpose::fpga_execution::pcie_pq::calculate(*executionSettings, data, reinterpret_cast(*dataHandler)); } break; #ifdef MKL_FOUND case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*executionSettings, data, *dataHandler); break; @@ -90,63 +90,70 @@ transpose::TransposeBenchmark::executeKernel(TransposeData &data) { } void -transpose::TransposeBenchmark::collectAndPrintResults(const transpose::TransposeExecutionTimings &output) { +transpose::TransposeBenchmark::collectResults() { double flops = static_cast(executionSettings->programSettings->matrixSize) * executionSettings->programSettings->matrixSize; // Number of experiment repetitions - uint number_measurements = output.calculationTimings.size(); + uint number_measurements = timings.at("calculation").size(); std::vector max_measures(number_measurements); std::vector max_transfers(number_measurements); #ifdef _USE_MPI_ // Copy the object variable to a local variable to make it accessible to the lambda function int mpi_size = mpi_comm_size; - MPI_Reduce(output.calculationTimings.data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - MPI_Reduce(output.transferTimings.data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(timings.at("calculation").data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(timings.at("transfer").data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); #else - std::copy(output.calculationTimings.begin(), output.calculationTimings.end(), max_measures.begin()); - std::copy(output.transferTimings.begin(), output.transferTimings.end(), max_transfers.begin()); + std::copy(timings.at("calculation").begin(), timings.at("calculation").end(), max_measures.begin()); + std::copy(timings.at("transfer").begin(), timings.at("transfer").end(), max_transfers.begin()); #endif double avgCalculationTime = accumulate(max_measures.begin(), max_measures.end(), 0.0) / max_measures.size(); + results.emplace("avg_calc_t", hpcc_base::HpccResult(avgCalculationTime, "s")); + double minCalculationTime = *min_element(max_measures.begin(), max_measures.end()); + results.emplace("min_calc_t", hpcc_base::HpccResult(minCalculationTime, "s")); double avgTransferTime = accumulate(max_transfers.begin(), max_transfers.end(), 0.0) / max_transfers.size(); - double minTransferTime = *min_element(max_transfers.begin(), max_transfers.end()); - - double avgCalcFLOPS = flops / avgCalculationTime; - double maxCalcFLOPS = flops / minCalculationTime; - double avgMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime; - double maxMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime; - double avgTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime; - double maxTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime; - - + results.emplace("avg_transfer_t", hpcc_base::HpccResult(avgTransferTime, "s")); + double minTransferTime = *min_element(max_transfers.begin(), max_transfers.end()); + results.emplace("min_transfer_t", hpcc_base::HpccResult(minTransferTime, "s")); + + results.emplace("avg_t", hpcc_base::HpccResult(avgCalculationTime + avgTransferTime, "s")); + results.emplace("min_t", hpcc_base::HpccResult(minCalculationTime + minTransferTime, "s")); + + results.emplace("avg_calc_flops", hpcc_base::HpccResult(flops / avgCalculationTime, "GFLOP/s")); + results.emplace("max_calc_flops", hpcc_base::HpccResult(flops / minCalculationTime, "GFLOP/s")); + results.emplace("avg_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime, "GB/s")); + results.emplace("max_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime, "GB/s")); + results.emplace("avg_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime, "GB/s")); + results.emplace("max_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime, "GB/s")); +} - if (mpi_comm_rank == 0) { - std::cout << " total [s] transfer [s] calc [s] calc FLOPS Mem [B/s] PCIe [B/s]" << std::endl; - std::cout << "avg: " << (avgTransferTime + avgCalculationTime) - << " " << avgTransferTime - << " " << avgCalculationTime - << " " << avgCalcFLOPS - << " " << avgMemBandwidth - << " " << avgTransferBandwidth - << std::endl; - std::cout << "best: " << (minTransferTime + minCalculationTime) - << " " << minTransferTime - << " " << minCalculationTime - << " " << maxCalcFLOPS - << " " << maxMemBandwidth - << " " << maxTransferBandwidth - << std::endl; - } +void +transpose::TransposeBenchmark::printResults() { + std::cout << " total [s] transfer [s] calc [s] calc FLOPS Mem [B/s] PCIe [B/s]" << std::endl; + std::cout << "avg: " << results.at("avg_t") + << " " << results.at("avg_transfer_t") + << " " << results.at("avg_calc_t") + << " " << results.at("avg_calc_flops") + << " " << results.at("avg_mem_bandwidth") + << " " << results.at("avg_transfer_bandwidth") + << std::endl; + std::cout << "best: " << results.at("min_t") + << " " << results.at("min_transfer_t") + << " " << results.at("min_calculation_t") + << " " << results.at("max_calc_flops") + << " " << results.at("max_mem_bandwidth") + << " " << results.at("max_transfer_bandwidth") + << std::endl; } std::unique_ptr transpose::TransposeBenchmark::generateInputData() { - return dataHandler->generateData(*executionSettings); +return dataHandler->generateData(*executionSettings); } bool diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index 5de333ca..cd595637 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -46,7 +46,7 @@ namespace transpose { * @brief Implementation of the transpose benchmark * */ -class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark { +class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark { protected: @@ -81,9 +81,8 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark Measured runtimes of the kernel execution */ - std::unique_ptr + void executeKernel(TransposeData &data) override; /** @@ -102,7 +101,10 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark transferTimings; - - /** - * @brief A vector containing the timings for all repetitions for the calculation - * - */ - std::vector calculationTimings; - -}; - } #endif diff --git a/PTRANS/tests/test_host_functionality.cpp b/PTRANS/tests/test_host_functionality.cpp index 0f7c64a0..c65019a6 100644 --- a/PTRANS/tests/test_host_functionality.cpp +++ b/PTRANS/tests/test_host_functionality.cpp @@ -24,16 +24,16 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatHeader) { std::vector calculateTimings; transferTimings.push_back(1.0); calculateTimings.push_back(1.0); - std::shared_ptr results( - new transpose::TransposeExecutionTimings{transferTimings, calculateTimings}); - + bm->addTimings("transfer", transferTimings); + bm->addTimings("calculation", calculateTimings); // Redirect stout buffer to local buffer to make checks possible std::stringstream newStdOutBuffer; std::streambuf *oldStdOutBuffer = std::cout.rdbuf(); std::cout.rdbuf(newStdOutBuffer.rdbuf()); - bm->collectAndPrintResults(*results); + bm->collectResults(); + bm->printResults(); // Redirect stdout to old buffer std::cout.rdbuf(oldStdOutBuffer); @@ -50,8 +50,8 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatValues) { std::vector calculateTimings; transferTimings.push_back(1.0); calculateTimings.push_back(1.0); - std::shared_ptr results( - new transpose::TransposeExecutionTimings{transferTimings, calculateTimings}); + bm->addTimings("transfer", transferTimings); + bm->addTimings("calculation", calculateTimings); // Redirect stout buffer to local buffer to make checks possible @@ -59,7 +59,8 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatValues) { std::streambuf *oldStdOutBuffer = std::cout.rdbuf(); std::cout.rdbuf(newStdOutBuffer.rdbuf()); - bm->collectAndPrintResults(*results); + bm->collectResults(); + bm->printResults(); // Redirect stdout to old buffer std::cout.rdbuf(oldStdOutBuffer); diff --git a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp index d7bc0c7f..985a0698 100644 --- a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp +++ b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp @@ -195,12 +195,12 @@ TEST_F(TransposeKernelTest, FPGAAAndBAreSummedUp4Blocks) { */ TEST_F(TransposeKernelTest, FPGATimingsMeasuredForEveryIteration) { bm->getExecutionSettings().programSettings->numRepetitions = 10; - auto result = bm->executeKernel(*data); - EXPECT_EQ(result->calculationTimings.size(), 10); - EXPECT_EQ(result->transferTimings.size(), 10); + bm->executeKernel(*data); + EXPECT_EQ(bm->getTimingsMap().at("calculation").size(), 10); + EXPECT_EQ(bm->getTimingsMap().at("transfer").size(), 10); for (int t = 0; t < 10; t++) { - EXPECT_GE(result->transferTimings[t], 0.0); - EXPECT_GE(result->calculationTimings[t], 0.0); + EXPECT_GE(bm->getTimingsMap().at("transfer")[t], 0.0); + EXPECT_GE(bm->getTimingsMap().at("calculation")[t], 0.0); } } diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 4c8454b1..68127eab 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -510,6 +510,11 @@ class HpccFpgaBenchmark { return timings; } + void + addTimings(std::string key, std::vector value) { + timings.emplace(key, value); + } + std::map getResultsJson() { // TODO: nested maps, recursive? std::map results_string; From 8ab9c86484b6ea682a4196a43ecd52de7c6aad0f Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Tue, 18 Oct 2022 21:02:58 +0200 Subject: [PATCH 208/318] add dump for RandomAccess --- RandomAccess/src/common/parameters.h.in | 6 +-- RandomAccess/src/host/execution.h | 2 +- RandomAccess/src/host/execution_single.cpp | 9 ++-- .../src/host/random_access_benchmark.cpp | 50 ++++++++++--------- .../src/host/random_access_benchmark.hpp | 24 +++------ ...nel_functionality_and_host_integration.cpp | 10 ++-- 6 files changed, 48 insertions(+), 53 deletions(-) diff --git a/RandomAccess/src/common/parameters.h.in b/RandomAccess/src/common/parameters.h.in index 837d3c74..a47f850e 100644 --- a/RandomAccess/src/common/parameters.h.in +++ b/RandomAccess/src/common/parameters.h.in @@ -35,8 +35,8 @@ Short description of the program. Moreover the version and build time is also compiled into the description. */ - -#define PROGRAM_DESCRIPTION "Implementation of the random access benchmark"\ +#define PROGRAM_NAME "random access" +#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\ " proposed in the HPCC benchmark suite for FPGA.\n"\ "Version: " VERSION "\n" @@ -62,4 +62,4 @@ Output separator -#endif // SRC_COMMON_PARAMETERS_H_ \ No newline at end of file +#endif // SRC_COMMON_PARAMETERS_H_ diff --git a/RandomAccess/src/host/execution.h b/RandomAccess/src/host/execution.h index 88cf6736..51d1796d 100644 --- a/RandomAccess/src/host/execution.h +++ b/RandomAccess/src/host/execution.h @@ -40,7 +40,7 @@ namespace bm_execution { * @param data The data that is used as input and output of the random accesses * @return std::unique_ptr The measured runtimes of the kernel */ -std::unique_ptr +std::map> calculate(hpcc_base::ExecutionSettings const& config, HOST_DATA_TYPE * data, int mpi_rank, int mpi_size); } // namespace bm_execution diff --git a/RandomAccess/src/host/execution_single.cpp b/RandomAccess/src/host/execution_single.cpp index 486234bf..d4718083 100644 --- a/RandomAccess/src/host/execution_single.cpp +++ b/RandomAccess/src/host/execution_single.cpp @@ -40,7 +40,7 @@ namespace bm_execution { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::unique_ptr + std::map> calculate(hpcc_base::ExecutionSettings const& config, HOST_DATA_TYPE * data, int mpi_rank, int mpi_size) { // int used to check for OpenCL errors int err; @@ -204,7 +204,10 @@ namespace bm_execution { free(random_inits); - return std::unique_ptr(new random_access::RandomAccessExecutionTimings{executionTimes}); - } + std::map> timings; + + timings["execution"] = executionTimes; + return timings; + } } // namespace bm_execution diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp index e51e1fe2..a5f06303 100644 --- a/RandomAccess/src/host/random_access_benchmark.cpp +++ b/RandomAccess/src/host/random_access_benchmark.cpp @@ -87,45 +87,49 @@ random_access::RandomAccessBenchmark::addAdditionalParseOptions(cxxopts::Options cxxopts::value()->default_value(std::to_string(HPCC_FPGA_RA_RNG_COUNT_LOG))); } -std::unique_ptr +void random_access::RandomAccessBenchmark::executeKernel(RandomAccessData &data) { - return bm_execution::calculate(*executionSettings, data.data, mpi_comm_rank, mpi_comm_size); + timings = bm_execution::calculate(*executionSettings, data.data, mpi_comm_rank, mpi_comm_size); } void -random_access::RandomAccessBenchmark::collectAndPrintResults(const random_access::RandomAccessExecutionTimings &output) { +random_access::RandomAccessBenchmark::collectResults() { - std::vector avgTimings(output.times.size()); + std::vector avgTimings(timings.at("execution").size()); #ifdef _USE_MPI_ // Copy the object variable to a local variable to make it accessible to the lambda function int mpi_size = mpi_comm_size; - MPI_Reduce(output.times.data(),avgTimings.data(),output.times.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); - std::for_each(avgTimings.begin(),avgTimings.end(), [mpi_size](double& x) {x /= mpi_size;}); + MPI_Reduce(timings.at("execution").data(), avgTimings.data(),timings.at("execution").size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + std::for_each(avgTimings.begin(), avgTimings.end(), [mpi_size](double& x) {x /= mpi_size;}); #else - std::copy(output.times.begin(), output.times.end(), avgTimings.begin()); + std::copy(timings.at("execution").begin(), timings.at("execution").end(), avgTimings.begin()); #endif - if (mpi_comm_rank == 0) { + // Calculate performance for kernel execution + double tmean = 0; + double tmin = std::numeric_limits::max(); + double gups = static_cast(4 * executionSettings->programSettings->dataSize * mpi_comm_size) / 1000000000; + for (double currentTime : avgTimings) { + tmean += currentTime; + if (currentTime < tmin) { + tmin = currentTime; + } + } + tmean = tmean / timings.at("execution").size(); + + results.emplace("t_min", hpcc_base::HpccResult(tmin, "s")); + results.emplace("t_mean", hpcc_base::HpccResult(tmean, "s")); + results.emplace("guops", hpcc_base::HpccResult(gups / tmin, "GUOP/s")); +} + +void random_access::RandomAccessBenchmark::printResults() { std::cout << std::setw(ENTRY_SPACE) << "best" << std::setw(ENTRY_SPACE) << "mean" << std::setw(ENTRY_SPACE) << "GUOPS" << std::endl; - // Calculate performance for kernel execution - double tmean = 0; - double tmin = std::numeric_limits::max(); - double gups = static_cast(4 * executionSettings->programSettings->dataSize * mpi_comm_size) / 1000000000; - for (double currentTime : avgTimings) { - tmean += currentTime; - if (currentTime < tmin) { - tmin = currentTime; - } - } - tmean = tmean / output.times.size(); - std::cout << std::setw(ENTRY_SPACE) - << tmin << std::setw(ENTRY_SPACE) << tmean - << std::setw(ENTRY_SPACE) << gups / tmin + << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean") + << std::setw(ENTRY_SPACE) << results.at("guops") << std::endl; - } } bool diff --git a/RandomAccess/src/host/random_access_benchmark.hpp b/RandomAccess/src/host/random_access_benchmark.hpp index 393c9b53..56c7ff40 100644 --- a/RandomAccess/src/host/random_access_benchmark.hpp +++ b/RandomAccess/src/host/random_access_benchmark.hpp @@ -114,25 +114,11 @@ class RandomAccessData { }; -/** - * @brief Measured execution timing from the kernel execution - * - */ -class RandomAccessExecutionTimings { -public: - /** - * @brief A vector containing the timings for all repetitions - * - */ - std::vector times; - -}; - /** * @brief Implementation of the random access benchmark * */ -class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark { +class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark { protected: @@ -158,9 +144,8 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark */ - std::unique_ptr + void executeKernel(RandomAccessData &data) override; /** @@ -179,7 +164,10 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmarkexecuteKernel( *data); - EXPECT_EQ(result->times.size(), 1); + bm->executeKernel( *data); + EXPECT_EQ(bm->getTimingsMap().at("execution").size(), 1); } /** @@ -37,15 +37,15 @@ TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements1Rep) { */ TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements3Rep) { bm->getExecutionSettings().programSettings->numRepetitions = 3; - auto result = bm->executeKernel(*data); - EXPECT_EQ(result->times.size(), 3); + bm->executeKernel(*data); + EXPECT_EQ(bm->getTimingsMap().at("execution").size(), 3); } /** * Execution returns correct results for a single repetition */ TEST_F(RandomAccessKernelTest, FPGAErrorBelow1Percent) { - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); bool success = bm->validateOutputAndPrintError(*data); EXPECT_TRUE(success); } From 58a06c4378f1d4d602f8b5218eb2032c570b8682 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Tue, 18 Oct 2022 21:55:01 +0200 Subject: [PATCH 209/318] add dump for STREAM --- STREAM/src/common/parameters.h.in | 5 +- STREAM/src/host/execution.hpp | 8 ++- STREAM/src/host/execution_default.cpp | 10 +--- STREAM/src/host/stream_benchmark.cpp | 57 ++++++++++--------- STREAM/src/host/stream_benchmark.hpp | 30 ++-------- ...nel_functionality_and_host_integration.cpp | 4 +- 6 files changed, 50 insertions(+), 64 deletions(-) diff --git a/STREAM/src/common/parameters.h.in b/STREAM/src/common/parameters.h.in index 57bb0d0a..8d822247 100644 --- a/STREAM/src/common/parameters.h.in +++ b/STREAM/src/common/parameters.h.in @@ -33,7 +33,8 @@ #cmakedefine USE_SVM #cmakedefine USE_HBM -#define PROGRAM_DESCRIPTION "Implementation of the STREAM benchmark"\ +#define PROGRAM_NAME "STREAM" +#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\ " proposed in the HPCC benchmark suite for FPGA.\n"\ "Version: " VERSION "\n" @@ -48,4 +49,4 @@ Output separator #define TRIAD_KERNEL_TYPE 3 -#endif // SRC_COMMON_PARAMETERS_H_ \ No newline at end of file +#endif // SRC_COMMON_PARAMETERS_H_ diff --git a/STREAM/src/host/execution.hpp b/STREAM/src/host/execution.hpp index 70d6f948..d3e1c31b 100644 --- a/STREAM/src/host/execution.hpp +++ b/STREAM/src/host/execution.hpp @@ -35,13 +35,15 @@ SOFTWARE. #include "half.hpp" // Map keys for execution timings -#define PCIE_WRITE_KEY "PCI write" -#define PCIE_READ_KEY "PCI read" +#define PCIE_WRITE_KEY "PCI_write" +#define PCIE_READ_KEY "PCI_read" #define COPY_KEY "Copy" #define SCALE_KEY "Scale" #define ADD_KEY "Add" #define TRIAD_KEY "Triad" +const std::string keys[] = {PCIE_WRITE_KEY, PCIE_READ_KEY, COPY_KEY, SCALE_KEY, ADD_KEY, TRIAD_KEY}; + namespace bm_execution { static std::map multiplicatorMap = { @@ -62,7 +64,7 @@ namespace bm_execution { * @param C The array C of the stream benchmark * @return std::unique_ptr The measured timings for all stream operations */ - std::unique_ptr + std::map> calculate(const hpcc_base::ExecutionSettings& config, HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, diff --git a/STREAM/src/host/execution_default.cpp b/STREAM/src/host/execution_default.cpp index 71a4d04f..a8cc5d83 100644 --- a/STREAM/src/host/execution_default.cpp +++ b/STREAM/src/host/execution_default.cpp @@ -67,7 +67,7 @@ namespace bm_execution { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::unique_ptr + std::map> calculate(const hpcc_base::ExecutionSettings& config, HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, @@ -105,7 +105,7 @@ namespace bm_execution { add_kernels, triad_kernels, command_queues); } if (!success) { - return std::unique_ptr(nullptr); + return std::map>(); } // @@ -331,11 +331,7 @@ namespace bm_execution { } - std::unique_ptr result(new stream::StreamExecutionTimings{ - timingMap, - config.programSettings->streamArraySize - }); - return result; + return timingMap; } bool initialize_queues_and_kernels(const hpcc_base::ExecutionSettings &config, diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp index 4dac0ea0..07da82b3 100644 --- a/STREAM/src/host/stream_benchmark.cpp +++ b/STREAM/src/host/stream_benchmark.cpp @@ -102,19 +102,18 @@ stream::StreamBenchmark::addAdditionalParseOptions(cxxopts::Options &options) { ("multi-kernel", "Use the legacy multi kernel implementation"); } -std::unique_ptr +void stream::StreamBenchmark::executeKernel(StreamData &data) { - return bm_execution::calculate(*executionSettings, + timings = bm_execution::calculate(*executionSettings, data.A, data.B, data.C); } void -stream::StreamBenchmark::collectAndPrintResults(const stream::StreamExecutionTimings &output) { - +stream::StreamBenchmark::collectResults() { std::map> totalTimingsMap; - for (auto v : output.timings) { + for (auto v : timings) { // Number of experiment repetitions uint number_measurements = v.second.size(); // create a new @@ -127,29 +126,35 @@ stream::StreamBenchmark::collectAndPrintResults(const stream::StreamExecutionTim #else std::copy(v.second.begin(), v.second.end(), avg_measures.begin()); #endif - totalTimingsMap.insert({v.first,avg_measures}); - } - if (mpi_comm_rank == 0) { - std::cout << std::setw(ENTRY_SPACE) << "Function"; - std::cout << std::setw(ENTRY_SPACE) << "Best Rate MB/s"; - std::cout << std::setw(ENTRY_SPACE) << "Avg time s"; - std::cout << std::setw(ENTRY_SPACE) << "Min time" ; - std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::endl; + double minTime = *min_element(v.second.begin(), v.second.end()); + double avgTime = accumulate(v.second.begin(), v.second.end(), 0.0) + / v.second.size(); + double maxTime = *max_element(v.second.begin(), v.second.end()); + + double bestRate = (static_cast(sizeof(HOST_DATA_TYPE)) * executionSettings->programSettings->streamArraySize * bm_execution::multiplicatorMap[v.first] / minTime) * 1.0e-6 * mpi_comm_size; + + results.emplace(v.first + "_min_t", hpcc_base::HpccResult(minTime, "s")); + results.emplace(v.first + "_avg_t", hpcc_base::HpccResult(avgTime, "s")); + results.emplace(v.first + "_max_t", hpcc_base::HpccResult(maxTime, "s")); + results.emplace(v.first + "_best_rate", hpcc_base::HpccResult(bestRate, "MB/s")); + } +} - for (auto v : totalTimingsMap) { - double minTime = *min_element(v.second.begin(), v.second.end()); - double avgTime = accumulate(v.second.begin(), v.second.end(), 0.0) - / v.second.size(); - double maxTime = *max_element(v.second.begin(), v.second.end()); +void +stream::StreamBenchmark::printResults() { + std::cout << std::setw(ENTRY_SPACE) << "Function"; + std::cout << std::setw(ENTRY_SPACE) << "Best Rate"; + std::cout << std::setw(ENTRY_SPACE) << "Avg time"; + std::cout << std::setw(ENTRY_SPACE) << "Min time" ; + std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::endl; - std::cout << std::setw(ENTRY_SPACE) << v.first; - std::cout << std::setw(ENTRY_SPACE) - << (static_cast(sizeof(HOST_DATA_TYPE)) * output.arraySize * bm_execution::multiplicatorMap[v.first] / minTime) * 1.0e-6 * mpi_comm_size - << std::setw(ENTRY_SPACE) << avgTime - << std::setw(ENTRY_SPACE) << minTime - << std::setw(ENTRY_SPACE) << maxTime << std::endl; - } + for (auto key : keys) { + std::cout << std::setw(ENTRY_SPACE) << key; + std::cout << std::setw(ENTRY_SPACE) << results.at(key + "_best_rate") + << std::setw(ENTRY_SPACE) << results.at(key + "_avg_t") + << std::setw(ENTRY_SPACE) << results.at(key + "_min_t") + << std::setw(ENTRY_SPACE) << results.at(key + "_max_t") << std::endl; } } @@ -265,4 +270,4 @@ stream::StreamBenchmark::validateOutputAndPrintError(stream::StreamData &data) { return false; } return true; -} \ No newline at end of file +} diff --git a/STREAM/src/host/stream_benchmark.hpp b/STREAM/src/host/stream_benchmark.hpp index 401a899d..8377b744 100644 --- a/STREAM/src/host/stream_benchmark.hpp +++ b/STREAM/src/host/stream_benchmark.hpp @@ -127,30 +127,11 @@ class StreamData { }; -/** - * @brief Measured execution timing from the kernel execution - * - */ -class StreamExecutionTimings { -public: - /** - * @brief A map containing the timings for all stream operation types - * - */ - std::map> timings; - - /** - * @brief The used array size - * - */ - uint arraySize; -}; - /** * @brief Implementation of the Sream benchmark * */ -class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark { +class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark { protected: @@ -176,9 +157,8 @@ class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark */ - std::unique_ptr + void executeKernel( StreamData &data) override; /** @@ -194,10 +174,12 @@ class StreamBenchmark : public hpcc_base::HpccFpgaBenchmarkgetExecutionSettings().programSettings->numRepetitions = 1; - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); for (int i = 0; i < bm->getExecutionSettings().programSettings->streamArraySize; i++) { EXPECT_FLOAT_EQ(data->A[i], 30.0); EXPECT_FLOAT_EQ(data->B[i], 6.0); @@ -42,7 +42,7 @@ TEST_F(StreamKernelTest, FPGACorrectResultsOneRepetition) { */ TEST_F(StreamKernelTest, FPGACorrectResultsThreeRepetition) { bm->getExecutionSettings().programSettings->numRepetitions = 3; - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); for (int i = 0; i < bm->getExecutionSettings().programSettings->streamArraySize; i++) { EXPECT_FLOAT_EQ(data->A[i], 6750.0); EXPECT_FLOAT_EQ(data->B[i], 1350.0); From eddef646d30f566bdaf975333f3e06a31da7634b Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 24 Oct 2022 17:29:29 +0200 Subject: [PATCH 210/318] add dump for b_eff --- b_eff/src/common/parameters.h.in | 4 +- b_eff/src/host/execution.h | 2 +- .../host/execution_types/execution_cpu.hpp | 7 +- .../host/execution_types/execution_iec.hpp | 7 +- .../host/execution_types/execution_pcie.hpp | 7 +- b_eff/src/host/network_benchmark.cpp | 89 ++++++++++--------- b_eff/src/host/network_benchmark.hpp | 66 ++++++++++---- ...nel_functionality_and_host_integration.cpp | 40 ++++----- shared/include/hpcc_benchmark.hpp | 7 +- 9 files changed, 131 insertions(+), 98 deletions(-) diff --git a/b_eff/src/common/parameters.h.in b/b_eff/src/common/parameters.h.in index d404bfd7..a334f7db 100644 --- a/b_eff/src/common/parameters.h.in +++ b/b_eff/src/common/parameters.h.in @@ -29,7 +29,9 @@ Short description of the program. Moreover the version and build time is also compiled into the description. */ -#define PROGRAM_DESCRIPTION "Implementation of the effective bandwidth benchmark"\ +#define PROGRAM_NAME "effective bandwidth" + +#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\ " proposed in the HPCC benchmark suite for FPGA.\n"\ "Version: " VERSION "\n" diff --git a/b_eff/src/host/execution.h b/b_eff/src/host/execution.h index 195b97b1..f43c31de 100644 --- a/b_eff/src/host/execution.h +++ b/b_eff/src/host/execution.h @@ -44,7 +44,7 @@ simple exchange of the different calculation methods. @return The resulting matrix */ - std::shared_ptr + network::ExecutionTimings calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, cl::vector &validationData); } // namespace bm_execution diff --git a/b_eff/src/host/execution_types/execution_cpu.hpp b/b_eff/src/host/execution_types/execution_cpu.hpp index 778dc2f1..ec37dcb6 100644 --- a/b_eff/src/host/execution_types/execution_cpu.hpp +++ b/b_eff/src/host/execution_types/execution_cpu.hpp @@ -38,7 +38,7 @@ namespace network::execution_types::cpu { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::shared_ptr + network::ExecutionTimings calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, cl::vector &validationData) { @@ -105,12 +105,11 @@ namespace network::execution_types::cpu { err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); ASSERT_CL(err); } - std::shared_ptr result(new network::ExecutionTimings{ + return network::ExecutionTimings{ looplength, messageSize, calculationTimings - }); - return result; + }; } } // namespace bm_execution diff --git a/b_eff/src/host/execution_types/execution_iec.hpp b/b_eff/src/host/execution_types/execution_iec.hpp index 2ec348e5..4225c783 100644 --- a/b_eff/src/host/execution_types/execution_iec.hpp +++ b/b_eff/src/host/execution_types/execution_iec.hpp @@ -39,7 +39,7 @@ namespace network::execution_types::iec { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::shared_ptr + network::ExecutionTimings calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, cl::vector &validationData) { @@ -164,12 +164,11 @@ namespace network::execution_types::iec { err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); ASSERT_CL(err); } - std::shared_ptr result(new network::ExecutionTimings{ + return network::ExecutionTimings{ looplength, messageSize, calculationTimings - }); - return result; + }; } } // namespace bm_execution diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp index 73156b7e..50d357e6 100644 --- a/b_eff/src/host/execution_types/execution_pcie.hpp +++ b/b_eff/src/host/execution_types/execution_pcie.hpp @@ -38,7 +38,7 @@ namespace network::execution_types::pcie { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::shared_ptr + network::ExecutionTimings calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, cl::vector &validationData) { @@ -111,12 +111,11 @@ namespace network::execution_types::pcie { err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); ASSERT_CL(err); } - std::shared_ptr result(new network::ExecutionTimings{ + return network::ExecutionTimings{ looplength, messageSize, calculationTimings - }); - return result; + }; } } // namespace bm_execution diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index 7bf728a2..73cd27a7 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -89,7 +89,7 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options) cxxopts::value()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE))); } -std::unique_ptr +void network::NetworkBenchmark::executeKernel(NetworkData &data) { // Get the number of processes int world_size; @@ -99,13 +99,13 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) { int world_rank; MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); - std::vector> timing_results; + std::vector timing_results; for (auto& run : data.items) { if (world_rank == 0) { std::cout << "Measure for " << (1 << run.messageSize) << " Byte" << std::endl; } - std::shared_ptr timing; + network::ExecutionTimings timing; switch (executionSettings->programSettings->communicationType) { case hpcc_base::CommunicationType::cpu_only: timing = execution_types::cpu::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; case hpcc_base::CommunicationType::pcie_mpi: timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; @@ -115,16 +115,15 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) { timing_results.push_back(timing); } - std::unique_ptr collected_results = std::unique_ptr (new network::NetworkExecutionTimings()); if (world_rank > 0) { for (const auto& t : timing_results) { - MPI_Send(&(t->messageSize), + MPI_Send(&(t.messageSize), 1, MPI_UNSIGNED, 0, 0, MPI_COMM_WORLD); - MPI_Send(&(t->looplength), + MPI_Send(&(t.looplength), 1, MPI_UNSIGNED, 0, 1, MPI_COMM_WORLD); - MPI_Send(&(t->calculationTimings.front()), + MPI_Send(&(t.calculationTimings.front()), executionSettings->programSettings->numRepetitions, MPI_DOUBLE, 0, 2, MPI_COMM_WORLD); } @@ -132,84 +131,86 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) { std::cout << "Collect results over MPI."; int k = 0; for (auto& run : data.items) { - std::vector> tmp_timings; + std::vector tmp_timings; std::cout << "."; for (int i=1; i < world_size; i++) { - auto execution_result = std::shared_ptr( new network::ExecutionTimings { + auto execution_result = network::ExecutionTimings{ 0,0,std::vector(executionSettings->programSettings->numRepetitions) - }); + }; MPI_Status status; - MPI_Recv(&(execution_result->messageSize), + MPI_Recv(&(execution_result.messageSize), 1, MPI_UNSIGNED, i, 0, MPI_COMM_WORLD, &status); - MPI_Recv(&(execution_result->looplength), + MPI_Recv(&(execution_result.looplength), 1, MPI_UNSIGNED, i, 1, MPI_COMM_WORLD, &status); - MPI_Recv(&(execution_result->calculationTimings.front()), + MPI_Recv(&(execution_result.calculationTimings.front()), executionSettings->programSettings->numRepetitions, MPI_DOUBLE, i, 2, MPI_COMM_WORLD, &status); tmp_timings.push_back(execution_result); - if (execution_result->messageSize != run.messageSize) { - std::cerr << "Wrong message size: " << execution_result->messageSize << " != " << run.messageSize << " from rank " << i << std::endl; + if (execution_result.messageSize != run.messageSize) { + std::cerr << "Wrong message size: " << execution_result.messageSize << " != " << run.messageSize << " from rank " << i << std::endl; throw std::runtime_error("Wrong message size received! Something went wrong in the MPI communication"); } } tmp_timings.push_back(timing_results[k]); k++; - collected_results->timings.emplace(run.messageSize, std::make_shared>>(tmp_timings)); + collected_timings.emplace(run.messageSize, network::ExecutionResult{tmp_timings, 0.0, 0.0}); } std::cout << " done!" << std::endl; } - - return collected_results; + return; } void -network::NetworkBenchmark::collectAndPrintResults(const network::NetworkExecutionTimings &output) { +network::NetworkBenchmark::collectResults() { std::vector maxBandwidths; if (mpi_comm_rank == 0) { - std::cout << std::setw(ENTRY_SPACE) << "MSize" << " " - << std::setw(ENTRY_SPACE) << "looplength" << " " - << std::setw(ENTRY_SPACE) << "transfer" << " " - << std::setw(ENTRY_SPACE) << "B/s" << std::endl; - std::vector totalMaxMinCalculationTime; - for (long unsigned int i =0; i < output.timings.size(); i++) { - totalMaxMinCalculationTime.push_back(0.0); - } int i = 0; - for (const auto& msgSizeResults : output.timings) { - for (const auto& r : *msgSizeResults.second) { - double localMinCalculationTime = *min_element(r->calculationTimings.begin(), r->calculationTimings.end()); - totalMaxMinCalculationTime[i] = std::max(totalMaxMinCalculationTime[i], localMinCalculationTime); + for (auto& timing : collected_timings) { + for (auto& r : timing.second.execution_timings) { + double localMinCalculationTime = *min_element(r.calculationTimings.begin(), r.calculationTimings.end()); + timing.second.maxMinCalculationTime = std::max(timing.second.maxMinCalculationTime, localMinCalculationTime); } i++; } i = 0; - for (const auto& msgSizeResults : output.timings) { - int looplength = msgSizeResults.second->at(0)->looplength; + for (auto& timing : collected_timings) { + int looplength = timing.second.execution_timings.at(0).looplength; + int messageSize = timing.first; + int num_timings = timing.second.execution_timings.size(); // The total sent data in bytes will be: // #Nodes * message_size * looplength * 2 // the * 2 is because we have two kernels per bitstream that will send and receive simultaneously. // This will be divided by half of the maximum of the minimum measured runtime over all ranks. - double maxCalcBW = static_cast(msgSizeResults.second->size() * 2 * (1 << msgSizeResults.first) * looplength) - / (totalMaxMinCalculationTime[i]); + timing.second.maxCalcBW = static_cast(num_timings * 2 * (1 << messageSize) * looplength) + / timing.second.maxMinCalculationTime; - maxBandwidths.push_back(maxCalcBW); + maxBandwidths.push_back(timing.second.maxCalcBW); - std::cout << std::setw(ENTRY_SPACE) << (1 << msgSizeResults.first) << " " - << std::setw(ENTRY_SPACE) << looplength << " " - << std::setw(ENTRY_SPACE) << totalMaxMinCalculationTime[i] << " " - << std::setw(ENTRY_SPACE) << maxCalcBW - << std::endl; i++; } + results.emplace("b_eff", hpcc_base::HpccResult(accumulate(maxBandwidths.begin(), maxBandwidths.end(), 0.0) / static_cast(maxBandwidths.size()), "B/s")); + } +} - double b_eff = accumulate(maxBandwidths.begin(), maxBandwidths.end(), 0.0) / static_cast(maxBandwidths.size()); - - std::cout << std::endl << "b_eff = " << b_eff << " B/s" << std::endl; +void network::NetworkBenchmark::printResults() { + std::cout << std::setw(ENTRY_SPACE) << "MSize" << " " + << std::setw(ENTRY_SPACE) << "looplength" << " " + << std::setw(ENTRY_SPACE) << "transfer" << " " + << std::setw(ENTRY_SPACE) << "B/s" << std::endl; + + for (const auto& timing : collected_timings) { + std::cout << std::setw(ENTRY_SPACE) << (1 << timing.first) << " " + << std::setw(ENTRY_SPACE) << timing.second.execution_timings.at(0).looplength << " " + << std::setw(ENTRY_SPACE) << timing.second.maxMinCalculationTime << " " + << std::setw(ENTRY_SPACE) << timing.second.maxCalcBW + << std::endl; } + + std::cout << std::endl << "b_eff = " << results.at("b_eff") << std::endl; } std::unique_ptr diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index 0fdf8064..2d8e9ee9 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -64,11 +64,26 @@ namespace network { std::vector calculationTimings; }; + struct ExecutionResult { + std::vector execution_timings; + /** + * @brief maximum of minimum calculation time, filled by collectResults + * + */ + double maxMinCalculationTime; + + /** + * @brief maximum of calculated bandwidths, filled by collectResults + * + */ + double maxCalcBW; + }; + /** * @brief The data structure used to store all measurement results * */ - typedef std::map>>> CollectedResultMap; + typedef std::map CollectedTimingsMap; /** * @brief The Network benchmark specific program settings @@ -194,26 +209,11 @@ class NetworkData { }; -/** - * @brief Measured execution timing from the kernel execution - * - */ -class NetworkExecutionTimings { -public: - - /** - * @brief A vector containing the timings for all repetitions for the kernel execution - * - */ - CollectedResultMap timings; - -}; - /** * @brief Implementation of the Network benchmark * */ -class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark { +class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark { protected: @@ -227,6 +227,31 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark timings_json; + for (const auto& execution_timing: timing.second.execution_timings) { + json single_timing_json; + single_timing_json["looplength"] = execution_timing.looplength; + single_timing_json["messageSize"] = execution_timing.messageSize; + single_timing_json["timings"] = execution_timing.calculationTimings; + timings_json.push_back(single_timing_json); + } + timing_json["timings"] = timings_json; + + j[std::to_string(timing.first)] = timing_json; + } + return j; + } + /** * @brief Network specific implementation of the data generation * @@ -241,7 +266,7 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark Measured runtimes of the kernel execution */ - std::unique_ptr + void executeKernel(NetworkData &data) override; /** @@ -259,7 +284,10 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) { data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(1,1)); - auto result = bm->executeKernel(*data); - EXPECT_NE(result->timings.end(), result->timings.find(1)); - EXPECT_EQ(1, result->timings.find(1)->second->at(0)->looplength); - EXPECT_EQ(1, result->timings.find(1)->second->at(0)->calculationTimings.size()); + bm->executeKernel(*data); + EXPECT_NE(bm->collected_timings.end(), bm->collected_timings.find(1)); + EXPECT_EQ(1, bm->collected_timings.find(1)->second.execution_timings.at(0).looplength); + EXPECT_EQ(1, bm->collected_timings.find(1)->second.execution_timings.at(0).calculationTimings.size()); } /** @@ -64,10 +64,10 @@ TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) { bm->getExecutionSettings().programSettings->numRepetitions = 2; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(8,4)); - auto result = bm->executeKernel(*data); - EXPECT_NE(result->timings.end(), result->timings.find(8)); - EXPECT_EQ(4, result->timings.find(8)->second->at(0)->looplength); - EXPECT_EQ(2, result->timings.find(8)->second->at(0)->calculationTimings.size()); + bm->executeKernel(*data); + EXPECT_NE(bm->collected_timings.end(), bm->collected_timings.find(8)); + EXPECT_EQ(4, bm->collected_timings.find(8)->second.execution_timings.at(0).looplength); + EXPECT_EQ(2, bm->collected_timings.find(8)->second.execution_timings.at(0).calculationTimings.size()); } /** @@ -82,7 +82,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) const unsigned looplength = 4; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2]; for (int i=0; i < numberOfChannels; i++) { std::string ifname = channelOutName + std::to_string(i); @@ -110,7 +110,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels const unsigned looplength = 4; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize, looplength)); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2]; for (int i=0; i < numberOfChannels; i++) { std::string ifname = channelOutName + std::to_string(i); @@ -135,7 +135,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwo const unsigned looplength = 1; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2]; for (int i=0; i < numberOfChannels; i++) { std::string ifname = channelOutName + std::to_string(i); @@ -160,7 +160,7 @@ TEST_P(NetworkKernelTest, CorrectDataIsWrittenToChannel) { const unsigned looplength = 4; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[messageSize * looplength * 2]; for (int i=0; i < numberOfChannels; i++) { std::string ifname = channelOutName + std::to_string(i); @@ -180,7 +180,7 @@ TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) { const unsigned looplength = 4; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); HOST_DATA_TYPE cvalue = static_cast(messageSize & 255); EXPECT_EQ(cvalue, data->items[0].validationBuffer[0]); bool all_same = true; @@ -195,7 +195,7 @@ TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) { const unsigned looplength = 4; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); HOST_DATA_TYPE cvalue = static_cast(messageSize & 255); EXPECT_EQ(cvalue, data->items[0].validationBuffer[0]); bool all_same = true; @@ -210,7 +210,7 @@ TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) { const unsigned looplength = 4; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size()); } @@ -219,7 +219,7 @@ TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) { const unsigned looplength = 1; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size()); } @@ -228,7 +228,7 @@ TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) { const unsigned looplength = 1; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); EXPECT_EQ(looplength * CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size()); } @@ -268,7 +268,7 @@ TEST_P(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) { const unsigned looplength = 4; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); EXPECT_TRUE(bm->validateOutputAndPrintError(*data)); } @@ -281,7 +281,7 @@ TEST_P(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExec data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength)); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); EXPECT_TRUE(bm->validateOutputAndPrintError(*data)); } @@ -291,7 +291,7 @@ TEST_P(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) { data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength)); - auto result = bm->executeKernel(*data); + bm->executeKernel(*data); data->items[1].validationBuffer[0] = static_cast(0); EXPECT_FALSE(bm->validateOutputAndPrintError(*data)); } diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 68127eab..3d560ce4 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -527,6 +527,11 @@ class HpccFpgaBenchmark { return results_string; } + // override for special benchmarks like b_eff + virtual json getTimingsJson() { + return timings; + } + std::map getEnvironmentMap() { std::map env; @@ -584,7 +589,7 @@ class HpccFpgaBenchmark { dump["version"] = VERSION; dump["device"] = executionSettings->getDeviceName(); dump["settings"] = jsonifySettingsMap(executionSettings->programSettings->getSettingsMap()); - dump["timings"] = timings; + dump["timings"] = getTimingsJson(); dump["results"] = getResultsJson(); dump["environment"] = getEnvironmentMap(); From a99cca1c4774690b0b7d835dcd6a62334d30ceb5 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Thu, 10 Nov 2022 19:33:19 +0100 Subject: [PATCH 211/318] add units to timings dump --- b_eff/src/host/network_benchmark.hpp | 9 ++++++++- shared/include/hpcc_benchmark.hpp | 13 ++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index 2d8e9ee9..d86c2f61 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -242,7 +242,14 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark calculation_timings; + for (const auto& timing: execution_timing.calculationTimings) { + json j; + j["unit"] = "s"; + j["value"] = timing; + calculation_timings.push_back(timing); + } + single_timing_json["timings"] = calculation_timings; timings_json.push_back(single_timing_json); } timing_json["timings"] = timings_json; diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 3d560ce4..71430599 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -529,7 +529,18 @@ class HpccFpgaBenchmark { // override for special benchmarks like b_eff virtual json getTimingsJson() { - return timings; + json j; + for (auto const &key: timings) { + std::vector timings_list; + for (auto const &timing: key.second) { + json j; + j["unit"] = "s"; + j["value"] = timing; + timings_list.push_back(j); + } + j[key.first] = timings_list; + } + return j; } std::map From d2ee8ba547796526b9b7bcc54c5fa9e65da01522 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Thu, 10 Nov 2022 19:34:16 +0100 Subject: [PATCH 212/318] change dump option to dump-json --- shared/include/hpcc_benchmark.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 71430599..ec611c3d 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -177,7 +177,7 @@ class BaseSettings { #else communicationType(retrieveCommunicationType("UNSUPPORTED", results["f"].as())), #endif - dumpfilePath(results["dump"].as()), + dumpfilePath(results["dump-json"].as()), testOnly(static_cast(results.count("test"))) {} /** @@ -451,7 +451,7 @@ class HpccFpgaBenchmark { ("comm-type", "Used communication type for inter-FPGA communication", cxxopts::value()->default_value(DEFAULT_COMM_TYPE)) #endif - ("dump", "dump benchmark configuration and results to this file", cxxopts::value()->default_value(std::string(""))) + ("dump-json", "dump benchmark configuration and results to this file in json format", cxxopts::value()->default_value(std::string(""))) ("test", "Only test given configuration and skip execution and validation") ("h,help", "Print this help"); From 3bd17f8d2a7bd538ff33fc4412e5e030a0ea7f0e Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 16 Nov 2022 18:35:18 +0100 Subject: [PATCH 213/318] Fix output parser for GEMM, RA, STREAM, HPL --- scripts/evaluation/parse_raw_to_csv.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py index 03dfa2f4..833d5391 100755 --- a/scripts/evaluation/parse_raw_to_csv.py +++ b/scripts/evaluation/parse_raw_to_csv.py @@ -10,12 +10,12 @@ # Regular expressions for the raw output of all fft_regex = "Version:\\s+(?P.+)\n(.*\n)+Batch\\sSize\\s+(?P\d+)\n(.*\n)FFT\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" -gemm_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" -ra_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+Error:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" +gemm_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" +ra_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+Error:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+total\\s\\[s\\]\\s+transfer\\s\\[s\\]\\s+calc\\s\\[s\\]\\s+calc\\s+FLOPS\\s+Mem\\s+\\[B/s\\]\\s+PCIe\\s+\\[B/s\\]\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e|inf)+)\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e|inf)+)" -stream_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P.+)\n(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Kernel\\sType\\s+(?P.+)\n(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" -linpack_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(\\s*\n)\\s+GEFA\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(\\s*\n)\\s+GESL\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" - +stream_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P.+)\n(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Kernel\\sType\\s+(?P.+)\n(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\s+Avg\\stime\\s+Min\\stime\\s+Max\\stime\n\\s+PCI_write\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+PCI_read\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+Copy\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+Scale\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+Add\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+Triad\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" +linpack_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)(\\s+.+\\s*\n)\\s+GEFA\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)(\\s+.+\\s*\n)\\s+GESL\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" + def parse_network(file_content): ''' From 9f99017c590e5ddbf56069933dd01364675d7a17 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Wed, 23 Nov 2022 16:03:14 +0100 Subject: [PATCH 214/318] fix map:at panics --- FFT/src/host/fft_benchmark.cpp | 2 +- PTRANS/src/host/transpose_benchmark.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp index ff0710ef..4bfed7d2 100644 --- a/FFT/src/host/fft_benchmark.cpp +++ b/FFT/src/host/fft_benchmark.cpp @@ -123,7 +123,7 @@ fft::FFTBenchmark::printResults() { std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << std::setw(ENTRY_SPACE) << results.at("t_avg") << std::setw(ENTRY_SPACE) << results.at("t_min") << std::endl; std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << std::setw(ENTRY_SPACE) << results.at("gflops_avg") - << std::setw(ENTRY_SPACE) << results.at("gflop_min") << std::endl; + << std::setw(ENTRY_SPACE) << results.at("gflops_min") << std::endl; } std::unique_ptr diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp index 1c2682e3..0a7e6bc7 100644 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -144,7 +144,7 @@ transpose::TransposeBenchmark::printResults() { << std::endl; std::cout << "best: " << results.at("min_t") << " " << results.at("min_transfer_t") - << " " << results.at("min_calculation_t") + << " " << results.at("min_calc_t") << " " << results.at("max_calc_flops") << " " << results.at("max_mem_bandwidth") << " " << results.at("max_transfer_bandwidth") From 2231e19f0df04f3c87b99bfd7e861958489fad9f Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Wed, 23 Nov 2022 21:00:54 +0100 Subject: [PATCH 215/318] add test for json dump feature --- shared/tests/hpcc_base_benchmark_test.cpp | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp index c3cc7c2f..b6378840 100644 --- a/shared/tests/hpcc_base_benchmark_test.cpp +++ b/shared/tests/hpcc_base_benchmark_test.cpp @@ -8,6 +8,7 @@ #include "test_program_settings.h" #include "gmock/gmock.h" #include "hpcc_benchmark.hpp" +#include "nlohmann/json.hpp" // Dirty GoogleTest and static library hack @@ -264,3 +265,31 @@ TEST(SetupTest, BenchmarkSetupFails) { delete [] tmp_argv; delete [] name_str; } + +using json = nlohmann::json; + +/** + * + * Check if dump-json flag produces valid json output + */ +TEST(SetupTest, BenchmarkJsonDump) { + std::unique_ptr bm = std::unique_ptr(new MinimalBenchmark()); + bm->setupBenchmark(global_argc, global_argv); + bm->getExecutionSettings().programSettings->dumpfilePath = "out.json"; + bm->executeBenchmark(); + std::FILE *f = std::fopen("out.json", "r"); + EXPECT_NE(f, nullptr); + if (f != nullptr) { + // json::parse will panic if f is nullptr + json j = json::parse(f); + // check if the expected keys are there + EXPECT_TRUE(j.contains("config_time")); + EXPECT_TRUE(j.contains("device")); + EXPECT_TRUE(j.contains("environment")); + EXPECT_TRUE(j.contains("git_commit")); + EXPECT_TRUE(j.contains("results")); + EXPECT_TRUE(j.contains("settings")); + EXPECT_TRUE(j.contains("timings")); + EXPECT_TRUE(j.contains("version")); + } +} From d52aca158a42a33c6e44f17636e22ff6e94a9307 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 25 Nov 2022 14:23:50 +0100 Subject: [PATCH 216/318] Fix raw parsing for FFT and PTRANS --- PTRANS/src/host/transpose_benchmark.cpp | 14 +++++++------- scripts/evaluation/parse_raw_to_csv.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp index 0a7e6bc7..9b37f162 100644 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -124,17 +124,17 @@ transpose::TransposeBenchmark::collectResults() { results.emplace("avg_t", hpcc_base::HpccResult(avgCalculationTime + avgTransferTime, "s")); results.emplace("min_t", hpcc_base::HpccResult(minCalculationTime + minTransferTime, "s")); - results.emplace("avg_calc_flops", hpcc_base::HpccResult(flops / avgCalculationTime, "GFLOP/s")); - results.emplace("max_calc_flops", hpcc_base::HpccResult(flops / minCalculationTime, "GFLOP/s")); - results.emplace("avg_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime, "GB/s")); - results.emplace("max_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime, "GB/s")); - results.emplace("avg_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime, "GB/s")); - results.emplace("max_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime, "GB/s")); + results.emplace("avg_calc_flops", hpcc_base::HpccResult(flops / avgCalculationTime * 1.0e9, "GFLOP/s")); + results.emplace("max_calc_flops", hpcc_base::HpccResult(flops / minCalculationTime * 1.0e9, "GFLOP/s")); + results.emplace("avg_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime * 1.0e9, "GB/s")); + results.emplace("max_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime * 1.0e9, "GB/s")); + results.emplace("avg_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime * 1.0e9, "GB/s")); + results.emplace("max_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime * 1.0e9, "GB/s")); } void transpose::TransposeBenchmark::printResults() { - std::cout << " total [s] transfer [s] calc [s] calc FLOPS Mem [B/s] PCIe [B/s]" << std::endl; + std::cout << " total time transfer time calc time calc FLOPS Memory Bandwidth PCIe Bandwidth" << std::endl; std::cout << "avg: " << results.at("avg_t") << " " << results.at("avg_transfer_t") << " " << results.at("avg_calc_t") diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py index 833d5391..e5306dc7 100755 --- a/scripts/evaluation/parse_raw_to_csv.py +++ b/scripts/evaluation/parse_raw_to_csv.py @@ -9,10 +9,10 @@ import sys # Regular expressions for the raw output of all -fft_regex = "Version:\\s+(?P.+)\n(.*\n)+Batch\\sSize\\s+(?P\d+)\n(.*\n)FFT\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" +fft_regex = "Version:\\s+(?P.+)\n(.*\n)+Batch\\sSize\\s+(?P\d+)\n(.*\n)FFT\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+GFLOPS:\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" gemm_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" ra_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+Error:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" -trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+total\\s\\[s\\]\\s+transfer\\s\\[s\\]\\s+calc\\s\\[s\\]\\s+calc\\s+FLOPS\\s+Mem\\s+\\[B/s\\]\\s+PCIe\\s+\\[B/s\\]\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e|inf)+)\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e|inf)+)" +trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+total\\s+time\\s+transfer\\s+time\\s+calc\\s+time\\s+calc\\s+FLOPS\\s+Memory\\s+Bandwidth\\s+PCIe\\s+Bandwidth\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e|inf)+)\\s+.+\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e|inf)+)" stream_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P.+)\n(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Kernel\\sType\\s+(?P.+)\n(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\s+Avg\\stime\\s+Min\\stime\\s+Max\\stime\n\\s+PCI_write\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+PCI_read\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+Copy\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+Scale\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+Add\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+Triad\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" linpack_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)(\\s+.+\\s*\n)\\s+GEFA\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)(\\s+.+\\s*\n)\\s+GESL\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" From 6c09f06627a91c43e6b2ae35136de760663e7764 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 25 Nov 2022 14:26:21 +0100 Subject: [PATCH 217/318] Fix metric conversion --- PTRANS/src/host/transpose_benchmark.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp index 9b37f162..decc9b85 100644 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -124,12 +124,12 @@ transpose::TransposeBenchmark::collectResults() { results.emplace("avg_t", hpcc_base::HpccResult(avgCalculationTime + avgTransferTime, "s")); results.emplace("min_t", hpcc_base::HpccResult(minCalculationTime + minTransferTime, "s")); - results.emplace("avg_calc_flops", hpcc_base::HpccResult(flops / avgCalculationTime * 1.0e9, "GFLOP/s")); - results.emplace("max_calc_flops", hpcc_base::HpccResult(flops / minCalculationTime * 1.0e9, "GFLOP/s")); - results.emplace("avg_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime * 1.0e9, "GB/s")); - results.emplace("max_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime * 1.0e9, "GB/s")); - results.emplace("avg_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime * 1.0e9, "GB/s")); - results.emplace("max_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime * 1.0e9, "GB/s")); + results.emplace("avg_calc_flops", hpcc_base::HpccResult(flops / avgCalculationTime * 1.0e-9, "GFLOP/s")); + results.emplace("max_calc_flops", hpcc_base::HpccResult(flops / minCalculationTime * 1.0e-9, "GFLOP/s")); + results.emplace("avg_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime * 1.0e-9, "GB/s")); + results.emplace("max_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime * 1.0e-9, "GB/s")); + results.emplace("avg_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime * 1.0e-9, "GB/s")); + results.emplace("max_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime * 1.0e-9, "GB/s")); } void From 1e3dc1e2a7790d9463c66071b5f2c765078045cf Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 25 Nov 2022 14:35:01 +0100 Subject: [PATCH 218/318] Fix unit tests for output parsing --- PTRANS/tests/test_host_functionality.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PTRANS/tests/test_host_functionality.cpp b/PTRANS/tests/test_host_functionality.cpp index c65019a6..1c671f2b 100644 --- a/PTRANS/tests/test_host_functionality.cpp +++ b/PTRANS/tests/test_host_functionality.cpp @@ -39,7 +39,7 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatHeader) { std::cout.rdbuf(oldStdOutBuffer); EXPECT_THAT(newStdOutBuffer.str(), - ::testing::MatchesRegex("(\\s+)total\\s\\[s\\](\\s+)transfer\\s\\[s\\](\\s+)calc\\s\\[s\\](\\s+)calc\\sFLOPS(\\s+)Mem\\s\\[B/s\\](\\s+)PCIe\\s\\[B/s\\]\n.*")); + ::testing::MatchesRegex("(\\s+)total\\stime(\\s+)transfer\\stime(\\s+)calc\\s+time(\\s+)calc\\sFLOPS(\\s+)Memory\\sBandwidth(\\s+)PCIe\\sBandwidth\n.*")); } /** @@ -66,7 +66,7 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatValues) { std::cout.rdbuf(oldStdOutBuffer); EXPECT_THAT(newStdOutBuffer.str(), - ::testing::MatchesRegex(".*\navg:\\s+2\\.00000e\\+00\\s+1\\.00000e\\+00\\s+1\\.00000e\\+00.*\n.*\n")); + ::testing::MatchesRegex(".*\navg:\\s+2\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s.*\n.*\n")); } /** From 9ad73dbefad83aa3f36278f63ba729b3acb24694 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 5 Dec 2022 10:29:28 +0100 Subject: [PATCH 219/318] add json tests for all benchmarks --- FFT/tests/test_fft_functionality.cpp | 27 +++++++++- ...nel_functionality_and_host_integration.cpp | 24 +++++++++ ...nel_functionality_and_host_integration.cpp | 33 +++++++++++- ...nel_functionality_and_host_integration.cpp | 34 ++++++++++++- ...nel_functionality_and_host_integration.cpp | 25 ++++++++- ...nel_functionality_and_host_integration.cpp | 51 ++++++++++++++++++- b_eff/src/host/network_benchmark.hpp | 2 +- ...nel_functionality_and_host_integration.cpp | 34 +++++++++++++ 8 files changed, 224 insertions(+), 6 deletions(-) diff --git a/FFT/tests/test_fft_functionality.cpp b/FFT/tests/test_fft_functionality.cpp index f5818814..df5dba9f 100644 --- a/FFT/tests/test_fft_functionality.cpp +++ b/FFT/tests/test_fft_functionality.cpp @@ -6,6 +6,7 @@ #include "fft_benchmark.hpp" #include "parameters.h" #include "test_program_settings.h" +#include "nlohmann/json.hpp" struct FFTHostTest : testing::Test { @@ -119,4 +120,28 @@ TEST_F(FFTHostTest, FFTandiFFTProduceResultCloseToSource) { for (int i=1; i < (1 << LOG_FFT_SIZE); i++) { EXPECT_NEAR(std::abs(data->data[i]), std::abs(verify_data->data[i]), 0.001); } -} \ No newline at end of file +} + +using json = nlohmann::json; + +TEST_F(FFTHostTest, JsonDump) { + bm->executeKernel(*data); + bm->collectResults(); + bm->dumpConfigurationAndResults("fft.json"); + std::FILE *f = std::fopen("fft.json", "r"); + EXPECT_NE(f, nullptr); + if (f != nullptr) { + json j = json::parse(f); + EXPECT_TRUE(j.contains("timings")); + if (j.contains("timings")) { + EXPECT_TRUE(j["timings"].contains("calculation")); + } + EXPECT_TRUE(j.contains("results")); + if (j.contains("results")) { + EXPECT_TRUE(j["results"].contains("gflops_avg")); + EXPECT_TRUE(j["results"].contains("gflops_min")); + EXPECT_TRUE(j["results"].contains("t_avg")); + EXPECT_TRUE(j["results"].contains("t_min")); + } + } +} diff --git a/GEMM/tests/test_kernel_functionality_and_host_integration.cpp b/GEMM/tests/test_kernel_functionality_and_host_integration.cpp index c3d9723e..41ead85e 100755 --- a/GEMM/tests/test_kernel_functionality_and_host_integration.cpp +++ b/GEMM/tests/test_kernel_functionality_and_host_integration.cpp @@ -7,6 +7,7 @@ #include "gemm_benchmark.hpp" #include "parameters.h" #include "test_program_settings.h" +#include "nlohmann/json.hpp" void ref_matmul(HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C, int size) { @@ -179,6 +180,29 @@ TEST_P(GEMMKernelTest, FPGACorrectbetaCplusalphaAB) { } } +using json = nlohmann::json; + +TEST_P(GEMMKernelTest, JsonDump) { + bm->executeKernel(*data); + bm->collectResults(); + bm->dumpConfigurationAndResults("gemm.json"); + std::FILE *f = std::fopen("gemm.json", "r"); + EXPECT_NE(f, nullptr); + if (f != nullptr) { + json j = json::parse(f); + EXPECT_TRUE(j.contains("timings")); + if (j.contains("timings")) { + EXPECT_TRUE(j["timings"].contains("execution")); + } + EXPECT_TRUE(j.contains("results")); + if (j.contains("results")) { + EXPECT_TRUE(j["results"].contains("gflops")); + EXPECT_TRUE(j["results"].contains("t_mean")); + EXPECT_TRUE(j["results"].contains("t_min")); + } + } +} + INSTANTIATE_TEST_CASE_P(Default, GEMMKernelTest, testing::Values(1,2)); diff --git a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp index 77c0fd70..2dbd21f0 100644 --- a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp +++ b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp @@ -5,6 +5,7 @@ #include "parameters.h" #include "test_program_settings.h" #include "linpack_benchmark.hpp" +#include "nlohmann/json.hpp" #ifdef _LAPACK_ #ifdef _DP @@ -94,7 +95,37 @@ TEST_P(LinpackKernelTest, DISABLED_ValidationWorksForMKL) { #endif +using json = nlohmann::json; + +TEST_P(LinpackKernelTest, JsonDump) { + bm->executeKernel(*data); + bm->collectResults(); + bm->dumpConfigurationAndResults("linpack.json"); + std::FILE *f = std::fopen("linpack.json", "r"); + EXPECT_NE(f, nullptr); + if (f != nullptr) { + json j = json::parse(f); + EXPECT_TRUE(j.contains("timings")); + if (j.contains("timings")) { + EXPECT_TRUE(j["timings"].contains("gefa")); + EXPECT_TRUE(j["timings"].contains("gesl")); + } + EXPECT_TRUE(j.contains("results")); + if (j.contains("results")) { + EXPECT_TRUE(j["results"].contains("gflops")); + EXPECT_TRUE(j["results"].contains("gflops_lu")); + EXPECT_TRUE(j["results"].contains("gflops_sl")); + EXPECT_TRUE(j["results"].contains("t_mean")); + EXPECT_TRUE(j["results"].contains("t_min")); + EXPECT_TRUE(j["results"].contains("tlu_mean")); + EXPECT_TRUE(j["results"].contains("tlu_min")); + EXPECT_TRUE(j["results"].contains("tsl_mean")); + EXPECT_TRUE(j["results"].contains("tsl_min")); + } + } +} + INSTANTIATE_TEST_CASE_P( LinpackKernelParametrizedTests, LinpackKernelTest, - ::testing::Values(1, 2, 3)); \ No newline at end of file + ::testing::Values(1, 2, 3)); diff --git a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp index 985a0698..eaff5c42 100644 --- a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp +++ b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp @@ -6,7 +6,7 @@ #include "gtest/gtest.h" #include "parameters.h" #include "test_program_settings.h" - +#include "nlohmann/json.hpp" struct TransposeKernelTest : testing::Test { std::shared_ptr data; @@ -204,3 +204,35 @@ TEST_F(TransposeKernelTest, FPGATimingsMeasuredForEveryIteration) { } } +using json = nlohmann::json; + +TEST_F(TransposeKernelTest, JsonDump) { + bm->executeKernel(*data); + bm->collectResults(); + bm->dumpConfigurationAndResults("ptrans.json"); + std::FILE *f = std::fopen("ptrans.json", "r"); + EXPECT_NE(f, nullptr); + if (f != nullptr) { + json j = json::parse(f); + EXPECT_TRUE(j.contains("timings")); + if (j.contains("timings")) { + EXPECT_TRUE(j["timings"].contains("calculation")); + EXPECT_TRUE(j["timings"].contains("transfer")); + } + EXPECT_TRUE(j.contains("timings")); + if (j.contains("results")) { + EXPECT_TRUE(j["results"].contains("avg_calc_flops")); + EXPECT_TRUE(j["results"].contains("avg_calc_t")); + EXPECT_TRUE(j["results"].contains("avg_mem_bandwidth")); + EXPECT_TRUE(j["results"].contains("avg_t")); + EXPECT_TRUE(j["results"].contains("avg_transfer_bandwidth")); + EXPECT_TRUE(j["results"].contains("avg_transfer_t")); + EXPECT_TRUE(j["results"].contains("max_calc_flops")); + EXPECT_TRUE(j["results"].contains("max_mem_bandwidth")); + EXPECT_TRUE(j["results"].contains("max_transfer_bandwidth")); + EXPECT_TRUE(j["results"].contains("min_calc_t")); + EXPECT_TRUE(j["results"].contains("min_t")); + EXPECT_TRUE(j["results"].contains("min_transfer_t")); + } + } +} diff --git a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp index 0cb30dd1..35c9f229 100644 --- a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp +++ b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp @@ -5,7 +5,7 @@ #include "parameters.h" #include "random_access_benchmark.hpp" #include "test_program_settings.h" - +#include "nlohmann/json.hpp" struct RandomAccessKernelTest : testing::Test { std::unique_ptr data; @@ -49,3 +49,26 @@ TEST_F(RandomAccessKernelTest, FPGAErrorBelow1Percent) { bool success = bm->validateOutputAndPrintError(*data); EXPECT_TRUE(success); } + +using json = nlohmann::json; + +TEST_F(RandomAccessKernelTest, JsonDump) { + bm->executeKernel(*data); + bm->collectResults(); + bm->dumpConfigurationAndResults("fft.json"); + std::FILE *f = std::fopen("fft.json", "r"); + EXPECT_NE(f, nullptr); + if (f != nullptr) { + json j = json::parse(f); + EXPECT_TRUE(j.contains("timings")); + if (j.contains("timings")) { + EXPECT_TRUE(j["timings"].contains("execution")); + } + EXPECT_TRUE(j.contains("results")); + if (j.contains("results")) { + EXPECT_TRUE(j["results"].contains("guops")); + EXPECT_TRUE(j["results"].contains("t_mean")); + EXPECT_TRUE(j["results"].contains("t_min")); + } + } +} diff --git a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp index ec78075e..1aae4c2a 100644 --- a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp +++ b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp @@ -5,7 +5,7 @@ #include "parameters.h" #include "test_program_settings.h" #include "stream_benchmark.hpp" - +#include "nlohmann/json.hpp" struct StreamKernelTest :public ::testing::Test { std::shared_ptr data; @@ -49,3 +49,52 @@ TEST_F(StreamKernelTest, FPGACorrectResultsThreeRepetition) { EXPECT_FLOAT_EQ(data->C[i], 1800.0); } } + +using json = nlohmann::json; + +TEST_F(StreamKernelTest, JsonDump) { + bm->executeKernel(*data); + bm->collectResults(); + bm->dumpConfigurationAndResults("stream.json"); + std::FILE *f = std::fopen("stream.json", "r"); + EXPECT_NE(f, nullptr); + if (f != nullptr) { + json j = json::parse(f); + EXPECT_TRUE(j.contains("timings")); + if (j.contains("timings")) { + EXPECT_TRUE(j["timings"].contains("Add")); + EXPECT_TRUE(j["timings"].contains("Copy")); + EXPECT_TRUE(j["timings"].contains("PCI_read")); + EXPECT_TRUE(j["timings"].contains("PCI_write")); + EXPECT_TRUE(j["timings"].contains("Scale")); + EXPECT_TRUE(j["timings"].contains("Triad")); + } + EXPECT_TRUE(j.contains("results")); + if (j.contains("results")) { + EXPECT_TRUE(j["results"].contains("Add_avg_t")); + EXPECT_TRUE(j["results"].contains("Add_best_rate")); + EXPECT_TRUE(j["results"].contains("Add_max_t")); + EXPECT_TRUE(j["results"].contains("Add_min_t")); + EXPECT_TRUE(j["results"].contains("Copy_avg_t")); + EXPECT_TRUE(j["results"].contains("Copy_best_rate")); + EXPECT_TRUE(j["results"].contains("Copy_max_t")); + EXPECT_TRUE(j["results"].contains("Copy_min_t")); + EXPECT_TRUE(j["results"].contains("PCI_read_avg_t")); + EXPECT_TRUE(j["results"].contains("PCI_read_best_rate")); + EXPECT_TRUE(j["results"].contains("PCI_read_max_t")); + EXPECT_TRUE(j["results"].contains("PCI_read_min_t")); + EXPECT_TRUE(j["results"].contains("PCI_write_avg_t")); + EXPECT_TRUE(j["results"].contains("PCI_write_best_rate")); + EXPECT_TRUE(j["results"].contains("PCI_write_max_t")); + EXPECT_TRUE(j["results"].contains("PCI_write_min_t")); + EXPECT_TRUE(j["results"].contains("Scale_avg_t")); + EXPECT_TRUE(j["results"].contains("Scale_best_rate")); + EXPECT_TRUE(j["results"].contains("Scale_max_t")); + EXPECT_TRUE(j["results"].contains("Scale_min_t")); + EXPECT_TRUE(j["results"].contains("Triad_avg_t")); + EXPECT_TRUE(j["results"].contains("Triad_best_rate")); + EXPECT_TRUE(j["results"].contains("Triad_max_t")); + EXPECT_TRUE(j["results"].contains("Triad_min_t")); + } + } +} diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index d86c2f61..e1b77bc9 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -247,7 +247,7 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmarkvalidateOutputAndPrintError(*data)); } +TEST_P(NetworkKernelTest, JsonDump) { + data->items.clear(); + data->items.push_back(network::NetworkData::NetworkDataItem(8,4)); + bm->executeKernel(*data); + bm->collectResults(); + bm->dumpConfigurationAndResults("b_eff.json"); + std::FILE *f = std::fopen("b_eff.json", "r"); + EXPECT_NE(f, nullptr); + if (f != nullptr) { + json j = json::parse(f); + EXPECT_TRUE(j.contains("timings")); + if (j.contains("timings")) { + EXPECT_TRUE(j["timings"].size() > 0); + if (j["timings"].size() > 0) { + for (const auto& timing: j["timings"].items()) { + EXPECT_TRUE(timing.value().contains("maxCalcBW")); + EXPECT_TRUE(timing.value().contains("maxMinCalculationTime")); + EXPECT_TRUE(timing.value().contains("timings")); + if (timing.value().contains("timings")) { + for (const auto& timing: timing.value()["timings"]) { + EXPECT_TRUE(timing.contains("looplength")); + EXPECT_TRUE(timing.contains("messageSize")); + EXPECT_TRUE(timing.contains("timings")); + } + } + } + } + } + EXPECT_TRUE(j.contains("results")); + if (j.contains("results")) { + EXPECT_TRUE(j["results"].contains("b_eff")); + } + } +} INSTANTIATE_TEST_CASE_P( From fcdab10ba8295c93b6291772935bfa940900df28 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 5 Dec 2022 10:29:52 +0100 Subject: [PATCH 220/318] add documentation for json-dump feature --- README.md | 1 + .../Host Input Parameters/index.rst | 6 +- .../json_output/available_keys.csv | 58 ++++ .../technical_support/json_output/index.rst | 284 ++++++++++++++++++ 4 files changed, 348 insertions(+), 1 deletion(-) create mode 100644 docs/source/technical_support/json_output/available_keys.csv create mode 100644 docs/source/technical_support/json_output/index.rst diff --git a/README.md b/README.md index 1f830da8..1814c5a0 100755 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ Moreover, additional libraries are fetched by the build system during configurat - [cxxopts](https://github.com/jarro2783/cxxopts) for option parsing - [hlslib](https://github.com/definelicht/hlslib) for CMake FindPackages - [Googletest](https://github.com/google/googletest) for unit testing +- [json](https://github.com/nlohmann/json) for json output These dependencies will be downloaded automatically when configuring a benchmark for the first time. The exact version that are used can be found in the `CMakeLists.txt`located in the `extern` directory where all extern dependencies are defined. diff --git a/docs/source/technical_support/Host Input Parameters/index.rst b/docs/source/technical_support/Host Input Parameters/index.rst index 50121964..550e8f19 100644 --- a/docs/source/technical_support/Host Input Parameters/index.rst +++ b/docs/source/technical_support/Host Input Parameters/index.rst @@ -1,3 +1,4 @@ +.. _execution: ======================== Execution of a Benchmark ======================== @@ -46,9 +47,12 @@ Input parameters (or options) can be appended to the host execution call like th ``--comm-type COMM``: This parameter chooses the communication strategy which will be used. Current Options are "IEC" for using the Intel External Channel, "PCIE" for using the host-to-host communicationa and "CPU" for calculating on the CPU. +``--dump-json PATH``: + This parameters enables the dumping of the benchmark configuration, settings, timings and results in machine-readable json-format. The parameter describes the path of the json file, where the dump will go. If no parameter is given no dump will be created. + ``--test``: This option will also skip the execution of the benchmark. It can be used to test different data generation schemes or the benchmark summary before the actual execution. Please note, that the host will exit with a non-zero exit code, because it will not be able to validate the output. Additionally, every benchmark will have several options to define the size and type of the used input data. -These options vary between the benchmarks. An easy way to find out more about these options is to use the ``-h`` option with the host. \ No newline at end of file +These options vary between the benchmarks. An easy way to find out more about these options is to use the ``-h`` option with the host. diff --git a/docs/source/technical_support/json_output/available_keys.csv b/docs/source/technical_support/json_output/available_keys.csv new file mode 100644 index 00000000..070a21da --- /dev/null +++ b/docs/source/technical_support/json_output/available_keys.csv @@ -0,0 +1,58 @@ +Benchmark,timings,results,settings +:ref:`FFT `,calculation,gflops_avg,FFT Size +,,gflops_min,Batch Size +,,t_avg, +,,t_min, +:ref:`GEMM `,execution,gflops,Matrix Size +,,t_mean,Replicate Inputs +,,t_min, +:ref:`LINPACK `,gefa,gflops,Matrix Size +,gesl,gflops_lu,Block Size +,,gflops_sl,Emulate +,,t_mean,Data Type +,,t_min,FPGA Torus +,,tlu_mean, +,,tlu_min, +,,tsl_mean, +,,tsl_min, +:ref:`PTRANS `,calculation,avg_calc_flops,Matrix Size +,transfer,avg_calc_t,Block Size +,,avg_mem_bandwidth,Dist. Buffers +,,avg_t,Data Handler +,,avg_transfer_bandwidth, +,,avg_transfer_t, +,,max_calc_flops, +,,max_mem_bandwidth, +,,max_transfer_bandwidth, +,,min_calc_t, +,,min_t, +,,min_transfer_t, +:ref:`RandomAccess `,execution,guops,Array Size +,,t_mean,#RNGs +,,t_min, +:ref:`STREAM `,Add,Add_avg_t,Data Type +,,Add_best_rate,Array Size +,,Add_max_t,Kernel Type +,,Add_min_t, +,,Copy_avg_t, +,,Copy_best_rate, +,,Copy_max_t, +,,Copy_min_t, +,PCI_read,PCI_read_avg_t, +,,PCI_read_best_rate, +,,PCI_read_max_t, +,,PCI_read_min_t, +,PCI_write,PCI_write_avg_t, +,,PCI_write_best_rate, +,,PCI_write_max_t, +,,PCI_write_min_t, +,Scale,Scale_avg_t, +,,Scale_best_rate, +,,Scale_max_t, +,,Scale_min_t, +,Triad,Triad_avg_t, +,,Triad_best_rate, +,,Triad_max_t, +,,Triad_min_t, +:ref:`b_eff `,**special syntax - see below**,b_eff,Loop Length +,,,Message Sizes diff --git a/docs/source/technical_support/json_output/index.rst b/docs/source/technical_support/json_output/index.rst new file mode 100644 index 00000000..37aa1f68 --- /dev/null +++ b/docs/source/technical_support/json_output/index.rst @@ -0,0 +1,284 @@ +=========== +JSON Output +=========== + +The output of the configuration, settings, timings and results in machine-readable json-format can be enabled as described in :ref:`Execution of a Benchmark ` + +When enabled, this creates a json file which will have some information for all benchmarks. In the following example the different informations are left out, so these are the same for all benchmarks. + +.. code-block:: javascript + + { + "config_time": "Mon Dec 05 15:09:08 UTC 2022", + "device": "Intel(R) FPGA Emulation Device", + "environment": { + "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" + }, + "git_commit": "c7f3890-dirty", + "mpi": { + "subversion": 1, + "version": 3 + }, + "name": "effective bandwidth", + "results": { + }, + "settings": { + "Communication Type": "IEC", + "Kernel File": "./communication_bw520n_IEC_emulate.aocx", + "Kernel Replications": 2, + "MPI Ranks": 1, + "Repetitions": 10, + "Test Mode": "No" + }, + "timings": { + }, + "version": "1.3" + } + +If a benchmark has more settings, they will be added to the settings-key. Every benchmark can track different categories of timings and different results. The following table shows which keys are available for which benchmark. + +.. csv-table:: Available keys + :file: available_keys.csv + :header-rows: 1 + :class: longtable + :widths: 1 1 1 1 + +The results and timings are in a special format, which consists of the value and the unit. + +.. code-block:: javascript + + { + "results": { + "b_eff": { + "unit": "B/s", + "value": 14806691.755972749 + } + } + } + +The timings are a vector of all the timings which were measured, expect for b_eff, where a special format is used. For every message size used in the benchmark the interim results are saved in the following way. + +.. code-block:: javascript + + { + "6": { + "maxCalcBW": 9225059.007945802, + "maxMinCalculationTime": 5.5501e-05, + "timings": [ + { + "looplength": 4, + "messageSize": 6, + "timings": [ + { + "unit": "s", + "value": 0.008889638 + }, + { + "unit": "s", + "value": 0.000115271 + }, + { + "unit": "s", + "value": 0.000149272 + }, + { + "unit": "s", + "value": 0.000163372 + }, + { + "unit": "s", + "value": 7.5731e-05 + }, + { + "unit": "s", + "value": 5.5501e-05 + }, + { + "unit": "s", + "value": 0.000162132 + }, + { + "unit": "s", + "value": 8.2091e-05 + }, + { + "unit": "s", + "value": 6.7621e-05 + }, + { + "unit": "s", + "value": 0.000126891 + } + ] + } + ] + }, + "7": { + "maxCalcBW": 12222341.581026724, + "maxMinCalculationTime": 8.3781e-05, + "timings": [ + { + "looplength": 4, + "messageSize": 7, + "timings": [ + { + "unit": "s", + "value": 0.000296573 + }, + { + "unit": "s", + "value": 0.000136292 + }, + { + "unit": "s", + "value": 0.000320834 + }, + { + "unit": "s", + "value": 0.000130881 + }, + { + "unit": "s", + "value": 8.3781e-05 + }, + { + "unit": "s", + "value": 0.000247252 + }, + { + "unit": "s", + "value": 0.000430356 + }, + { + "unit": "s", + "value": 0.000281403 + }, + { + "unit": "s", + "value": 0.000421565 + }, + { + "unit": "s", + "value": 0.000266754 + } + ] + } + ] + }, + "8": { + "maxCalcBW": 38030862.93662141, + "maxMinCalculationTime": 5.3851e-05, + "timings": [ + { + "looplength": 4, + "messageSize": 8, + "timings": [ + { + "unit": "s", + "value": 0.000157722 + }, + { + "unit": "s", + "value": 0.000121611 + }, + { + "unit": "s", + "value": 0.000217192 + }, + { + "unit": "s", + "value": 9.7101e-05 + }, + { + "unit": "s", + "value": 6.6931e-05 + }, + { + "unit": "s", + "value": 8.6791e-05 + }, + { + "unit": "s", + "value": 0.000145572 + }, + { + "unit": "s", + "value": 0.000143042 + }, + { + "unit": "s", + "value": 8.5281e-05 + }, + { + "unit": "s", + "value": 5.3851e-05 + } + ] + } + ] + } + } + +A full example for FFT looks like this. + +.. code-block:: javascript + + { + "config_time": "Mon Dec 05 17:39:57 UTC 2022", + "device": "Intel(R) FPGA Emulation Device", + "environment": { + "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" + }, + "git_commit": "c7f3890-dirty", + "name": "FFT", + "results": { + "gflops_avg": { + "unit": "GFLOP/s", + "value": 0.27772734580591407 + }, + "gflops_min": { + "unit": "GFLOP/s", + "value": 0.28466663597913383 + }, + "t_avg": { + "unit": "s", + "value": 0.0008848966575 + }, + "t_min": { + "unit": "s", + "value": 0.00086332562 + } + }, + "settings": { + "Batch Size": 100, + "Communication Type": "UNSUPPORTED", + "FFT Size": 4096, + "Kernel File": "fft1d_float_8_emulate.aocx", + "Kernel Replications": 1, + "MPI Ranks": "None", + "Repetitions": 4, + "Test Mode": "No" + }, + "timings": { + "calculation": [ + { + "unit": "s", + "value": 0.090789326 + }, + { + "unit": "s", + "value": 0.086332562 + }, + { + "unit": "s", + "value": 0.090089428 + }, + { + "unit": "s", + "value": 0.086747347 + } + ] + }, + "version": "1.4" + } + From afecda016c1db4d163659c40522e1e212cf8af7c Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Sat, 10 Dec 2022 11:56:22 +0100 Subject: [PATCH 221/318] add errors to json output --- FFT/src/host/execution_default.cpp | 2 +- FFT/src/host/fft_benchmark.cpp | 31 ++++--- FFT/src/host/fft_benchmark.hpp | 9 +- FFT/tests/test_execution_functionality.cpp | 4 +- FFT/tests/test_fft_functionality.cpp | 2 +- GEMM/src/host/gemm_benchmark.cpp | 26 +++--- GEMM/src/host/gemm_benchmark.hpp | 9 +- LINPACK/src/host/linpack_benchmark.cpp | 48 +++++----- LINPACK/src/host/linpack_benchmark.hpp | 9 +- .../test_host_reference_implementations.cpp | 9 +- LINPACK/tests/test_kernel_communication.cpp | 3 +- ...nel_functionality_and_host_integration.cpp | 4 +- PTRANS/src/host/transpose_benchmark.cpp | 57 +++++++----- PTRANS/src/host/transpose_benchmark.hpp | 16 +++- PTRANS/tests/test_host_functionality.cpp | 6 +- .../src/host/random_access_benchmark.cpp | 22 +++-- .../src/host/random_access_benchmark.hpp | 9 +- RandomAccess/tests/test_host_code.cpp | 10 +-- ...nel_functionality_and_host_integration.cpp | 4 +- STREAM/src/host/stream_benchmark.cpp | 89 +++++++++++++------ STREAM/src/host/stream_benchmark.hpp | 9 +- b_eff/src/host/network_benchmark.cpp | 20 +++-- b_eff/src/host/network_benchmark.hpp | 15 +++- ...nel_functionality_and_host_integration.cpp | 18 ++-- shared/include/hpcc_benchmark.hpp | 66 ++++++++++---- shared/tests/hpcc_base_benchmark_test.cpp | 10 ++- 26 files changed, 334 insertions(+), 173 deletions(-) diff --git a/FFT/src/host/execution_default.cpp b/FFT/src/host/execution_default.cpp index d0d565da..a1ae245a 100644 --- a/FFT/src/host/execution_default.cpp +++ b/FFT/src/host/execution_default.cpp @@ -212,7 +212,7 @@ namespace bm_execution { } std::map> timings; - timings["calculation"] = calculationTimings; + timings["execution"] = calculationTimings; return timings; } diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp index 4bfed7d2..fde7c01c 100644 --- a/FFT/src/host/fft_benchmark.cpp +++ b/FFT/src/host/fft_benchmark.cpp @@ -96,7 +96,7 @@ void fft::FFTBenchmark::collectResults() { double gflop = static_cast(5 * (1 << LOG_FFT_SIZE) * LOG_FFT_SIZE) * executionSettings->programSettings->iterations * 1.0e-9 * mpi_comm_size; - uint number_measurements = timings["calculation"].size(); + uint number_measurements = timings["execution"].size(); std::vector avg_measures(number_measurements); #ifdef _USE_MPI_ // Copy the object variable to a local variable to make it accessible to the lambda function @@ -104,7 +104,7 @@ fft::FFTBenchmark::collectResults() { MPI_Reduce(timings.data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); std::for_each(avg_measures.begin(),avg_measures.end(), [mpi_size](double& x) {x /= mpi_size;}); #else - std::copy(timings["calculation"].begin(), timings["calculation"].end(), avg_measures.begin()); + std::copy(timings["execution"].begin(), timings["execution"].end(), avg_measures.begin()); #endif if (mpi_comm_rank == 0) { double minTime = *min_element(avg_measures.begin(), avg_measures.end()); @@ -118,12 +118,10 @@ fft::FFTBenchmark::collectResults() { void fft::FFTBenchmark::printResults() { - std::cout << std::setw(ENTRY_SPACE) << " " << std::setw(ENTRY_SPACE) << "avg" - << std::setw(ENTRY_SPACE) << "best" << std::endl; - std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << std::setw(ENTRY_SPACE) << results.at("t_avg") - << std::setw(ENTRY_SPACE) << results.at("t_min") << std::endl; - std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << std::setw(ENTRY_SPACE) << results.at("gflops_avg") - << std::setw(ENTRY_SPACE) << results.at("gflops_min") << std::endl; + std::cout << std::setw(ENTRY_SPACE) << " " << std::left << std::setw(ENTRY_SPACE) << " avg" + << std::setw(ENTRY_SPACE) << " best" << std::right << std::endl; + std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << results.at("t_avg") << results.at("t_min") << std::endl; + std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << results.at("gflops_avg") << results.at("gflops_min") << std::endl; } std::unique_ptr @@ -141,7 +139,7 @@ fft::FFTBenchmark::generateInputData() { } bool -fft::FFTBenchmark::validateOutputAndPrintError(fft::FFTData &data) { +fft::FFTBenchmark::validateOutput(fft::FFTData &data) { double residual_max = 0; for (int i = 0; i < executionSettings->programSettings->iterations; i++) { // we have to bit reverse the output data of the FPGA kernel, since it will be provided in bit-reversed order. @@ -159,17 +157,22 @@ fft::FFTBenchmark::validateOutputAndPrintError(fft::FFTData &data) { residual_max = residual_max > tmp_error ? residual_max : tmp_error; } } + // Calculate residual according to paper considering also the used iterations double error = residual_max / (std::numeric_limits::epsilon() * LOG_FFT_SIZE); + + errors.emplace("residual", hpcc_base::HpccResult(error, "")); + errors.emplace("epsilon", hpcc_base::HpccResult(std::numeric_limits::epsilon(), "")); - std::cout << std::setw(ENTRY_SPACE) << "res. error" << std::setw(ENTRY_SPACE) << "mach. eps" << std::endl; - std::cout << std::setw(ENTRY_SPACE) << error << std::setw(ENTRY_SPACE) - << std::numeric_limits::epsilon() << std::endl << std::endl; - - // Calculate residual according to paper considering also the used iterations return error < 1.0; } +void fft::FFTBenchmark::printError() { + std::cout << std::left << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl; + std::cout << errors.at("residual") << errors.at("epsilon") << std::endl << std::endl; + +} + void fft::bit_reverse(std::complex *data, unsigned iterations) { auto *tmp = new std::complex[(1 << LOG_FFT_SIZE)]; diff --git a/FFT/src/host/fft_benchmark.hpp b/FFT/src/host/fft_benchmark.hpp index 99fd3458..33ee832a 100644 --- a/FFT/src/host/fft_benchmark.hpp +++ b/FFT/src/host/fft_benchmark.hpp @@ -176,7 +176,14 @@ class FFTBenchmark : public hpcc_base::HpccFpgaBenchmarkgetExecutionSettings().programSettings->numRepetitions = 1; data = bm->generateInputData(); bm->executeKernel(*data); - EXPECT_EQ(1, bm->getTimingsMap().at("calculation").size()); + EXPECT_EQ(1, bm->getTimingsMap().at("execution").size()); } /** @@ -45,7 +45,7 @@ TEST_F(FFTKernelTest, CalculateReturnsCorrectExecutionResultFor24True) { bm->getExecutionSettings().programSettings->numRepetitions = 2; data = bm->generateInputData(); bm->executeKernel(*data); - EXPECT_EQ(2, bm->getTimingsMap().at("calculation").size()); + EXPECT_EQ(2, bm->getTimingsMap().at("execution").size()); } /** diff --git a/FFT/tests/test_fft_functionality.cpp b/FFT/tests/test_fft_functionality.cpp index df5dba9f..4453a695 100644 --- a/FFT/tests/test_fft_functionality.cpp +++ b/FFT/tests/test_fft_functionality.cpp @@ -134,7 +134,7 @@ TEST_F(FFTHostTest, JsonDump) { json j = json::parse(f); EXPECT_TRUE(j.contains("timings")); if (j.contains("timings")) { - EXPECT_TRUE(j["timings"].contains("calculation")); + EXPECT_TRUE(j["timings"].contains("execution")); } EXPECT_TRUE(j.contains("results")); if (j.contains("results")) { diff --git a/GEMM/src/host/gemm_benchmark.cpp b/GEMM/src/host/gemm_benchmark.cpp index 141ea160..21b8fd99 100644 --- a/GEMM/src/host/gemm_benchmark.cpp +++ b/GEMM/src/host/gemm_benchmark.cpp @@ -140,13 +140,12 @@ gemm::GEMMBenchmark::collectResults() { void gemm::GEMMBenchmark::printResults() { - std::cout << std::setw(ENTRY_SPACE) - << "best" << std::setw(ENTRY_SPACE) << "mean" - << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl; + std::cout << std::left << std::setw(ENTRY_SPACE) + << " best" << std::setw(ENTRY_SPACE) << " mean" + << std::setw(ENTRY_SPACE) << " GFLOPS" << std::right << std::endl; std::cout << std::setw(ENTRY_SPACE) - << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean") - << std::setw(ENTRY_SPACE) << results.at("gflops") + << results.at("t_min") << results.at("t_mean") << results.at("gflops") << std::endl; } @@ -170,7 +169,7 @@ gemm::GEMMBenchmark::generateInputData() { } bool -gemm::GEMMBenchmark::validateOutputAndPrintError(gemm::GEMMData &data) { +gemm::GEMMBenchmark::validateOutput(gemm::GEMMData &data) { auto ref_data = generateInputData(); gemm_ref(ref_data->A, ref_data->B, ref_data->C, executionSettings->programSettings->matrixSize, OPTIONAL_CAST(0.5), OPTIONAL_CAST(2.0)); @@ -195,19 +194,22 @@ gemm::GEMMBenchmark::validateOutputAndPrintError(gemm::GEMMData &data) { double eps = std::numeric_limits::epsilon(); double residn = resid / (executionSettings->programSettings->matrixSize*executionSettings->programSettings->matrixSize*ref_data->normtotal*normx*eps); - std::cout << " norm. resid resid "\ - "machep" << std::endl; - std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE) - << resid << std::setw(ENTRY_SPACE) << eps - << std::endl; + errors.emplace("epsilon", hpcc_base::HpccResult(eps, "")); + errors.emplace("residual", hpcc_base::HpccResult(resid, "")); + errors.emplace("residual_norm", hpcc_base::HpccResult(residn, "")); return residn < 1.0; } - // All other ranks are always reporting success of the validation return true; } +void +gemm::GEMMBenchmark::printError() { + std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl; + std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl; +} + void gemm::gemm_ref(HOST_DATA_TYPE* a,HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, int n, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta) { diff --git a/GEMM/src/host/gemm_benchmark.hpp b/GEMM/src/host/gemm_benchmark.hpp index 534a5bab..c77a212f 100644 --- a/GEMM/src/host/gemm_benchmark.hpp +++ b/GEMM/src/host/gemm_benchmark.hpp @@ -213,7 +213,14 @@ class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark @@ -295,7 +293,7 @@ linpack::LinpackBenchmark::generateInputData() { } bool -linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &data) { +linpack::LinpackBenchmark::validateOutput(linpack::LinpackData &data) { uint n= executionSettings->programSettings->matrixSize; uint matrix_width = data.matrix_width; uint matrix_height = data.matrix_height; @@ -420,19 +418,23 @@ linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &dat } #endif + errors.emplace("epsilon", hpcc_base::HpccResult(eps, "")); + errors.emplace("residual", hpcc_base::HpccResult(resid, "")); + errors.emplace("residual_norm", hpcc_base::HpccResult(residn, "")); + if (mpi_comm_rank == 0) { - //std::cout << resid << ", " << norma << ", " << normx << std::endl; - std::cout << " norm. resid resid "\ - "machep " << std::endl; - std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE) - << resid << std::setw(ENTRY_SPACE) << eps << std::endl; return residn < 1; - } - else { + } else { return true; } } +void +linpack::LinpackBenchmark::printError() { + std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl; + std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl; +} + void linpack::LinpackBenchmark::distributed_gesl_nopvt_ref(linpack::LinpackData& data) { uint global_matrix_size = executionSettings->programSettings->matrixSize; diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp index 7c7ce315..6178230d 100644 --- a/LINPACK/src/host/linpack_benchmark.hpp +++ b/LINPACK/src/host/linpack_benchmark.hpp @@ -257,7 +257,14 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmarkb[i] = static_cast(x[i]); } - EXPECT_TRUE(bm->validateOutputAndPrintError(*data)); + EXPECT_TRUE(bm->validateOutput(*data)); + bm->printError(); } #endif @@ -83,7 +84,8 @@ TEST_F(LinpackHostTest, ReferenceSolveWithPivoting) { data = bm->generateInputData(); linpack::gefa_ref(data->A, array_size, array_size, data->ipvt); linpack::gesl_ref(data->A, data->b, data->ipvt, array_size, array_size); - EXPECT_TRUE(bm->validateOutputAndPrintError(*data)); + EXPECT_TRUE(bm->validateOutput(*data)); + bm->printError(); } @@ -91,7 +93,8 @@ TEST_F(LinpackHostTest, ReferenceSolveWithoutPivoting) { data = bm->generateInputData(); linpack::gefa_ref_nopvt(data->A, array_size, array_size); linpack::gesl_ref_nopvt(data->A, data->b, array_size, array_size); - EXPECT_TRUE(bm->validateOutputAndPrintError(*data)); + EXPECT_TRUE(bm->validateOutput(*data)); + bm->printError(); } diff --git a/LINPACK/tests/test_kernel_communication.cpp b/LINPACK/tests/test_kernel_communication.cpp index 603bedef..2182de1e 100644 --- a/LINPACK/tests/test_kernel_communication.cpp +++ b/LINPACK/tests/test_kernel_communication.cpp @@ -920,7 +920,8 @@ TEST_F(LinpackKernelCommunicationTestLU, LUBlockExternalResultisSameAsRef) { TEST_F(LinpackKernelCommunicationTestLU, LUBlockExternalResultisCorrect) { linpack::gesl_ref_nopvt(data->A, data->b, bm->getExecutionSettings().programSettings->matrixSize,bm->getExecutionSettings().programSettings->matrixSize); - EXPECT_TRUE(bm->validateOutputAndPrintError(*data)); + EXPECT_TRUE(bm->validateOutput(*data)); + bm->printError(); } diff --git a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp index 2dbd21f0..0200a017 100644 --- a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp +++ b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp @@ -88,8 +88,8 @@ TEST_P(LinpackKernelTest, DISABLED_ValidationWorksForMKL) { #else dgesv_(&s, &lrhs, data_cpu->A, &s, data_cpu->ipvt, data_cpu->b, &s, &info); #endif - bool success = bm->validateOutputAndPrintError(*data_cpu); - EXPECT_TRUE(success); + EXPECT_TRUE(bm->validateOutput(*data)); + bm->printError(); } diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp index decc9b85..e0e45c11 100644 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -134,21 +134,30 @@ transpose::TransposeBenchmark::collectResults() { void transpose::TransposeBenchmark::printResults() { - std::cout << " total time transfer time calc time calc FLOPS Memory Bandwidth PCIe Bandwidth" << std::endl; - std::cout << "avg: " << results.at("avg_t") - << " " << results.at("avg_transfer_t") - << " " << results.at("avg_calc_t") - << " " << results.at("avg_calc_flops") - << " " << results.at("avg_mem_bandwidth") - << " " << results.at("avg_transfer_bandwidth") - << std::endl; - std::cout << "best: " << results.at("min_t") - << " " << results.at("min_transfer_t") - << " " << results.at("min_calc_t") - << " " << results.at("max_calc_flops") - << " " << results.at("max_mem_bandwidth") - << " " << results.at("max_transfer_bandwidth") - << std::endl; + std::cout << std::setw(ENTRY_SPACE) << " " + << std::left << std::setw(ENTRY_SPACE) << "total time" + << std::setw(ENTRY_SPACE) << "transfer time" + << std::setw(ENTRY_SPACE) << "calc time" + << std::setw(ENTRY_SPACE) << "calc FLOPS" + << std::setw(ENTRY_SPACE) << "Memory Bandwidth" + << std::setw(ENTRY_SPACE) << "PCIe Bandwidth" + << std::right << std::endl; + std::cout << std::setw(ENTRY_SPACE) << "avg: " + << results.at("avg_t") + << results.at("avg_transfer_t") + << results.at("avg_calc_t") + << results.at("avg_calc_flops") + << results.at("avg_mem_bandwidth") + << results.at("avg_transfer_bandwidth") + << std::endl; + std::cout << std::setw(ENTRY_SPACE) << "best: " + << results.at("min_t") + << results.at("min_transfer_t") + << results.at("min_calc_t") + << results.at("max_calc_flops") + << results.at("max_mem_bandwidth") + << results.at("max_transfer_bandwidth") + << std::endl; } std::unique_ptr @@ -157,8 +166,7 @@ return dataHandler->generateData(*executionSettings); } bool -transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeData &data) { - +transpose::TransposeBenchmark::validateOutput(transpose::TransposeData &data) { // exchange the data using MPI depending on the chosen distribution scheme dataHandler->exchangeData(data); @@ -172,14 +180,19 @@ transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeD double global_max_error = 0; MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - if (mpi_comm_rank == 0) { - std::cout << "Maximum error: " << global_max_error << " < " << 100 * std::numeric_limits::epsilon() << std::endl; - std::cout << "Mach. Epsilon: " << std::numeric_limits::epsilon() << std::endl; - } + errors.emplace("epsilon", hpcc_base::HpccResult(std::numeric_limits::epsilon(), "")); + errors.emplace("max_error", hpcc_base::HpccResult(global_max_error, "")); return static_cast(global_max_error) < 100 * std::numeric_limits::epsilon(); } +void +transpose::TransposeBenchmark::printError() { + std::cout << "Maximum error: " << errors.at("epsilon") << " < " << 100 * errors.at("epsilon").value << std::endl; + std::cout << "Mach. Epsilon: " << errors.at("epsilon") << std::endl; + +} + void transpose::TransposeBenchmark::setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier) { switch (dataHandlerIdentifier) { @@ -187,6 +200,4 @@ transpose::TransposeBenchmark::setTransposeDataHandler(transpose::data_handler:: case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break; default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier)); } - - } diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp index cd595637..57cd0231 100644 --- a/PTRANS/src/host/transpose_benchmark.hpp +++ b/PTRANS/src/host/transpose_benchmark.hpp @@ -93,16 +93,26 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmarkvalidateOutputAndPrintError(*data); + bool success = bm->validateOutput(*data); + bm->printError(); // Redirect stdout to old buffer std::cout.rdbuf(oldStdOutBuffer); @@ -128,7 +129,8 @@ TEST_F(TransposeHostTest, ValidationIsSuccess) { std::streambuf *oldStdOutBuffer = std::cout.rdbuf(); std::cout.rdbuf(newStdOutBuffer.rdbuf()); - bool success = bm->validateOutputAndPrintError(*data); + bool success = bm->validateOutput(*data); + bm->printError(); // Redirect stdout to old buffer std::cout.rdbuf(oldStdOutBuffer); diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp index a5f06303..2c5c449b 100644 --- a/RandomAccess/src/host/random_access_benchmark.cpp +++ b/RandomAccess/src/host/random_access_benchmark.cpp @@ -122,9 +122,9 @@ random_access::RandomAccessBenchmark::collectResults() { } void random_access::RandomAccessBenchmark::printResults() { - std::cout << std::setw(ENTRY_SPACE) + std::cout << std::left << std::setw(ENTRY_SPACE) << "best" << std::setw(ENTRY_SPACE) << "mean" - << std::setw(ENTRY_SPACE) << "GUOPS" << std::endl; + << std::setw(ENTRY_SPACE) << "GUOPS" << std::right << std::endl; std::cout << std::setw(ENTRY_SPACE) << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean") @@ -159,7 +159,7 @@ random_access::RandomAccessBenchmark::generateInputData() { } bool -random_access::RandomAccessBenchmark::validateOutputAndPrintError(random_access::RandomAccessData &data) { +random_access::RandomAccessBenchmark::validateOutput(random_access::RandomAccessData &data) { HOST_DATA_TYPE* rawdata; if (mpi_comm_size > 1) { @@ -190,19 +190,18 @@ random_access::RandomAccessBenchmark::validateOutputAndPrintError(random_access: rawdata[(temp >> 3) & (executionSettings->programSettings->dataSize * mpi_comm_size - 1)] ^= temp; } - double errors = 0; -#pragma omp parallel for reduction(+:errors) + double error_count = 0; +#pragma omp parallel for reduction(+:error_count) for (HOST_DATA_TYPE i=0; i< executionSettings->programSettings->dataSize * mpi_comm_size; i++) { if (rawdata[i] != i) { // If the array at index i does not contain i, it differs from the initial value and is counted as an error - errors++; + error_count++; } } // The overall error is calculated in percent of the overall array size - double error_ratio = static_cast(errors) / (executionSettings->programSettings->dataSize * mpi_comm_size); - std::cout << "Error: " << error_ratio * 100 - << "%" << std::endl; + double error_ratio = static_cast(error_count) / (executionSettings->programSettings->dataSize * mpi_comm_size); + errors.emplace("ratio", hpcc_base::HpccResult(error_ratio, "")); #ifdef _USE_MPI_ if (mpi_comm_rank == 0 && mpi_comm_size > 1) { @@ -216,3 +215,8 @@ random_access::RandomAccessBenchmark::validateOutputAndPrintError(random_access: // All other ranks skip validation and always return true return true; } + +void +random_access::RandomAccessBenchmark::printError() { + std::cout << "Error: " << errors.at("ratio") << std::endl; +} diff --git a/RandomAccess/src/host/random_access_benchmark.hpp b/RandomAccess/src/host/random_access_benchmark.hpp index 56c7ff40..3a1eebaa 100644 --- a/RandomAccess/src/host/random_access_benchmark.hpp +++ b/RandomAccess/src/host/random_access_benchmark.hpp @@ -156,7 +156,14 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmarkgenerateInputData(); // do random accesses - bm->validateOutputAndPrintError(*data); + bm->validateOutput(*data); // check correctness of random accesses - bool success = bm->validateOutputAndPrintError(*data); - EXPECT_TRUE(success); + EXPECT_TRUE(bm->validateOutput(*data)); + bm->printError(); } /** @@ -53,6 +53,6 @@ TEST_F(RandomAccessHostCodeTest, ValidDataSizeAreDetected) { TEST_F(RandomAccessHostCodeTest, ResultValidationWorksForWrongUpdates) { auto data = bm->generateInputData(); // check correctness of random accesses - bool success = bm->validateOutputAndPrintError( *data); - EXPECT_FALSE(success); + EXPECT_TRUE(bm->validateOutput(*data)); + bm->printError(); } diff --git a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp index 35c9f229..a52ce55f 100644 --- a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp +++ b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp @@ -46,8 +46,8 @@ TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements3Rep) { */ TEST_F(RandomAccessKernelTest, FPGAErrorBelow1Percent) { bm->executeKernel(*data); - bool success = bm->validateOutputAndPrintError(*data); - EXPECT_TRUE(success); + EXPECT_TRUE(bm->validateOutput(*data)); + bm->printError(); } using json = nlohmann::json; diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp index 07da82b3..bdba29e8 100644 --- a/STREAM/src/host/stream_benchmark.cpp +++ b/STREAM/src/host/stream_benchmark.cpp @@ -143,18 +143,19 @@ stream::StreamBenchmark::collectResults() { void stream::StreamBenchmark::printResults() { - std::cout << std::setw(ENTRY_SPACE) << "Function"; + std::cout << std::left << std::setw(ENTRY_SPACE) << "Function"; std::cout << std::setw(ENTRY_SPACE) << "Best Rate"; std::cout << std::setw(ENTRY_SPACE) << "Avg time"; std::cout << std::setw(ENTRY_SPACE) << "Min time" ; - std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::endl; + std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::right << std::endl; for (auto key : keys) { - std::cout << std::setw(ENTRY_SPACE) << key; - std::cout << std::setw(ENTRY_SPACE) << results.at(key + "_best_rate") - << std::setw(ENTRY_SPACE) << results.at(key + "_avg_t") - << std::setw(ENTRY_SPACE) << results.at(key + "_min_t") - << std::setw(ENTRY_SPACE) << results.at(key + "_max_t") << std::endl; + std::cout << std::left << std::setw(ENTRY_SPACE) << key + << results.at(key + "_best_rate") + << results.at(key + "_avg_t") + << results.at(key + "_min_t") + << results.at(key + "_max_t") + << std::right << std::endl; } } @@ -171,7 +172,7 @@ stream::StreamBenchmark::generateInputData() { } bool -stream::StreamBenchmark::validateOutputAndPrintError(stream::StreamData &data) { +stream::StreamBenchmark::validateOutput(stream::StreamData &data) { HOST_DATA_TYPE aj,bj,cj,scalar; double aSumErr,bSumErr,cSumErr; double aAvgErr,bAvgErr,cAvgErr; @@ -220,54 +221,84 @@ stream::StreamBenchmark::validateOutputAndPrintError(stream::StreamData &data) { bAvgErr = totalBAvgErr / mpi_comm_size; #endif + bool success = true; if (mpi_comm_rank == 0) { + errors.emplace("a_expected_value", hpcc_base::HpccResult(aj, "")); + errors.emplace("a_average_error", hpcc_base::HpccResult(aAvgErr, "")); + errors.emplace("a_average_relative_error", hpcc_base::HpccResult(abs(aAvgErr)/aj, "")); + + errors.emplace("b_expected_value", hpcc_base::HpccResult(bj, "")); + errors.emplace("b_average_error", hpcc_base::HpccResult(bAvgErr, "")); + errors.emplace("b_average_relative_error", hpcc_base::HpccResult(abs(bAvgErr)/bj, "")); + + errors.emplace("c_expected_value", hpcc_base::HpccResult(cj, "")); + errors.emplace("c_average_error", hpcc_base::HpccResult(cAvgErr, "")); + errors.emplace("c_average_relative_error", hpcc_base::HpccResult(abs(cAvgErr)/cj, "")); epsilon = std::numeric_limits::epsilon(); + errors.emplace("epsilon", hpcc_base::HpccResult(epsilon, "")); - err = 0; if (abs(aAvgErr/aj) > epsilon) { - err++; - printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); - printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); + success = false; ierr = 0; for (j=0; jprogramSettings->streamArraySize; j++) { if (abs(data.A[j]/aj-1.0) > epsilon) { ierr++; } } - printf(" For array a[], %d errors were found.\n",ierr); + errors.emplace("a_error_count", hpcc_base::HpccResult(ierr, "")); + ierr = 0; } if (abs(bAvgErr/bj) > epsilon) { - err++; - printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); - printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); - printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + success = false; ierr = 0; for (j=0; jprogramSettings->streamArraySize; j++) { if (abs(data.B[j]/bj-1.0) > epsilon) { ierr++; } } - printf(" For array b[], %d errors were found.\n",ierr); + errors.emplace("b_error_count", hpcc_base::HpccResult(ierr, "")); } if (abs(cAvgErr/cj) > epsilon) { - err++; - printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); - printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); - printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); + success = false; ierr = 0; for (j=0; jprogramSettings->streamArraySize; j++) { if (abs(data.C[j]/cj-1.0) > epsilon) { ierr++; } } - printf(" For array c[], %d errors were found.\n",ierr); + errors.emplace("b_error_count", hpcc_base::HpccResult(ierr, "")); } - if (err == 0) { - printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); - return true; - } - return false; } - return true; + return success; +} + +void +stream::StreamBenchmark::printError() { + int err = 0; + double epsilon = errors.at("epsilon").value; + if (errors.at("a_average_relative_error").value > epsilon) { + err++; + printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value); + printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("a_expected_value").value, errors.at("a_average_error").value, errors.at("a_average_relative_error").value); + printf(" For array a[], %d errors were found.\n", errors.at("a_error_count")); + } + + if (errors.at("b_average_relative_error").value > epsilon) { + err++; + printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value); + printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("b_expected_value").value, errors.at("b_average_error").value, errors.at("b_average_relative_error").value); + printf(" AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value); + printf(" For array b[], %d errors were found.\n", errors.at("b_error_count").value); + } + if (errors.at("c_average_relative_error").value > epsilon) { + err++; + printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value); + printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("c_expected_value").value, errors.at("c_average_error").value, errors.at("c_average_relative_error").value); + printf(" AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value); + printf(" For array c[], %d errors were found.\n", errors.at("c_error_count").value); + } + if (err == 0) { + printf ("Solution Validates: avg error less than %e on all three arrays\n", errors.at("epsilon").value); + } } diff --git a/STREAM/src/host/stream_benchmark.hpp b/STREAM/src/host/stream_benchmark.hpp index 8377b744..50f24b88 100644 --- a/STREAM/src/host/stream_benchmark.hpp +++ b/STREAM/src/host/stream_benchmark.hpp @@ -169,7 +169,14 @@ class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark(item.messageSize & 255u); - unsigned errors = 0; + unsigned error_count = 0; HOST_DATA_TYPE failing_entry = 0; for (const auto& v: item.validationBuffer) { if (v != expected_value) { - errors++; + error_count++; failing_entry = v; } } - total_error += errors; - if (errors > 0) { - std::cerr << "Validation data invalid for message size " << (1 << item.messageSize) << " in " << errors << " cases! Expected: " - << static_cast(expected_value) << ", Value: " << static_cast(failing_entry) << std::endl; + if (error_count > 0) { + errors.emplace(std::to_string(item.messageSize), hpcc_base::HpccResult(error_count, "")); } + total_error += error_count; } // success only, if no error occured return total_error == 0; } +void +network::NetworkBenchmark::printError() { + for (const auto& error: errors) { + std::cerr << "Validation data invalid for message size " << (1 << stoi(error.first)) << " in " << int(error.second.value) << " cases!" << std::endl; + } +} + diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index e1b77bc9..4d47c392 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -283,16 +283,27 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmarkitems.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data;}); data->items[0].validationBuffer[looplength] = expected_data + 1; - EXPECT_FALSE(bm->validateOutputAndPrintError(*data)); + EXPECT_FALSE(bm->validateOutput(*data)); + bm->printError(); } TEST_P(NetworkKernelTest, ValidationDataWrongCheckFails) { @@ -250,7 +251,8 @@ TEST_P(NetworkKernelTest, ValidationDataWrongCheckFails) { data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data - 1;}); - EXPECT_FALSE(bm->validateOutputAndPrintError(*data)); + EXPECT_FALSE(bm->validateOutput(*data)); + bm->printError(); } TEST_P(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) { @@ -260,7 +262,8 @@ TEST_P(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) { data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data;}); - EXPECT_TRUE(bm->validateOutputAndPrintError(*data)); + EXPECT_TRUE(bm->validateOutput(*data)); + bm->printError(); } TEST_P(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) { @@ -269,7 +272,8 @@ TEST_P(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) { data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); bm->executeKernel(*data); - EXPECT_TRUE(bm->validateOutputAndPrintError(*data)); + EXPECT_TRUE(bm->validateOutput(*data)); + bm->printError(); } // This test is disabled because it does not work with the current implementation of the @@ -282,7 +286,8 @@ TEST_P(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExec data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength)); bm->executeKernel(*data); - EXPECT_TRUE(bm->validateOutputAndPrintError(*data)); + EXPECT_TRUE(bm->validateOutput(*data)); + bm->printError(); } TEST_P(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) { @@ -293,7 +298,8 @@ TEST_P(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) { data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength)); bm->executeKernel(*data); data->items[1].validationBuffer[0] = static_cast(0); - EXPECT_FALSE(bm->validateOutputAndPrintError(*data)); + EXPECT_FALSE(bm->validateOutput(*data)); + bm->printError(); } TEST_P(NetworkKernelTest, JsonDump) { diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index ec611c3d..a2f7de95 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -45,7 +45,9 @@ SOFTWARE. #define STR_EXPAND(tok) #tok #define STR(tok) STR_EXPAND(tok) -#define ENTRY_SPACE 15 +#define VALUE_SPACE 11 +#define UNIT_SPACE 8 +#define ENTRY_SPACE (VALUE_SPACE + UNIT_SPACE + 1) using json = nlohmann::json; @@ -64,7 +66,7 @@ class HpccResult { HpccResult(double value, std::string unit): value(value), unit(unit) {} friend std::ostream &operator<<(std::ostream &os, const HpccResult &result) { - os << result.value << " " << result.unit; + os << std::setw(VALUE_SPACE) << result.value << " " << std::left << std::setw(UNIT_SPACE) << result.unit << std::right; return os; } @@ -73,6 +75,7 @@ class HpccResult { oss << *this; return oss.str(); } + // TODO: to_json function }; /** @@ -345,7 +348,13 @@ class HpccFpgaBenchmark { * */ std::map results; - + + /** + * + * @brief map containing the errors of the benchmark + * + */ + std::map errors; public: @@ -374,7 +383,13 @@ class HpccFpgaBenchmark { * @return false If the validation failed */ virtual bool - validateOutputAndPrintError(TData &data) = 0; + validateOutput(TData &data) = 0; + + /** + * @brief Print the error after validating output + */ + virtual void + printError() = 0; /** * @brief Collects the measurment results from all MPI ranks and @@ -515,18 +530,6 @@ class HpccFpgaBenchmark { timings.emplace(key, value); } - std::map getResultsJson() { - // TODO: nested maps, recursive? - std::map results_string; - for (auto const &result: results) { - json j; - j["unit"] = result.second.unit; - j["value"] = result.second.value; - results_string[result.first] = j; - } - return results_string; - } - // override for special benchmarks like b_eff virtual json getTimingsJson() { json j; @@ -542,6 +545,28 @@ class HpccFpgaBenchmark { } return j; } + + std::map getResultsJson() { + std::map results_string; + for (auto const &result: results) { + json j; + j["unit"] = result.second.unit; + j["value"] = result.second.value; + results_string[result.first] = j; + } + return results_string; + } + + std::map getErrorsJson() { + std::map errors_string; + for (auto const &error: errors) { + json j; + j["unit"] = error.second.unit; + j["value"] = error.second.value; + errors_string[error.first] = j; + } + return errors_string; + } std::map getEnvironmentMap() { @@ -602,6 +627,7 @@ class HpccFpgaBenchmark { dump["settings"] = jsonifySettingsMap(executionSettings->programSettings->getSettingsMap()); dump["timings"] = getTimingsJson(); dump["results"] = getResultsJson(); + dump["errors"] = getErrorsJson(); dump["environment"] = getEnvironmentMap(); fs << dump; @@ -738,13 +764,15 @@ class HpccFpgaBenchmark { if (!executionSettings->programSettings->skipValidation) { auto eval_start = std::chrono::high_resolution_clock::now(); - validateSuccess = validateOutputAndPrintError(*data); + validateSuccess = validateOutput(*data); + printError(); std::chrono::duration eval_time = std::chrono::high_resolution_clock::now() - eval_start; if (mpi_comm_rank == 0) { std::cout << "Validation Time: " << eval_time.count() << " s" << std::endl; } } + std::cout << HLINE << "Collect results..." << std::endl << HLINE; collectResults(); if (mpi_comm_rank == 0) { @@ -755,10 +783,10 @@ class HpccFpgaBenchmark { printResults(); if (!validateSuccess) { - std::cerr << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl; + std::cerr << HLINE << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl; } else { - std::cout << "Validation: SUCCESS!" << std::endl; + std::cout << HLINE << "Validation: SUCCESS!" << std::endl; } } diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp index b6378840..194c8920 100644 --- a/shared/tests/hpcc_base_benchmark_test.cpp +++ b/shared/tests/hpcc_base_benchmark_test.cpp @@ -40,7 +40,10 @@ class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark Date: Sat, 10 Dec 2022 11:56:48 +0100 Subject: [PATCH 222/318] update documentation and READMEs --- FFT/README.md | 157 +++++- GEMM/Readme.md | 181 +++++-- LINPACK/Readme.md | 190 ++++++- PTRANS/README.md | 248 +++++++-- RandomAccess/README.md | 136 ++++- STREAM/README.md | 486 +++++++++++++++++- b_eff/README.md | 293 +++++++++-- docs/source/FFT/index.rst | 1 + docs/source/conf.py | 1 + docs/source/index.rst | 7 + .../json_output/available_keys.csv | 58 --- .../technical_support/json_output/index.rst | 72 +-- 12 files changed, 1518 insertions(+), 312 deletions(-) delete mode 100644 docs/source/technical_support/json_output/available_keys.csv diff --git a/FFT/README.md b/FFT/README.md index 1d14663d..2926a5ac 100644 --- a/FFT/README.md +++ b/FFT/README.md @@ -59,31 +59,36 @@ For execution of the benchmark run: For more information on available input parameters run - $./FFT_intel -h + ./FFT_intel -h Implementation of the FFT benchmark proposed in the HPCC benchmark suite for FPGA. - Version: 1.2 + Version: 1.4 Usage: ./FFT_intel [OPTION...] - -f, --file arg Kernel file name - -n, arg Number of repetitions (default: 10) - -i, Use memory Interleaving - --skip-validation Skip the validation of the output data. This will - speed up execution and helps when working with special - data types. - --device arg Index of the device that has to be used. If not - given you will be asked which device to use if there are - multiple devices available. (default: -1) - --platform arg Index of the platform that has to be used. If not - given you will be asked which platform to use if there - are multiple platforms available. (default: -1) - -h, --help Print this help - -b, arg Number of batched FFT calculations (iterations) - (default: 100) - --inverse If set, the inverse FFT is calculated instead - -r, arg Number of kernel replications used for calculation - (default: 1) + -f, --file arg Kernel file name + -n, arg Number of repetitions (default: 10) + -i, Use memory Interleaving + --skip-validation Skip the validation of the output data. This will + speed up execution and helps when working with + special data types. + --device arg Index of the device that has to be used. If not + given you will be asked which device to use if there + are multiple devices available. (default: 0) + --platform arg Index of the platform that has to be used. If not + given you will be asked which platform to use if + there are multiple platforms available. (default: 0) + --platform_str arg Name of the platform that has to be used (default: + ) + -r, arg Number of used kernel replications (default: 1) + --dump-json arg dump benchmark configuration and results to this + file in json format (default: ) + --test Only test given configuration and skip execution + and validation + -h, --help Print this help + -b, arg Number of batched FFT calculations (iterations) + (default: 100) + --inverse If set, the inverse FFT is calculated instead To execute the unit and integration tests run @@ -96,12 +101,13 @@ It will run an emulation of the kernel and execute some functionality tests. The benchmark will print the following two tables to standard output after execution: - res. error mach. eps - 2.67000e-01 1.19209e-07 - - avg best - Time in s: 7.56801e-03 7.07241e-03 - GFLOPS: 3.24735e-02 3.47491e-02 + res. error mach. eps + 2.63523e-01 1.19209e-07 + + avg best + Time in s: 8.93261e-04 s 8.73572e-04 s + GFLOPS: 2.75127e-01 GFLOP/s 2.81328e-01 GFLOP/s + The first table contains the maximum residual error of the calculation and the machine epsilon that was used to calculate the residual error. @@ -118,3 +124,102 @@ In the second table the measured execution times and calculated FLOPs are given. It gives the average and bast for both. The time gives the averaged execution time for a single FFT in case of a batched execution (an execution with more than one iteration). They are also used to calculate the FLOPs. + +The json output looks like the following. + +```json + +{ + "config_time": "Thu Dec 08 10:39:10 UTC 2022", + "device": "Intel(R) FPGA Emulation Device", + "environment": { + "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" + }, + "errors": { + "epsilon": { + "unit": "", + "value": 1.1920928955078125e-07 + }, + "residual": { + "unit": "", + "value": 0.2635231415430705 + } + }, + "git_commit": "86e0064-dirty", + "name": "FFT", + "results": { + "gflops_avg": { + "unit": "GFLOP/s", + "value": 0.2751268094908118 + }, + "gflops_min": { + "unit": "GFLOP/s", + "value": 0.2813275822966743 + }, + "t_avg": { + "unit": "s", + "value": 0.0008932608220000002 + }, + "t_min": { + "unit": "s", + "value": 0.0008735723600000001 + } + }, + "settings": { + "Batch Size": 100, + "Communication Type": "UNSUPPORTED", + "FFT Size": 4096, + "Kernel File": "./bin/fft1d_float_8_emulate.aocx", + "Kernel Replications": 1, + "MPI Ranks": "None", + "Repetitions": 10, + "Test Mode": "No" + }, + "timings": { + "calculation": [ + { + "unit": "s", + "value": 0.090378907 + }, + { + "unit": "s", + "value": 0.089294969 + }, + { + "unit": "s", + "value": 0.08941156 + }, + { + "unit": "s", + "value": 0.089993811 + }, + { + "unit": "s", + "value": 0.087884474 + }, + { + "unit": "s", + "value": 0.087357236 + }, + { + "unit": "s", + "value": 0.089228888 + }, + { + "unit": "s", + "value": 0.089401591 + }, + { + "unit": "s", + "value": 0.089537203 + }, + { + "unit": "s", + "value": 0.090772183 + } + ] + }, + "version": "1.4" +} + +``` diff --git a/GEMM/Readme.md b/GEMM/Readme.md index 831194bd..33f0419b 100755 --- a/GEMM/Readme.md +++ b/GEMM/Readme.md @@ -75,36 +75,43 @@ For execution of the benchmark run: For more information on available input parameters run ./GEMM_intel -h - + Implementation of the GEMM benchmark proposed in the HPCC benchmark adapted for FPGA + Version: 1.3 + + MPI Version: 3.1 + Config. Time: Thu Dec 08 10:39:51 UTC 2022 + Git Commit: 86e0064-dirty + Usage: - ./GEMM_intel [OPTION...] - -Implementation of the GEMM benchmark proposed in the HPCC benchmark adapted for FPGA -Version: 1.0 - -Usage: - bin/GEMM_intel [OPTION...] - - -f, --file arg Kernel file name - -n, arg Number of repetitions (default: 10) - -i, Use memory Interleaving - --skip-validation Skip the validation of the output data. This will - speed up execution and helps when working with special - data types. - --device arg Index of the device that has to be used. If not - given you will be asked which device to use if there are - multiple devices available. (default: -1) - --platform arg Index of the platform that has to be used. If not - given you will be asked which platform to use if there - are multiple platforms available. (default: -1) - -h, --help Print this help - -m, arg Matrix size in number of blocks in a single - dimension (default: 8) - -b, arg Block size in number of values in one dimension - (default: 256) - -r, arg Number of used kernel replications (default: 4) - + ./bin/GEMM_intel [OPTION...] + + -f, --file arg Kernel file name + -n, arg Number of repetitions (default: 10) + -i, Use memory Interleaving + --skip-validation Skip the validation of the output data. This will + speed up execution and helps when working with + special data types. + --device arg Index of the device that has to be used. If not + given you will be asked which device to use if there + are multiple devices available. (default: 0) + --platform arg Index of the platform that has to be used. If not + given you will be asked which platform to use if + there are multiple platforms available. (default: 0) + --platform_str arg Name of the platform that has to be used (default: + ) + -r, arg Number of used kernel replications (default: 4) + --dump-json arg dump benchmark configuration and results to this + file in json format (default: ) + --test Only test given configuration and skip execution + and validation + -h, --help Print this help + -m, arg Matrix size in number of blocks in a single + dimension (default: 8) + -b, arg Block size in number of values in one dimension + (default: 32) + --replicate-inputs Also replicates the input buffer for each kernel + To execute the unit and integration tests run ./GEMM_test_intel -f KERNEL_FILE_NAME @@ -116,16 +123,17 @@ It will run an emulation of the kernel and execute some functionality tests. An example output from an emulation is given below: - norm. resid resid machep - 1.45417e-05 4.76837e-05 1.19209e-07 - best mean GFLOPS - 6.89168e-03 6.89168e-03 1.03868e+02 + norm. residual res. error mach. eps + 8.08345e-05 7.62939e-06 1.19209e-07 + + best mean GFLOPS + 6.50672e-03 s 1.06789e-02 s 5.15689e+00 GFLOP/s The first two rows give information about the calculation error. -- `norm. resid`: The normalized residual error based on the used matrix size and used values -- `resid`: The maximum residual error of the calculation -- `machep`: The machine epsilon +- `norm. residual`: The normalized residual error based on the used matrix size and used values +- `res. error`: The maximum residual error of the calculation +- `mach. epsilon`: The machine epsilon The last two columns contain the time measurements and based on that the achieved FLOPS of the calculation. @@ -133,3 +141,106 @@ of the calculation. - `best`: The shortest execution time in all runs - `mean`: Arithmetic mean of all execution times - `GFLOPS`: GFLOPS calculated from the shortest execution time + +The json output looks like the following. + +```json + +{ + "config_time": "Thu Dec 08 10:39:51 UTC 2022", + "device": "Intel(R) FPGA Emulation Device", + "environment": { + "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" + }, + "errors": { + "epsilon": { + "unit": "", + "value": 1.1920928955078125e-07 + }, + "residual": { + "unit": "", + "value": 7.62939453125e-06 + }, + "residual_norm": { + "unit": "", + "value": 8.08345175162664e-05 + } + }, + "git_commit": "86e0064-dirty", + "mpi": { + "subversion": 1, + "version": 3 + }, + "name": "GEMM", + "results": { + "gflops": { + "unit": "GFLOP/s", + "value": 5.347517549652832 + }, + "t_mean": { + "unit": "s", + "value": 0.009541589199999999 + }, + "t_min": { + "unit": "s", + "value": 0.006274768 + } + }, + "settings": { + "Communication Type": "UNSUPPORTED", + "Kernel File": "./bin/gemm_base_emulate.aocx", + "Kernel Replications": 4, + "MPI Ranks": 1, + "Matrix Size": 256, + "Repetitions": 10, + "Replicate Inputs": false, + "Test Mode": "No" + }, + "timings": { + "execution": [ + { + "unit": "s", + "value": 0.012631986 + }, + { + "unit": "s", + "value": 0.012796959 + }, + { + "unit": "s", + "value": 0.012527344 + }, + { + "unit": "s", + "value": 0.012579805 + }, + { + "unit": "s", + "value": 0.0064457 + }, + { + "unit": "s", + "value": 0.006274768 + }, + { + "unit": "s", + "value": 0.00642924 + }, + { + "unit": "s", + "value": 0.012808459 + }, + { + "unit": "s", + "value": 0.006587663 + }, + { + "unit": "s", + "value": 0.006333968 + } + ] + }, + "version": "1.3" +} + +``` diff --git a/LINPACK/Readme.md b/LINPACK/Readme.md index a8a07566..62162c43 100644 --- a/LINPACK/Readme.md +++ b/LINPACK/Readme.md @@ -127,14 +127,13 @@ It will run an emulation of the kernel and execute some functionality tests. The host code will print the results of the execution to the standard output. The result summary looks similar to this: - norm. resid resid machep - 3.25054e-08 5.88298e-05 1.19209e-07 - Validation Time: 4.55059e+01 s - Method best mean GFLOPS - total 5.87510e+01 5.87510e+01 2.10546e+04 - GEFA 5.87510e+01 5.87510e+01 2.10541e+04 - GESL 4.70000e-08 4.70000e-08 6.42532e+08 - Validation: SUCCESS! + norm. residual res. error mach. eps + 4.35451e-03 5.96046e-07 1.19209e-07 + + Method best mean GFLOPS + total 1.12152e-01 s 1.16113e-01 s 2.13045e-04 GFLOP/s + GEFA 1.12152e-01 s 1.16113e-01 s 1.94784e-04 GFLOP/s + GESL 2.00000e-08 s 3.97000e-08 s 1.02400e+02 GFLOP/s The first row contains data from the correctness check that is done once when executing the benchmark: @@ -155,3 +154,178 @@ The columns of the table contain the following information: The last row of the output will always contain `Validation: SUCCESS!`, if the norm. residual is below 1. This will be interpreted as successful validation. In this case, the executable will return 0 as exit code, 1 otherwise. + +The json output looks like the following. + +```json + +{ + "config_time": "Thu Dec 08 10:41:13 UTC 2022", + "device": "Intel(R) FPGA Emulation Device", + "environment": { + "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" + }, + "errors": { + "epsilon": { + "unit": "", + "value": 1.1920928955078125e-07 + }, + "residual": { + "unit": "", + "value": 5.960464477539062e-07 + }, + "residual_norm": { + "unit": "", + "value": 0.004354506590071576 + } + }, + "git_commit": "86e0064-dirty", + "mpi": { + "subversion": 1, + "version": 3 + }, + "name": "LINPACK", + "results": { + "gflops": { + "unit": "GFLOP/s", + "value": 0.000213044786995575 + }, + "gflops_lu": { + "unit": "GFLOP/s", + "value": 0.00019478383998887983 + }, + "gflops_sl": { + "unit": "GFLOP/s", + "value": 102.4 + }, + "t_mean": { + "unit": "s", + "value": 0.1161132923 + }, + "t_min": { + "unit": "s", + "value": 0.112151692 + }, + "tlu_mean": { + "unit": "s", + "value": 0.11611325259999998 + }, + "tlu_min": { + "unit": "s", + "value": 0.112151672 + }, + "tsl_mean": { + "unit": "s", + "value": 3.97e-08 + }, + "tsl_min": { + "unit": "s", + "value": 2e-08 + } + }, + "settings": { + "Block Size": 16, + "Communication Type": "IEC", + "Data Type": "cl_float", + "Emulate": false, + "FPGA Torus": { + "P": 1, + "Q": 1 + }, + "Kernel File": "./bin/hpl_torus_IEC_emulate.aocx", + "Kernel Replications": 3, + "MPI Ranks": 1, + "Matrix Size": 32, + "Repetitions": 10, + "Test Mode": "No" + }, + "timings": { + "gefa": [ + { + "unit": "s", + "value": 0.112151672 + }, + { + "unit": "s", + "value": 0.112186842 + }, + { + "unit": "s", + "value": 0.114559183 + }, + { + "unit": "s", + "value": 0.114920089 + }, + { + "unit": "s", + "value": 0.113395783 + }, + { + "unit": "s", + "value": 0.113512676 + }, + { + "unit": "s", + "value": 0.118974459 + }, + { + "unit": "s", + "value": 0.11378015 + }, + { + "unit": "s", + "value": 0.131815478 + }, + { + "unit": "s", + "value": 0.115836194 + } + ], + "gesl": [ + { + "unit": "s", + "value": 2e-08 + }, + { + "unit": "s", + "value": 3e-08 + }, + { + "unit": "s", + "value": 3e-08 + }, + { + "unit": "s", + "value": 2.9e-08 + }, + { + "unit": "s", + "value": 1.5e-07 + }, + { + "unit": "s", + "value": 3e-08 + }, + { + "unit": "s", + "value": 2e-08 + }, + { + "unit": "s", + "value": 2.9e-08 + }, + { + "unit": "s", + "value": 2.9e-08 + }, + { + "unit": "s", + "value": 3e-08 + } + ] + }, + "version": "2.6" +} + +``` diff --git a/PTRANS/README.md b/PTRANS/README.md index 55dfd8c4..9350e1de 100644 --- a/PTRANS/README.md +++ b/PTRANS/README.md @@ -69,58 +69,58 @@ For the execution of the benchmark run: For more information on available input parameters run - $./Transpose_xilinx -h - ------------------------------------------------------------- - General setup: - C++ high resolution clock is used. - The clock precision seems to be 1.00000e+01ns - ------------------------------------------------------------- + ./Transpose_xilinx -h + Implementation of the matrix transposition benchmark proposed in the HPCC benchmark suite for FPGA. Version: 1.7 MPI Version: 3.1 - Config. Time: Fri Mar 04 10:31:13 UTC 2022 - Git Commit: caebda4-dirty + Config. Time: Thu Dec 08 10:41:51 UTC 2022 + Git Commit: 86e0064-dirty Usage: - bin/Transpose_intel [OPTION...] + ./bin/Transpose_intel [OPTION...] - -f, --file arg Kernel file name - -n, arg Number of repetitions (default: 10) - -i, Use memory Interleaving - --skip-validation Skip the validation of the output data. This will + -f, --file arg Kernel file name + -n, arg Number of repetitions (default: 10) + -i, Use memory Interleaving + --skip-validation Skip the validation of the output data. This will speed up execution and helps when working with special data types. - --device arg Index of the device that has to be used. If not + --device arg Index of the device that has to be used. If not given you will be asked which device to use if - there are multiple devices available. (default: -1) - --platform arg Index of the platform that has to be used. If not + there are multiple devices available. (default: 0) + --platform arg Index of the platform that has to be used. If not given you will be asked which platform to use if there are multiple platforms available. (default: - -1) - -r, arg Number of used kernel replications (default: 2) - --comm-type arg Used communication type for inter-FPGA + 0) + --platform_str arg Name of the platform that has to be used + (default: ) + -r, arg Number of used kernel replications (default: 2) + --comm-type arg Used communication type for inter-FPGA communication (default: AUTO) - --test Only test given configuration and skip execution + --dump-json arg dump benchmark configuration and results to this + file in json format (default: ) + --test Only test given configuration and skip execution and validation - -h, --help Print this help - -m, arg Matrix size in number of blocks in one dimension - (default: 8) - -b, arg Block size in number of values in one dimension + -h, --help Print this help + -m, arg Matrix size in number of blocks in one dimension (default: 8) - -p, arg Value of P that equals the width of the PQ grid + -b, arg Block size in number of values in one dimension + (default: 512) + -p, arg Value of P that equals the width of the PQ grid of FPGAs. Q is determined by the world size. (default: 1) - --distribute-buffers Distribute buffers over memory banks. This will + --distribute-buffers Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs. - --handler arg Specify the used data handler that distributes + --handler arg Specify the used data handler that distributes the data over devices and memory banks (default: AUTO) - + Available options for `--comm-type`: - `CPU`: CPU only execution. MKL required. @@ -142,16 +142,12 @@ It will run an emulation of the kernel and execute some functionality tests. An example output from an emulation is given below: - ------------------------------------------------------------- - Validate output... - ------------------------------------------------------------- - Maximum error: 7.62939e-06 < 1.19209e-05 + Maximum error: 1.19209e-07 < 1.19209e-05 Mach. Epsilon: 1.19209e-07 - Validation Time: 4.66312e+00 s - total [s] transfer [s] calc [s] calc FLOPS Mem [B/s] PCIe [B/s] - avg: 1.15886e+00 1.04112e+00 1.17743e-01 9.11940e+09 1.09433e+11 1.23760e+10 - best: 1.13323e+00 1.02481e+00 1.08424e-01 9.90319e+09 1.18838e+11 1.25730e+10 - Validation: SUCCESS! + + total time transfer time calc time calc FLOPS Memory Bandwidth PCIe Bandwidth + avg: 6.05723e-02 s 1.30980e-02 s 4.74743e-02 s 3.53396e-01 GFLOP/s 4.24075e+00 GB/s 1.53708e+01 GB/s + best: 4.69977e-02 s 1.05343e-02 s 3.64633e-02 s 4.60112e-01 GFLOP/s 5.52134e+00 GB/s 1.91115e+01 GB/s The output gives the average and best calculation time for the transposition and important derived metrics based on these times. For the average and best timings, we have the following columns: @@ -171,3 +167,181 @@ The machine epsilon is given in the row below with `Mach. Epsilon`. Moreover, the total time that was needed for the validation of the result is given, which is just a debug information. The very last column summarizes the result: The last row will show `Validation: SUCCESS!` if the validation succeeded and the error is below the tolerated threshold. +The json output looks like the following. + +```json + +{ + "config_time": "Thu Dec 08 10:41:51 UTC 2022", + "device": "Intel(R) FPGA Emulation Device", + "environment": { + "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" + }, + "errors": { + "epsilon": { + "unit": "", + "value": 1.1920928955078125e-07 + }, + "max_error": { + "unit": "", + "value": 7.62939453125e-06 + } + }, + "git_commit": "86e0064-dirty", + "mpi": { + "subversion": 1, + "version": 3 + }, + "name": "matrix transposition", + "results": { + "avg_calc_flops": { + "unit": "GFLOP/s", + "value": 0.36102157111728794 + }, + "avg_calc_t": { + "unit": "s", + "value": 0.0464715057 + }, + "avg_mem_bandwidth": { + "unit": "GB/s", + "value": 4.332258853407454 + }, + "avg_t": { + "unit": "s", + "value": 0.061001096899999996 + }, + "avg_transfer_bandwidth": { + "unit": "GB/s", + "value": 13.856314966383914 + }, + "avg_transfer_t": { + "unit": "s", + "value": 0.0145295912 + }, + "max_calc_flops": { + "unit": "GFLOP/s", + "value": 0.4431353845559759 + }, + "max_mem_bandwidth": { + "unit": "GB/s", + "value": 5.31762461467171 + }, + "max_transfer_bandwidth": { + "unit": "GB/s", + "value": 17.8236830498358 + }, + "min_calc_t": { + "unit": "s", + "value": 0.037860249 + }, + "min_t": { + "unit": "s", + "value": 0.049155702999999995 + }, + "min_transfer_t": { + "unit": "s", + "value": 0.011295454 + } + }, + "settings": { + "Block Size": 512, + "Communication Type": "PCIE", + "Data Handler": "PQ", + "Dist. Buffers": "No", + "Kernel File": "./bin/transpose_PQ_PCIE_emulate.aocx", + "Kernel Replications": 2, + "MPI Ranks": 1, + "Matrix Size": 4096, + "Repetitions": 10, + "Test Mode": "No" + }, + "timings": { + "calculation": [ + { + "unit": "s", + "value": 0.054139988 + }, + { + "unit": "s", + "value": 0.05014593 + }, + { + "unit": "s", + "value": 0.037867809 + }, + { + "unit": "s", + "value": 0.037973641 + }, + { + "unit": "s", + "value": 0.046004999 + }, + { + "unit": "s", + "value": 0.037860249 + }, + { + "unit": "s", + "value": 0.056381497 + }, + { + "unit": "s", + "value": 0.050036547 + }, + { + "unit": "s", + "value": 0.048048414 + }, + { + "unit": "s", + "value": 0.046255983 + } + ], + "transfer": [ + { + "unit": "s", + "value": 0.025985196 + }, + { + "unit": "s", + "value": 0.012733798000000001 + }, + { + "unit": "s", + "value": 0.012989071999999999 + }, + { + "unit": "s", + "value": 0.011295454 + }, + { + "unit": "s", + "value": 0.013326449 + }, + { + "unit": "s", + "value": 0.012952722 + }, + { + "unit": "s", + "value": 0.014228134 + }, + { + "unit": "s", + "value": 0.013149265 + }, + { + "unit": "s", + "value": 0.014597321 + }, + { + "unit": "s", + "value": 0.014038500999999998 + } + ] + }, + "version": "1.7" +} + +``` diff --git a/RandomAccess/README.md b/RandomAccess/README.md index 12e665d7..a852b630 100644 --- a/RandomAccess/README.md +++ b/RandomAccess/README.md @@ -76,6 +76,40 @@ For more information on available input parameters run ./RandomAccess_intel -h + Implementation of the random access benchmark proposed in the HPCC benchmark suite for FPGA. + Version: 2.5 + + MPI Version: 3.1 + Config. Time: Thu Dec 08 10:42:40 UTC 2022 + Git Commit: 86e0064-dirty + + Usage: + ./bin/RandomAccess_intel [OPTION...] + + -f, --file arg Kernel file name + -n, arg Number of repetitions (default: 10) + -i, Use memory Interleaving + --skip-validation Skip the validation of the output data. This will + speed up execution and helps when working with + special data types. + --device arg Index of the device that has to be used. If not + given you will be asked which device to use if there + are multiple devices available. (default: 0) + --platform arg Index of the platform that has to be used. If not + given you will be asked which platform to use if + there are multiple platforms available. (default: 0) + --platform_str arg Name of the platform that has to be used (default: + ) + -r, arg Number of used kernel replications (default: 4) + --dump-json arg dump benchmark configuration and results to this + file in json format (default: ) + --test Only test given configuration and skip execution + and validation + -h, --help Print this help + -d, arg Log2 of the size of the data array (default: 29) + -g, arg Log2 of the number of random number generators + (default: 5) + To execute the unit and integration tests for Intel devices run CL_CONTEXT_EMULATOR_DEVICE=1 ./RandomAccess_test_intel -f KERNEL_FILE_NAME @@ -88,9 +122,10 @@ It will run an emulation of the kernel and execute some functionality tests. The host code will print the results of the execution to the standard output. The result summary looks similar to this: - Error: 9.87137e-03% - best mean GUPS - 1.73506e+01 1.73507e+01 2.47540e-01 + Error: 3.90625e-03 + + best mean GUOPS + 5.04258e-04 s 7.85656e-04 s 2.03071e-03 GUOP/s - `best` and `mean` are the fastest and the mean kernel execution time. The pure kernel execution time is measured without transferring the buffer @@ -105,3 +140,98 @@ The result summary looks similar to this: Benchmark results can be found in the `results` folder in this repository. + +The json output looks like the following. + +```json + +{ + "config_time": "Thu Dec 08 10:42:40 UTC 2022", + "device": "Intel(R) FPGA Emulation Device", + "environment": { + "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" + }, + "errors": { + "ratio": { + "unit": "", + "value": 0.00390625 + } + }, + "git_commit": "86e0064-dirty", + "mpi": { + "subversion": 1, + "version": 3 + }, + "name": "random access", + "results": { + "guops": { + "unit": "GUOP/s", + "value": 0.0022880227372259515 + }, + "t_mean": { + "unit": "s", + "value": 0.0005729401999999999 + }, + "t_min": { + "unit": "s", + "value": 0.000447548 + } + }, + "settings": { + "#RNGs": 32, + "Array Size": 256, + "Communication Type": "UNSUPPORTED", + "Kernel File": "./bin/random_access_kernels_single_emulate.aocx", + "Kernel Replications": 4, + "MPI Ranks": 1, + "Repetitions": 10, + "Test Mode": "No" + }, + "timings": { + "execution": [ + { + "unit": "s", + "value": 0.000672612 + }, + { + "unit": "s", + "value": 0.00058854 + }, + { + "unit": "s", + "value": 0.00058064 + }, + { + "unit": "s", + "value": 0.00057064 + }, + { + "unit": "s", + "value": 0.00053845 + }, + { + "unit": "s", + "value": 0.00055827 + }, + { + "unit": "s", + "value": 0.00056768 + }, + { + "unit": "s", + "value": 0.000649792 + }, + { + "unit": "s", + "value": 0.00055523 + }, + { + "unit": "s", + "value": 0.000447548 + } + ] + }, + "version": "2.5" +} + +``` diff --git a/STREAM/README.md b/STREAM/README.md index 4c5fa5ff..10980aad 100644 --- a/STREAM/README.md +++ b/STREAM/README.md @@ -73,24 +73,40 @@ For execution of the benchmark run: For more information on available input parameters run $./STREAM_FPGA_intel -h + Implementation of the STREAM benchmark proposed in the HPCC benchmark suite for FPGA. + Version: 2.6 + + MPI Version: 3.1 + Config. Time: Thu Dec 08 10:43:26 UTC 2022 + Git Commit: 86e0064-dirty + Usage: - ./STREAM_FPGA_xilinx [OPTION...] - - -f, --file arg Kernel file name - -n, arg Number of repetitions (default: 10) - -s, arg Size of the data arrays (default: 134217728) - -r, arg Number of kernel replications used (default: 1) - --multi-kernel Use the legacy multi-kernel implementation - --device arg Index of the device that has to be used. If not given - you will be asked which device to use if there are - multiple devices available. (default: -1) - --platform arg Index of the platform that has to be used. If not - given you will be asked which platform to use if there are - multiple platforms available. (default: -1) - -h, --help Print this help + ./bin/STREAM_FPGA_intel [OPTION...] + + -f, --file arg Kernel file name + -n, arg Number of repetitions (default: 10) + -i, Use memory Interleaving + --skip-validation Skip the validation of the output data. This will + speed up execution and helps when working with + special data types. + --device arg Index of the device that has to be used. If not + given you will be asked which device to use if there + are multiple devices available. (default: 0) + --platform arg Index of the platform that has to be used. If not + given you will be asked which platform to use if + there are multiple platforms available. (default: 0) + --platform_str arg Name of the platform that has to be used (default: + ) + -r, arg Number of used kernel replications (default: 4) + --dump-json arg dump benchmark configuration and results to this + file in json format (default: ) + --test Only test given configuration and skip execution + and validation + -h, --help Print this help + -s, arg Size of the data arrays (default: 134217728) + --multi-kernel Use the legacy multi kernel implementation - To execute the unit and integration tests for Intel devices run CL_CONTEXT_EMULATOR_DEVICE=1 ./STREAM_FPGA_test_intel -f KERNEL_FILE_NAME @@ -102,13 +118,13 @@ It will run an emulation of the kernel and execute some functionality tests. The output of the host application is similar to the original STREAM benchmark: - Function Best Rate MB/s Avg time Min time Max time - Copy: 30875.9 0.025914 0.025910 0.025919 - Scale: 30885.6 0.025905 0.025902 0.025911 - Add: 46289.2 0.025928 0.025924 0.025935 - Triad: 45613.4 0.026310 0.026308 0.026312 - PCI Write: 6324.0 0.189800 0.189753 0.189862 - PCI Read: 5587.3 0.214869 0.214773 0.214943 + Function Best Rate Avg time Min time Max time + PCI_write 2.68152e+04 MB/s 6.36535e-02 s 6.00633e-02 s 8.45139e-02 s + PCI_read 2.47220e+04 MB/s 6.72553e-02 s 6.51490e-02 s 6.82519e-02 s + Copy 4.75583e+04 MB/s 2.32275e-02 s 2.25774e-02 s 2.55071e-02 s + Scale 5.35745e+04 MB/s 2.13423e-02 s 2.00420e-02 s 2.42722e-02 s + Add 5.36221e+04 MB/s 3.33479e-02 s 3.00364e-02 s 3.68116e-02 s + Triad 4.84564e+04 MB/s 3.46477e-02 s 3.32384e-02 s 3.70085e-02 s In addition it also measures the bandwidth of the connection between host and device. It is distinguished between writing to and reading from the devices @@ -143,4 +159,428 @@ The raw data of these runs can be found in the folder `csv_result_export`. ![Single precision results](csv_result_export/sp_global_ring_plot.jpeg) ##### Double Precision -![Double precision results](csv_result_export/dp_global_ring_plot.jpeg) \ No newline at end of file +![Double precision results](csv_result_export/dp_global_ring_plot.jpeg) + +```json + +{ + "config_time": "Thu Dec 08 10:43:26 UTC 2022", + "device": "Intel(R) FPGA Emulation Device", + "environment": { + "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" + }, + "errors": { + "a_average_error": { + "unit": "", + "value": 0 + }, + "a_average_relative_error": { + "unit": "", + "value": 0 + }, + "a_expected_value": { + "unit": "", + "value": 1153300692992 + }, + "b_average_error": { + "unit": "", + "value": 0 + }, + "b_average_relative_error": { + "unit": "", + "value": 0 + }, + "b_expected_value": { + "unit": "", + "value": 230660145152 + }, + "c_average_error": { + "unit": "", + "value": 0 + }, + "c_average_relative_error": { + "unit": "", + "value": 0 + }, + "c_expected_value": { + "unit": "", + "value": 307546849280 + }, + "epsilon": { + "unit": "", + "value": 1.1920928955078125e-07 + } + }, + "git_commit": "86e0064-dirty", + "mpi": { + "subversion": 1, + "version": 3 + }, + "name": "STREAM", + "results": { + "Add_avg_t": { + "unit": "s", + "value": 0.033347886300000004 + }, + "Add_best_rate": { + "unit": "MB/s", + "value": 53622.07621998581 + }, + "Add_max_t": { + "unit": "s", + "value": 0.03681156 + }, + "Add_min_t": { + "unit": "s", + "value": 0.030036374 + }, + "Copy_avg_t": { + "unit": "s", + "value": 0.0232275248 + }, + "Copy_best_rate": { + "unit": "MB/s", + "value": 47558.26475478994 + }, + "Copy_max_t": { + "unit": "s", + "value": 0.025507117 + }, + "Copy_min_t": { + "unit": "s", + "value": 0.022577397 + }, + "PCI_read_avg_t": { + "unit": "s", + "value": 0.0672552576 + }, + "PCI_read_best_rate": { + "unit": "MB/s", + "value": 24721.98479896992 + }, + "PCI_read_max_t": { + "unit": "s", + "value": 0.06825187 + }, + "PCI_read_min_t": { + "unit": "s", + "value": 0.065149006 + }, + "PCI_write_avg_t": { + "unit": "s", + "value": 0.0636534559 + }, + "PCI_write_best_rate": { + "unit": "MB/s", + "value": 26815.238093906166 + }, + "PCI_write_max_t": { + "unit": "s", + "value": 0.084513938 + }, + "PCI_write_min_t": { + "unit": "s", + "value": 0.060063339 + }, + "Scale_avg_t": { + "unit": "s", + "value": 0.021342261699999997 + }, + "Scale_best_rate": { + "unit": "MB/s", + "value": 53574.52309080775 + }, + "Scale_max_t": { + "unit": "s", + "value": 0.024272246 + }, + "Scale_min_t": { + "unit": "s", + "value": 0.020042023 + }, + "Triad_avg_t": { + "unit": "s", + "value": 0.0346477169 + }, + "Triad_best_rate": { + "unit": "MB/s", + "value": 48456.4004453886 + }, + "Triad_max_t": { + "unit": "s", + "value": 0.037008534 + }, + "Triad_min_t": { + "unit": "s", + "value": 0.03323839 + } + }, + "settings": { + "Array Size": 134217728, + "Communication Type": "UNSUPPORTED", + "Data Type": "cl_float", + "Kernel File": "./bin/stream_kernels_single_emulate.aocx", + "Kernel Replications": 4, + "Kernel Type": "Single", + "MPI Ranks": 1, + "Repetitions": 10, + "Test Mode": "No" + }, + "timings": { + "Add": [ + { + "unit": "s", + "value": 0.03681156 + }, + { + "unit": "s", + "value": 0.030148826 + }, + { + "unit": "s", + "value": 0.034179315 + }, + { + "unit": "s", + "value": 0.03443528 + }, + { + "unit": "s", + "value": 0.030036374 + }, + { + "unit": "s", + "value": 0.03498338 + }, + { + "unit": "s", + "value": 0.033383682 + }, + { + "unit": "s", + "value": 0.03149675 + }, + { + "unit": "s", + "value": 0.035128302 + }, + { + "unit": "s", + "value": 0.032875394 + } + ], + "Copy": [ + { + "unit": "s", + "value": 0.023277928 + }, + { + "unit": "s", + "value": 0.023061445 + }, + { + "unit": "s", + "value": 0.022577397 + }, + { + "unit": "s", + "value": 0.025507117 + }, + { + "unit": "s", + "value": 0.022904103 + }, + { + "unit": "s", + "value": 0.023076385 + }, + { + "unit": "s", + "value": 0.022585516 + }, + { + "unit": "s", + "value": 0.023018084 + }, + { + "unit": "s", + "value": 0.023126956 + }, + { + "unit": "s", + "value": 0.023140317 + } + ], + "PCI_read": [ + { + "unit": "s", + "value": 0.066263925 + }, + { + "unit": "s", + "value": 0.065149006 + }, + { + "unit": "s", + "value": 0.06823823 + }, + { + "unit": "s", + "value": 0.067614649 + }, + { + "unit": "s", + "value": 0.068157828 + }, + { + "unit": "s", + "value": 0.06825187 + }, + { + "unit": "s", + "value": 0.068159038 + }, + { + "unit": "s", + "value": 0.066694763 + }, + { + "unit": "s", + "value": 0.067605659 + }, + { + "unit": "s", + "value": 0.066417608 + } + ], + "PCI_write": [ + { + "unit": "s", + "value": 0.084513938 + }, + { + "unit": "s", + "value": 0.060253183 + }, + { + "unit": "s", + "value": 0.060325944 + }, + { + "unit": "s", + "value": 0.064254031 + }, + { + "unit": "s", + "value": 0.060529077 + }, + { + "unit": "s", + "value": 0.063792623 + }, + { + "unit": "s", + "value": 0.060357565 + }, + { + "unit": "s", + "value": 0.060063339 + }, + { + "unit": "s", + "value": 0.060287283 + }, + { + "unit": "s", + "value": 0.062157576 + } + ], + "Scale": [ + { + "unit": "s", + "value": 0.021235864 + }, + { + "unit": "s", + "value": 0.020608554 + }, + { + "unit": "s", + "value": 0.020822067 + }, + { + "unit": "s", + "value": 0.020042023 + }, + { + "unit": "s", + "value": 0.021288745 + }, + { + "unit": "s", + "value": 0.020088374 + }, + { + "unit": "s", + "value": 0.021096531 + }, + { + "unit": "s", + "value": 0.021525769 + }, + { + "unit": "s", + "value": 0.024272246 + }, + { + "unit": "s", + "value": 0.022442444 + } + ], + "Triad": [ + { + "unit": "s", + "value": 0.037008534 + }, + { + "unit": "s", + "value": 0.036020228 + }, + { + "unit": "s", + "value": 0.033424273 + }, + { + "unit": "s", + "value": 0.033462613 + }, + { + "unit": "s", + "value": 0.033843901 + }, + { + "unit": "s", + "value": 0.033447893 + }, + { + "unit": "s", + "value": 0.03323839 + }, + { + "unit": "s", + "value": 0.036342203 + }, + { + "unit": "s", + "value": 0.03446487 + }, + { + "unit": "s", + "value": 0.035224264 + } + ] + }, + "version": "2.6" +} + +``` diff --git a/b_eff/README.md b/b_eff/README.md index ad2a9c27..157b0a67 100644 --- a/b_eff/README.md +++ b/b_eff/README.md @@ -71,38 +71,51 @@ For execution of the benchmark run: For more information on available input parameters run - $./Network_intel -h + ./Network_intel -h Implementation of the effective bandwidth benchmark proposed in the HPCC benchmark suite for FPGA. Version: 1.3 + MPI Version: 3.1 + Config. Time: Thu Dec 08 10:38:28 UTC 2022 + Git Commit: 86e0064-dirty + Usage: - bin/Network_intel [OPTION...] - - -f, --file arg Kernel file name - -n, arg Number of repetitions (default: 10) - -i, Use memory Interleaving - --skip-validation Skip the validation of the output data. This will - speed up execution and helps when working with special - data types. - --device arg Index of the device that has to be used. If not - given you will be asked which device to use if there are - multiple devices available. (default: -1) - --platform arg Index of the platform that has to be used. If not - given you will be asked which platform to use if there - are multiple platforms available. (default: -1) - -h, --help Print this help - -u, --upper arg Maximum number of repetitions per data size - (default: 32768) - -l, --lower arg Minimum number of repetitions per data size - (default: 1) - --min-size arg Minimum Message Size (default: 0) - -m, arg Maximum message size (default: 20) - -o, arg Offset used before reducing repetitions (default: 1) - -d, arg Number os steps the repetitions are decreased to its - minimum (default: 5) + ./bin/Network_intel [OPTION...] + + -f, --file arg Kernel file name + -n, arg Number of repetitions (default: 10) + -i, Use memory Interleaving + --skip-validation Skip the validation of the output data. This will + speed up execution and helps when working with + special data types. + --device arg Index of the device that has to be used. If not + given you will be asked which device to use if there + are multiple devices available. (default: 0) + --platform arg Index of the platform that has to be used. If not + given you will be asked which platform to use if + there are multiple platforms available. (default: 0) + --platform_str arg Name of the platform that has to be used (default: + ) + -r, arg Number of used kernel replications (default: 2) + --comm-type arg Used communication type for inter-FPGA + communication (default: AUTO) + --dump-json arg dump benchmark configuration and results to this + file in json format (default: ) + --test Only test given configuration and skip execution + and validation + -h, --help Print this help + -u, --upper arg Maximum number of repetitions per data size + (default: 65536) + -l, --lower arg Minimum number of repetitions per data size + (default: 256) + --min-size arg Minimum Message Size (default: 0) + -m, arg Maximum message size (default: 20) + -o, arg Offset used before reducing repetitions (default: + 11) + -d, arg Number os steps the repetitions are decreased to + its minimum (default: 7) - To execute the unit and integration tests run ./Network_test_intel -f KERNEL_FILE_NAME @@ -140,30 +153,12 @@ This might still lead to inaccuracies in the time measurements depending on the The benchmark will output a result table to the standard output after execution. This is an example output using a single rank in emulation: - MSize looplength time B/s - 1 16384 5.46779e-02 5.99292e+05 - 2 8192 5.19651e-02 6.30578e+05 - 4 4096 2.58565e-02 1.26730e+06 - 8 2048 7.51376e-03 4.36107e+06 - 16 1024 3.01288e-03 1.08760e+07 - 32 512 1.66958e-03 1.96265e+07 - 64 256 4.60622e-03 7.11386e+06 - 128 128 1.86568e-03 1.75636e+07 - 256 64 3.75094e-03 8.73594e+06 - 512 32 3.81549e-03 8.58814e+06 - 1024 16 3.44074e-03 9.52354e+06 - 2048 8 3.83420e-03 8.54624e+06 - 4096 4 3.34786e-03 9.78775e+06 - 16384 2 7.84717e-03 8.35154e+06 - 32768 1 7.42386e-03 8.82775e+06 - 65536 1 1.40822e-02 9.30761e+06 - 131072 1 1.28135e-02 2.04585e+07 - 262144 1 5.52680e-02 9.48628e+06 - 524288 1 9.99676e-02 1.04892e+07 - 1048576 1 1.21861e-01 1.72094e+07 - 2097152 1 4.20120e-01 9.98360e+06 - - b_eff = 9.58731e+06 B/s + MSize looplength transfer B/s + 64 5 4.38310e-05 1.46015e+07 + 128 5 7.07010e-05 1.81044e+07 + 256 5 7.73410e-05 3.31002e+07 + + b_eff = 2.19354e+07 B/s The table contains the measurements for all tested message sizes. It is split into the following four columns: @@ -177,4 +172,200 @@ It is possible to set the number of repetitions of the experiment. In this case, the best measured time will be used to calculate the bandwidth. Under the table the calculated effective bandwidth is printed. -It is the mean of the achieved bandwidths for all used message sizes. \ No newline at end of file +It is the mean of the achieved bandwidths for all used message sizes. + +The json output looks like the following. + +```json + +{ + "config_time": "Thu Dec 08 10:38:28 UTC 2022", + "device": "Intel(R) FPGA Emulation Device", + "environment": { + "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" + }, + "errors": {}, + "git_commit": "86e0064-dirty", + "mpi": { + "subversion": 1, + "version": 3 + }, + "name": "effective bandwidth", + "results": { + "b_eff": { + "unit": "B/s", + "value": 21935372.01805185 + } + }, + "settings": { + "Communication Type": "IEC", + "Kernel File": "./bin/communication_bw520n_IEC_emulate.aocx", + "Kernel Replications": 2, + "Loop Length": 5, + "MPI Ranks": 1, + "Message Sizes": 2, + "Repetitions": 10, + "Test Mode": "No" + }, + "timings": { + "6": { + "maxCalcBW": 14601537.724441605, + "maxMinCalculationTime": 4.3831e-05, + "timings": [ + { + "looplength": 5, + "messageSize": 6, + "timings": [ + { + "unit": "s", + "value": 0.013389739 + }, + { + "unit": "s", + "value": 6.2761e-05 + }, + { + "unit": "s", + "value": 4.9321e-05 + }, + { + "unit": "s", + "value": 4.3831e-05 + }, + { + "unit": "s", + "value": 4.951e-05 + }, + { + "unit": "s", + "value": 4.7561e-05 + }, + { + "unit": "s", + "value": 5.2311e-05 + }, + { + "unit": "s", + "value": 5.0441e-05 + }, + { + "unit": "s", + "value": 4.6901e-05 + }, + { + "unit": "s", + "value": 5.4401e-05 + } + ] + } + ] + }, + "7": { + "maxCalcBW": 18104411.535904724, + "maxMinCalculationTime": 7.0701e-05, + "timings": [ + { + "looplength": 5, + "messageSize": 7, + "timings": [ + { + "unit": "s", + "value": 0.000104852 + }, + { + "unit": "s", + "value": 0.000125222 + }, + { + "unit": "s", + "value": 7.9731e-05 + }, + { + "unit": "s", + "value": 0.000151442 + }, + { + "unit": "s", + "value": 9.3052e-05 + }, + { + "unit": "s", + "value": 0.000193763 + }, + { + "unit": "s", + "value": 8.4472e-05 + }, + { + "unit": "s", + "value": 0.000116562 + }, + { + "unit": "s", + "value": 8.2471e-05 + }, + { + "unit": "s", + "value": 7.0701e-05 + } + ] + } + ] + }, + "8": { + "maxCalcBW": 33100166.79380923, + "maxMinCalculationTime": 7.7341e-05, + "timings": [ + { + "looplength": 5, + "messageSize": 8, + "timings": [ + { + "unit": "s", + "value": 0.000711343 + }, + { + "unit": "s", + "value": 0.000378606 + }, + { + "unit": "s", + "value": 0.000280195 + }, + { + "unit": "s", + "value": 0.000107392 + }, + { + "unit": "s", + "value": 0.000203963 + }, + { + "unit": "s", + "value": 0.000122193 + }, + { + "unit": "s", + "value": 8.2151e-05 + }, + { + "unit": "s", + "value": 8.6861e-05 + }, + { + "unit": "s", + "value": 0.000167473 + }, + { + "unit": "s", + "value": 7.7341e-05 + } + ] + } + ] + } + }, + "version": "1.3" +} + +``` diff --git a/docs/source/FFT/index.rst b/docs/source/FFT/index.rst index 2fda355a..4f54398b 100644 --- a/docs/source/FFT/index.rst +++ b/docs/source/FFT/index.rst @@ -13,6 +13,7 @@ It is possible to specify the size of the FFT and the number of kernel replicati :glob: */index + ../../../FFT/README.md ------------------------ Configuration Parameters diff --git a/docs/source/conf.py b/docs/source/conf.py index 73c3c248..99328fa6 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -43,6 +43,7 @@ 'sphinx.ext.githubpages', # 'breathe', 'sphinx_rtd_theme', + 'myst_parser' ] # Enable Figure numbering and referencing diff --git a/docs/source/index.rst b/docs/source/index.rst index 13f1de5c..8139915b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -64,6 +64,13 @@ Further optimized implementations that use such device-specific communication ap :glob: */results/index + +.. toctree:: + :maxdepth: 1 + :caption: Benchmark Results: + :glob: + + ../../../*/README.md ---------- diff --git a/docs/source/technical_support/json_output/available_keys.csv b/docs/source/technical_support/json_output/available_keys.csv deleted file mode 100644 index 070a21da..00000000 --- a/docs/source/technical_support/json_output/available_keys.csv +++ /dev/null @@ -1,58 +0,0 @@ -Benchmark,timings,results,settings -:ref:`FFT `,calculation,gflops_avg,FFT Size -,,gflops_min,Batch Size -,,t_avg, -,,t_min, -:ref:`GEMM `,execution,gflops,Matrix Size -,,t_mean,Replicate Inputs -,,t_min, -:ref:`LINPACK `,gefa,gflops,Matrix Size -,gesl,gflops_lu,Block Size -,,gflops_sl,Emulate -,,t_mean,Data Type -,,t_min,FPGA Torus -,,tlu_mean, -,,tlu_min, -,,tsl_mean, -,,tsl_min, -:ref:`PTRANS `,calculation,avg_calc_flops,Matrix Size -,transfer,avg_calc_t,Block Size -,,avg_mem_bandwidth,Dist. Buffers -,,avg_t,Data Handler -,,avg_transfer_bandwidth, -,,avg_transfer_t, -,,max_calc_flops, -,,max_mem_bandwidth, -,,max_transfer_bandwidth, -,,min_calc_t, -,,min_t, -,,min_transfer_t, -:ref:`RandomAccess `,execution,guops,Array Size -,,t_mean,#RNGs -,,t_min, -:ref:`STREAM `,Add,Add_avg_t,Data Type -,,Add_best_rate,Array Size -,,Add_max_t,Kernel Type -,,Add_min_t, -,,Copy_avg_t, -,,Copy_best_rate, -,,Copy_max_t, -,,Copy_min_t, -,PCI_read,PCI_read_avg_t, -,,PCI_read_best_rate, -,,PCI_read_max_t, -,,PCI_read_min_t, -,PCI_write,PCI_write_avg_t, -,,PCI_write_best_rate, -,,PCI_write_max_t, -,,PCI_write_min_t, -,Scale,Scale_avg_t, -,,Scale_best_rate, -,,Scale_max_t, -,,Scale_min_t, -,Triad,Triad_avg_t, -,,Triad_best_rate, -,,Triad_max_t, -,,Triad_min_t, -:ref:`b_eff `,**special syntax - see below**,b_eff,Loop Length -,,,Message Sizes diff --git a/docs/source/technical_support/json_output/index.rst b/docs/source/technical_support/json_output/index.rst index 37aa1f68..08ca9ab7 100644 --- a/docs/source/technical_support/json_output/index.rst +++ b/docs/source/technical_support/json_output/index.rst @@ -35,13 +35,7 @@ When enabled, this creates a json file which will have some information for all "version": "1.3" } -If a benchmark has more settings, they will be added to the settings-key. Every benchmark can track different categories of timings and different results. The following table shows which keys are available for which benchmark. - -.. csv-table:: Available keys - :file: available_keys.csv - :header-rows: 1 - :class: longtable - :widths: 1 1 1 1 +If a benchmark has more settings, they will be added to the settings-key. Every benchmark can track different categories of timings, different results and errors. To see a full example and which keys are available have a look at the README.md of the single benchmarks in the [git repositoy](https://git.uni-paderborn.de/pc2/HPCC_FPGA). The results and timings are in a special format, which consists of the value and the unit. @@ -218,67 +212,3 @@ The timings are a vector of all the timings which were measured, expect for b_ef ] } } - -A full example for FFT looks like this. - -.. code-block:: javascript - - { - "config_time": "Mon Dec 05 17:39:57 UTC 2022", - "device": "Intel(R) FPGA Emulation Device", - "environment": { - "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" - }, - "git_commit": "c7f3890-dirty", - "name": "FFT", - "results": { - "gflops_avg": { - "unit": "GFLOP/s", - "value": 0.27772734580591407 - }, - "gflops_min": { - "unit": "GFLOP/s", - "value": 0.28466663597913383 - }, - "t_avg": { - "unit": "s", - "value": 0.0008848966575 - }, - "t_min": { - "unit": "s", - "value": 0.00086332562 - } - }, - "settings": { - "Batch Size": 100, - "Communication Type": "UNSUPPORTED", - "FFT Size": 4096, - "Kernel File": "fft1d_float_8_emulate.aocx", - "Kernel Replications": 1, - "MPI Ranks": "None", - "Repetitions": 4, - "Test Mode": "No" - }, - "timings": { - "calculation": [ - { - "unit": "s", - "value": 0.090789326 - }, - { - "unit": "s", - "value": 0.086332562 - }, - { - "unit": "s", - "value": 0.090089428 - }, - { - "unit": "s", - "value": 0.086747347 - } - ] - }, - "version": "1.4" - } - From b2769e51412d8ba6dff20f9bfd50c49586919cfd Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 12 Dec 2022 10:33:31 +0100 Subject: [PATCH 223/318] only print on rank0 --- FFT/src/host/fft_benchmark.cpp | 12 ++- GEMM/src/host/gemm_benchmark.cpp | 20 +++-- LINPACK/src/host/linpack_benchmark.cpp | 38 +++++---- PTRANS/src/host/transpose_benchmark.cpp | 57 +++++++------- .../src/host/random_access_benchmark.cpp | 6 +- RandomAccess/tests/test_host_code.cpp | 2 +- STREAM/src/host/stream_benchmark.cpp | 78 ++++++++++--------- 7 files changed, 118 insertions(+), 95 deletions(-) diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp index fde7c01c..c7042add 100644 --- a/FFT/src/host/fft_benchmark.cpp +++ b/FFT/src/host/fft_benchmark.cpp @@ -118,10 +118,12 @@ fft::FFTBenchmark::collectResults() { void fft::FFTBenchmark::printResults() { + if (mpi_comm_rank == 0) { std::cout << std::setw(ENTRY_SPACE) << " " << std::left << std::setw(ENTRY_SPACE) << " avg" << std::setw(ENTRY_SPACE) << " best" << std::right << std::endl; - std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << results.at("t_avg") << results.at("t_min") << std::endl; - std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << results.at("gflops_avg") << results.at("gflops_min") << std::endl; + std::cout << std::setw(ENTRY_SPACE) << "Time in s: " << results.at("t_avg") << results.at("t_min") << std::endl; + std::cout << std::setw(ENTRY_SPACE) << "GFLOPS: " << results.at("gflops_avg") << results.at("gflops_min") << std::endl; + } } std::unique_ptr @@ -168,8 +170,10 @@ fft::FFTBenchmark::validateOutput(fft::FFTData &data) { } void fft::FFTBenchmark::printError() { - std::cout << std::left << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl; - std::cout << errors.at("residual") << errors.at("epsilon") << std::endl << std::endl; + if (mpi_comm_rank == 0) { + std::cout << std::left << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl; + std::cout << errors.at("residual") << errors.at("epsilon") << std::endl << std::endl; + } } diff --git a/GEMM/src/host/gemm_benchmark.cpp b/GEMM/src/host/gemm_benchmark.cpp index 21b8fd99..5f6dfec6 100644 --- a/GEMM/src/host/gemm_benchmark.cpp +++ b/GEMM/src/host/gemm_benchmark.cpp @@ -140,13 +140,15 @@ gemm::GEMMBenchmark::collectResults() { void gemm::GEMMBenchmark::printResults() { - std::cout << std::left << std::setw(ENTRY_SPACE) - << " best" << std::setw(ENTRY_SPACE) << " mean" - << std::setw(ENTRY_SPACE) << " GFLOPS" << std::right << std::endl; + if (mpi_comm_rank == 0) { + std::cout << std::left << std::setw(ENTRY_SPACE) + << " best" << std::setw(ENTRY_SPACE) << " mean" + << std::setw(ENTRY_SPACE) << " GFLOPS" << std::right << std::endl; - std::cout << std::setw(ENTRY_SPACE) - << results.at("t_min") << results.at("t_mean") << results.at("gflops") - << std::endl; + std::cout << std::setw(ENTRY_SPACE) + << results.at("t_min") << results.at("t_mean") << results.at("gflops") + << std::endl; + } } std::unique_ptr @@ -206,8 +208,10 @@ gemm::GEMMBenchmark::validateOutput(gemm::GEMMData &data) { void gemm::GEMMBenchmark::printError() { - std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl; - std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl; + if (mpi_comm_rank == 0) { + std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl; + std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl; + } } void diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp index ec3d900e..66265a04 100644 --- a/LINPACK/src/host/linpack_benchmark.cpp +++ b/LINPACK/src/host/linpack_benchmark.cpp @@ -185,23 +185,25 @@ linpack::LinpackBenchmark::collectResults() { void linpack::LinpackBenchmark::printResults() { - std::cout << std::left << std::setw(ENTRY_SPACE) << " Method" - << std::setw(ENTRY_SPACE) << " best" - << std::setw(ENTRY_SPACE) << " mean" - << std::setw(ENTRY_SPACE) << " GFLOPS" - << std::endl; - - std::cout << std::left << std::setw(ENTRY_SPACE) << " total" - << results.at("t_min") << results.at("t_mean") << results.at("gflops") - << std::endl; - - std::cout << std::left << std::setw(ENTRY_SPACE) << " GEFA" - << results.at("tlu_min") << results.at("tlu_mean") << results.at("gflops_lu") + if (mpi_comm_rank == 0) { + std::cout << std::left << std::setw(ENTRY_SPACE) << " Method" + << std::setw(ENTRY_SPACE) << " best" + << std::setw(ENTRY_SPACE) << " mean" + << std::setw(ENTRY_SPACE) << " GFLOPS" << std::endl; - std::cout << std::left << std::setw(ENTRY_SPACE) << " GESL" - << results.at("tsl_min") << results.at("tsl_mean") << results.at("gflops_sl") - << std::right << std::endl; + std::cout << std::left << std::setw(ENTRY_SPACE) << " total" + << results.at("t_min") << results.at("t_mean") << results.at("gflops") + << std::endl; + + std::cout << std::left << std::setw(ENTRY_SPACE) << " GEFA" + << results.at("tlu_min") << results.at("tlu_mean") << results.at("gflops_lu") + << std::endl; + + std::cout << std::left << std::setw(ENTRY_SPACE) << " GESL" + << results.at("tsl_min") << results.at("tsl_mean") << results.at("gflops_sl") + << std::right << std::endl; + } } std::unique_ptr @@ -431,8 +433,10 @@ linpack::LinpackBenchmark::validateOutput(linpack::LinpackData &data) { void linpack::LinpackBenchmark::printError() { - std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl; - std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl; + if (mpi_comm_rank == 0) { + std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl; + std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl; + } } void diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp index e0e45c11..782b4680 100644 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -134,30 +134,32 @@ transpose::TransposeBenchmark::collectResults() { void transpose::TransposeBenchmark::printResults() { - std::cout << std::setw(ENTRY_SPACE) << " " - << std::left << std::setw(ENTRY_SPACE) << "total time" - << std::setw(ENTRY_SPACE) << "transfer time" - << std::setw(ENTRY_SPACE) << "calc time" - << std::setw(ENTRY_SPACE) << "calc FLOPS" - << std::setw(ENTRY_SPACE) << "Memory Bandwidth" - << std::setw(ENTRY_SPACE) << "PCIe Bandwidth" - << std::right << std::endl; - std::cout << std::setw(ENTRY_SPACE) << "avg: " - << results.at("avg_t") - << results.at("avg_transfer_t") - << results.at("avg_calc_t") - << results.at("avg_calc_flops") - << results.at("avg_mem_bandwidth") - << results.at("avg_transfer_bandwidth") - << std::endl; - std::cout << std::setw(ENTRY_SPACE) << "best: " - << results.at("min_t") - << results.at("min_transfer_t") - << results.at("min_calc_t") - << results.at("max_calc_flops") - << results.at("max_mem_bandwidth") - << results.at("max_transfer_bandwidth") - << std::endl; + if (mpi_comm_rank == 0) { + std::cout << std::setw(ENTRY_SPACE) << " " + << std::left << std::setw(ENTRY_SPACE) << "total time" + << std::setw(ENTRY_SPACE) << "transfer time" + << std::setw(ENTRY_SPACE) << "calc time" + << std::setw(ENTRY_SPACE) << "calc FLOPS" + << std::setw(ENTRY_SPACE) << "Memory Bandwidth" + << std::setw(ENTRY_SPACE) << "PCIe Bandwidth" + << std::right << std::endl; + std::cout << std::setw(ENTRY_SPACE) << "avg: " + << results.at("avg_t") + << results.at("avg_transfer_t") + << results.at("avg_calc_t") + << results.at("avg_calc_flops") + << results.at("avg_mem_bandwidth") + << results.at("avg_transfer_bandwidth") + << std::endl; + std::cout << std::setw(ENTRY_SPACE) << "best: " + << results.at("min_t") + << results.at("min_transfer_t") + << results.at("min_calc_t") + << results.at("max_calc_flops") + << results.at("max_mem_bandwidth") + << results.at("max_transfer_bandwidth") + << std::endl; + } } std::unique_ptr @@ -188,9 +190,10 @@ transpose::TransposeBenchmark::validateOutput(transpose::TransposeData &data) { void transpose::TransposeBenchmark::printError() { - std::cout << "Maximum error: " << errors.at("epsilon") << " < " << 100 * errors.at("epsilon").value << std::endl; - std::cout << "Mach. Epsilon: " << errors.at("epsilon") << std::endl; - + if (mpi_comm_rank == 0) { + std::cout << "Maximum error: " << errors.at("epsilon") << " < " << 100 * errors.at("epsilon").value << std::endl; + std::cout << "Mach. Epsilon: " << errors.at("epsilon") << std::endl; + } } void diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp index 2c5c449b..512ab354 100644 --- a/RandomAccess/src/host/random_access_benchmark.cpp +++ b/RandomAccess/src/host/random_access_benchmark.cpp @@ -122,6 +122,7 @@ random_access::RandomAccessBenchmark::collectResults() { } void random_access::RandomAccessBenchmark::printResults() { + if (mpi_comm_rank == 0) { std::cout << std::left << std::setw(ENTRY_SPACE) << "best" << std::setw(ENTRY_SPACE) << "mean" << std::setw(ENTRY_SPACE) << "GUOPS" << std::right << std::endl; @@ -130,6 +131,7 @@ void random_access::RandomAccessBenchmark::printResults() { << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean") << std::setw(ENTRY_SPACE) << results.at("guops") << std::endl; + } } bool @@ -218,5 +220,7 @@ random_access::RandomAccessBenchmark::validateOutput(random_access::RandomAccess void random_access::RandomAccessBenchmark::printError() { - std::cout << "Error: " << errors.at("ratio") << std::endl; + if (mpi_comm_rank == 0) { + std::cout << "Error: " << errors.at("ratio") << std::endl; + } } diff --git a/RandomAccess/tests/test_host_code.cpp b/RandomAccess/tests/test_host_code.cpp index 675c6979..59d1a27c 100644 --- a/RandomAccess/tests/test_host_code.cpp +++ b/RandomAccess/tests/test_host_code.cpp @@ -53,6 +53,6 @@ TEST_F(RandomAccessHostCodeTest, ValidDataSizeAreDetected) { TEST_F(RandomAccessHostCodeTest, ResultValidationWorksForWrongUpdates) { auto data = bm->generateInputData(); // check correctness of random accesses - EXPECT_TRUE(bm->validateOutput(*data)); + EXPECT_FALSE(bm->validateOutput(*data)); bm->printError(); } diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp index bdba29e8..e8328dc8 100644 --- a/STREAM/src/host/stream_benchmark.cpp +++ b/STREAM/src/host/stream_benchmark.cpp @@ -143,19 +143,21 @@ stream::StreamBenchmark::collectResults() { void stream::StreamBenchmark::printResults() { - std::cout << std::left << std::setw(ENTRY_SPACE) << "Function"; - std::cout << std::setw(ENTRY_SPACE) << "Best Rate"; - std::cout << std::setw(ENTRY_SPACE) << "Avg time"; - std::cout << std::setw(ENTRY_SPACE) << "Min time" ; - std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::right << std::endl; - - for (auto key : keys) { - std::cout << std::left << std::setw(ENTRY_SPACE) << key - << results.at(key + "_best_rate") - << results.at(key + "_avg_t") - << results.at(key + "_min_t") - << results.at(key + "_max_t") - << std::right << std::endl; + if (mpi_comm_rank == 0) { + std::cout << std::left << std::setw(ENTRY_SPACE) << "Function"; + std::cout << std::setw(ENTRY_SPACE) << "Best Rate"; + std::cout << std::setw(ENTRY_SPACE) << "Avg time"; + std::cout << std::setw(ENTRY_SPACE) << "Min time" ; + std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::right << std::endl; + + for (auto key : keys) { + std::cout << std::left << std::setw(ENTRY_SPACE) << key + << results.at(key + "_best_rate") + << results.at(key + "_avg_t") + << results.at(key + "_min_t") + << results.at(key + "_max_t") + << std::right << std::endl; + } } } @@ -275,30 +277,32 @@ stream::StreamBenchmark::validateOutput(stream::StreamData &data) { void stream::StreamBenchmark::printError() { - int err = 0; - double epsilon = errors.at("epsilon").value; - if (errors.at("a_average_relative_error").value > epsilon) { - err++; - printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value); - printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("a_expected_value").value, errors.at("a_average_error").value, errors.at("a_average_relative_error").value); - printf(" For array a[], %d errors were found.\n", errors.at("a_error_count")); - } + if (mpi_comm_rank == 0) { + int err = 0; + double epsilon = errors.at("epsilon").value; + if (errors.at("a_average_relative_error").value > epsilon) { + err++; + printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value); + printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("a_expected_value").value, errors.at("a_average_error").value, errors.at("a_average_relative_error").value); + printf(" For array a[], %d errors were found.\n", errors.at("a_error_count")); + } - if (errors.at("b_average_relative_error").value > epsilon) { - err++; - printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value); - printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("b_expected_value").value, errors.at("b_average_error").value, errors.at("b_average_relative_error").value); - printf(" AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value); - printf(" For array b[], %d errors were found.\n", errors.at("b_error_count").value); - } - if (errors.at("c_average_relative_error").value > epsilon) { - err++; - printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value); - printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("c_expected_value").value, errors.at("c_average_error").value, errors.at("c_average_relative_error").value); - printf(" AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value); - printf(" For array c[], %d errors were found.\n", errors.at("c_error_count").value); - } - if (err == 0) { - printf ("Solution Validates: avg error less than %e on all three arrays\n", errors.at("epsilon").value); + if (errors.at("b_average_relative_error").value > epsilon) { + err++; + printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value); + printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("b_expected_value").value, errors.at("b_average_error").value, errors.at("b_average_relative_error").value); + printf(" AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value); + printf(" For array b[], %d errors were found.\n", errors.at("b_error_count").value); + } + if (errors.at("c_average_relative_error").value > epsilon) { + err++; + printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value); + printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("c_expected_value").value, errors.at("c_average_error").value, errors.at("c_average_relative_error").value); + printf(" AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value); + printf(" For array c[], %d errors were found.\n", errors.at("c_error_count").value); + } + if (err == 0) { + printf ("Solution Validates: avg error less than %e on all three arrays\n", errors.at("epsilon").value); + } } } From c3dcd9b80605df5d621f743503b19aa44099c8b1 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 12 Dec 2022 17:04:40 +0100 Subject: [PATCH 224/318] add more settings to SettingsMap --- FFT/src/host/fft_benchmark.cpp | 2 +- FFT/src/host/fft_benchmark.hpp | 6 ---- GEMM/src/host/gemm_benchmark.cpp | 4 +-- GEMM/src/host/gemm_benchmark.hpp | 6 ---- LINPACK/src/host/linpack_benchmark.cpp | 1 + PTRANS/src/host/transpose_data.cpp | 11 +++--- .../src/host/random_access_benchmark.cpp | 2 -- .../src/host/random_access_benchmark.hpp | 6 ---- STREAM/src/host/stream_benchmark.cpp | 2 -- STREAM/src/host/stream_benchmark.hpp | 6 ---- shared/include/hpcc_benchmark.hpp | 35 +++++++++++++++++-- 11 files changed, 43 insertions(+), 38 deletions(-) diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp index c7042add..3e783685 100644 --- a/FFT/src/host/fft_benchmark.cpp +++ b/FFT/src/host/fft_benchmark.cpp @@ -44,7 +44,7 @@ fft::FFTProgramSettings::getSettingsMap() { auto map = hpcc_base::BaseSettings::getSettingsMap(); map["FFT Size"] = std::to_string(1 << LOG_FFT_SIZE); map["Batch Size"] = std::to_string(iterations); - map["Kernel Replications"] = std::to_string(kernelReplications); + map["Inverse"] = inverse ? "Yes" : "No"; return map; } diff --git a/FFT/src/host/fft_benchmark.hpp b/FFT/src/host/fft_benchmark.hpp index 33ee832a..6307275a 100644 --- a/FFT/src/host/fft_benchmark.hpp +++ b/FFT/src/host/fft_benchmark.hpp @@ -55,12 +55,6 @@ class FFTProgramSettings : public hpcc_base::BaseSettings { */ bool inverse; - /** - * @brief The number of used kernel replications - * - */ - uint kernelReplications; - /** * @brief Construct a new FFT Program Settings object * diff --git a/GEMM/src/host/gemm_benchmark.cpp b/GEMM/src/host/gemm_benchmark.cpp index 5f6dfec6..21896b4d 100644 --- a/GEMM/src/host/gemm_benchmark.cpp +++ b/GEMM/src/host/gemm_benchmark.cpp @@ -35,7 +35,7 @@ SOFTWARE. #include "parameters.h" gemm::GEMMProgramSettings::GEMMProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), - matrixSize(results["b"].as() * results["m"].as()), blockSize(results["b"].as()), kernelReplications(results["r"].as()), + matrixSize(results["b"].as() * results["m"].as()), blockSize(results["b"].as()), replicateInputBuffers(results["replicate-inputs"].count() > 0) { } @@ -44,7 +44,7 @@ std::map gemm::GEMMProgramSettings::getSettingsMap() { auto map = hpcc_base::BaseSettings::getSettingsMap(); map["Matrix Size"] = std::to_string(matrixSize); - map["Kernel Replications"] = std::to_string(kernelReplications); + map["Block Size"] = std::to_string(blockSize); map["Replicate Inputs"] = replicateInputBuffers ? "Yes" : "No"; return map; } diff --git a/GEMM/src/host/gemm_benchmark.hpp b/GEMM/src/host/gemm_benchmark.hpp index c77a212f..a17a29f7 100644 --- a/GEMM/src/host/gemm_benchmark.hpp +++ b/GEMM/src/host/gemm_benchmark.hpp @@ -71,12 +71,6 @@ class GEMMProgramSettings : public hpcc_base::BaseSettings { */ uint blockSize; - /** - * @brief Number of times the kernel is replicated - * - */ - uint kernelReplications; - /** * @brief If True, replicate input buffers for each kernel replication */ diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp index 66265a04..ce4fba22 100644 --- a/LINPACK/src/host/linpack_benchmark.cpp +++ b/LINPACK/src/host/linpack_benchmark.cpp @@ -58,6 +58,7 @@ linpack::LinpackProgramSettings::getSettingsMap() { map["Matrix Size"] = std::to_string(matrixSize); map["Block Size"] = std::to_string(blockSize); map["Emulate"] = (isEmulationKernel) ? "Yes" : "No"; + map["Diagonally Dominant"] = isDiagonallyDominant ? "Yes" : "No"; map["Data Type"] = STR(HOST_DATA_TYPE); map["FPGA Torus"] = "P=" + std::to_string(torus_width) + ", Q=" + std::to_string(torus_height); return map; diff --git a/PTRANS/src/host/transpose_data.cpp b/PTRANS/src/host/transpose_data.cpp index af794f30..3e8e2ba4 100644 --- a/PTRANS/src/host/transpose_data.cpp +++ b/PTRANS/src/host/transpose_data.cpp @@ -26,14 +26,17 @@ transpose::TransposeProgramSettings::TransposeProgramSettings(cxxopts::ParseResu std::map transpose::TransposeProgramSettings::getSettingsMap() { auto map = hpcc_base::BaseSettings::getSettingsMap(); - int mpi_size; -#ifdef _USE_MPI_ - MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); -#endif + int mpi_comm_size; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size); + // calculate the row and column of the MPI rank in the torus + if (mpi_comm_size % p != 0) { + throw std::runtime_error("MPI Comm size not dividable by P=" + std::to_string(p) + "!"); + } map["Matrix Size"] = std::to_string(matrixSize); map["Block Size"] = std::to_string(blockSize); map["Dist. Buffers"] = distributeBuffers ? "Yes" : "No"; map["Data Handler"] = transpose::data_handler::handlerToString(dataHandlerIdentifier); + map["FPGA Torus"] = "P=" + std::to_string(p) + " ,Q=" + std::to_string(mpi_comm_size / p); return map; } diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp index 512ab354..1fe1142a 100644 --- a/RandomAccess/src/host/random_access_benchmark.cpp +++ b/RandomAccess/src/host/random_access_benchmark.cpp @@ -36,7 +36,6 @@ SOFTWARE. random_access::RandomAccessProgramSettings::RandomAccessProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), dataSize((1UL << results["d"].as())), - kernelReplications(results["r"].as()), numRngs((1UL << results["g"].as())) { } @@ -51,7 +50,6 @@ random_access::RandomAccessProgramSettings::getSettingsMap() { std::stringstream ss; ss << dataSize << " (" << static_cast(dataSize * sizeof(HOST_DATA_TYPE) * mpi_size) << " Byte )"; map["Array Size"] = ss.str(); - map["Kernel Replications"] = std::to_string(kernelReplications); map["#RNGs"] = std::to_string(numRngs); return map; } diff --git a/RandomAccess/src/host/random_access_benchmark.hpp b/RandomAccess/src/host/random_access_benchmark.hpp index 3a1eebaa..0bbd02e2 100644 --- a/RandomAccess/src/host/random_access_benchmark.hpp +++ b/RandomAccess/src/host/random_access_benchmark.hpp @@ -50,12 +50,6 @@ class RandomAccessProgramSettings : public hpcc_base::BaseSettings { */ size_t dataSize; - /** - * @brief The number of used kernel replications - * - */ - uint kernelReplications; - /** * @brief Number of random number generators that are used per kernel replication * diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp index e8328dc8..a47ab743 100644 --- a/STREAM/src/host/stream_benchmark.cpp +++ b/STREAM/src/host/stream_benchmark.cpp @@ -36,7 +36,6 @@ SOFTWARE. stream::StreamProgramSettings::StreamProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), streamArraySize(results["s"].as()), - kernelReplications(results["r"].as()), useSingleKernel(!static_cast(results.count("multi-kernel"))) { } @@ -48,7 +47,6 @@ stream::StreamProgramSettings::getSettingsMap() { std::stringstream ss; ss << streamArraySize << " (" << static_cast(streamArraySize * sizeof(HOST_DATA_TYPE)) << " Byte )"; map["Array Size"] = ss.str(); - map["Kernel Replications"] = std::to_string(kernelReplications); map["Kernel Type"] = (useSingleKernel ? "Single" : "Separate"); return map; } diff --git a/STREAM/src/host/stream_benchmark.hpp b/STREAM/src/host/stream_benchmark.hpp index 50f24b88..638868da 100644 --- a/STREAM/src/host/stream_benchmark.hpp +++ b/STREAM/src/host/stream_benchmark.hpp @@ -52,12 +52,6 @@ class StreamProgramSettings : public hpcc_base::BaseSettings { */ uint streamArraySize; - /** - * @brief The number of used kernel replications - * - */ - uint kernelReplications; - /** * @brief Indicator if the single kernel or the legacy kernel are used for execution * diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index a2f7de95..766fbc13 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -200,7 +200,11 @@ class BaseSettings { } return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"}, - {"Communication Type", commToString(communicationType)}}; + {"Communication Type", commToString(communicationType)} +#ifdef INTEL_FPGA + ,{"Memory Interleaving", useMemoryInterleaving ? "Yes" : "No"} +#endif + }; } }; @@ -275,6 +279,19 @@ class ExecutionSettings { } return device_name; } + + /* + std::string + getPlatformName() const { + std::string platform_name; + if (!programSettings->testOnly) { + platform->getInfo(CL_PLATFORM_NAME, &platform_name); + } else { + platform_name = "TEST RUN: Not selected!"; + } + return platform_name; + } + */ }; @@ -585,6 +602,15 @@ class HpccFpgaBenchmark { j["Q"] = stoi(q_str.substr(q_str.find("=") + 1, q_str.size())); return j; } + + std::string + getCurrentTime() { + time_t time = std::time(0); + const tm *local_time = std::localtime(&time); + std::ostringstream oss; + oss << std::put_time(local_time, "%a %b %d %T %Z %Y"); + return oss.str(); + } std::map jsonifySettingsMap(std::map settings_map) { @@ -598,7 +624,7 @@ class HpccFpgaBenchmark { } catch (std::invalid_argument const &ex) { if (key == "FPGA Torus") { j[key] = parseFPGATorusString(value); - } else if (key == "Emulate" || key == "Replicate Inputs") { + } else if (key == "Emulate" || key == "Test Mode" || key == "Memory Interleaving" || key == "Replicate Inputs" || key == "Inverse" || key == "Diagonally Dominant" || "Dist. Buffers") { j[key] = value == "Yes"; } else { j[key] = value; @@ -621,6 +647,7 @@ class HpccFpgaBenchmark { dump["mpi"] ={{"version", MPI_VERSION}, {"subversion", MPI_SUBVERSION}}; #endif dump["config_time"] = CONFIG_TIME; + dump["execution_time"] = getCurrentTime(); dump["git_commit"] = GIT_COMMIT_HASH; dump["version"] = VERSION; dump["device"] = executionSettings->getDeviceName(); @@ -765,7 +792,9 @@ class HpccFpgaBenchmark { if (!executionSettings->programSettings->skipValidation) { auto eval_start = std::chrono::high_resolution_clock::now(); validateSuccess = validateOutput(*data); - printError(); + if (mpi_comm_rank == 0) { + printError(); + } std::chrono::duration eval_time = std::chrono::high_resolution_clock::now() - eval_start; if (mpi_comm_rank == 0) { From 911bc0569c2fad4b4e0ceba79c30150686dec233 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Wed, 14 Dec 2022 10:12:56 +0100 Subject: [PATCH 225/318] add validate flag to json-dump --- shared/include/hpcc_benchmark.hpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 766fbc13..081b7a72 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -373,6 +373,13 @@ class HpccFpgaBenchmark { */ std::map errors; + /** + * @brief This flag indicates whether the validation was successful + * + */ + bool validated = false; + + public: /** @@ -655,6 +662,7 @@ class HpccFpgaBenchmark { dump["timings"] = getTimingsJson(); dump["results"] = getResultsJson(); dump["errors"] = getErrorsJson(); + dump["validated"] = validated; dump["environment"] = getEnvironmentMap(); fs << dump; @@ -773,7 +781,6 @@ class HpccFpgaBenchmark { << HLINE; } - bool validateSuccess = false; auto exe_start = std::chrono::high_resolution_clock::now(); executeKernel(*data); @@ -791,7 +798,7 @@ class HpccFpgaBenchmark { if (!executionSettings->programSettings->skipValidation) { auto eval_start = std::chrono::high_resolution_clock::now(); - validateSuccess = validateOutput(*data); + validated = validateOutput(*data); if (mpi_comm_rank == 0) { printError(); } @@ -811,7 +818,7 @@ class HpccFpgaBenchmark { printResults(); - if (!validateSuccess) { + if (!validated) { std::cerr << HLINE << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl; } else { @@ -819,7 +826,7 @@ class HpccFpgaBenchmark { } } - return validateSuccess; + return validated; } catch (const std::exception& e) { std::cerr << "An error occured while executing the benchmark: " << std::endl; From 778f729f3a364ac9aa4d4a40e19d7bd95bb28800 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Wed, 14 Dec 2022 10:13:31 +0100 Subject: [PATCH 226/318] set both config and execution time to UTC --- shared/include/hpcc_benchmark.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 081b7a72..8262fa80 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -613,9 +613,9 @@ class HpccFpgaBenchmark { std::string getCurrentTime() { time_t time = std::time(0); - const tm *local_time = std::localtime(&time); + const tm *utc_time = std::gmtime(&time); std::ostringstream oss; - oss << std::put_time(local_time, "%a %b %d %T %Z %Y"); + oss << std::put_time(utc_time, "%a %b %d %T UTC %Y"); return oss.str(); } From bdfdf4ee0532fac79ea3e2b564487ed57aef3589 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Wed, 14 Dec 2022 11:01:52 +0100 Subject: [PATCH 227/318] remove empty unit from errors --- FFT/src/host/fft_benchmark.cpp | 8 +-- GEMM/src/host/gemm_benchmark.cpp | 10 ++-- LINPACK/src/host/linpack_benchmark.cpp | 10 ++-- PTRANS/src/host/transpose_benchmark.cpp | 6 +- .../src/host/random_access_benchmark.cpp | 4 +- STREAM/src/host/stream_benchmark.cpp | 56 +++++++++---------- b_eff/src/host/network_benchmark.cpp | 4 +- shared/include/hpcc_benchmark.hpp | 15 +---- 8 files changed, 51 insertions(+), 62 deletions(-) diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp index 3e783685..8610b121 100644 --- a/FFT/src/host/fft_benchmark.cpp +++ b/FFT/src/host/fft_benchmark.cpp @@ -163,16 +163,16 @@ fft::FFTBenchmark::validateOutput(fft::FFTData &data) { double error = residual_max / (std::numeric_limits::epsilon() * LOG_FFT_SIZE); - errors.emplace("residual", hpcc_base::HpccResult(error, "")); - errors.emplace("epsilon", hpcc_base::HpccResult(std::numeric_limits::epsilon(), "")); + errors.emplace("residual", error); + errors.emplace("epsilon", std::numeric_limits::epsilon()); return error < 1.0; } void fft::FFTBenchmark::printError() { if (mpi_comm_rank == 0) { - std::cout << std::left << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl; - std::cout << errors.at("residual") << errors.at("epsilon") << std::endl << std::endl; + std::cout << std::setw(ENTRY_SPACE) << "res. error" << std::setw(ENTRY_SPACE) << "mach. eps" << std::endl; + std::cout << std::setw(ENTRY_SPACE) << errors.at("residual") << std::setw(ENTRY_SPACE) << errors.at("epsilon") << std::endl << std::endl; } } diff --git a/GEMM/src/host/gemm_benchmark.cpp b/GEMM/src/host/gemm_benchmark.cpp index 21896b4d..f624ea57 100644 --- a/GEMM/src/host/gemm_benchmark.cpp +++ b/GEMM/src/host/gemm_benchmark.cpp @@ -196,9 +196,9 @@ gemm::GEMMBenchmark::validateOutput(gemm::GEMMData &data) { double eps = std::numeric_limits::epsilon(); double residn = resid / (executionSettings->programSettings->matrixSize*executionSettings->programSettings->matrixSize*ref_data->normtotal*normx*eps); - errors.emplace("epsilon", hpcc_base::HpccResult(eps, "")); - errors.emplace("residual", hpcc_base::HpccResult(resid, "")); - errors.emplace("residual_norm", hpcc_base::HpccResult(residn, "")); + errors.emplace("epsilon", eps); + errors.emplace("residual", resid); + errors.emplace("residual_norm", residn); return residn < 1.0; } @@ -209,8 +209,8 @@ gemm::GEMMBenchmark::validateOutput(gemm::GEMMData &data) { void gemm::GEMMBenchmark::printError() { if (mpi_comm_rank == 0) { - std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl; - std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl; + std::cout << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::endl; + std::cout << std::setw(ENTRY_SPACE) << errors.at("residual_norm") << std::setw(ENTRY_SPACE) << errors.at("residual") << std::setw(ENTRY_SPACE) << errors.at("epsilon") << std::endl; } } diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp index ce4fba22..16d35e02 100644 --- a/LINPACK/src/host/linpack_benchmark.cpp +++ b/LINPACK/src/host/linpack_benchmark.cpp @@ -421,9 +421,9 @@ linpack::LinpackBenchmark::validateOutput(linpack::LinpackData &data) { } #endif - errors.emplace("epsilon", hpcc_base::HpccResult(eps, "")); - errors.emplace("residual", hpcc_base::HpccResult(resid, "")); - errors.emplace("residual_norm", hpcc_base::HpccResult(residn, "")); + errors.emplace("epsilon", eps); + errors.emplace("residual", resid); + errors.emplace("residual_norm", residn); if (mpi_comm_rank == 0) { return residn < 1; @@ -435,8 +435,8 @@ linpack::LinpackBenchmark::validateOutput(linpack::LinpackData &data) { void linpack::LinpackBenchmark::printError() { if (mpi_comm_rank == 0) { - std::cout << std::left << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::right << std::endl; - std::cout << errors.at("residual_norm") << errors.at("residual") << errors.at("epsilon") << std::endl; + std::cout << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::endl; + std::cout << std::setw(ENTRY_SPACE) << errors.at("residual_norm") << std::setw(ENTRY_SPACE) << errors.at("residual") << std::setw(ENTRY_SPACE) << errors.at("epsilon") << std::endl; } } diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp index 782b4680..213f6c7e 100644 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -182,8 +182,8 @@ transpose::TransposeBenchmark::validateOutput(transpose::TransposeData &data) { double global_max_error = 0; MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - errors.emplace("epsilon", hpcc_base::HpccResult(std::numeric_limits::epsilon(), "")); - errors.emplace("max_error", hpcc_base::HpccResult(global_max_error, "")); + errors.emplace("epsilon", std::numeric_limits::epsilon()); + errors.emplace("max_error", global_max_error); return static_cast(global_max_error) < 100 * std::numeric_limits::epsilon(); } @@ -191,7 +191,7 @@ transpose::TransposeBenchmark::validateOutput(transpose::TransposeData &data) { void transpose::TransposeBenchmark::printError() { if (mpi_comm_rank == 0) { - std::cout << "Maximum error: " << errors.at("epsilon") << " < " << 100 * errors.at("epsilon").value << std::endl; + std::cout << "Maximum error: " << errors.at("epsilon") << " < " << 100 * errors.at("epsilon") << std::endl; std::cout << "Mach. Epsilon: " << errors.at("epsilon") << std::endl; } } diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp index 1fe1142a..94c63d0a 100644 --- a/RandomAccess/src/host/random_access_benchmark.cpp +++ b/RandomAccess/src/host/random_access_benchmark.cpp @@ -201,7 +201,7 @@ random_access::RandomAccessBenchmark::validateOutput(random_access::RandomAccess // The overall error is calculated in percent of the overall array size double error_ratio = static_cast(error_count) / (executionSettings->programSettings->dataSize * mpi_comm_size); - errors.emplace("ratio", hpcc_base::HpccResult(error_ratio, "")); + errors.emplace("ratio", error_ratio); #ifdef _USE_MPI_ if (mpi_comm_rank == 0 && mpi_comm_size > 1) { @@ -219,6 +219,6 @@ random_access::RandomAccessBenchmark::validateOutput(random_access::RandomAccess void random_access::RandomAccessBenchmark::printError() { if (mpi_comm_rank == 0) { - std::cout << "Error: " << errors.at("ratio") << std::endl; + std::cout << "Error: " << errors.at("ratio") * 100 << " %" << std::endl; } } diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp index a47ab743..f0cc01f3 100644 --- a/STREAM/src/host/stream_benchmark.cpp +++ b/STREAM/src/host/stream_benchmark.cpp @@ -223,20 +223,20 @@ stream::StreamBenchmark::validateOutput(stream::StreamData &data) { bool success = true; if (mpi_comm_rank == 0) { - errors.emplace("a_expected_value", hpcc_base::HpccResult(aj, "")); - errors.emplace("a_average_error", hpcc_base::HpccResult(aAvgErr, "")); - errors.emplace("a_average_relative_error", hpcc_base::HpccResult(abs(aAvgErr)/aj, "")); + errors.emplace("a_expected", aj); + errors.emplace("a_average_error", aAvgErr); + errors.emplace("a_average_relative_error", abs(aAvgErr)/aj); - errors.emplace("b_expected_value", hpcc_base::HpccResult(bj, "")); - errors.emplace("b_average_error", hpcc_base::HpccResult(bAvgErr, "")); - errors.emplace("b_average_relative_error", hpcc_base::HpccResult(abs(bAvgErr)/bj, "")); + errors.emplace("b_expected", bj); + errors.emplace("b_average_error", bAvgErr); + errors.emplace("b_average_relative_error", abs(bAvgErr)/bj); - errors.emplace("c_expected_value", hpcc_base::HpccResult(cj, "")); - errors.emplace("c_average_error", hpcc_base::HpccResult(cAvgErr, "")); - errors.emplace("c_average_relative_error", hpcc_base::HpccResult(abs(cAvgErr)/cj, "")); + errors.emplace("c_expected", cj); + errors.emplace("c_average_error", cAvgErr); + errors.emplace("c_average_relative_error", abs(cAvgErr)/cj); epsilon = std::numeric_limits::epsilon(); - errors.emplace("epsilon", hpcc_base::HpccResult(epsilon, "")); + errors.emplace("epsilon", epsilon); if (abs(aAvgErr/aj) > epsilon) { success = false; @@ -246,7 +246,7 @@ stream::StreamBenchmark::validateOutput(stream::StreamData &data) { ierr++; } } - errors.emplace("a_error_count", hpcc_base::HpccResult(ierr, "")); + errors.emplace("a_error_count", ierr); ierr = 0; } if (abs(bAvgErr/bj) > epsilon) { @@ -257,7 +257,7 @@ stream::StreamBenchmark::validateOutput(stream::StreamData &data) { ierr++; } } - errors.emplace("b_error_count", hpcc_base::HpccResult(ierr, "")); + errors.emplace("b_error_count", ierr); } if (abs(cAvgErr/cj) > epsilon) { success = false; @@ -267,7 +267,7 @@ stream::StreamBenchmark::validateOutput(stream::StreamData &data) { ierr++; } } - errors.emplace("b_error_count", hpcc_base::HpccResult(ierr, "")); + errors.emplace("b_error_count", ierr); } } return success; @@ -277,30 +277,30 @@ void stream::StreamBenchmark::printError() { if (mpi_comm_rank == 0) { int err = 0; - double epsilon = errors.at("epsilon").value; - if (errors.at("a_average_relative_error").value > epsilon) { + double epsilon = errors.at("epsilon"); + if (errors.at("a_average_relative_error") > epsilon) { err++; - printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value); - printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("a_expected_value").value, errors.at("a_average_error").value, errors.at("a_average_relative_error").value); + printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon")); + printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("a_expected"), errors.at("a_average_error"), errors.at("a_average_relative_error")); printf(" For array a[], %d errors were found.\n", errors.at("a_error_count")); } - if (errors.at("b_average_relative_error").value > epsilon) { + if (errors.at("b_average_relative_error") > epsilon) { err++; - printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value); - printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("b_expected_value").value, errors.at("b_average_error").value, errors.at("b_average_relative_error").value); - printf(" AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value); - printf(" For array b[], %d errors were found.\n", errors.at("b_error_count").value); + printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon")); + printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("b_expected"), errors.at("b_average_error"), errors.at("b_average_relative_error")); + printf(" AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon")); + printf(" For array b[], %d errors were found.\n", errors.at("b_error_count")); } - if (errors.at("c_average_relative_error").value > epsilon) { + if (errors.at("c_average_relative_error") > epsilon) { err++; - printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon").value); - printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("c_expected_value").value, errors.at("c_average_error").value, errors.at("c_average_relative_error").value); - printf(" AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon").value); - printf(" For array c[], %d errors were found.\n", errors.at("c_error_count").value); + printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon")); + printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("c_expected"), errors.at("c_average_error"), errors.at("c_average_relative_error")); + printf(" AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon")); + printf(" For array c[], %d errors were found.\n", errors.at("c_error_count")); } if (err == 0) { - printf ("Solution Validates: avg error less than %e on all three arrays\n", errors.at("epsilon").value); + printf ("Solution Validates: avg error less than %e on all three arrays\n", errors.at("epsilon")); } } } diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index 2986465d..dbe1f610 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -250,7 +250,7 @@ network::NetworkBenchmark::validateOutput(network::NetworkData &data) { } } if (error_count > 0) { - errors.emplace(std::to_string(item.messageSize), hpcc_base::HpccResult(error_count, "")); + errors.emplace(std::to_string(item.messageSize), error_count); } total_error += error_count; } @@ -262,7 +262,7 @@ network::NetworkBenchmark::validateOutput(network::NetworkData &data) { void network::NetworkBenchmark::printError() { for (const auto& error: errors) { - std::cerr << "Validation data invalid for message size " << (1 << stoi(error.first)) << " in " << int(error.second.value) << " cases!" << std::endl; + std::cerr << "Validation data invalid for message size " << (1 << stoi(error.first)) << " in " << int(error.second) << " cases!" << std::endl; } } diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 8262fa80..69da2bfe 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -371,7 +371,7 @@ class HpccFpgaBenchmark { * @brief map containing the errors of the benchmark * */ - std::map errors; + std::map errors; /** * @brief This flag indicates whether the validation was successful @@ -581,17 +581,6 @@ class HpccFpgaBenchmark { return results_string; } - std::map getErrorsJson() { - std::map errors_string; - for (auto const &error: errors) { - json j; - j["unit"] = error.second.unit; - j["value"] = error.second.value; - errors_string[error.first] = j; - } - return errors_string; - } - std::map getEnvironmentMap() { std::map env; @@ -661,7 +650,7 @@ class HpccFpgaBenchmark { dump["settings"] = jsonifySettingsMap(executionSettings->programSettings->getSettingsMap()); dump["timings"] = getTimingsJson(); dump["results"] = getResultsJson(); - dump["errors"] = getErrorsJson(); + dump["errors"] = errors; dump["validated"] = validated; dump["environment"] = getEnvironmentMap(); From 86b9642e89cb46bab084bdbb075f7817953c178e Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Wed, 14 Dec 2022 11:02:38 +0100 Subject: [PATCH 228/318] update READMEs --- FFT/README.md | 55 ++++---- GEMM/{Readme.md => README.md} | 54 ++++--- LINPACK/{Readme.md => README.md} | 80 +++++------ PTRANS/README.md | 136 +++++------------- RandomAccess/README.md | 43 +++--- STREAM/README.md | 234 ++++++++++++++----------------- b_eff/README.md | 86 ++++++------ 7 files changed, 287 insertions(+), 401 deletions(-) rename GEMM/{Readme.md => README.md} (92%) rename LINPACK/{Readme.md => README.md} (90%) diff --git a/FFT/README.md b/FFT/README.md index 2926a5ac..52269e94 100644 --- a/FFT/README.md +++ b/FFT/README.md @@ -130,95 +130,92 @@ The json output looks like the following. ```json { - "config_time": "Thu Dec 08 10:39:10 UTC 2022", + "config_time": "Wed Dec 14 08:40:17 UTC 2022", "device": "Intel(R) FPGA Emulation Device", "environment": { "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" }, "errors": { - "epsilon": { - "unit": "", - "value": 1.1920928955078125e-07 - }, - "residual": { - "unit": "", - "value": 0.2635231415430705 - } + "epsilon": 1.1920928955078125e-07, + "residual": 0.2635231415430705 }, - "git_commit": "86e0064-dirty", + "execution_time": "Wed Dec 14 08:55:51 GMT 2022", + "git_commit": "be1a4e9-dirty", "name": "FFT", "results": { "gflops_avg": { "unit": "GFLOP/s", - "value": 0.2751268094908118 + "value": 0.2573525536079919 }, "gflops_min": { "unit": "GFLOP/s", - "value": 0.2813275822966743 + "value": 0.2842073122577159 }, "t_avg": { "unit": "s", - "value": 0.0008932608220000002 + "value": 0.0009549545810000001 }, "t_min": { "unit": "s", - "value": 0.0008735723600000001 + "value": 0.00086472089 } }, "settings": { "Batch Size": 100, - "Communication Type": "UNSUPPORTED", + "Communication Type": false, "FFT Size": 4096, - "Kernel File": "./bin/fft1d_float_8_emulate.aocx", + "Inverse": false, + "Kernel File": false, "Kernel Replications": 1, - "MPI Ranks": "None", + "MPI Ranks": false, "Repetitions": 10, - "Test Mode": "No" + "Test Mode": false }, "timings": { - "calculation": [ + "execution": [ { "unit": "s", - "value": 0.090378907 + "value": 0.151814849 }, { "unit": "s", - "value": 0.089294969 + "value": 0.086472089 }, { "unit": "s", - "value": 0.08941156 + "value": 0.089654183 }, { "unit": "s", - "value": 0.089993811 + "value": 0.09003793 }, { "unit": "s", - "value": 0.087884474 + "value": 0.089870966 }, { "unit": "s", - "value": 0.087357236 + "value": 0.089802216 }, { "unit": "s", - "value": 0.089228888 + "value": 0.089816195 }, { "unit": "s", - "value": 0.089401591 + "value": 0.089979618 }, { "unit": "s", - "value": 0.089537203 + "value": 0.090762352 }, { "unit": "s", - "value": 0.090772183 + "value": 0.086744183 } ] }, + "validated": true, "version": "1.4" } diff --git a/GEMM/Readme.md b/GEMM/README.md similarity index 92% rename from GEMM/Readme.md rename to GEMM/README.md index 33f0419b..8ac117df 100755 --- a/GEMM/Readme.md +++ b/GEMM/README.md @@ -147,26 +147,18 @@ The json output looks like the following. ```json { - "config_time": "Thu Dec 08 10:39:51 UTC 2022", + "config_time": "Wed Dec 14 08:40:52 UTC 2022", "device": "Intel(R) FPGA Emulation Device", "environment": { "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" }, "errors": { - "epsilon": { - "unit": "", - "value": 1.1920928955078125e-07 - }, - "residual": { - "unit": "", - "value": 7.62939453125e-06 - }, - "residual_norm": { - "unit": "", - "value": 8.08345175162664e-05 - } + "epsilon": 1.1920928955078125e-07, + "residual": 7.62939453125e-06, + "residual_norm": 8.08345175162664e-05 }, - "git_commit": "86e0064-dirty", + "execution_time": "Wed Dec 14 09:14:09 UTC 2022", + "git_commit": "be1a4e9-dirty", "mpi": { "subversion": 1, "version": 3 @@ -175,71 +167,73 @@ The json output looks like the following. "results": { "gflops": { "unit": "GFLOP/s", - "value": 5.347517549652832 + "value": 5.297554068962992 }, "t_mean": { "unit": "s", - "value": 0.009541589199999999 + "value": 0.010202154299999999 }, "t_min": { "unit": "s", - "value": 0.006274768 + "value": 0.006333948 } }, "settings": { - "Communication Type": "UNSUPPORTED", - "Kernel File": "./bin/gemm_base_emulate.aocx", + "Block Size": 32, + "Communication Type": false, + "Kernel File": false, "Kernel Replications": 4, "MPI Ranks": 1, "Matrix Size": 256, "Repetitions": 10, "Replicate Inputs": false, - "Test Mode": "No" + "Test Mode": false }, "timings": { "execution": [ { "unit": "s", - "value": 0.012631986 + "value": 0.012732567 }, { "unit": "s", - "value": 0.012796959 + "value": 0.006511861 }, { "unit": "s", - "value": 0.012527344 + "value": 0.006333948 }, { "unit": "s", - "value": 0.012579805 + "value": 0.012710817 }, { "unit": "s", - "value": 0.0064457 + "value": 0.006552662 }, { "unit": "s", - "value": 0.006274768 + "value": 0.006600733 }, { "unit": "s", - "value": 0.00642924 + "value": 0.012673167 }, { "unit": "s", - "value": 0.012808459 + "value": 0.012720237 }, { "unit": "s", - "value": 0.006587663 + "value": 0.012608296 }, { "unit": "s", - "value": 0.006333968 + "value": 0.012577255 } ] }, + "validated": true, "version": "1.3" } diff --git a/LINPACK/Readme.md b/LINPACK/README.md similarity index 90% rename from LINPACK/Readme.md rename to LINPACK/README.md index 62162c43..7135b511 100644 --- a/LINPACK/Readme.md +++ b/LINPACK/README.md @@ -160,26 +160,18 @@ The json output looks like the following. ```json { - "config_time": "Thu Dec 08 10:41:13 UTC 2022", + "config_time": "Wed Dec 14 08:41:58 UTC 2022", "device": "Intel(R) FPGA Emulation Device", "environment": { "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" }, "errors": { - "epsilon": { - "unit": "", - "value": 1.1920928955078125e-07 - }, - "residual": { - "unit": "", - "value": 5.960464477539062e-07 - }, - "residual_norm": { - "unit": "", - "value": 0.004354506590071576 - } + "epsilon": 1.1920928955078125e-07, + "residual": 5.960464477539062e-07, + "residual_norm": 0.004354506590071576 }, - "git_commit": "86e0064-dirty", + "execution_time": "Wed Dec 14 09:20:49 UTC 2022", + "git_commit": "be1a4e9-dirty", "mpi": { "subversion": 1, "version": 3 @@ -188,104 +180,105 @@ The json output looks like the following. "results": { "gflops": { "unit": "GFLOP/s", - "value": 0.000213044786995575 + "value": 0.0006047108051562395 }, "gflops_lu": { "unit": "GFLOP/s", - "value": 0.00019478383998887983 + "value": 0.0005528788702090362 }, "gflops_sl": { "unit": "GFLOP/s", - "value": 102.4 + "value": 68.26666666666668 }, "t_mean": { "unit": "s", - "value": 0.1161132923 + "value": 0.041533081799999996 }, "t_min": { "unit": "s", - "value": 0.112151692 + "value": 0.039512 }, "tlu_mean": { "unit": "s", - "value": 0.11611325259999998 + "value": 0.041533051599999996 }, "tlu_min": { "unit": "s", - "value": 0.112151672 + "value": 0.03951197 }, "tsl_mean": { "unit": "s", - "value": 3.97e-08 + "value": 3.019999999999999e-08 }, "tsl_min": { "unit": "s", - "value": 2e-08 + "value": 3e-08 } }, "settings": { "Block Size": 16, - "Communication Type": "IEC", - "Data Type": "cl_float", + "Communication Type": false, + "Data Type": false, + "Diagonally Dominant": true, "Emulate": false, "FPGA Torus": { "P": 1, "Q": 1 }, - "Kernel File": "./bin/hpl_torus_IEC_emulate.aocx", + "Kernel File": false, "Kernel Replications": 3, "MPI Ranks": 1, "Matrix Size": 32, "Repetitions": 10, - "Test Mode": "No" + "Test Mode": false }, "timings": { "gefa": [ { "unit": "s", - "value": 0.112151672 + "value": 0.040978706 }, { "unit": "s", - "value": 0.112186842 + "value": 0.041104108 }, { "unit": "s", - "value": 0.114559183 + "value": 0.040878394 }, { "unit": "s", - "value": 0.114920089 + "value": 0.040391036 }, { "unit": "s", - "value": 0.113395783 + "value": 0.044723132 }, { "unit": "s", - "value": 0.113512676 + "value": 0.03951197 }, { "unit": "s", - "value": 0.118974459 + "value": 0.043374308 }, { "unit": "s", - "value": 0.11378015 + "value": 0.04179909 }, { "unit": "s", - "value": 0.131815478 + "value": 0.041162129 }, { "unit": "s", - "value": 0.115836194 + "value": 0.041407643 } ], "gesl": [ { "unit": "s", - "value": 2e-08 + "value": 3e-08 }, { "unit": "s", @@ -297,11 +290,11 @@ The json output looks like the following. }, { "unit": "s", - "value": 2.9e-08 + "value": 3e-08 }, { "unit": "s", - "value": 1.5e-07 + "value": 3e-08 }, { "unit": "s", @@ -309,15 +302,15 @@ The json output looks like the following. }, { "unit": "s", - "value": 2e-08 + "value": 3.1e-08 }, { "unit": "s", - "value": 2.9e-08 + "value": 3.1e-08 }, { "unit": "s", - "value": 2.9e-08 + "value": 3e-08 }, { "unit": "s", @@ -325,6 +318,7 @@ The json output looks like the following. } ] }, + "validated": true, "version": "2.6" } diff --git a/PTRANS/README.md b/PTRANS/README.md index 9350e1de..521389a0 100644 --- a/PTRANS/README.md +++ b/PTRANS/README.md @@ -172,22 +172,17 @@ The json output looks like the following. ```json { - "config_time": "Thu Dec 08 10:41:51 UTC 2022", + "config_time": "Wed Dec 14 08:42:29 UTC 2022", "device": "Intel(R) FPGA Emulation Device", "environment": { - "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" + "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" }, "errors": { - "epsilon": { - "unit": "", - "value": 1.1920928955078125e-07 - }, - "max_error": { - "unit": "", - "value": 7.62939453125e-06 - } + "epsilon": 1.1920928955078125e-07, + "max_error": 199.96849060058594 }, - "git_commit": "86e0064-dirty", + "execution_time": "Wed Dec 14 09:57:30 UTC 2022", + "git_commit": "be1a4e9-dirty", "mpi": { "subversion": 1, "version": 3 @@ -196,151 +191,84 @@ The json output looks like the following. "results": { "avg_calc_flops": { "unit": "GFLOP/s", - "value": 0.36102157111728794 + "value": 0.011002914958427963 }, "avg_calc_t": { "unit": "s", - "value": 0.0464715057 + "value": 1.524797389 }, "avg_mem_bandwidth": { "unit": "GB/s", - "value": 4.332258853407454 + "value": 0.13203497950113555 }, "avg_t": { "unit": "s", - "value": 0.061001096899999996 + "value": 1.5332141689999998 }, "avg_transfer_bandwidth": { "unit": "GB/s", - "value": 13.856314966383914 + "value": 23.919669042080226 }, "avg_transfer_t": { "unit": "s", - "value": 0.0145295912 + "value": 0.00841678 }, "max_calc_flops": { "unit": "GFLOP/s", - "value": 0.4431353845559759 + "value": 0.011002914958427963 }, "max_mem_bandwidth": { "unit": "GB/s", - "value": 5.31762461467171 + "value": 0.13203497950113555 }, "max_transfer_bandwidth": { "unit": "GB/s", - "value": 17.8236830498358 + "value": 23.919669042080226 }, "min_calc_t": { "unit": "s", - "value": 0.037860249 + "value": 1.524797389 }, "min_t": { "unit": "s", - "value": 0.049155702999999995 + "value": 1.5332141689999998 }, "min_transfer_t": { "unit": "s", - "value": 0.011295454 + "value": 0.00841678 } }, "settings": { "Block Size": 512, - "Communication Type": "PCIE", - "Data Handler": "PQ", - "Dist. Buffers": "No", - "Kernel File": "./bin/transpose_PQ_PCIE_emulate.aocx", + "Communication Type": false, + "Data Handler": false, + "Dist. Buffers": false, + "FPGA Torus": { + "P": 1, + "Q": 3 + }, + "Kernel File": false, "Kernel Replications": 2, - "MPI Ranks": 1, + "MPI Ranks": 3, "Matrix Size": 4096, - "Repetitions": 10, - "Test Mode": "No" + "Repetitions": 1, + "Test Mode": false }, "timings": { "calculation": [ { "unit": "s", - "value": 0.054139988 - }, - { - "unit": "s", - "value": 0.05014593 - }, - { - "unit": "s", - "value": 0.037867809 - }, - { - "unit": "s", - "value": 0.037973641 - }, - { - "unit": "s", - "value": 0.046004999 - }, - { - "unit": "s", - "value": 0.037860249 - }, - { - "unit": "s", - "value": 0.056381497 - }, - { - "unit": "s", - "value": 0.050036547 - }, - { - "unit": "s", - "value": 0.048048414 - }, - { - "unit": "s", - "value": 0.046255983 + "value": 1.523696949 } ], "transfer": [ { "unit": "s", - "value": 0.025985196 - }, - { - "unit": "s", - "value": 0.012733798000000001 - }, - { - "unit": "s", - "value": 0.012989071999999999 - }, - { - "unit": "s", - "value": 0.011295454 - }, - { - "unit": "s", - "value": 0.013326449 - }, - { - "unit": "s", - "value": 0.012952722 - }, - { - "unit": "s", - "value": 0.014228134 - }, - { - "unit": "s", - "value": 0.013149265 - }, - { - "unit": "s", - "value": 0.014597321 - }, - { - "unit": "s", - "value": 0.014038500999999998 + "value": 0.008189295 } ] }, + "validated": false, "version": "1.7" } diff --git a/RandomAccess/README.md b/RandomAccess/README.md index a852b630..ede6a47d 100644 --- a/RandomAccess/README.md +++ b/RandomAccess/README.md @@ -146,18 +146,16 @@ The json output looks like the following. ```json { - "config_time": "Thu Dec 08 10:42:40 UTC 2022", + "config_time": "Wed Dec 14 08:43:07 UTC 2022", "device": "Intel(R) FPGA Emulation Device", "environment": { "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" }, "errors": { - "ratio": { - "unit": "", - "value": 0.00390625 - } + "ratio": 0.00390625 }, - "git_commit": "86e0064-dirty", + "execution_time": "Wed Dec 14 09:54:47 UTC 2022", + "git_commit": "be1a4e9-dirty", "mpi": { "subversion": 1, "version": 3 @@ -166,71 +164,72 @@ The json output looks like the following. "results": { "guops": { "unit": "GUOP/s", - "value": 0.0022880227372259515 + "value": 0.0021329867229908477 }, "t_mean": { "unit": "s", - "value": 0.0005729401999999999 + "value": 0.0005428726000000001 }, "t_min": { "unit": "s", - "value": 0.000447548 + "value": 0.000480078 } }, "settings": { "#RNGs": 32, "Array Size": 256, - "Communication Type": "UNSUPPORTED", - "Kernel File": "./bin/random_access_kernels_single_emulate.aocx", + "Communication Type": false, + "Kernel File": false, "Kernel Replications": 4, "MPI Ranks": 1, "Repetitions": 10, - "Test Mode": "No" + "Test Mode": false }, "timings": { "execution": [ { "unit": "s", - "value": 0.000672612 + "value": 0.000643471 }, { "unit": "s", - "value": 0.00058854 + "value": 0.000516849 }, { "unit": "s", - "value": 0.00058064 + "value": 0.000606361 }, { "unit": "s", - "value": 0.00057064 + "value": 0.00058182 }, { "unit": "s", - "value": 0.00053845 + "value": 0.00060401 }, { "unit": "s", - "value": 0.00055827 + "value": 0.000485259 }, { "unit": "s", - "value": 0.00056768 + "value": 0.000484699 }, { "unit": "s", - "value": 0.000649792 + "value": 0.00053713 }, { "unit": "s", - "value": 0.00055523 + "value": 0.000489049 }, { "unit": "s", - "value": 0.000447548 + "value": 0.000480078 } ] }, + "validated": true, "version": "2.5" } diff --git a/STREAM/README.md b/STREAM/README.md index 10980aad..298777b3 100644 --- a/STREAM/README.md +++ b/STREAM/README.md @@ -164,54 +164,25 @@ The raw data of these runs can be found in the folder `csv_result_export`. ```json { - "config_time": "Thu Dec 08 10:43:26 UTC 2022", + "config_time": "Wed Dec 14 08:43:42 UTC 2022", "device": "Intel(R) FPGA Emulation Device", "environment": { "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" }, "errors": { - "a_average_error": { - "unit": "", - "value": 0 - }, - "a_average_relative_error": { - "unit": "", - "value": 0 - }, - "a_expected_value": { - "unit": "", - "value": 1153300692992 - }, - "b_average_error": { - "unit": "", - "value": 0 - }, - "b_average_relative_error": { - "unit": "", - "value": 0 - }, - "b_expected_value": { - "unit": "", - "value": 230660145152 - }, - "c_average_error": { - "unit": "", - "value": 0 - }, - "c_average_relative_error": { - "unit": "", - "value": 0 - }, - "c_expected_value": { - "unit": "", - "value": 307546849280 - }, - "epsilon": { - "unit": "", - "value": 1.1920928955078125e-07 - } + "a_average_error": 0, + "a_average_relative_error": 0, + "a_expected": 1153300692992, + "b_average_error": 0, + "b_average_relative_error": 0, + "b_expected": 230660145152, + "c_average_error": 0, + "c_average_relative_error": 0, + "c_expected": 307546849280, + "epsilon": 1.1920928955078125e-07 }, - "git_commit": "86e0064-dirty", + "execution_time": "Wed Dec 14 09:29:17 UTC 2022", + "git_commit": "be1a4e9-dirty", "mpi": { "subversion": 1, "version": 3 @@ -220,366 +191,367 @@ The raw data of these runs can be found in the folder `csv_result_export`. "results": { "Add_avg_t": { "unit": "s", - "value": 0.033347886300000004 + "value": 0.0530118015 }, "Add_best_rate": { "unit": "MB/s", - "value": 53622.07621998581 + "value": 30506.44534004568 }, "Add_max_t": { "unit": "s", - "value": 0.03681156 + "value": 0.053374228 }, "Add_min_t": { "unit": "s", - "value": 0.030036374 + "value": 0.052795818 }, "Copy_avg_t": { "unit": "s", - "value": 0.0232275248 + "value": 0.0389517071 }, "Copy_best_rate": { "unit": "MB/s", - "value": 47558.26475478994 + "value": 27731.67753145461 }, "Copy_max_t": { "unit": "s", - "value": 0.025507117 + "value": 0.040187928 }, "Copy_min_t": { "unit": "s", - "value": 0.022577397 + "value": 0.038718964 }, "PCI_read_avg_t": { "unit": "s", - "value": 0.0672552576 + "value": 0.0597715322 }, "PCI_read_best_rate": { "unit": "MB/s", - "value": 24721.98479896992 + "value": 27479.82304062059 }, "PCI_read_max_t": { "unit": "s", - "value": 0.06825187 + "value": 0.063351971 }, "PCI_read_min_t": { "unit": "s", - "value": 0.065149006 + "value": 0.058610739 }, "PCI_write_avg_t": { "unit": "s", - "value": 0.0636534559 + "value": 0.0685080558 }, "PCI_write_best_rate": { "unit": "MB/s", - "value": 26815.238093906166 + "value": 25765.843668891466 }, "PCI_write_max_t": { "unit": "s", - "value": 0.084513938 + "value": 0.120777629 }, "PCI_write_min_t": { "unit": "s", - "value": 0.060063339 + "value": 0.062509606 }, "Scale_avg_t": { "unit": "s", - "value": 0.021342261699999997 + "value": 0.03978323250000001 }, "Scale_best_rate": { "unit": "MB/s", - "value": 53574.52309080775 + "value": 27084.469403573872 }, "Scale_max_t": { "unit": "s", - "value": 0.024272246 + "value": 0.039983335 }, "Scale_min_t": { "unit": "s", - "value": 0.020042023 + "value": 0.039644189 }, "Triad_avg_t": { "unit": "s", - "value": 0.0346477169 + "value": 0.052600337100000005 }, "Triad_best_rate": { "unit": "MB/s", - "value": 48456.4004453886 + "value": 30701.997665172144 }, "Triad_max_t": { "unit": "s", - "value": 0.037008534 + "value": 0.052735936 }, "Triad_min_t": { "unit": "s", - "value": 0.03323839 + "value": 0.052459542 } }, "settings": { "Array Size": 134217728, - "Communication Type": "UNSUPPORTED", - "Data Type": "cl_float", - "Kernel File": "./bin/stream_kernels_single_emulate.aocx", + "Communication Type": false, + "Data Type": false, + "Kernel File": false, "Kernel Replications": 4, - "Kernel Type": "Single", + "Kernel Type": false, "MPI Ranks": 1, "Repetitions": 10, - "Test Mode": "No" + "Test Mode": false }, "timings": { "Add": [ { "unit": "s", - "value": 0.03681156 + "value": 0.052848008 }, { "unit": "s", - "value": 0.030148826 + "value": 0.052795818 }, { "unit": "s", - "value": 0.034179315 + "value": 0.053294617 }, { "unit": "s", - "value": 0.03443528 + "value": 0.053374228 }, { "unit": "s", - "value": 0.030036374 + "value": 0.052812528 }, { "unit": "s", - "value": 0.03498338 + "value": 0.053091652 }, { "unit": "s", - "value": 0.033383682 + "value": 0.052962381 }, { "unit": "s", - "value": 0.03149675 + "value": 0.052992892 }, { "unit": "s", - "value": 0.035128302 + "value": 0.052880469 }, { "unit": "s", - "value": 0.032875394 + "value": 0.053065422 } ], "Copy": [ { "unit": "s", - "value": 0.023277928 + "value": 0.040187928 }, { "unit": "s", - "value": 0.023061445 + "value": 0.038718964 }, { "unit": "s", - "value": 0.022577397 + "value": 0.038728084 }, { "unit": "s", - "value": 0.025507117 + "value": 0.038760534 }, { "unit": "s", - "value": 0.022904103 + "value": 0.038793734 }, { "unit": "s", - "value": 0.023076385 + "value": 0.039005018 }, { "unit": "s", - "value": 0.022585516 + "value": 0.038862845 }, { "unit": "s", - "value": 0.023018084 + "value": 0.038731043 }, { "unit": "s", - "value": 0.023126956 + "value": 0.038891176 }, { "unit": "s", - "value": 0.023140317 + "value": 0.038837745 } ], "PCI_read": [ { "unit": "s", - "value": 0.066263925 + "value": 0.058610739 }, { "unit": "s", - "value": 0.065149006 + "value": 0.059211539 }, { "unit": "s", - "value": 0.06823823 + "value": 0.059094178 }, { "unit": "s", - "value": 0.067614649 + "value": 0.063351971 }, { "unit": "s", - "value": 0.068157828 + "value": 0.059738369 }, { "unit": "s", - "value": 0.06825187 + "value": 0.059645487 }, { "unit": "s", - "value": 0.068159038 + "value": 0.059697218 }, { "unit": "s", - "value": 0.066694763 + "value": 0.059381852 }, { "unit": "s", - "value": 0.067605659 + "value": 0.059468254 }, { "unit": "s", - "value": 0.066417608 + "value": 0.059515715 } ], "PCI_write": [ { "unit": "s", - "value": 0.084513938 + "value": 0.120777629 }, { "unit": "s", - "value": 0.060253183 + "value": 0.062600188 }, { "unit": "s", - "value": 0.060325944 + "value": 0.062606179 }, { "unit": "s", - "value": 0.064254031 + "value": 0.062711891 }, { "unit": "s", - "value": 0.060529077 + "value": 0.062509606 }, { "unit": "s", - "value": 0.063792623 + "value": 0.062803592 }, { "unit": "s", - "value": 0.060357565 + "value": 0.062787151 }, { "unit": "s", - "value": 0.060063339 + "value": 0.062679419 }, { "unit": "s", - "value": 0.060287283 + "value": 0.06271488 }, { "unit": "s", - "value": 0.062157576 + "value": 0.062890023 } ], "Scale": [ { "unit": "s", - "value": 0.021235864 + "value": 0.039983335 }, { "unit": "s", - "value": 0.020608554 + "value": 0.039644189 }, { "unit": "s", - "value": 0.020822067 + "value": 0.039831532 }, { "unit": "s", - "value": 0.020042023 + "value": 0.039766591 }, { "unit": "s", - "value": 0.021288745 + "value": 0.039660679 }, { "unit": "s", - "value": 0.020088374 + "value": 0.039933614 }, { "unit": "s", - "value": 0.021096531 + "value": 0.039789862 }, { "unit": "s", - "value": 0.021525769 + "value": 0.03967413 }, { "unit": "s", - "value": 0.024272246 + "value": 0.039722601 }, { "unit": "s", - "value": 0.022442444 + "value": 0.039825792 } ], "Triad": [ { "unit": "s", - "value": 0.037008534 + "value": 0.052583184 }, { "unit": "s", - "value": 0.036020228 + "value": 0.052564403 }, { "unit": "s", - "value": 0.033424273 + "value": 0.052735936 }, { "unit": "s", - "value": 0.033462613 + "value": 0.052644865 }, { "unit": "s", - "value": 0.033843901 + "value": 0.052699956 }, { "unit": "s", - "value": 0.033447893 + "value": 0.052459542 }, { "unit": "s", - "value": 0.03323839 + "value": 0.052657585 }, { "unit": "s", - "value": 0.036342203 + "value": 0.052493212 }, { "unit": "s", - "value": 0.03446487 + "value": 0.052600984 }, { "unit": "s", - "value": 0.035224264 + "value": 0.052563704 } ] }, + "validated": true, "version": "2.6" } diff --git a/b_eff/README.md b/b_eff/README.md index 157b0a67..cdbb8c92 100644 --- a/b_eff/README.md +++ b/b_eff/README.md @@ -179,13 +179,14 @@ The json output looks like the following. ```json { - "config_time": "Thu Dec 08 10:38:28 UTC 2022", + "config_time": "Wed Dec 14 08:39:42 UTC 2022", "device": "Intel(R) FPGA Emulation Device", "environment": { "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib" }, "errors": {}, - "git_commit": "86e0064-dirty", + "execution_time": "Wed Dec 14 09:56:29 UTC 2022", + "git_commit": "be1a4e9-dirty", "mpi": { "subversion": 1, "version": 3 @@ -194,23 +195,23 @@ The json output looks like the following. "results": { "b_eff": { "unit": "B/s", - "value": 21935372.01805185 + "value": 22061624.19637537 } }, "settings": { - "Communication Type": "IEC", - "Kernel File": "./bin/communication_bw520n_IEC_emulate.aocx", + "Communication Type": false, + "Kernel File": false, "Kernel Replications": 2, "Loop Length": 5, "MPI Ranks": 1, "Message Sizes": 2, "Repetitions": 10, - "Test Mode": "No" + "Test Mode": false }, "timings": { "6": { - "maxCalcBW": 14601537.724441605, - "maxMinCalculationTime": 4.3831e-05, + "maxCalcBW": 9880812.696844315, + "maxMinCalculationTime": 6.4772e-05, "timings": [ { "looplength": 5, @@ -218,51 +219,51 @@ The json output looks like the following. "timings": [ { "unit": "s", - "value": 0.013389739 + "value": 0.010991125 }, { "unit": "s", - "value": 6.2761e-05 + "value": 8.8202e-05 }, { "unit": "s", - "value": 4.9321e-05 + "value": 0.000133323 }, { "unit": "s", - "value": 4.3831e-05 + "value": 8.5442e-05 }, { "unit": "s", - "value": 4.951e-05 + "value": 0.000272905 }, { "unit": "s", - "value": 4.7561e-05 + "value": 0.000168784 }, { "unit": "s", - "value": 5.2311e-05 + "value": 6.4772e-05 }, { "unit": "s", - "value": 5.0441e-05 + "value": 0.000171733 }, { "unit": "s", - "value": 4.6901e-05 + "value": 0.000163393 }, { "unit": "s", - "value": 5.4401e-05 + "value": 8.0391e-05 } ] } ] }, "7": { - "maxCalcBW": 18104411.535904724, - "maxMinCalculationTime": 7.0701e-05, + "maxCalcBW": 19143908.348538782, + "maxMinCalculationTime": 6.6862e-05, "timings": [ { "looplength": 5, @@ -270,51 +271,51 @@ The json output looks like the following. "timings": [ { "unit": "s", - "value": 0.000104852 + "value": 0.000135662 }, { "unit": "s", - "value": 0.000125222 + "value": 0.000119343 }, { "unit": "s", - "value": 7.9731e-05 + "value": 0.000178914 }, { "unit": "s", - "value": 0.000151442 + "value": 7.7691e-05 }, { "unit": "s", - "value": 9.3052e-05 + "value": 9.1922e-05 }, { "unit": "s", - "value": 0.000193763 + "value": 0.000259545 }, { "unit": "s", - "value": 8.4472e-05 + "value": 0.000143233 }, { "unit": "s", - "value": 0.000116562 + "value": 0.000149763 }, { "unit": "s", - "value": 8.2471e-05 + "value": 6.6862e-05 }, { "unit": "s", - "value": 7.0701e-05 + "value": 7.2351e-05 } ] } ] }, "8": { - "maxCalcBW": 33100166.79380923, - "maxMinCalculationTime": 7.7341e-05, + "maxCalcBW": 37160151.543743014, + "maxMinCalculationTime": 6.8891e-05, "timings": [ { "looplength": 5, @@ -322,49 +323,50 @@ The json output looks like the following. "timings": [ { "unit": "s", - "value": 0.000711343 + "value": 0.000159723 }, { "unit": "s", - "value": 0.000378606 + "value": 0.000104432 }, { "unit": "s", - "value": 0.000280195 + "value": 0.000166953 }, { "unit": "s", - "value": 0.000107392 + "value": 7.7492e-05 }, { "unit": "s", - "value": 0.000203963 + "value": 7.8241e-05 }, { "unit": "s", - "value": 0.000122193 + "value": 9.5762e-05 }, { "unit": "s", - "value": 8.2151e-05 + "value": 0.000235084 }, { "unit": "s", - "value": 8.6861e-05 + "value": 0.000280265 }, { "unit": "s", - "value": 0.000167473 + "value": 0.000130013 }, { "unit": "s", - "value": 7.7341e-05 + "value": 6.8891e-05 } ] } ] } }, + "validated": true, "version": "1.3" } From 7bbdeeab19a4b638cf304c6c4c6fcd2abd0cc921 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Wed, 14 Dec 2022 17:30:00 +0100 Subject: [PATCH 229/318] fix FFT build --- FFT/src/host/fft_benchmark.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp index 8610b121..e9b86e12 100644 --- a/FFT/src/host/fft_benchmark.cpp +++ b/FFT/src/host/fft_benchmark.cpp @@ -35,7 +35,7 @@ SOFTWARE. #include "parameters.h" fft::FFTProgramSettings::FFTProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), - iterations(results["b"].as()), inverse(results.count("inverse")), kernelReplications(results["r"].as()) { + iterations(results["b"].as()), inverse(results.count("inverse")) { } From a5d6c9eb5fb2a7acae9c8cf91a37e67a430e477c Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Fri, 16 Dec 2022 13:43:43 +0100 Subject: [PATCH 230/318] fix output parsing --- scripts/evaluation/parse_raw_to_csv.py | 64 +++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 6 deletions(-) diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py index e5306dc7..743b5410 100755 --- a/scripts/evaluation/parse_raw_to_csv.py +++ b/scripts/evaluation/parse_raw_to_csv.py @@ -9,12 +9,64 @@ import sys # Regular expressions for the raw output of all -fft_regex = "Version:\\s+(?P.+)\n(.*\n)+Batch\\sSize\\s+(?P\d+)\n(.*\n)FFT\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+GFLOPS:\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" -gemm_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" -ra_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+Error:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" -trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+total\\s+time\\s+transfer\\s+time\\s+calc\\s+time\\s+calc\\s+FLOPS\\s+Memory\\s+Bandwidth\\s+PCIe\\s+Bandwidth\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e|inf)+)\\s+.+\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e|inf)+)" -stream_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P.+)\n(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Kernel\\sType\\s+(?P.+)\n(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\s+Avg\\stime\\s+Min\\stime\\s+Max\\stime\n\\s+PCI_write\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+PCI_read\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+Copy\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+Scale\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+Add\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n\\s+Triad\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" -linpack_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)(\\s+.+\\s*\n)\\s+GEFA\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)(\\s+.+\\s*\n)\\s+GESL\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)" +fft_regex = ("Version:\\s+(?P.+)\n" + "(.*\n)+Batch\\sSize\\s+(?P\d+)\n" + "(.*\n)FFT\\sSize\\s+(?P\d+)" + "(.*\n)+Device\\s+(?P.+)\n" + "(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n" + "\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" + "(.*\n)+\\s+avg\\s+best\\s+\n" + "\\s+Time\\s+in\\s+s:\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\n" + "\\s+GFLOPS:\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\sGFLOP") + +gemm_regex = ("Version:\\s+(?P.+)\n" + "(.*\n)+Matrix\\sSize\\s+(?P\d+)" + "(.*\n)+Device\\s+(?P.+)\n" + "(.*\n)+\\s+norm\.\\sresidual\\s+res\.\\serror\\s+mach\.\\seps\n" + "\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" + "(.*\n)+\\s+best\\s+mean\\s+GFLOPS\\s+\n" + "(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+(?P(\d|\.|\+|-|e)+)\\s+GFLOP") + +ra_regex = ("Version:\\s+(?P.+)\n" + "(.*\n)+Array\\sSize\\s+(?P(\d|\.|\+|-|e)+)" + "(.*\n)+Kernel\\sReplications\\s+(?P\d+)" + "(.*\n)+Device\\s+(?P.+)\n" + "(.*\n)+Error:\\s+(?P(\d|\.|\+|-|e)+)" + "(.*\n)+best\\s+mean\\s+GUOPS\\s+\n" + "(?P(\d|\.|\+|-|e)+)\\s.\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\sGUOP") + +#TODO +trans_regex = ("Version:\\s+(?P.+)\n" + "(.*\n)+Matrix\\sSize\\s+(?P\d+)" + "(.*\n)+Device\\s+(?P.+)\n" + "(.*\n)+\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)" + "(.*\n)+\\s+total\\stime\\s+transfer\\stime\\s+calc\\s+time\\s+calc\\sFLOPS\\s+Memory\\sBandwidth\\s+PCIe\\sBandwidth\\s+\n" + "\\s+avg:\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e|inf)+)\\s.+\\s+\n" + "\\s+best:\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e|inf)+)\\s.+\\s.\n") + +stream_regex = ("Version:\\s+(?P.+)\n" + "(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P(\d|\.|\+|-|e)+)" + "(.*\n)+Data\\sType\\s+(?P.+)\n" + "(.*\n)+Kernel\\sReplications\\s+(?P\d+)" + "(.*\n)+Kernel\\sType\\s+(?P.+)\n" + "(.*\n)+Device\\s+(?P.+)\n" + "(.*\n)+Function\\s+Best\\sRate\\s+Avg\\stime\\s+Min\\stime\\s+Max\\stime\\s+\n" + "PCI_write\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+\n" + "PCI_read\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+\n" + "Copy\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s+.+\\s+\n" + "Scale\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+\n" + "Add\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+\n" + "Triad\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+\n") + +linpack_regex = ("Version:\\s+(?P.+)\n" + "(.*\n)+Matrix\\sSize\\s+(?P\d+)" + "(.*\n)+Device\\s+(?P.+)\n" + "(.*\n)+\\s+norm\.\\sresidual\\s+res\.\\serror\\s+mach\.\\seps\n" + "\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P(\d|\.|\+|-|e)+)\n" + "(.*\n)+\\sMethod\\s+best\\s+mean\\s+GFLOPS\\s+\n" + "\\stotal\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+\n" + "\\sGEFA\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+\n" + "\\sGESL\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+(?P(\d|\.|\+|-|e)+)\\s.+\\s+\n") def parse_network(file_content): From 8652cbb5f6d0d09f19ec938109edd6116c1398d6 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Fri, 16 Dec 2022 14:16:32 +0100 Subject: [PATCH 231/318] fix PTRANS unit tests --- PTRANS/src/host/transpose_benchmark.cpp | 2 +- PTRANS/tests/test_host_functionality.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp index 213f6c7e..9b16e38d 100644 --- a/PTRANS/src/host/transpose_benchmark.cpp +++ b/PTRANS/src/host/transpose_benchmark.cpp @@ -191,7 +191,7 @@ transpose::TransposeBenchmark::validateOutput(transpose::TransposeData &data) { void transpose::TransposeBenchmark::printError() { if (mpi_comm_rank == 0) { - std::cout << "Maximum error: " << errors.at("epsilon") << " < " << 100 * errors.at("epsilon") << std::endl; + std::cout << "Maximum error: " << errors.at("max_error") << " < " << 100 * errors.at("epsilon") << std::endl; std::cout << "Mach. Epsilon: " << errors.at("epsilon") << std::endl; } } diff --git a/PTRANS/tests/test_host_functionality.cpp b/PTRANS/tests/test_host_functionality.cpp index 4f7ebed6..486b178b 100644 --- a/PTRANS/tests/test_host_functionality.cpp +++ b/PTRANS/tests/test_host_functionality.cpp @@ -39,7 +39,7 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatHeader) { std::cout.rdbuf(oldStdOutBuffer); EXPECT_THAT(newStdOutBuffer.str(), - ::testing::MatchesRegex("(\\s+)total\\stime(\\s+)transfer\\stime(\\s+)calc\\s+time(\\s+)calc\\sFLOPS(\\s+)Memory\\sBandwidth(\\s+)PCIe\\sBandwidth\n.*")); + ::testing::MatchesRegex("(\\s+)total\\stime(\\s+)transfer\\stime(\\s+)calc\\s+time(\\s+)calc\\sFLOPS(\\s+)Memory\\sBandwidth(\\s+)PCIe\\sBandwidth(\\s+)\n.*")); } /** @@ -66,7 +66,7 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatValues) { std::cout.rdbuf(oldStdOutBuffer); EXPECT_THAT(newStdOutBuffer.str(), - ::testing::MatchesRegex(".*\navg:\\s+2\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s.*\n.*\n")); + ::testing::MatchesRegex(".*\n\\s+avg:\\s+2\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s.*\n.*\n")); } /** From 73b73cade692f8245f8d6fe8e0813b75a167534e Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Fri, 16 Dec 2022 14:33:00 +0100 Subject: [PATCH 232/318] remove myst_parser dependency from sphinx --- docs/source/conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 99328fa6..73c3c248 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -43,7 +43,6 @@ 'sphinx.ext.githubpages', # 'breathe', 'sphinx_rtd_theme', - 'myst_parser' ] # Enable Figure numbering and referencing From ea66d52433bb96327884390de5d66506ca296540 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 9 Dec 2022 16:01:09 +0100 Subject: [PATCH 233/318] Delete unused header file --- b_eff/src/host/execution.h | 52 -------------------------------------- 1 file changed, 52 deletions(-) delete mode 100644 b_eff/src/host/execution.h diff --git a/b_eff/src/host/execution.h b/b_eff/src/host/execution.h deleted file mode 100644 index f43c31de..00000000 --- a/b_eff/src/host/execution.h +++ /dev/null @@ -1,52 +0,0 @@ -/* -Copyright (c) 2019 Marius Meyer - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ -#ifndef SRC_HOST_EXECUTION_H_ -#define SRC_HOST_EXECUTION_H_ - -/* C++ standard library headers */ -#include -#include -#include - -/* External library headers */ -#include "parameters.h" -#include "network_benchmark.hpp" - - -namespace bm_execution { - -/** -The actual execution of the benchmark. -This method can be implemented in multiple *.cpp files. This header enables -simple exchange of the different calculation methods. - -@param config struct that contains all necessary information to execute the kernel on the FPGA - - -@return The resulting matrix -*/ - network::ExecutionTimings - calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, cl::vector &validationData); - -} // namespace bm_execution - -#endif // SRC_HOST_EXECUTION_H_ From a69c22b0cabf73ee1425a18fb9e9d9f218301d2c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 9 Dec 2022 16:41:32 +0100 Subject: [PATCH 234/318] Update hlslib --- extern/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index d54a37c5..ae08a768 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -28,7 +28,7 @@ FetchContent_Declare( # unfortunately they do not use releases, so the latest commit was used GIT_REPOSITORY https://github.com/definelicht/hlslib.git - GIT_TAG v1.2.1) + GIT_TAG v1.4.3) FetchContent_GetProperties(extern_hlslib) if(NOT extern_hlslib_POPULATED) From c0209c8588fa44d96f11fd3870392b74faf40d3e Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 9 Dec 2022 17:36:30 +0100 Subject: [PATCH 235/318] Remove Intel FPGA limitation from beff --- b_eff/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt index b894bb48..b613e3d4 100755 --- a/b_eff/CMakeLists.txt +++ b/b_eff/CMakeLists.txt @@ -23,7 +23,3 @@ set(DATA_TYPE char) include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake) unset(DATA_TYPE CACHE) find_package(MPI REQUIRED) - -if (NOT INTELFPGAOPENCL_FOUND) - message(ERROR "Benchmark does only support the Intel OpenCL SDK") -endif() From cba6581a40663306233ea8851335118f83347202 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 9 Dec 2022 17:42:10 +0100 Subject: [PATCH 236/318] Add PCIE dummy kernel --- b_eff/src/device/CMakeLists.txt | 30 ++++++++++++++------- b_eff/src/device/communication_PCIE.cl | 37 ++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 9 deletions(-) create mode 100644 b_eff/src/device/communication_PCIE.cl diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt index 8316a884..0a15211e 100644 --- a/b_eff/src/device/CMakeLists.txt +++ b/b_eff/src/device/CMakeLists.txt @@ -3,12 +3,24 @@ set(KERNEL_REPLICATION_ENABLED Yes CACHE INTERNAL "Enables kernel replication in set(NUM_REPLICATIONS 2) include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake) -generate_kernel_targets_intel(communication_bw520n_IEC) -add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 -m 0 -n 1 - WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) -add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1 - WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) -add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type PCIE -l 1 -u 10 -m 0 -n 1 - WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) -add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 1 -m 20 -n 1 - WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) +if (INTELFPGAOPENCL_FOUND) + generate_kernel_targets_intel(communication_bw520n_IEC communication_PCIE) + add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 -m 0 -n 1 + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1 + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type PCIE -l 1 -u 10 -m 0 -n 1 + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 1 -m 20 -n 1 + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) +endif() + +if (Vitis_FOUND) + generate_kernel_targets_xilinx(communication_PCIE) + add_test(NAME test_emulation_pcie_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 10 -m 0 -n 1 + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_emulation_cpu_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin --comm-type CPU -l 1 -u 10 -m 0 -n 1 + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 1 -m 20 -n 1 + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) +endif() diff --git a/b_eff/src/device/communication_PCIE.cl b/b_eff/src/device/communication_PCIE.cl new file mode 100644 index 00000000..dfae7ca8 --- /dev/null +++ b/b_eff/src/device/communication_PCIE.cl @@ -0,0 +1,37 @@ +/* +Copyright (c) 2022 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "parameters.h" + + +/** + * Minimal kernel only used to measure the startup latency of a kernel and to provide a + * memory buffe for Xilinx FPGAs to measure PCIe read and write performance + * + * @param input Dummy input + */ +__kernel +__attribute__ ((max_global_work_dim(0))) +void dummyKernel(__global char *input) { + // Minimal kernel only used to measure the startup latency of a kernel and to provide a + // memory buffe for Xilinx FPGAs to measure PCIe read and write performance +} From c640a141e954b35011246c612d690075fa681953 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 9 Dec 2022 17:43:31 +0100 Subject: [PATCH 237/318] Add Xilinx compatability for host and add PCIe reverse execution --- b_eff/src/host/CMakeLists.txt | 14 ++ b_eff/src/host/execution_types/execution.hpp | 5 +- .../execution_pcie_reverse.hpp | 142 ++++++++++++++++++ b_eff/src/host/network_benchmark.cpp | 23 ++- b_eff/src/host/network_benchmark.hpp | 55 +++++++ 5 files changed, 234 insertions(+), 5 deletions(-) create mode 100644 b_eff/src/host/execution_types/execution_pcie_reverse.hpp diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt index fb08281f..deaf1aae 100755 --- a/b_eff/src/host/CMakeLists.txt +++ b/b_eff/src/host/CMakeLists.txt @@ -17,3 +17,17 @@ if (INTELFPGAOPENCL_FOUND) target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}") add_test(NAME test_intel_host_executable COMMAND $ -h) endif() + +if (Vitis_FOUND) + add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) + target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) + target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host) + add_executable(${HOST_EXE_NAME}_xilinx main.cpp) + target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") + target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base) + target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx) + target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA) + target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA) + target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") + add_test(NAME test_xilinx_host_executable COMMAND $ -h) +endif() diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp index df630838..f1e0600c 100644 --- a/b_eff/src/host/execution_types/execution.hpp +++ b/b_eff/src/host/execution_types/execution.hpp @@ -22,4 +22,7 @@ SOFTWARE. #include "execution_types/execution_cpu.hpp" #include "execution_types/execution_pcie.hpp" -#include "execution_types/execution_iec.hpp" \ No newline at end of file +#include "execution_types/execution_pcie_reverse.hpp" +#if INTEL_FPGA +#include "execution_types/execution_iec.hpp" +#endif \ No newline at end of file diff --git a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp new file mode 100644 index 00000000..5f44522e --- /dev/null +++ b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp @@ -0,0 +1,142 @@ +/* +Copyright (c) 2019 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_PCIE_REVERSE_HPP +#define SRC_HOST_EXECUTION_TYPES_EXECUTION_PCIE_REVERSE_HPP + +/* C++ standard library headers */ +#include +#include +#include + +/* External library headers */ +#include "mpi.h" + +/* Project's headers */ + +namespace network::execution_types::pcie_reverse { + + /* + Implementation for the single kernel. + @copydoc bm_execution::calculate() + */ + std::shared_ptr + calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, + cl::vector &validationData) { + + int err; + std::vector sendQueues; + std::vector dummyBuffers; + std::vector> dummyBufferContents; + std::vector dummyKernels; + + cl_uint size_in_bytes = std::max(static_cast(validationData.size()), (1 << messageSize)); + + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + + int current_size; + MPI_Comm_size(MPI_COMM_WORLD, & current_size); + + std::vector calculationTimings; + for (uint r =0; r < config.programSettings->numRepetitions; r++) { + sendQueues.clear(); + dummyBuffers.clear(); + dummyBufferContents.clear(); + dummyKernels.clear(); + + // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + + dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err)); + ASSERT_CL(err) + + dummyKernels.push_back(cl::Kernel(*config.program, + "dummyKernel", &err)); + + err = dummyKernels[r].setArg(0, dummyBuffers[r]); + ASSERT_CL(err); + + dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); + + cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err); + ASSERT_CL(err) + + sendQueue.enqueueWriteBuffer(dummyBuffers.back(), CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents.back().data()); + + sendQueues.push_back(sendQueue); + + } + double calculationTime = 0.0; + for (int i = 0; i < config.programSettings->kernelReplications; i++) { + MPI_Barrier(MPI_COMM_WORLD); + auto startCalculation = std::chrono::high_resolution_clock::now(); + for (int l = 0; l < looplength; l++) { + if (config.programSettings->pcie_reverse_write_pcie) { + sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); + sendQueues[i].finish(); + } + if (config.programSettings->pcie_reverse_execute_kernel) { + sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1)); + sendQueues[i].finish(); + } + if (config.programSettings->pcie_reverse_read_pcie) { + sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); + sendQueues[i].finish(); + } + } + auto endCalculation = std::chrono::high_resolution_clock::now(); + calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); + #ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl; + #endif + } + calculationTimings.push_back(calculationTime); +#ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Done " << r << std::endl; +#endif + } + // Read validation data from FPGA will be placed sequentially in buffer for all replications + // The data order should not matter, because every byte should have the same value! + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + std::cout << validationData.size() << std::endl; + err = sendQueues[r].enqueueReadBuffer( + dummyBuffers[r], CL_TRUE, 0, + sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, + &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); + ASSERT_CL(err); + sendQueues[r].finish(); + } + std::shared_ptr result(new network::ExecutionTimings{ + looplength, + messageSize, + calculationTimings + }); + return result; + } + +} // namespace bm_execution + +#endif diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index dbe1f610..5b9d4f2c 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -36,7 +36,11 @@ SOFTWARE. network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), maxLoopLength(results["u"].as()), minLoopLength(results["l"].as()), maxMessageSize(results["m"].as()), - minMessageSize(results["min-size"].as()), llOffset(results["o"].as()), llDecrease(results["d"].as()) { + minMessageSize(results["min-size"].as()), llOffset(results["o"].as()), llDecrease(results["d"].as()), + pcie_reverse_write_pcie(results["pcie-read"].count()), pcie_reverse_read_pcie(results["pcie-write"].count()), + pcie_reverse_execute_kernel(results["kernel-latency"].count()) { + + pcie_reverse = pcie_reverse_execute_kernel | pcie_reverse_read_pcie | pcie_reverse_write_pcie; } @@ -49,7 +53,7 @@ network::NetworkProgramSettings::getSettingsMap() { } network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize, unsigned int _loopLength) : messageSize(_messageSize), loopLength(_loopLength), - validationBuffer(CHANNEL_WIDTH * 2 * 2, 0) { + validationBuffer((1 << _messageSize) * 2 * 2, 0) { // TODO: fix the validation buffer size to use the variable number of kernel replications and channels // Validation data buffer should be big enough to fit the data of two channels // for every repetition. The number of kernel replications is fixed to 2, which @@ -86,7 +90,10 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options) ("o", "Offset used before reducing repetitions", cxxopts::value()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_OFFSET))) ("d", "Number os steps the repetitions are decreased to its minimum", - cxxopts::value()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE))); + cxxopts::value()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE))) + ("pcie-read", "Use reverse PCIe experiment and measure PCIe read performance from device") + ("pcie-write", "Use reverse PCIe experiment and measure PCIe write performance from device") + ("kernel-latency", "Use reverse PCIe experiment and measure kernel execution latency"); } void @@ -108,8 +115,16 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) { network::ExecutionTimings timing; switch (executionSettings->programSettings->communicationType) { case hpcc_base::CommunicationType::cpu_only: timing = execution_types::cpu::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; - case hpcc_base::CommunicationType::pcie_mpi: timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; + case hpcc_base::CommunicationType::pcie_mpi: + if (executionSettings->programSettings->pcie_reverse) { + timing = execution_types::pcie_reverse::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); + } else { + timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); + } + break; +#if INTEL_FPGA case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; +#endif default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType)); } timing_results.push_back(timing); diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index 4d47c392..1eb1825d 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -31,6 +31,33 @@ SOFTWARE. #include "hpcc_benchmark.hpp" #include "parameters.h" +#ifdef XILINX_FPGA +template +struct aligned_allocator { + + // typedefs + typedef T value_type; + typedef value_type* pointer; + typedef const value_type* const_pointer; + + pointer allocate(size_t pCount, const_pointer = 0){ + T* mem = 0; + if (posix_memalign(reinterpret_cast(&mem), 4096, sizeof(T) * pCount) != 0) { + throw std::bad_alloc(); + } + return mem; + } + + void deallocate(pointer pPtr, size_t pCount) { + free(pPtr); + } +}; + +namespace cl { + template using vector = std::vector>; +} +#endif + /** * @brief Contains all classes and methods needed by the Network benchmark * @@ -128,6 +155,34 @@ class NetworkProgramSettings : public hpcc_base::BaseSettings { */ uint llDecrease; + /** + * @brief his is automatically set to true if one of pcie_reverse_write_pcie, pcie_reverse_read_pcie, + * or pcie_reverse_execute_kernel is set to true. The reverse PCIe experiment will be executed in that case. + * + */ + bool pcie_reverse; + + /** + * @brief If true, the benchmark will execute the reverse PCIe benchmark instead. It will write data to the FPGA. + * The other pcie_reverse flags can be set to do additional operations within the measurement. + * + */ + bool pcie_reverse_write_pcie; + + /** + * @brief If true, the benchmark will execute the reverse PCIe benchmark instead. It will execute an empty kernel. + * The other pcie_reverse flags can be set to do additional operations within the measurement. + * + */ + bool pcie_reverse_execute_kernel; + + /** + * @brief If true, the benchmark will execute the reverse PCIe benchmark instead. It will read data from the FPGA. + * The other pcie_reverse flags can be set to do additional operations within the measurement. + * + */ + bool pcie_reverse_read_pcie; + /** * @brief Construct a new Network Program Settings object * From 2df2fca858011bab50d73864094dfe44e57bdb47 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 9 Dec 2022 17:44:44 +0100 Subject: [PATCH 238/318] Fix test CMakeLists --- b_eff/tests/CMakeLists.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/b_eff/tests/CMakeLists.txt b/b_eff/tests/CMakeLists.txt index 2a00ea83..6604c769 100755 --- a/b_eff/tests/CMakeLists.txt +++ b/b_eff/tests/CMakeLists.txt @@ -6,4 +6,9 @@ set(TEST_SOURCES test_kernel_functionality_and_host_integration.cpp) include(${CMAKE_SOURCE_DIR}/../cmake/unitTestTargets.cmake) -target_link_libraries(${LIB_NAME}_intel ${MPI_LIBRARIES}) +if (INTELFPGAOPENCL_FOUND) + target_link_libraries(${LIB_NAME}_intel ${MPI_LIBRARIES}) +endif() +if (Vitis_FOUND) + target_link_libraries(${LIB_NAME}_xilinx ${MPI_LIBRARIES}) +endif() \ No newline at end of file From a8c871c2783b07e67c60c5f231c1885019f44c38 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 9 Dec 2022 17:47:01 +0100 Subject: [PATCH 239/318] Add config for U280 --- b_eff/configs/Xilinx_U280_DDR.cmake | 14 ++++++++++++++ b_eff/settings/settings.compile.xilinx.u280.ini | 0 b_eff/settings/settings.link.xilinx.u280.ddr.ini | 4 ++++ 3 files changed, 18 insertions(+) create mode 100644 b_eff/configs/Xilinx_U280_DDR.cmake create mode 100644 b_eff/settings/settings.compile.xilinx.u280.ini create mode 100644 b_eff/settings/settings.link.xilinx.u280.ddr.ini diff --git a/b_eff/configs/Xilinx_U280_DDR.cmake b/b_eff/configs/Xilinx_U280_DDR.cmake new file mode 100644 index 00000000..61d9003b --- /dev/null +++ b/b_eff/configs/Xilinx_U280_DDR.cmake @@ -0,0 +1,14 @@ + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE) +set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.u280.ddr.ini CACHE FILEPATH "" FORCE) +set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.u280.ini CACHE FILEPATH "" FORCE) + +# STREAM specific options +# Defaults to a total of ~12GB data +set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE) + +set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE) \ No newline at end of file diff --git a/b_eff/settings/settings.compile.xilinx.u280.ini b/b_eff/settings/settings.compile.xilinx.u280.ini new file mode 100644 index 00000000..e69de29b diff --git a/b_eff/settings/settings.link.xilinx.u280.ddr.ini b/b_eff/settings/settings.link.xilinx.u280.ddr.ini new file mode 100644 index 00000000..4d8fb9bd --- /dev/null +++ b/b_eff/settings/settings.link.xilinx.u280.ddr.ini @@ -0,0 +1,4 @@ +[connectivity] +nk=dummyKernel:1:dummyKernel + +sp=dummyKernel.m_axi_gmem:DDR[0] \ No newline at end of file From 6b48b26328042b1c1055fbc83c373fbf4fa62859 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 12 Dec 2022 08:36:32 +0100 Subject: [PATCH 240/318] Add Intel PCIE config --- b_eff/configs/Bittware_520N_PCIE.cmake | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 b_eff/configs/Bittware_520N_PCIE.cmake diff --git a/b_eff/configs/Bittware_520N_PCIE.cmake b/b_eff/configs/Bittware_520N_PCIE.cmake new file mode 100644 index 00000000..b5fb6dad --- /dev/null +++ b/b_eff/configs/Bittware_520N_PCIE.cmake @@ -0,0 +1,17 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "p520_hpc_sg280l" CACHE STRING "" FORCE) +set(AOC_FLAGS "-fpc -fp-relaxed -seed=7" CACHE STRING "" FORCE) + +# GEMM specific options +set(CHANNEL_WIDTH 32 CACHE STRING "Width of a single external channel in Byte" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications" FORCE) \ No newline at end of file From fbebd06a3ae06d08e1953b91d0f1b993812eeed5 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 12 Dec 2022 16:41:00 +0100 Subject: [PATCH 241/318] Clean up CPU only code --- .../host/execution_types/execution_cpu.hpp | 33 ++++++------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_cpu.hpp b/b_eff/src/host/execution_types/execution_cpu.hpp index ec37dcb6..6fcf636c 100644 --- a/b_eff/src/host/execution_types/execution_cpu.hpp +++ b/b_eff/src/host/execution_types/execution_cpu.hpp @@ -43,9 +43,8 @@ namespace network::execution_types::cpu { cl::vector &validationData) { int err; - std::vector sendQueues; - std::vector dummyBuffers; - std::vector> dummyBufferContents; + std::vector> dummyBufferReadContents; + std::vector> dummyBufferWriteContents; cl_uint size_in_bytes = std::max(static_cast(validationData.size()), (1 << messageSize)); @@ -57,32 +56,20 @@ namespace network::execution_types::cpu { std::vector calculationTimings; for (uint r =0; r < config.programSettings->numRepetitions; r++) { - sendQueues.clear(); - dummyBuffers.clear(); - dummyBufferContents.clear(); + dummyBufferReadContents.clear(); + dummyBufferWriteContents.clear(); // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels for (int r = 0; r < config.programSettings->kernelReplications; r++) { - - dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err)); - ASSERT_CL(err) - - dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); - - cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err); - ASSERT_CL(err) - - sendQueue.enqueueWriteBuffer(dummyBuffers.back(), CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents.back().data()); - - sendQueues.push_back(sendQueue); - + dummyBufferReadContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); + dummyBufferWriteContents.emplace_back(size_in_bytes, static_cast(0)); } double calculationTime = 0.0; for (int i = 0; i < config.programSettings->kernelReplications; i++) { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); for (int l = 0; l < looplength; l++) { - MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, - dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Sendrecv(dummyBufferReadContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, + dummyBufferWriteContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); @@ -102,8 +89,8 @@ namespace network::execution_types::cpu { // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { - err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); - ASSERT_CL(err); + std::copy(dummyBufferWriteContents[r].begin(),dummyBufferWriteContents[r].begin() + dummyBufferWriteContents[r].size() / config.programSettings->kernelReplications, + &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); } return network::ExecutionTimings{ looplength, From 3107f22f5fe8c7bed6fc1653abc6f82b605b70ba Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 12 Dec 2022 16:41:23 +0100 Subject: [PATCH 242/318] Fix message size bug (where four times too large) --- b_eff/src/host/network_benchmark.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index 5b9d4f2c..274f2006 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -53,7 +53,7 @@ network::NetworkProgramSettings::getSettingsMap() { } network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize, unsigned int _loopLength) : messageSize(_messageSize), loopLength(_loopLength), - validationBuffer((1 << _messageSize) * 2 * 2, 0) { + validationBuffer((1 << _messageSize), 0) { // TODO: fix the validation buffer size to use the variable number of kernel replications and channels // Validation data buffer should be big enough to fit the data of two channels // for every repetition. The number of kernel replications is fixed to 2, which From a922f9bf30c901539a5845b54cc06ccd920ffd5a Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 13 Dec 2022 11:36:52 +0100 Subject: [PATCH 243/318] Improve validation scheme for b_eff --- b_eff/src/device/communication_PCIE.cl | 11 +++-- .../host/execution_types/execution_cpu.hpp | 6 +-- .../host/execution_types/execution_iec.hpp | 2 +- .../host/execution_types/execution_pcie.hpp | 4 +- .../execution_pcie_reverse.hpp | 41 ++++++++++--------- b_eff/src/host/network_benchmark.cpp | 11 ++--- b_eff/src/host/network_benchmark.hpp | 7 +++- 7 files changed, 45 insertions(+), 37 deletions(-) diff --git a/b_eff/src/device/communication_PCIE.cl b/b_eff/src/device/communication_PCIE.cl index dfae7ca8..af4f4f81 100644 --- a/b_eff/src/device/communication_PCIE.cl +++ b/b_eff/src/device/communication_PCIE.cl @@ -27,11 +27,14 @@ SOFTWARE. * Minimal kernel only used to measure the startup latency of a kernel and to provide a * memory buffe for Xilinx FPGAs to measure PCIe read and write performance * - * @param input Dummy input + * @param output Output buffer that will be used to write the verification data into + * @param verification Verification value that will be written to the buffer + * @param messageSize size of the output buffer */ __kernel __attribute__ ((max_global_work_dim(0))) -void dummyKernel(__global char *input) { - // Minimal kernel only used to measure the startup latency of a kernel and to provide a - // memory buffe for Xilinx FPGAs to measure PCIe read and write performance +void dummyKernel(__global DEVICE_DATA_TYPE *output, DEVICE_DATA_TYPE verification, int messageSize) { + for (int m=0; m < messageSize; m++) { + output[m] = verification; + } } diff --git a/b_eff/src/host/execution_types/execution_cpu.hpp b/b_eff/src/host/execution_types/execution_cpu.hpp index 6fcf636c..bb125b59 100644 --- a/b_eff/src/host/execution_types/execution_cpu.hpp +++ b/b_eff/src/host/execution_types/execution_cpu.hpp @@ -46,7 +46,7 @@ namespace network::execution_types::cpu { std::vector> dummyBufferReadContents; std::vector> dummyBufferWriteContents; - cl_uint size_in_bytes = std::max(static_cast(validationData.size()), (1 << messageSize)); + cl_uint size_in_bytes = (1 << messageSize); int current_rank; MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); @@ -89,8 +89,8 @@ namespace network::execution_types::cpu { // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { - std::copy(dummyBufferWriteContents[r].begin(),dummyBufferWriteContents[r].begin() + dummyBufferWriteContents[r].size() / config.programSettings->kernelReplications, - &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); + std::copy(dummyBufferWriteContents[r].begin(),dummyBufferWriteContents[r].end(), + &validationData.data()[r * size_in_bytes]); } return network::ExecutionTimings{ looplength, diff --git a/b_eff/src/host/execution_types/execution_iec.hpp b/b_eff/src/host/execution_types/execution_iec.hpp index 4225c783..2d0cec0e 100644 --- a/b_eff/src/host/execution_types/execution_iec.hpp +++ b/b_eff/src/host/execution_types/execution_iec.hpp @@ -161,7 +161,7 @@ namespace network::execution_types::iec { // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { - err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); + err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * (1 << messageSize), &validationData.data()[r * (1 << messageSize)]); ASSERT_CL(err); } return network::ExecutionTimings{ diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp index 50d357e6..cc3e5548 100644 --- a/b_eff/src/host/execution_types/execution_pcie.hpp +++ b/b_eff/src/host/execution_types/execution_pcie.hpp @@ -47,7 +47,7 @@ namespace network::execution_types::pcie { std::vector dummyBuffers; std::vector> dummyBufferContents; - cl_uint size_in_bytes = std::max(static_cast(validationData.size()), (1 << messageSize)); + cl_uint size_in_bytes = (1 << messageSize); int current_rank; MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); @@ -108,7 +108,7 @@ namespace network::execution_types::pcie { // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { - err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); + err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, &validationData.data()[r * size_in_bytes]); ASSERT_CL(err); } return network::ExecutionTimings{ diff --git a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp index 5f44522e..a606f891 100644 --- a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp +++ b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp @@ -48,7 +48,7 @@ namespace network::execution_types::pcie_reverse { std::vector> dummyBufferContents; std::vector dummyKernels; - cl_uint size_in_bytes = std::max(static_cast(validationData.size()), (1 << messageSize)); + cl_uint size_in_bytes = (1 << messageSize); int current_rank; MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); @@ -74,6 +74,10 @@ namespace network::execution_types::pcie_reverse { err = dummyKernels[r].setArg(0, dummyBuffers[r]); ASSERT_CL(err); + err = dummyKernels[r].setArg(1, (HOST_DATA_TYPE)(messageSize & 255)); + ASSERT_CL(err); + err = dummyKernels[r].setArg(2, (1 << messageSize)); + ASSERT_CL(err); dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); @@ -90,19 +94,17 @@ namespace network::execution_types::pcie_reverse { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); for (int l = 0; l < looplength; l++) { - if (config.programSettings->pcie_reverse_write_pcie) { - sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); - sendQueues[i].finish(); - } - if (config.programSettings->pcie_reverse_execute_kernel) { - sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1)); - sendQueues[i].finish(); - } - if (config.programSettings->pcie_reverse_read_pcie) { - sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); - sendQueues[i].finish(); - } + if (config.programSettings->pcie_reverse_write_pcie) { + sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); + } + if (config.programSettings->pcie_reverse_execute_kernel) { + sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1)); + } + if (config.programSettings->pcie_reverse_read_pcie) { + sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); + } } + sendQueues[i].finish(); auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); #ifndef NDEBUG @@ -121,13 +123,12 @@ namespace network::execution_types::pcie_reverse { // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { - std::cout << validationData.size() << std::endl; - err = sendQueues[r].enqueueReadBuffer( - dummyBuffers[r], CL_TRUE, 0, - sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, - &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]); - ASSERT_CL(err); - sendQueues[r].finish(); + if (!config.programSettings->pcie_reverse_read_pcie) { + err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[r].data()); + err = sendQueues[r].finish(); + ASSERT_CL(err) + } + std::copy(dummyBufferContents[r].begin(), dummyBufferContents[r].end(), &validationData.data()[r * size_in_bytes]); } std::shared_ptr result(new network::ExecutionTimings{ looplength, diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index 274f2006..86e1dd15 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -52,8 +52,8 @@ network::NetworkProgramSettings::getSettingsMap() { return map; } -network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize, unsigned int _loopLength) : messageSize(_messageSize), loopLength(_loopLength), - validationBuffer((1 << _messageSize), 0) { +network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize, unsigned int _loopLength, unsigned int replications) : messageSize(_messageSize), loopLength(_loopLength), + validationBuffer((1 << _messageSize) * replications, 0) { // TODO: fix the validation buffer size to use the variable number of kernel replications and channels // Validation data buffer should be big enough to fit the data of two channels // for every repetition. The number of kernel replications is fixed to 2, which @@ -61,13 +61,13 @@ network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize } network::NetworkData::NetworkData(unsigned int max_looplength, unsigned int min_looplength, unsigned int min_messagesize, unsigned int max_messagesize, - unsigned int offset, unsigned int decrease) { + unsigned int offset, unsigned int decrease, unsigned int replications) { uint decreasePerStep = (max_looplength - min_looplength) / decrease; for (uint i = min_messagesize; i <= max_messagesize; i++) { uint messageSizeDivOffset = (i > offset) ? i - offset : 0u; uint newLooplength = (max_looplength > messageSizeDivOffset * decreasePerStep) ? max_looplength - messageSizeDivOffset * decreasePerStep : 0u; uint looplength = std::max(newLooplength, min_looplength); - this->items.push_back(NetworkDataItem(i, looplength)); + this->items.push_back(NetworkDataItem(i, looplength, replications)); } } @@ -244,7 +244,8 @@ network::NetworkBenchmark::generateInputData() { executionSettings->programSettings->minMessageSize, executionSettings->programSettings->maxMessageSize, executionSettings->programSettings->llOffset, - executionSettings->programSettings->llDecrease)); + executionSettings->programSettings->llDecrease, + executionSettings->programSettings->kernelReplications)); return d; } diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index 1eb1825d..27481194 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -239,8 +239,9 @@ class NetworkData { * * @param messageSize The message size in bytes * @param loopLength The number of repetitions in the kernel + * @param replications The number of kernel replications */ - NetworkDataItem(unsigned int messageSize, unsigned int loopLength); + NetworkDataItem(unsigned int messageSize, unsigned int loopLength, unsigned int replications); }; @@ -259,8 +260,10 @@ class NetworkData { * @param max_messagesize The maximum message size * @param offset The used offset to scale the loop length. The higher the offset, the later the loop lenght will be decreased * @param decrease Number of steps the looplength will be decreased to the minimum + * @param replications The number of kernel replications */ - NetworkData(unsigned int max_looplength, unsigned int min_looplength, unsigned int min_messagesize, unsigned int max_messagesize, unsigned int offset, unsigned int decrease); + NetworkData(unsigned int max_looplength, unsigned int min_looplength, unsigned int min_messagesize, unsigned int max_messagesize, + unsigned int offset, unsigned int decrease, unsigned int replications); }; From 3802ee4d59315e6a51cc2ed4e3f1949218c60d51 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 13 Dec 2022 13:08:02 +0100 Subject: [PATCH 244/318] Add PCIe reverse batch option --- .../host/execution_types/execution_pcie_reverse.hpp | 13 ++++++++++++- b_eff/src/host/network_benchmark.cpp | 6 ++++-- b_eff/src/host/network_benchmark.hpp | 6 ++++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp index a606f891..4146912f 100644 --- a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp +++ b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp @@ -96,15 +96,26 @@ namespace network::execution_types::pcie_reverse { for (int l = 0; l < looplength; l++) { if (config.programSettings->pcie_reverse_write_pcie) { sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); + if (!config.programSettings->pcie_reverse_batch) { + sendQueues[i].finish(); + } } if (config.programSettings->pcie_reverse_execute_kernel) { sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1)); + if (!config.programSettings->pcie_reverse_batch) { + sendQueues[i].finish(); + } } if (config.programSettings->pcie_reverse_read_pcie) { sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); + if (!config.programSettings->pcie_reverse_batch) { + sendQueues[i].finish(); + } } } - sendQueues[i].finish(); + if (config.programSettings->pcie_reverse_batch) { + sendQueues[i].finish(); + } auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); #ifndef NDEBUG diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index 86e1dd15..a1f86b12 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -38,7 +38,8 @@ network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &re maxLoopLength(results["u"].as()), minLoopLength(results["l"].as()), maxMessageSize(results["m"].as()), minMessageSize(results["min-size"].as()), llOffset(results["o"].as()), llDecrease(results["d"].as()), pcie_reverse_write_pcie(results["pcie-read"].count()), pcie_reverse_read_pcie(results["pcie-write"].count()), - pcie_reverse_execute_kernel(results["kernel-latency"].count()) { + pcie_reverse_execute_kernel(results["kernel-latency"].count()), + pcie_reverse_batch(results["pcie-batch"].count()) { pcie_reverse = pcie_reverse_execute_kernel | pcie_reverse_read_pcie | pcie_reverse_write_pcie; @@ -93,7 +94,8 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options) cxxopts::value()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE))) ("pcie-read", "Use reverse PCIe experiment and measure PCIe read performance from device") ("pcie-write", "Use reverse PCIe experiment and measure PCIe write performance from device") - ("kernel-latency", "Use reverse PCIe experiment and measure kernel execution latency"); + ("kernel-latency", "Use reverse PCIe experiment and measure kernel execution latency") + ("pcie-batch", "Execute the reverse PCIe experiments in batch mode to make use of the queues of the schedulers"); } void diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index 27481194..cb488686 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -183,6 +183,12 @@ class NetworkProgramSettings : public hpcc_base::BaseSettings { */ bool pcie_reverse_read_pcie; + /** + * @brief If true, the reverse experiments are executed in batch mode per looplength to make use of the scheduling queues + * + */ + bool pcie_reverse_batch; + /** * @brief Construct a new Network Program Settings object * From ea874c3e79d28ba60db6e43ef487de7d4281df1b Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 13 Dec 2022 16:10:14 +0100 Subject: [PATCH 245/318] Fix unit tests and add reverse tests --- b_eff/src/device/CMakeLists.txt | 4 + ...nel_functionality_and_host_integration.cpp | 113 ++++++++++-------- 2 files changed, 64 insertions(+), 53 deletions(-) diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt index 0a15211e..146a4407 100644 --- a/b_eff/src/device/CMakeLists.txt +++ b/b_eff/src/device/CMakeLists.txt @@ -23,4 +23,8 @@ if (Vitis_FOUND) WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 1 -m 20 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_emulation_pcie_reverse_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 10 -m 0 -n 1 --kernel-latency --pcie-write --pcie-read + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) + add_test(NAME test_emulation_pcie_reverse_batch_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 10 -m 0 -n 1 --kernel-latency --pcie-write --pcie-read --pcie-batch + WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp index 4e1cdb62..e7a51712 100644 --- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp +++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp @@ -10,7 +10,7 @@ #include "test_program_settings.h" #include -struct NetworkKernelTest : testing::TestWithParam { +struct NetworkKernelTest : testing::Test { std::unique_ptr bm; std::unique_ptr data; unsigned numberOfChannels = 4; @@ -22,7 +22,6 @@ struct NetworkKernelTest : testing::TestWithParam void SetUp() override { bm = std::unique_ptr(new network::NetworkBenchmark(global_argc, global_argv)); bm->getExecutionSettings().programSettings->numRepetitions = 1; - bm->getExecutionSettings().programSettings->communicationType = GetParam(); data = bm->generateInputData(); createChannelFilesAndSymbolicLinks(); } @@ -48,9 +47,9 @@ struct NetworkKernelTest : testing::TestWithParam /** * Tests if calculate returns the correct execution results */ -TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) { +TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) { data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(1,1)); + data->items.push_back(network::NetworkData::NetworkDataItem(1,1, bm->getExecutionSettings().programSettings->kernelReplications)); bm->executeKernel(*data); EXPECT_NE(bm->collected_timings.end(), bm->collected_timings.find(1)); EXPECT_EQ(1, bm->collected_timings.find(1)->second.execution_timings.at(0).looplength); @@ -60,10 +59,10 @@ TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) { /** * Tests if calculate returns the correct execution results for multiple repetitions */ -TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) { +TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) { bm->getExecutionSettings().programSettings->numRepetitions = 2; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(8,4)); + data->items.push_back(network::NetworkData::NetworkDataItem(8,4, bm->getExecutionSettings().programSettings->kernelReplications)); bm->executeKernel(*data); EXPECT_NE(bm->collected_timings.end(), bm->collected_timings.find(8)); EXPECT_EQ(4, bm->collected_timings.find(8)->second.execution_timings.at(0).looplength); @@ -73,7 +72,7 @@ TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) { /** * Tests if data is written to the channels for small message sizes */ -TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) { +TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) { if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) { // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files GTEST_SKIP(); @@ -81,7 +80,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) const unsigned messageSize = std::log2(CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); bm->executeKernel(*data); HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2]; for (int i=0; i < numberOfChannels; i++) { @@ -101,7 +100,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) /** * Tests if data is written to the channels for small message sizes filling two channels */ -TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) { +TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) { if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) { // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files GTEST_SKIP(); @@ -109,7 +108,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize, looplength)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize, looplength, bm->getExecutionSettings().programSettings->kernelReplications)); bm->executeKernel(*data); HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2]; for (int i=0; i < numberOfChannels; i++) { @@ -126,7 +125,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels /** * Tests if data is written to the channels for message sizes filling more than two channels */ -TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) { +TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) { if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) { // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files GTEST_SKIP(); @@ -134,7 +133,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwo const unsigned messageSize = std::log2(8 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 1; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); bm->executeKernel(*data); HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2]; for (int i=0; i < numberOfChannels; i++) { @@ -151,7 +150,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwo /** * Tests if correct data is written to the channels */ -TEST_P(NetworkKernelTest, CorrectDataIsWrittenToChannel) { +TEST_F(NetworkKernelTest, CorrectDataIsWrittenToChannel) { if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) { // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files GTEST_SKIP(); @@ -159,7 +158,7 @@ TEST_P(NetworkKernelTest, CorrectDataIsWrittenToChannel) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); bm->executeKernel(*data); HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[messageSize * looplength * 2]; for (int i=0; i < numberOfChannels; i++) { @@ -175,11 +174,11 @@ TEST_P(NetworkKernelTest, CorrectDataIsWrittenToChannel) { delete [] buffer; } -TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) { +TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); bm->executeKernel(*data); HOST_DATA_TYPE cvalue = static_cast(messageSize & 255); EXPECT_EQ(cvalue, data->items[0].validationBuffer[0]); @@ -190,11 +189,11 @@ TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) { EXPECT_TRUE(all_same); } -TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) { +TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) { const unsigned messageSize = 0; const unsigned looplength = 4; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); bm->executeKernel(*data); HOST_DATA_TYPE cvalue = static_cast(messageSize & 255); EXPECT_EQ(cvalue, data->items[0].validationBuffer[0]); @@ -205,72 +204,86 @@ TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) { EXPECT_TRUE(all_same); } -TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) { +TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); + bm->getExecutionSettings().programSettings->kernelReplications = 1; const unsigned looplength = 4; + const unsigned replications = 1; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - bm->executeKernel(*data); - EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size()); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1)); + auto result = bm->executeKernel(*data); + EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size()); } -TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) { +TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); + bm->getExecutionSettings().programSettings->kernelReplications = 1; const unsigned looplength = 1; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - bm->executeKernel(*data); - EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size()); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1)); + auto result = bm->executeKernel(*data); + EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size()); } -TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) { +TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) { const unsigned messageSize = 0; + bm->getExecutionSettings().programSettings->kernelReplications = 1; const unsigned looplength = 1; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - bm->executeKernel(*data); - EXPECT_EQ(looplength * CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size()); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1)); + auto result = bm->executeKernel(*data); + EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size()); } -TEST_P(NetworkKernelTest, ValidationDataSingleItemWrongCheckFails) { - const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); +TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForReplication2) { + const unsigned messageSize = 4; + const unsigned looplength = 2; + bm->getExecutionSettings().programSettings->kernelReplications = 2; + data->items.clear(); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 2)); + auto result = bm->executeKernel(*data); + EXPECT_EQ((1 << messageSize) * 2, data->items[0].validationBuffer.size()); +} + +TEST_F(NetworkKernelTest, ValidationDataSingleItemWrongCheckFails) { + const unsigned messageSize = 4; const HOST_DATA_TYPE expected_data = static_cast(messageSize & 255); const unsigned looplength = 4; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data;}); data->items[0].validationBuffer[looplength] = expected_data + 1; EXPECT_FALSE(bm->validateOutput(*data)); bm->printError(); } -TEST_P(NetworkKernelTest, ValidationDataWrongCheckFails) { +TEST_F(NetworkKernelTest, ValidationDataWrongCheckFails) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const HOST_DATA_TYPE expected_data = static_cast(messageSize & 255); const unsigned looplength = 4; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data - 1;}); EXPECT_FALSE(bm->validateOutput(*data)); bm->printError(); } -TEST_P(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) { +TEST_F(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const HOST_DATA_TYPE expected_data = static_cast(messageSize & 255); const unsigned looplength = 4; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data;}); EXPECT_TRUE(bm->validateOutput(*data)); bm->printError(); } -TEST_P(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) { +TEST_F(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); bm->executeKernel(*data); EXPECT_TRUE(bm->validateOutput(*data)); bm->printError(); @@ -279,32 +292,32 @@ TEST_P(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) { // This test is disabled because it does not work with the current implementation of the // external channels in software emulation. The different kernel executions will read // the old data from the channel file, which will lead to a failing validation! -TEST_P(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExecution) { +TEST_F(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExecution) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); bm->executeKernel(*data); EXPECT_TRUE(bm->validateOutput(*data)); bm->printError(); } -TEST_P(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) { +TEST_F(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength)); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); bm->executeKernel(*data); data->items[1].validationBuffer[0] = static_cast(0); EXPECT_FALSE(bm->validateOutput(*data)); bm->printError(); } -TEST_P(NetworkKernelTest, JsonDump) { +TEST_F(NetworkKernelTest, JsonDump) { data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(8,4)); + data->items.push_back(network::NetworkData::NetworkDataItem(8,4, bm->getExecutionSettings().programSettings->kernelReplications)); bm->executeKernel(*data); bm->collectResults(); bm->dumpConfigurationAndResults("b_eff.json"); @@ -336,9 +349,3 @@ TEST_P(NetworkKernelTest, JsonDump) { } } } - - -INSTANTIATE_TEST_CASE_P( - NetworkKernelParametrizedTests, - NetworkKernelTest, - ::testing::Values(hpcc_base::CommunicationType::intel_external_channels,hpcc_base::CommunicationType::cpu_only, hpcc_base::CommunicationType::pcie_mpi)); From a13ef51210286ebb2ec727c057ea0c0efb177c38 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 16 Dec 2022 10:45:38 +0100 Subject: [PATCH 246/318] Additionally enqueue kernels to correctly measure for Xilinx FPGAs --- .../src/host/execution_types/execution_pcie.hpp | 17 +++++++++++++++-- .../execution_types/execution_pcie_reverse.hpp | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp index cc3e5548..de79379c 100644 --- a/b_eff/src/host/execution_types/execution_pcie.hpp +++ b/b_eff/src/host/execution_types/execution_pcie.hpp @@ -45,6 +45,7 @@ namespace network::execution_types::pcie { int err; std::vector sendQueues; std::vector dummyBuffers; + std::vector dummyKernels; std::vector> dummyBufferContents; cl_uint size_in_bytes = (1 << messageSize); @@ -66,6 +67,16 @@ namespace network::execution_types::pcie { dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err)); ASSERT_CL(err) + dummyKernels.push_back(cl::Kernel(*config.program, + "dummyKernel", &err)); + + err = dummyKernels[r].setArg(0, dummyBuffers[r]); + ASSERT_CL(err); + err = dummyKernels[r].setArg(1, (HOST_DATA_TYPE)(messageSize & 255)); + ASSERT_CL(err); + err = dummyKernels[r].setArg(2, 1); + ASSERT_CL(err); + dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err); @@ -81,14 +92,16 @@ namespace network::execution_types::pcie { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); for (int l = 0; l < looplength; l++) { - + sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1)); sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); + sendQueues[i].finish(); MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); - + sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1)); + sendQueues[i].finish(); } auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); diff --git a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp index 4146912f..434fb95a 100644 --- a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp +++ b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp @@ -76,7 +76,7 @@ namespace network::execution_types::pcie_reverse { ASSERT_CL(err); err = dummyKernels[r].setArg(1, (HOST_DATA_TYPE)(messageSize & 255)); ASSERT_CL(err); - err = dummyKernels[r].setArg(2, (1 << messageSize)); + err = dummyKernels[r].setArg(2, 1); ASSERT_CL(err); dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); From ec02f72da07dcf11e702a4e69bda20084ed4782c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 16 Dec 2022 14:32:45 +0100 Subject: [PATCH 247/318] Exlude too short messages for IEC test --- b_eff/src/device/CMakeLists.txt | 8 ++++---- .../test_kernel_functionality_and_host_integration.cpp | 7 +++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt index 146a4407..4ee0f8a3 100644 --- a/b_eff/src/device/CMakeLists.txt +++ b/b_eff/src/device/CMakeLists.txt @@ -5,13 +5,13 @@ include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake) if (INTELFPGAOPENCL_FOUND) generate_kernel_targets_intel(communication_bw520n_IEC communication_PCIE) - add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 -m 0 -n 1 + add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 --min-size 6 -m 6 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1 + add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_PCIE_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type PCIE -l 1 -u 10 -m 0 -n 1 + add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_PCIE_emulate.aocx -l 1 -u 10 -m 0 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) - add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 1 -m 20 -n 1 + add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_PCIE_emulate.aocx -l 1 -u 1 -m 20 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) endif() diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp index e7a51712..613f1b13 100644 --- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp +++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp @@ -48,6 +48,10 @@ struct NetworkKernelTest : testing::Test { * Tests if calculate returns the correct execution results */ TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) { + if (bm->getExecutionSettings().programSettings->communicationType == hpcc_base::CommunicationType::intel_external_channels) { + // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files + GTEST_SKIP() << "Intel external channel needs at least message size of 64 byte to fill channel!"; + } data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(1,1, bm->getExecutionSettings().programSettings->kernelReplications)); bm->executeKernel(*data); @@ -190,6 +194,9 @@ TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) { } TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) { + if (bm->getExecutionSettings().programSettings->communicationType == hpcc_base::CommunicationType::intel_external_channels) { + GTEST_SKIP() << "Intel external channel needs at least message size of 64 byte to fill channel!"; + } const unsigned messageSize = 0; const unsigned looplength = 4; data->items.clear(); From 1cdcc0b8627efb0dfda8a81321cba1558dc3d9f6 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 16 Dec 2022 14:33:07 +0100 Subject: [PATCH 248/318] Update IEC implementation to store whole message --- b_eff/src/device/communication_bw520n_IEC.cl | 25 ++++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/b_eff/src/device/communication_bw520n_IEC.cl b/b_eff/src/device/communication_bw520n_IEC.cl index 8f43756b..26379080 100644 --- a/b_eff/src/device/communication_bw520n_IEC.cl +++ b/b_eff/src/device/communication_bw520n_IEC.cl @@ -119,6 +119,21 @@ void recv{{ i }}(__global DEVICE_DATA_TYPE* validation_buffer, for (unsigned k=0; k < send_iterations; k++) { recv_part1 = read_channel_intel(ch_in_{{ 2*i+1 }}); recv_part2 = read_channel_intel(ch_in_{{ 2*i+2 }}); + + DEVICE_DATA_TYPE mem_buffer[2 * ITEMS_PER_CHANNEL]; + // Store the last received data chunks in global memory for later validation + __attribute__((opencl_unroll_hint(ITEMS_PER_CHANNEL))) + for (DEVICE_DATA_TYPE d = 0; d < ITEMS_PER_CHANNEL; d++) { + mem_buffer[d] = recv_part1.values[d]; + } + __attribute__((opencl_unroll_hint(ITEMS_PER_CHANNEL))) + for (DEVICE_DATA_TYPE d = 0; d < ITEMS_PER_CHANNEL; d++) { + mem_buffer[ITEMS_PER_CHANNEL + d] = recv_part2.values[d]; + } + __attribute__((opencl_unroll_hint(2*ITEMS_PER_CHANNEL))) + for (DEVICE_DATA_TYPE d = 0; d < 2*ITEMS_PER_CHANNEL; d++) { + validation_buffer[k * (2 * ITEMS_PER_CHANNEL) + d] = mem_buffer[d]; + } } #ifndef EMULATE // Introduce data dependency between loop iterations to prevent coalescing of loop @@ -127,16 +142,6 @@ void recv{{ i }}(__global DEVICE_DATA_TYPE* validation_buffer, write_channel_intel(ch_exchange{{ 2*i+2 }}, recv_part2); #endif } - - // Store the last received data chunks in global memory for later validation - __attribute__((opencl_unroll_hint(ITEMS_PER_CHANNEL))) - for (DEVICE_DATA_TYPE d = 0; d < ITEMS_PER_CHANNEL; d++) { - validation_buffer[d] = recv_part1.values[d]; - } - __attribute__((opencl_unroll_hint(ITEMS_PER_CHANNEL))) - for (DEVICE_DATA_TYPE d = 0; d < ITEMS_PER_CHANNEL; d++) { - validation_buffer[ITEMS_PER_CHANNEL + d] = recv_part2.values[d]; - } } {% endfor %} From 512c4c68b31f209d85d00096e341761496b403a8 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 16 Dec 2022 15:18:26 +0100 Subject: [PATCH 249/318] Fix tests for updated IEC --- .../test_kernel_functionality_and_host_integration.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp index 613f1b13..4cc30e25 100644 --- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp +++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp @@ -218,7 +218,6 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) { const unsigned replications = 1; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1)); - auto result = bm->executeKernel(*data); EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size()); } @@ -228,7 +227,6 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) { const unsigned looplength = 1; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1)); - auto result = bm->executeKernel(*data); EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size()); } @@ -237,8 +235,7 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) { bm->getExecutionSettings().programSettings->kernelReplications = 1; const unsigned looplength = 1; data->items.clear(); - data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1)); - auto result = bm->executeKernel(*data); + data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications)); EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size()); } @@ -248,7 +245,6 @@ TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForReplication2) { bm->getExecutionSettings().programSettings->kernelReplications = 2; data->items.clear(); data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 2)); - auto result = bm->executeKernel(*data); EXPECT_EQ((1 << messageSize) * 2, data->items[0].validationBuffer.size()); } @@ -299,7 +295,7 @@ TEST_F(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) { // This test is disabled because it does not work with the current implementation of the // external channels in software emulation. The different kernel executions will read // the old data from the channel file, which will lead to a failing validation! -TEST_F(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExecution) { +TEST_F(NetworkKernelTest, ValidationDataCorrectTwoMessageSizesAfterExecution) { const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE)); const unsigned looplength = 4; data->items.clear(); From 6df822b0e18914987aa72d24f777ef268102bafb Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 16 Dec 2022 15:37:55 +0100 Subject: [PATCH 250/318] Fix PCIe reverse signature --- b_eff/src/host/execution_types/execution_pcie_reverse.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp index 434fb95a..2395b7bd 100644 --- a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp +++ b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp @@ -38,7 +38,7 @@ namespace network::execution_types::pcie_reverse { Implementation for the single kernel. @copydoc bm_execution::calculate() */ - std::shared_ptr + network::ExecutionTimings calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, cl::vector &validationData) { @@ -141,12 +141,11 @@ namespace network::execution_types::pcie_reverse { } std::copy(dummyBufferContents[r].begin(), dummyBufferContents[r].end(), &validationData.data()[r * size_in_bytes]); } - std::shared_ptr result(new network::ExecutionTimings{ + return network::ExecutionTimings{ looplength, messageSize, calculationTimings - }); - return result; + }; } } // namespace bm_execution From 25134b6820164dae8e45cef83cea32cb602e5a14 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Fri, 16 Dec 2022 17:06:05 +0100 Subject: [PATCH 251/318] add comments to json dump helper functions --- shared/include/hpcc_benchmark.hpp | 64 +++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 69da2bfe..9328a251 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -544,17 +544,33 @@ class HpccFpgaBenchmark { std::cout << *executionSettings << std::endl; } + /* + * @brief Returns the map of the timings + * + * @return The timings map + */ std::map> getTimingsMap() { return timings; } + /* + * @brief adds a timing to the timings map + * + * @param key The key + */ void addTimings(std::string key, std::vector value) { timings.emplace(key, value); } - // override for special benchmarks like b_eff + /* + * @brief Returns the timings map as json + * + * @return The json object + * + * It should be overwritten for benchmarks with special timings format, like b_eff + */ virtual json getTimingsJson() { json j; for (auto const &key: timings) { @@ -570,6 +586,12 @@ class HpccFpgaBenchmark { return j; } + /** + * @brief Returns the results map as json + * + * @return The return object + * + */ std::map getResultsJson() { std::map results_string; for (auto const &result: results) { @@ -581,13 +603,27 @@ class HpccFpgaBenchmark { return results_string; } + /** + * @brief Returns the map of the dumped environment variables + * + * @param The environment map + * + * Can be extended as needed + */ std::map getEnvironmentMap() { std::map env; env["LD_LIBRARY_PATH"] = std::string(std::getenv("LD_LIBRARY_PATH")); return env; } - + /** + * @brief Format the FPGA Torus setting string + * + * @param The setting string + * + * @return The parsed json object + * + */ json parseFPGATorusString(std::string str) { json j; @@ -599,6 +635,13 @@ class HpccFpgaBenchmark { return j; } + /** + * @brief Get current time as string + * + * @return The time string + * + * Has the same format as CONFIG_TIME + */ std::string getCurrentTime() { time_t time = std::time(0); @@ -608,6 +651,15 @@ class HpccFpgaBenchmark { return oss.str(); } + /** + * @brief Convert the settings map to json + * + * @param settings_map The settings map + * + * @return the json object + * + * This function checks for settings which are not strings and converts them + */ std::map jsonifySettingsMap(std::map settings_map) { json j; @@ -629,7 +681,13 @@ class HpccFpgaBenchmark { } return j; } - + + /** + * @brief Dumps the benchmark configuration and results to a json file + * + * @param file_path Path where the json will be saved + * + */ void dumpConfigurationAndResults(std::string file_path) { std::fstream fs; From a0ccd3a45a4385af17c0c28aad1253dbf7327459 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 6 Jan 2023 16:14:45 +0100 Subject: [PATCH 252/318] Re-add emulation kernels as unit test build dependency --- cmake/unitTestTargets.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/unitTestTargets.cmake b/cmake/unitTestTargets.cmake index 263d4033..4e949a9d 100644 --- a/cmake/unitTestTargets.cmake +++ b/cmake/unitTestTargets.cmake @@ -29,6 +29,7 @@ if (Vitis_FOUND) add_executable(${HOST_EXE_NAME}_test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES}) target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}") target_link_libraries(${HOST_EXE_NAME}_test_xilinx hpcc_fpga_base_test) + add_dependencies(${HOST_EXE_NAME}_test_xilinx ${kernel_emulation_targets_xilinx}) target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA) target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}") if (USE_ACCL) From de85402d74afdeec8e1338821699db85719a8b8e Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 19 Jan 2023 10:11:32 +0100 Subject: [PATCH 253/318] Update references in README --- README.md | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 1814c5a0..fefd250b 100755 --- a/README.md +++ b/README.md @@ -296,14 +296,31 @@ If you are using one of the benchmarks contained in the HPCC FPGA benchmark suit doi={10.1109/H2RC51942.2020.00007} } -If the focus is on multi-FPGA execution and inter-FPGA communication, you may rather want to cite - @misc{hpcc_multi_fpga, - doi = {10.48550/ARXIV.2202.13995}, - url = {https://arxiv.org/abs/2202.13995}, - author = {Meyer, Marius and Kenter, Tobias and Plessl, Christian}, - title = {Multi-FPGA Designs and Scaling of HPC Challenge Benchmarks via MPI and Circuit-Switched Inter-FPGA Networks}, - publisher = {arXiv}, + @article{hpcc_fpga_in_depth, + author = {Marius Meyer and Tobias Kenter and Christian Plessl}, + doi = {https://doi.org/10.1016/j.jpdc.2021.10.007}, + issn = {0743-7315}, + journal = {Journal of Parallel and Distributed Computing}, + keywords = {FPGA, OpenCL, High level synthesis, HPC benchmarking}, + pages = {79-89}, + title = {In-depth FPGA accelerator performance evaluation with single node benchmarks from the HPC challenge benchmark suite for Intel and Xilinx FPGAs using OpenCL}, + url = {https://www.sciencedirect.com/science/article/pii/S0743731521002057}, + volume = {160}, year = {2022} } + +If the focus is on multi-FPGA execution and inter-FPGA communication, you may rather want to cite + + @article{hpcc_multi_fpga, + author = {Meyer, Marius and Kenter, Tobias and Plessl, Christian}, + title = {Multi-FPGA Designs and Scaling of HPC Challenge Benchmarks via MPI and Circuit-Switched Inter-FPGA Networks}, + year = {2023}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + issn = {1936-7406}, + url = {https://doi.org/10.1145/3576200}, + doi = {10.1145/3576200} + } + From a0477d02d6a9621951686773f599a7dbd3de1f23 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 16 Jan 2023 17:50:30 +0100 Subject: [PATCH 254/318] Fix IEC execution code --- b_eff/src/host/execution_types/execution_iec.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/b_eff/src/host/execution_types/execution_iec.hpp b/b_eff/src/host/execution_types/execution_iec.hpp index 2d0cec0e..471a3547 100644 --- a/b_eff/src/host/execution_types/execution_iec.hpp +++ b/b_eff/src/host/execution_types/execution_iec.hpp @@ -39,8 +39,9 @@ namespace network::execution_types::iec { Implementation for the single kernel. @copydoc bm_execution::calculate() */ + template network::ExecutionTimings - calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, + calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, cl::vector &validationData) { int err; From 2fc4c98d078225e4cf228039fc6a4e7c0b0dbb96 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 16 Jan 2023 17:51:15 +0100 Subject: [PATCH 255/318] Add step size and PCIe reverse flag --- b_eff/src/host/network_benchmark.cpp | 22 ++++++++++++---------- b_eff/src/host/network_benchmark.hpp | 9 ++++++++- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index a1f86b12..7b1cdde3 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -36,13 +36,11 @@ SOFTWARE. network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results), maxLoopLength(results["u"].as()), minLoopLength(results["l"].as()), maxMessageSize(results["m"].as()), - minMessageSize(results["min-size"].as()), llOffset(results["o"].as()), llDecrease(results["d"].as()), - pcie_reverse_write_pcie(results["pcie-read"].count()), pcie_reverse_read_pcie(results["pcie-write"].count()), - pcie_reverse_execute_kernel(results["kernel-latency"].count()), - pcie_reverse_batch(results["pcie-batch"].count()) { - - pcie_reverse = pcie_reverse_execute_kernel | pcie_reverse_read_pcie | pcie_reverse_write_pcie; - + minMessageSize(results["min-size"].as()), stepSize(results["step-size"].as()), llOffset(results["o"].as()), + llDecrease(results["d"].as()), pcie_reverse_write_pcie(results["pcie-read"].count()), + pcie_reverse_read_pcie(results["pcie-write"].count()), pcie_reverse_execute_kernel(results["kernel-latency"].count()), + pcie_reverse_batch(results["pcie-batch"].count()), pcie_reverse(results["pcie-reverse"].count()) +{ } std::map @@ -62,9 +60,9 @@ network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize } network::NetworkData::NetworkData(unsigned int max_looplength, unsigned int min_looplength, unsigned int min_messagesize, unsigned int max_messagesize, - unsigned int offset, unsigned int decrease, unsigned int replications) { + unsigned int stepsize, unsigned int offset, unsigned int decrease, unsigned int replications) { uint decreasePerStep = (max_looplength - min_looplength) / decrease; - for (uint i = min_messagesize; i <= max_messagesize; i++) { + for (uint i = min_messagesize; i <= max_messagesize; i += stepsize) { uint messageSizeDivOffset = (i > offset) ? i - offset : 0u; uint newLooplength = (max_looplength > messageSizeDivOffset * decreasePerStep) ? max_looplength - messageSizeDivOffset * decreasePerStep : 0u; uint looplength = std::max(newLooplength, min_looplength); @@ -88,6 +86,8 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options) ("min-size", "Minimum Message Size", cxxopts::value()->default_value(std::to_string(0))) ("m", "Maximum message size", cxxopts::value()->default_value(std::to_string(DEFAULT_MAX_MESSAGE_SIZE))) + ("step-size", "Step size to generate message sizes in the specified range", + cxxopts::value()->default_value(std::to_string(1))) ("o", "Offset used before reducing repetitions", cxxopts::value()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_OFFSET))) ("d", "Number os steps the repetitions are decreased to its minimum", @@ -95,7 +95,8 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options) ("pcie-read", "Use reverse PCIe experiment and measure PCIe read performance from device") ("pcie-write", "Use reverse PCIe experiment and measure PCIe write performance from device") ("kernel-latency", "Use reverse PCIe experiment and measure kernel execution latency") - ("pcie-batch", "Execute the reverse PCIe experiments in batch mode to make use of the queues of the schedulers"); + ("pcie-batch", "Execute the reverse PCIe experiments in batch mode to make use of the queues of the schedulers") + ("pcie-reverse", "Execute the reverse PCIe experiments"); } void @@ -245,6 +246,7 @@ network::NetworkBenchmark::generateInputData() { executionSettings->programSettings->minLoopLength, executionSettings->programSettings->minMessageSize, executionSettings->programSettings->maxMessageSize, + executionSettings->programSettings->stepSize, executionSettings->programSettings->llOffset, executionSettings->programSettings->llDecrease, executionSettings->programSettings->kernelReplications)); diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index cb488686..814075c0 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -143,6 +143,12 @@ class NetworkProgramSettings : public hpcc_base::BaseSettings { */ uint minMessageSize; + /** + * @brief Step size for tested message sizes + * + */ + uint stepSize; + /** * @brief Offset that is used before the loop length will be reduced for higher message sizes * @@ -264,12 +270,13 @@ class NetworkData { * @param min_looplength The minimum number of iterations that should be done for a message size * @param max_messagesize The minimum message size * @param max_messagesize The maximum message size + * @param stepSize Step size used to generate tested message sizes * @param offset The used offset to scale the loop length. The higher the offset, the later the loop lenght will be decreased * @param decrease Number of steps the looplength will be decreased to the minimum * @param replications The number of kernel replications */ NetworkData(unsigned int max_looplength, unsigned int min_looplength, unsigned int min_messagesize, unsigned int max_messagesize, - unsigned int offset, unsigned int decrease, unsigned int replications); + unsigned int stepSize, unsigned int offset, unsigned int decrease, unsigned int replications); }; From 24e1f540e0145851c17657c5127484ed48f489a6 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 16 Jan 2023 09:53:08 +0100 Subject: [PATCH 256/318] Make kernel executiuon in base PCIE version optional --- b_eff/src/host/execution_types/execution_pcie.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp index de79379c..b13f8e4f 100644 --- a/b_eff/src/host/execution_types/execution_pcie.hpp +++ b/b_eff/src/host/execution_types/execution_pcie.hpp @@ -92,7 +92,9 @@ namespace network::execution_types::pcie { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); for (int l = 0; l < looplength; l++) { - sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1)); + if(config.programSettings->pcie_reverse_execute_kernel) { + sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1)); + } sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); sendQueues[i].finish(); @@ -100,7 +102,9 @@ namespace network::execution_types::pcie { dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data()); - sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1)); + if(config.programSettings->pcie_reverse_execute_kernel) { + sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1)); + } sendQueues[i].finish(); } auto endCalculation = std::chrono::high_resolution_clock::now(); From fadc681f151cc1ba0825d62aa3bfbac2ca895a2c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 17 Jan 2023 16:12:08 +0100 Subject: [PATCH 257/318] Update HPL DDR link settings to jinja2 --- ....link.xilinx.hpl_torus_pcie.ddr.generator.ini | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini index e032e407..e419e22e 100644 --- a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini +++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini @@ -9,9 +9,9 @@ nk=inner_update_mm0:$PY_CODE_GEN num_replications$ slr=lu_1:SLR0 slr=left_update_1:SLR0 slr=top_update_1:SLR0 -# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] -slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN (i +1) % 3$ -# PY_CODE_GEN block_end +{% for i in range(num_replications) %} +slr=inner_update_mm0_{{ i+1 }}:SLR{{ (i+1) % 3 }} +{% endfor %} # matrix ports sp=lu_1.m_axi_gmem0:DDR[0] @@ -26,9 +26,9 @@ sp=left_update_1.m_axi_gmem0:DDR[0] sp=left_update_1.m_axi_gmem1:DDR[1] sp=left_update_1.m_axi_gmem2:DDR[1] -# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] -sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:DDR[0] -sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:DDR[1] -sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:DDR[0] -# PY_CODE_GEN block_end +{% for i in range(num_replications) %} +sp=inner_update_mm0_{{ i+1 }}.m_axi_gmem0:DDR[0] +sp=inner_update_mm0_{{ i+1 }}.m_axi_gmem1:DDR[1] +sp=inner_update_mm0_{{ i+1 }}.m_axi_gmem2:DDR[0] +{% endfor %} From 22bb76eb68cf97d2095729b9d249a879168449ac Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 17 Jan 2023 16:14:29 +0100 Subject: [PATCH 258/318] Update Xilinx kernel build for jinja2 --- cmake/kernelTargets.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake index 1d7e667f..7542d4d4 100644 --- a/cmake/kernelTargets.cmake +++ b/cmake/kernelTargets.cmake @@ -57,7 +57,7 @@ function(generate_kernel_targets_xilinx) ) if (XILINX_GENERATE_LINK_SETTINGS) add_custom_command(OUTPUT ${xilinx_link_settings} - COMMAND ${Python3_EXECUTABLE} ${CODE_GENERATOR} -o ${xilinx_link_settings} -p num_replications=${NUM_REPLICATIONS} --comment "\"#\"" --comment-ml-start "\"$$\"" --comment-ml-end "\"$$\"" ${gen_xilinx_link_settings} + COMMAND ${Python3_EXECUTABLE} ${CODE_GENERATOR} -o ${xilinx_link_settings} -p num_replications=${NUM_REPLICATIONS} ${gen_xilinx_link_settings} MAIN_DEPENDENCY ${gen_xilinx_link_settings} ) else() From 5db53122c0997c3f4d3ec6a2c22e7402f6f9bbbc Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Fri, 20 Jan 2023 13:37:56 +0100 Subject: [PATCH 259/318] update to Sphinx 4.0.0 --- docs/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index c675a279..f705e859 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,2 +1,2 @@ -Sphinx==3.0.3 -sphinx-rtd-theme==0.5.0 +Sphinx==4.0.0 +sphinx-rtd-theme==1.1.1 From d91bee60527d3cb52601c40c2c7d7b8e0d3e22df Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Fri, 20 Jan 2023 13:38:13 +0100 Subject: [PATCH 260/318] remove some warnings from sphinx html build --- docs/source/FFT/index.rst | 6 +++--- docs/source/GEMM/index.rst | 1 + docs/source/LINPACK/index.rst | 1 + docs/source/PTRANS/index.rst | 1 + docs/source/RandomAccess/index.rst | 1 + docs/source/STREAM/index.rst | 1 + docs/source/b_eff/index.rst | 1 + docs/source/index.rst | 2 -- .../technical_support/Host Input Parameters/index.rst | 1 + 9 files changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/source/FFT/index.rst b/docs/source/FFT/index.rst index 4f54398b..353691bd 100644 --- a/docs/source/FFT/index.rst +++ b/docs/source/FFT/index.rst @@ -1,7 +1,8 @@ .. _fft: -====== + +====== FFT -====== +====== This section contains all information related to the FFT benchmark. The benchmark executes a batched calculation of 1d FFTs on a single FPGA. @@ -13,7 +14,6 @@ It is possible to specify the size of the FFT and the number of kernel replicati :glob: */index - ../../../FFT/README.md ------------------------ Configuration Parameters diff --git a/docs/source/GEMM/index.rst b/docs/source/GEMM/index.rst index 14f597ed..df3899ed 100644 --- a/docs/source/GEMM/index.rst +++ b/docs/source/GEMM/index.rst @@ -1,4 +1,5 @@ .. _gemm: + ====== GEMM ====== diff --git a/docs/source/LINPACK/index.rst b/docs/source/LINPACK/index.rst index 7ce28dd4..440616bd 100644 --- a/docs/source/LINPACK/index.rst +++ b/docs/source/LINPACK/index.rst @@ -1,4 +1,5 @@ .. _hpl: + ======= LINPACK ======= diff --git a/docs/source/PTRANS/index.rst b/docs/source/PTRANS/index.rst index b5a9c93d..07bf00c2 100644 --- a/docs/source/PTRANS/index.rst +++ b/docs/source/PTRANS/index.rst @@ -1,4 +1,5 @@ .. _ptrans: + ====== PTRANS ====== diff --git a/docs/source/RandomAccess/index.rst b/docs/source/RandomAccess/index.rst index 607b311a..02b510d4 100644 --- a/docs/source/RandomAccess/index.rst +++ b/docs/source/RandomAccess/index.rst @@ -1,4 +1,5 @@ .. _randomaccess: + ============ RandomAccess ============ diff --git a/docs/source/STREAM/index.rst b/docs/source/STREAM/index.rst index 7b4f41ff..26dbc1c8 100644 --- a/docs/source/STREAM/index.rst +++ b/docs/source/STREAM/index.rst @@ -1,4 +1,5 @@ .. _stream: + ======= STREAM ======= diff --git a/docs/source/b_eff/index.rst b/docs/source/b_eff/index.rst index f8cc2f18..030bee78 100644 --- a/docs/source/b_eff/index.rst +++ b/docs/source/b_eff/index.rst @@ -1,4 +1,5 @@ .. _beff: + ======= b_eff ======= diff --git a/docs/source/index.rst b/docs/source/index.rst index 8139915b..8f3cd6bb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -70,8 +70,6 @@ Further optimized implementations that use such device-specific communication ap :caption: Benchmark Results: :glob: - ../../../*/README.md - ---------- References diff --git a/docs/source/technical_support/Host Input Parameters/index.rst b/docs/source/technical_support/Host Input Parameters/index.rst index 550e8f19..b45e8a36 100644 --- a/docs/source/technical_support/Host Input Parameters/index.rst +++ b/docs/source/technical_support/Host Input Parameters/index.rst @@ -1,4 +1,5 @@ .. _execution: + ======================== Execution of a Benchmark ======================== From 4e58959fc108c71fa580d874ad7ed08f7a39a62f Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 6 Feb 2023 09:18:26 +0100 Subject: [PATCH 261/318] update host input parameters page --- .../technical_support/Host Input Parameters/index.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/technical_support/Host Input Parameters/index.rst b/docs/source/technical_support/Host Input Parameters/index.rst index b45e8a36..c201524e 100644 --- a/docs/source/technical_support/Host Input Parameters/index.rst +++ b/docs/source/technical_support/Host Input Parameters/index.rst @@ -28,10 +28,16 @@ Input parameters (or options) can be appended to the host execution call like th The number of repetitions can be given with this parameter as a positive integer. The benchmark experiment will be repeated the given number of times. The benchmark will show the aggregated results for all runs, but only validate the output of the last run. +``-i``: + Use `Intel memory interleaving `_. + ``--platform INT``: Also an integer. It can be used to specify the index of the OpenCL platform that should be used for execution. By default, it is set to -1. This will make the host code ask you to select a platform if multiple platforms are available. This option can become handy if you want to automize the execution of your benchmark. +``--platform_str arg``: + A string which can be used to specify the wanted platform independent of the index. The exact platform name needs to be specified. When given, the value of the platform index specified by the flag above will be ignored. + ``--device INT``: Also an integer. It can be used to specify the index of the OpenCL device that should be used for execution. By default, it is set to -1. This will make the host code ask you to select a device if multiple devices are available. This option can become handy if you want to automize the execution of your benchmark. From 47ba61dc6eeffdc3fb3e0ed79084214275f7c101 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 6 Feb 2023 09:18:41 +0100 Subject: [PATCH 262/318] add noctua2 experiments to results --- docs/source/FFT/results/fft-1-1.csv | 43 +++++++-------- docs/source/FFT/results/index.rst | 2 +- docs/source/GEMM/results/gemm-1-0.csv | 48 ++++++++--------- .../RandomAccess/results/randomaccess-2-2.csv | 40 +++++++------- docs/source/STREAM/results/stream-2-3.csv | 54 +++++++++---------- 5 files changed, 94 insertions(+), 93 deletions(-) diff --git a/docs/source/FFT/results/fft-1-1.csv b/docs/source/FFT/results/fft-1-1.csv index c98312bc..7099394e 100644 --- a/docs/source/FFT/results/fft-1-1.csv +++ b/docs/source/FFT/results/fft-1-1.csv @@ -1,21 +1,22 @@ -FPGA board,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 -FPGA,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX -Memory Type,DDR,DDR,HBM2,SVM -SDK,19.4.0,2019.2,2019.2,19.4.0 -BSP/Shell,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm -CPU,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 -System,`Noctua `_,,, -LOG_FFT_SIZE,17,9,5,17 -NUM_REPLICATIONS,2,1,15,1 -LUT,276676,83494,602125,192189 -LUT percent,36.0,7.39,54.13,22.0 -Register,724790,168150,941404,480285 -Register percent,36.0,7.19,42.18,22.0 -BRAM,4177,39,405,2147 -BRAM percent,36.0,2.28,22.35,18.0 -DSP,1414,672,5280,707 -DSP percent,25.0,7.46,58.58,12.0 -Frequency,413.34,248.00,254.00,348.00 -GFLOPs,349.45,78.26,576.00,119.66 -GBs,65.78,27.83,368.77,22.54 -Error,7.1e-1,3.9e-1,5.4e-1,7.1e-1 +Version,1.4,1.1,1.1,1.1,1.1 +FPGA board,BittWare 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 +FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX +Memory Type,DDR,DDR,DDR,HBM2,SVM +SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0 +BSP/Shell,20.4.0,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm +CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 +System,`Noctua 2 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_ +LOG_FFT_SIZE,12,17,9,5,17 +NUM_REPLICATIONS,2,2,1,15,1 +LUT,280105,276676,83494,602125,192189 +LUT percent,30,36.0,7.39,54.13,22.0 +Register,611446,724790,168150,941404,480285 +Register percent,,36.0,7.19,42.18,22.0 +BRAM,1811,4177,39,405,2147 +BRAM percent,15,36.0,2.28,22.35,18.0 +DSP,1560,1414,672,5280,707 +DSP percent,27,25.0,7.46,58.58,12.0 +Frequency,402.41,413.34,248.00,254.00,348.00 +GFLOPs,239.598,349.45,78.26,576.00,119.66 +GBs,,65.78,27.83,368.77,22.54 +Error,3.00463e-1,7.1e-1,3.9e-1,5.4e-1,7.1e-1 \ No newline at end of file diff --git a/docs/source/FFT/results/index.rst b/docs/source/FFT/results/index.rst index e2f705db..8672be27 100644 --- a/docs/source/FFT/results/index.rst +++ b/docs/source/FFT/results/index.rst @@ -9,7 +9,7 @@ The measurements were executed 10 times and the best result is published. The results and the used configuration is given in :numref:`tbl_fft_1_1_results` and are also available as :download:`CSV `. .. _tbl_fft_1_1_results: -.. csv-table:: FFT FPGA Benchmark Results for version 1.1 +.. csv-table:: FFT FPGA Benchmark Results :file: fft-1-1.csv :stub-columns: 1 diff --git a/docs/source/GEMM/results/gemm-1-0.csv b/docs/source/GEMM/results/gemm-1-0.csv index 6b36ebc3..c8142d6b 100644 --- a/docs/source/GEMM/results/gemm-1-0.csv +++ b/docs/source/GEMM/results/gemm-1-0.csv @@ -1,24 +1,24 @@ -FPGA board,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 -FPGA,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX -Memory Type,DDR,DDR,HBM2,SVM -SDK,19.4.0,2019.2,2019.2,19.4.0 -BSP/Shell,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm -CPU,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 -System,`Noctua `_,,, -BLOCK_SIZE,512,256,256,512 -GEMM_SIZE,8,8,8,8 -GLOBAL_MEM_UNROLL,16,16,16,16 -DATA_TYPE,float,float,float,float -NUM_REPLICATIONS,5,3,3,5 -LUT,275754,568558,499002,299427 -LUT percent,36.0,51.87,42.64,33.0 -Register,861277,441602,920127,829802 -Register percent,36.0,19.43,38.7,33.0 -BRAM,8860,666,666,9041 -BRAM percent,76.0,43.11,36.71,77.0 -DSP,3398,7683,7683,3398 -DSP percent,59.0,85.23,85.18,59.0 -Frequency,160.42,100.00,236.00,225.00 -GFLOPs,708.95,266.91,603.86,739.59 -GFLOPs norm,88.39,85.29,88.97,65.74 -Error,6.0e-7,2.0e-6,2.0e-6,6.0e-7 +FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 +FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX +Memory Type,DDR,DDR,DDR,HBM2,SVM +SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0 +BSP/Shell,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm +CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 +System,`Noctua 2 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_, +BLOCK_SIZE,"512 ? ",512,256,256,512 +GEMM_SIZE,8,8,8,8,8 +GLOBAL_MEM_UNROLL,8,16,16,16,16 +DATA_TYPE,float,float,float,float,float +NUM_REPLICATIONS,5,5,3,3,5 +LUT,310564,275754,568558,499002,299427 +LUT percent,33,36.0,51.87,42.64,33.0 +Register,793535,861277,441602,920127,829802 +Register percent,,36.0,19.43,38.7,33.0 +BRAM,8321,8860,666,666,9041 +BRAM percent,71,76.0,43.11,36.71,77.0 +DSP,3318,3398,7683,7683,3398 +DSP percent,58,59.0,85.23,85.18,59.0 +Frequency,273.07,160.42,100.00,236.00,225.00 +GFLOPs,1232.50,708.95,266.91,603.86,739.59 +GFLOPs norm,90.27,88.39,85.29,88.97,65.74 +Error,9.15527e-5,6.0e-7,2.0e-6,2.0e-6,6.0e-7 \ No newline at end of file diff --git a/docs/source/RandomAccess/results/randomaccess-2-2.csv b/docs/source/RandomAccess/results/randomaccess-2-2.csv index 68969c49..698edc3a 100644 --- a/docs/source/RandomAccess/results/randomaccess-2-2.csv +++ b/docs/source/RandomAccess/results/randomaccess-2-2.csv @@ -1,20 +1,20 @@ -FPGA board,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 -FPGA,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX -Memory Type,DDR,DDR,HBM2,SVM -SDK,19.4.0,2019.2,2019.2,19.4.0 -BSP/Shell,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm -CPU,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 -System,`Noctua `_,,, -DEVICE_BUFFER_SIZE,1,1024,1024,1024 -NUM_REPLICATIONS,4,2,32,1 -LUT,115743,7256,116096,103397 -LUT percent,18.0,0.65,10.68,12.0 -Register,253578,11716,187456,225293 -Register percent,18.0,0.5,8.76,12.0 -BRAM,489,38,608,535 -BRAM percent,4.0,2.23,33.55,5.0 -DSP,14,14,224,0 -DSP percent,1.0,0.16,2.48,0.0 -Frequency,329.17,446.0,450.0,322.0 -MUOPs,245.0,40.3,128.1,0.5 -Error,0.0099,0.0106,0.0106,0.0106 +FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 +FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX +Memory Type,DDR,DDR,DDR,HBM2,SVM +SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0 +BSP/Shell,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm +CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 +System,`Noctua 2 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_ +DEVICE_BUFFER_SIZE,1,1,1024,1024,1024 +NUM_REPLICATIONS,4,4,2,32,1 +LUT,222405,115743,7256,116096,103397 +LUT percent,24,18.0,0.65,10.68,12.0 +Register,434090,253578,11716,187456,225293 +Register percent,24,18.0,0.5,8.76,12.0 +BRAM,602,489,38,608,535 +BRAM percent,5,4.0,2.23,33.55,5.0 +DSP,14,14,14,224,0 +DSP percent,< 1.0,< 1.0,0.16,2.48,0.0 +Frequency,326.05,329.17,446.0,450.0,322.0 +MUOPs,185.633,245.0,40.3,128.1,0.5 +Error,0.0689179,0.0099,0.0106,0.0106,0.0106 \ No newline at end of file diff --git a/docs/source/STREAM/results/stream-2-3.csv b/docs/source/STREAM/results/stream-2-3.csv index aa9a49ee..25f1e366 100644 --- a/docs/source/STREAM/results/stream-2-3.csv +++ b/docs/source/STREAM/results/stream-2-3.csv @@ -1,27 +1,27 @@ -FPGA board,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 -FPGA,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX -Memory Type,DDR,DDR,HBM2,SVM -SDK,19.4.0,2019.2,2019.2,19.4.0 -BSP/Shell,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm -CPU,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 -System,`Noctua `_,,, -DATA_TYPE,float,float,float,float -VECTOR_COUNT,16,16,16,16 -GLOBAL_MEM_UNROLL,1,1,1,1 -DEVICE_BUFFER_SIZE,4096,16384,2048,1 -NUM_REPLICATIONS,4,2,32,1 -LUT,176396,20832,331904,103628 -LUT percent,25.0,1.9,20.69,12.0 -Register,449231,39002,574976,244354 -Register percent,25.0,1.39,27.24,12.0 -BRAM,4029,558,1408,548 -BRAM percent,34.0,34.19,77.7,5.0 -DSP,128,160,2560,32 -DSP percent,2.0,1.78,28.38,1.0 -Frequency,316.67,300.0,370.0,346.0 -Copy,67.01,33.94,377.42,20.15 -Scale,67.24,33.92,365.8,20.04 -Add,68.9,34.58,374.03,15.04 -Triad,68.9,34.57,378.88,15.12 -PCIe Read,6.41,5.68,6.66,inf -PCIe Write,6.32,5.47,6.03,inf +FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 +FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX +Memory Type,DDR,DDR,DDR,HBM2,SVM +SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0 +BSP/Shell,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm +CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 +System,`Noctua 2 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_ +DATA_TYPE,float,float,float,float,float +VECTOR_COUNT,16,16,16,16,16 +GLOBAL_MEM_UNROLL,1,1,1,1,1 +DEVICE_BUFFER_SIZE,65536,4096,16384,2048,1 +NUM_REPLICATIONS,4,4,2,32,1 +LUT,178268,176396,20832,331904,103628 +LUT percent,19,25.0,1.9,20.69,12.0 +Register,297342,449231,39002,574976,244354 +Register percent,,25.0,1.39,27.24,12.0 +BRAM,3926,4029,558,1408,548 +BRAM percent,33,34.0,34.19,77.7,5.0 +DSP,128,128,160,2560,32 +DSP percent,2,2.0,1.78,28.38,1.0 +Frequency,342.23,316.67,300.0,370.0,346.0 +Copy,65.63,67.01,33.94,377.42,20.15 +Scale,65.63,67.24,33.92,365.8,20.04 +Add,67.78,68.9,34.58,374.03,15.04 +Triad,67.80,68.9,34.57,378.88,15.12 +PCIe Read,6.28,6.41,5.68,6.66,inf +PCIe Write,5.87,6.32,5.47,6.03,inf \ No newline at end of file From b0d2fa79143b2a487a78779667c339a69e5ca0cc Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Mon, 6 Feb 2023 15:56:38 +0100 Subject: [PATCH 263/318] update results for intel boards --- docs/source/GEMM/results/gemm-1-0.csv | 3 ++- docs/source/GEMM/results/index.rst | 2 +- docs/source/RandomAccess/results/index.rst | 2 +- docs/source/RandomAccess/results/randomaccess-2-2.csv | 1 + docs/source/STREAM/results/index.rst | 2 +- docs/source/STREAM/results/stream-2-3.csv | 3 ++- 6 files changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/source/GEMM/results/gemm-1-0.csv b/docs/source/GEMM/results/gemm-1-0.csv index c8142d6b..211d4e9f 100644 --- a/docs/source/GEMM/results/gemm-1-0.csv +++ b/docs/source/GEMM/results/gemm-1-0.csv @@ -1,3 +1,4 @@ +Version,1.4,1.0,1.0,1.0,1.0 FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX Memory Type,DDR,DDR,DDR,HBM2,SVM @@ -5,7 +6,7 @@ SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0 BSP/Shell,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 System,`Noctua 2 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_, -BLOCK_SIZE,"512 ? ",512,256,256,512 +BLOCK_SIZE,512,512,256,256,512 GEMM_SIZE,8,8,8,8,8 GLOBAL_MEM_UNROLL,8,16,16,16,16 DATA_TYPE,float,float,float,float,float diff --git a/docs/source/GEMM/results/index.rst b/docs/source/GEMM/results/index.rst index 923b78d2..7e08adb0 100644 --- a/docs/source/GEMM/results/index.rst +++ b/docs/source/GEMM/results/index.rst @@ -10,7 +10,7 @@ The measurements were executed 10 times and the best result is published. The results and the used configuration is given in :numref:`tbl_gemm_1_0_results` and are also available as :download:`CSV `. .. _tbl_gemm_1_0_results: -.. csv-table:: GEMM FPGA Benchmark Results for version 1.0 +.. csv-table:: GEMM FPGA Benchmark Results :file: gemm-1-0.csv :stub-columns: 1 diff --git a/docs/source/RandomAccess/results/index.rst b/docs/source/RandomAccess/results/index.rst index 52dd983d..a4330c56 100644 --- a/docs/source/RandomAccess/results/index.rst +++ b/docs/source/RandomAccess/results/index.rst @@ -9,7 +9,7 @@ The measurements were executed 10 times and the best result is published. The results and the used configuration is given in :numref:`tbl_randomaccess_2_2_results` and are also available as :download:`CSV `. .. _tbl_randomaccess_2_2_results: -.. csv-table:: RandomAccess FPGA Benchmark Results for version 2.2 +.. csv-table:: RandomAccess FPGA Benchmark Results :file: randomaccess-2-2.csv :stub-columns: 1 diff --git a/docs/source/RandomAccess/results/randomaccess-2-2.csv b/docs/source/RandomAccess/results/randomaccess-2-2.csv index 698edc3a..766a6287 100644 --- a/docs/source/RandomAccess/results/randomaccess-2-2.csv +++ b/docs/source/RandomAccess/results/randomaccess-2-2.csv @@ -1,3 +1,4 @@ +Version,2.5,2.2,2.2,2.2,2.2 FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX Memory Type,DDR,DDR,DDR,HBM2,SVM diff --git a/docs/source/STREAM/results/index.rst b/docs/source/STREAM/results/index.rst index 4b0d8d4a..b529fcee 100644 --- a/docs/source/STREAM/results/index.rst +++ b/docs/source/STREAM/results/index.rst @@ -18,7 +18,7 @@ The results and the used configuration is given in :numref:`tbl_stream_2_3_resul .. _tbl_stream_2_3_results: -.. csv-table:: STREAM FPGA Benchmark Results for version 2.3 +.. csv-table:: STREAM FPGA Benchmark Results :file: stream-2-3.csv :stub-columns: 1 diff --git a/docs/source/STREAM/results/stream-2-3.csv b/docs/source/STREAM/results/stream-2-3.csv index 25f1e366..7ee2c8d7 100644 --- a/docs/source/STREAM/results/stream-2-3.csv +++ b/docs/source/STREAM/results/stream-2-3.csv @@ -1,3 +1,4 @@ +Version,2.6,2.3,2.3,2.3,2.3 FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX Memory Type,DDR,DDR,DDR,HBM2,SVM @@ -8,7 +9,7 @@ System,`Noctua 2 Date: Sat, 11 Feb 2023 11:15:42 +0100 Subject: [PATCH 264/318] add u280 results for stream and ra --- .../RandomAccess/results/randomaccess-2-2.csv | 42 +++++++------- docs/source/STREAM/results/stream-2-3.csv | 56 +++++++++---------- 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/docs/source/RandomAccess/results/randomaccess-2-2.csv b/docs/source/RandomAccess/results/randomaccess-2-2.csv index 766a6287..59685a9d 100644 --- a/docs/source/RandomAccess/results/randomaccess-2-2.csv +++ b/docs/source/RandomAccess/results/randomaccess-2-2.csv @@ -1,21 +1,21 @@ -Version,2.5,2.2,2.2,2.2,2.2 -FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 -FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX -Memory Type,DDR,DDR,DDR,HBM2,SVM -SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0 -BSP/Shell,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm -CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 -System,`Noctua 2 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_ -DEVICE_BUFFER_SIZE,1,1,1024,1024,1024 -NUM_REPLICATIONS,4,4,2,32,1 -LUT,222405,115743,7256,116096,103397 -LUT percent,24,18.0,0.65,10.68,12.0 -Register,434090,253578,11716,187456,225293 -Register percent,24,18.0,0.5,8.76,12.0 -BRAM,602,489,38,608,535 -BRAM percent,5,4.0,2.23,33.55,5.0 -DSP,14,14,14,224,0 -DSP percent,< 1.0,< 1.0,0.16,2.48,0.0 -Frequency,326.05,329.17,446.0,450.0,322.0 -MUOPs,185.633,245.0,40.3,128.1,0.5 -Error,0.0689179,0.0099,0.0106,0.0106,0.0106 \ No newline at end of file +Version,2.5,2.5,2.2,2.2,2.2,2.2 +FPGA board,Alveo U280,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 +FPGA,Xilinx XCU280,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX +Memory Type,DDR,DDR,DDR,DDR,HBM2,SVM +SDK,2019.2,21.2.0,19.4.0,2019.2,2019.2,19.4.0 +BSP/Shell,2019.2.3,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm +CPU,AMD EPYC Milan 7763,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 +System,`Noctua 2 `_,`Noctua 2 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_ +DEVICE_BUFFER_SIZE,1024,1,1,1024,1024,1024 +NUM_REPLICATIONS,2,4,4,2,32,1 +LUT,,222405,115743,7256,116096,103397 +LUT percent,,24,18.0,0.65,10.68,12.0 +Register,,434090,253578,11716,187456,225293 +Register percent,,24,18.0,0.5,8.76,12.0 +BRAM,,602,489,38,608,535 +BRAM percent,,5,4.0,2.23,33.55,5.0 +DSP,,14,14,14,224,0 +DSP percent,,< 1.0,< 1.0,0.16,2.48,0.0 +Frequency,411.015198,326.05,329.17,446.0,450.0,322.0 +MUOPs,39.7888,185.633,245.0,40.3,128.1,0.5 +Error,0.00662282,0.0689179,0.0099,0.0106,0.0106,0.0106 \ No newline at end of file diff --git a/docs/source/STREAM/results/stream-2-3.csv b/docs/source/STREAM/results/stream-2-3.csv index 7ee2c8d7..643ae5ed 100644 --- a/docs/source/STREAM/results/stream-2-3.csv +++ b/docs/source/STREAM/results/stream-2-3.csv @@ -1,28 +1,28 @@ -Version,2.6,2.3,2.3,2.3,2.3 -FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 -FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX -Memory Type,DDR,DDR,DDR,HBM2,SVM -SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0 -BSP/Shell,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm -CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 -System,`Noctua 2 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_ -DATA_TYPE,float,float,float,float,float -VECTOR_COUNT,16,16,16,16,16 -GLOBAL_MEM_UNROLL,1,1,1,1,1 -DEVICE_BUFFER_SIZE,32768,4096,16384,2048,1 -NUM_REPLICATIONS,4,4,2,32,1 -LUT,178268,176396,20832,331904,103628 -LUT percent,19,25.0,1.9,20.69,12.0 -Register,297342,449231,39002,574976,244354 -Register percent,,25.0,1.39,27.24,12.0 -BRAM,3926,4029,558,1408,548 -BRAM percent,33,34.0,34.19,77.7,5.0 -DSP,128,128,160,2560,32 -DSP percent,2,2.0,1.78,28.38,1.0 -Frequency,342.23,316.67,300.0,370.0,346.0 -Copy,65.63,67.01,33.94,377.42,20.15 -Scale,65.63,67.24,33.92,365.8,20.04 -Add,67.78,68.9,34.58,374.03,15.04 -Triad,67.80,68.9,34.57,378.88,15.12 -PCIe Read,6.28,6.41,5.68,6.66,inf -PCIe Write,5.87,6.32,5.47,6.03,inf \ No newline at end of file +Version,2.6,2.6,2.3,2.3,2.3,2.3 +FPGA board,Alveo U280,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005 +FPGA,Xilinx XCU280,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX +Memory Type,DDR,DDR,DDR,DDR,HBM2,SVM +SDK,2019.2,21.2.0,19.4.0,2019.2,2019.2,19.4.0 +BSP/Shell,2019.2.3,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm +CPU,AMD EPYC Milan 7763,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 +System,`Noctua 2 `_,`Noctua 2 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_ +DATA_TYPE,,float,float,float,float,float +VECTOR_COUNT,,16,16,16,16,16 +GLOBAL_MEM_UNROLL,,1,1,1,1,1 +DEVICE_BUFFER_SIZE,,32768,4096,16384,2048,1 +NUM_REPLICATIONS,,4,4,2,32,1 +LUT,,178268,176396,20832,331904,103628 +LUT percent,,19,25.0,1.9,20.69,12.0 +Register,,297342,449231,39002,574976,244354 +Register percent,,,25.0,1.39,27.24,12.0 +BRAM,,3926,4029,558,1408,548 +BRAM percent,,33,34.0,34.19,77.7,5.0 +DSP,,128,128,160,2560,32 +DSP percent,,2,2.0,1.78,28.38,1.0 +Frequency,,342.23,316.67,300.0,370.0,346.0 +Copy (GB/s),32.98,65.63,67.01,33.94,377.42,20.15 +Scale (GB/s),32.98,65.63,67.24,33.92,365.8,20.04 +Add (GB/s),33.88,67.78,68.9,34.58,374.03,15.04 +Triad,33.89,67.80,68.9,34.57,378.88,15.12 +PCIe Read,6.35,6.28,6.41,5.68,6.66,inf +PCIe Write,4.00,5.87,6.32,5.47,6.03,inf \ No newline at end of file From 5bc22eb234f72c9aa690d4cb790b7a66b53bdc61 Mon Sep 17 00:00:00 2001 From: Gerrit Pape Date: Fri, 17 Feb 2023 13:45:41 +0100 Subject: [PATCH 265/318] add data for stream and ra --- .../RandomAccess/results/randomaccess-2-2.csv | 16 +++++------ docs/source/STREAM/results/stream-2-3.csv | 28 +++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/docs/source/RandomAccess/results/randomaccess-2-2.csv b/docs/source/RandomAccess/results/randomaccess-2-2.csv index 59685a9d..b101cbc6 100644 --- a/docs/source/RandomAccess/results/randomaccess-2-2.csv +++ b/docs/source/RandomAccess/results/randomaccess-2-2.csv @@ -8,14 +8,14 @@ CPU,AMD EPYC Milan 7763,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold System,`Noctua 2 `_,`Noctua 2 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_ DEVICE_BUFFER_SIZE,1024,1,1,1024,1024,1024 NUM_REPLICATIONS,2,4,4,2,32,1 -LUT,,222405,115743,7256,116096,103397 -LUT percent,,24,18.0,0.65,10.68,12.0 -Register,,434090,253578,11716,187456,225293 -Register percent,,24,18.0,0.5,8.76,12.0 -BRAM,,602,489,38,608,535 -BRAM percent,,5,4.0,2.23,33.55,5.0 -DSP,,14,14,14,224,0 -DSP percent,,< 1.0,< 1.0,0.16,2.48,0.0 +LUT,184888,222405,115743,7256,116096,103397 +LUT percent,14.19,24,18.0,0.65,10.68,12.0 +Register,288566,434090,253578,11716,187456,225293 +Register percent,11.08,24,18.0,0.5,8.76,12.0 +BRAM,349.5,602,489,38,608,535 +BRAM percent,17.34,5,4.0,2.23,33.55,5.0 +DSP,24,14,14,14,224,0 +DSP percent,0.27,< 1.0,< 1.0,0.16,2.48,0.0 Frequency,411.015198,326.05,329.17,446.0,450.0,322.0 MUOPs,39.7888,185.633,245.0,40.3,128.1,0.5 Error,0.00662282,0.0689179,0.0099,0.0106,0.0106,0.0106 \ No newline at end of file diff --git a/docs/source/STREAM/results/stream-2-3.csv b/docs/source/STREAM/results/stream-2-3.csv index 643ae5ed..bf12dedf 100644 --- a/docs/source/STREAM/results/stream-2-3.csv +++ b/docs/source/STREAM/results/stream-2-3.csv @@ -6,20 +6,20 @@ SDK,2019.2,21.2.0,19.4.0,2019.2,2019.2,19.4.0 BSP/Shell,2019.2.3,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm CPU,AMD EPYC Milan 7763,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148 System,`Noctua 2 `_,`Noctua 2 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_,`Noctua 1 `_ -DATA_TYPE,,float,float,float,float,float -VECTOR_COUNT,,16,16,16,16,16 -GLOBAL_MEM_UNROLL,,1,1,1,1,1 -DEVICE_BUFFER_SIZE,,32768,4096,16384,2048,1 -NUM_REPLICATIONS,,4,4,2,32,1 -LUT,,178268,176396,20832,331904,103628 -LUT percent,,19,25.0,1.9,20.69,12.0 -Register,,297342,449231,39002,574976,244354 -Register percent,,,25.0,1.39,27.24,12.0 -BRAM,,3926,4029,558,1408,548 -BRAM percent,,33,34.0,34.19,77.7,5.0 -DSP,,128,128,160,2560,32 -DSP percent,,2,2.0,1.78,28.38,1.0 -Frequency,,342.23,316.67,300.0,370.0,346.0 +DATA_TYPE,float,float,float,float,float,float +VECTOR_COUNT,16,16,16,16,16,16 +GLOBAL_MEM_UNROLL,1,1,1,1,1,1 +DEVICE_BUFFER_SIZE,16384,32768,4096,16384,2048,1 +NUM_REPLICATIONS,2,4,4,2,32,1 +LUT,188124,178268,176396,20832,331904,103628 +LUT percent,14.44,19,25.0,1.9,20.69,12.0 +Register,298365,297342,449231,39002,574976,244354 +Register percent,11.45,,25.0,1.39,27.24,12.0 +BRAM,853.5,3926,4029,558,1408,548 +BRAM percent,42.43,33,34.0,34.19,77.7,5.0 +DSP,170,128,128,160,2560,32 +DSP percent,1.88,2,2.0,1.78,28.38,1.0 +Frequency,411.015198,342.23,316.67,300.0,370.0,346.0 Copy (GB/s),32.98,65.63,67.01,33.94,377.42,20.15 Scale (GB/s),32.98,65.63,67.24,33.92,365.8,20.04 Add (GB/s),33.88,67.78,68.9,34.58,374.03,15.04 From d138c904b9b9524ea5b137e64aab2854305e0d33 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 21 Apr 2023 18:42:27 +0200 Subject: [PATCH 266/318] Update hlslib to follow master to support XRT 2.14 --- extern/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 197fa734..0e8bed30 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -28,7 +28,7 @@ FetchContent_Declare( # unfortunately they do not use releases, so the latest commit was used GIT_REPOSITORY https://github.com/definelicht/hlslib.git - GIT_TAG v1.4.3) + GIT_TAG master) FetchContent_GetProperties(extern_hlslib) if(NOT extern_hlslib_POPULATED) From 074e9544495215e7dcd8cca917f2ab73ba35475c Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 21 Apr 2023 19:52:52 +0200 Subject: [PATCH 267/318] Disable TCP bypass in ACCL stack to prevent data loss --- cmake/accl.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/accl.cmake b/cmake/accl.cmake index fd29f4ee..7a31a665 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -59,7 +59,7 @@ add_custom_command( COMMAND mkdir build && cd build && cmake .. -DFDEV_NAME=u280 -DVIVADO_HLS_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 -DVIVADO_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 - -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=1 -DTCP_STACK_WINDOW_SCALING_EN=0 && + -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=0 -DTCP_STACK_WINDOW_SCALING_EN=0 && make installip WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR}) From 6a68ca4bb1036fb27dd56341fd621ba81e5dc50e Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 25 Apr 2023 09:05:57 +0200 Subject: [PATCH 268/318] Fix ACCL host signatures in b_eff --- b_eff/src/host/execution_types/execution_accl.hpp | 11 +++++------ b_eff/src/host/execution_types/execution_accl_pl.hpp | 7 +++---- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp index 2ade570b..998e4d78 100644 --- a/b_eff/src/host/execution_types/execution_accl.hpp +++ b/b_eff/src/host/execution_types/execution_accl.hpp @@ -40,7 +40,7 @@ namespace network::execution_types::accl { @copydoc bm_execution::calculate() */ template - std::shared_ptr + network::ExecutionTimings calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, cl::vector &validationData) { @@ -111,14 +111,13 @@ namespace network::execution_types::accl { // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { - std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); + std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); } - std::shared_ptr result(new network::ExecutionTimings{ - looplength, + return network::ExecutionTimings{ + looplength, messageSize, calculationTimings - }); - return result; + }; } } // namespace bm_execution diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp index eecb552e..d5df937f 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp @@ -47,7 +47,7 @@ namespace network::execution_types::accl_pl { @copydoc bm_execution::calculate() */ template - std::shared_ptr + network::ExecutionTimings calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, cl::vector &validationData) { @@ -137,12 +137,11 @@ namespace network::execution_types::accl_pl { } std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); } - std::shared_ptr result(new network::ExecutionTimings{ + return network::ExecutionTimings{ looplength, messageSize, calculationTimings - }); - return result; + }; } } // namespace bm_execution From c989680d141f29aebbf99b8a7840cc1ab9e7ff74 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 25 Apr 2023 14:53:15 +0200 Subject: [PATCH 269/318] Fix ACCl PL kernel --- b_eff/src/device/communication_ACCL_pl.cpp | 31 +++++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/b_eff/src/device/communication_ACCL_pl.cpp b/b_eff/src/device/communication_ACCL_pl.cpp index 97a21907..4d4548c5 100644 --- a/b_eff/src/device/communication_ACCL_pl.cpp +++ b/b_eff/src/device/communication_ACCL_pl.cpp @@ -22,12 +22,35 @@ SOFTWARE. #include "accl_hls.h" -void send_recv(const float *read_buffer,float *write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, +void send_recv(ap_uint<64> read_buffer,ap_uint<64> write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &cmd, STREAM &sts) { - accl_hls::ACCLCommand accl_cmd(cmd, sts, communicator_addr, datapath_cfg,0,0); +#pragma HLS INTERFACE s_axilite port=read_buffer +#pragma HLS INTERFACE s_axilite port=write_buffer +#pragma HLS INTERFACE s_axilite port=size +#pragma HLS INTERFACE s_axilite port=num_iterations +#pragma HLS INTERFACE s_axilite port=neighbor_rank +#pragma HLS INTERFACE s_axilite port=communicator_addr +#pragma HLS INTERFACE s_axilite port=datapath_cfg +#pragma HLS INTERFACE axis port=cmd +#pragma HLS INTERFACE axis port=sts +#pragma HLS INTERFACE s_axilite port=return + accl_hls::ACCLCommand accl(cmd, sts); for (int i = 0; i < num_iterations; i++) { - accl_cmd.send(size, 0, neighbor_rank, (ap_uint<64>)read_buffer); - accl_cmd.recv(size, 0, neighbor_rank, (ap_uint<64>)write_buffer); + #pragma HLS protocol fixed + accl.start_call( + ACCL_SEND, size, communicator_addr, neighbor_rank, 0, 0, + datapath_cfg, 0, 0, + read_buffer, 0, 0); + ap_wait(); + accl.finalize_call(); + ap_wait(); + accl.start_call( + ACCL_RECV, size, communicator_addr, neighbor_rank, 0, 0, + datapath_cfg, 0, 0, + 0, write_buffer, 0); + ap_wait(); + accl.finalize_call(); } } + From d81c17736478bfa3f98b7618fc31e081b8768c45 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 25 Apr 2023 14:54:06 +0200 Subject: [PATCH 270/318] Start device indexing at 0 for XRT --- shared/setup/fpga_setup_xrt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp index 1b41f9e0..f5d7ef32 100644 --- a/shared/setup/fpga_setup_xrt.cpp +++ b/shared/setup/fpga_setup_xrt.cpp @@ -42,7 +42,7 @@ namespace fpga_setup { } else { //TODO Use xrt::system::enumerate_devices() in "experimental/xrt_system.h" for future XRT versions // instead of hardcoded number of devices. - current_device = current_device + 1 % 3; + current_device = current_device % 3; } return std::unique_ptr(new xrt::device(current_device)); } From 7d84815146e2caee54e9edd9e3e82736f9e924d0 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 25 Apr 2023 14:55:59 +0200 Subject: [PATCH 271/318] Adjust copying of validation data --- b_eff/src/host/execution_types/execution_accl.hpp | 3 ++- b_eff/src/host/execution_types/execution_accl_pl.hpp | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp index 998e4d78..32e5a34e 100644 --- a/b_eff/src/host/execution_types/execution_accl.hpp +++ b/b_eff/src/host/execution_types/execution_accl.hpp @@ -111,7 +111,8 @@ namespace network::execution_types::accl { // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { - std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); + acclRecvBuffers[r]->sync_from_device(); + std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); } return network::ExecutionTimings{ looplength, diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp index d5df937f..2fc79956 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp @@ -133,9 +133,9 @@ namespace network::execution_types::accl_pl { // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { if (!config.programSettings->useAcclEmulation) { - acclRecvBuffers.back()->sync_from_device(); + acclRecvBuffers[r]->sync_from_device(); } - std::copy(recvBufferContents[r].begin(), recvBufferContents[r].begin() + validationData.size() / config.programSettings->kernelReplications, validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); + std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); } return network::ExecutionTimings{ looplength, From 0c0ff70b2f717909cb47bc19389565825e52e916 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 26 Apr 2023 14:56:10 +0200 Subject: [PATCH 272/318] Fix ACCL executor --- .../host/execution_types/execution_accl.hpp | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp index 32e5a34e..b678e6fb 100644 --- a/b_eff/src/host/execution_types/execution_accl.hpp +++ b/b_eff/src/host/execution_types/execution_accl.hpp @@ -45,11 +45,11 @@ namespace network::execution_types::accl { cl::vector &validationData) { int err; - std::vector> dummyBufferContents; - std::vector> recvBufferContents; - std::vector>> acclSendBuffers; - std::vector>> acclRecvBuffers; - size_t size_in_bytes = std::max(static_cast(validationData.size()), static_cast(1 << messageSize)); + std::vector> dummyBufferContents; + std::vector> recvBufferContents; + std::vector>> acclSendBuffers; + std::vector>> acclRecvBuffers; + size_t size_in_bytes = std::max((1 << messageSize), 4); int current_rank; MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); @@ -66,12 +66,12 @@ namespace network::execution_types::accl { int size_in_values = (size_in_bytes + 3) / 4; // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels for (int r = 0; r < config.programSettings->kernelReplications; r++) { - dummyBufferContents.emplace_back(size_in_values, static_cast(messageSize & (255))); - recvBufferContents.emplace_back(size_in_values, static_cast(0)); - acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_values, ACCL::dataType::float32)); - acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_values, ACCL::dataType::float32)); - acclSendBuffers.back()->sync_to_device(); - acclRecvBuffers.back()->sync_to_device(); + dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); + recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); + acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32)); + acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32)); + acclSendBuffers.back()->sync_to_device(); + acclRecvBuffers.back()->sync_to_device(); } double calculationTime = 0.0; @@ -80,15 +80,19 @@ namespace network::execution_types::accl { auto startCalculation = std::chrono::high_resolution_clock::now(); for (int l = 0; l < looplength; l++) { #ifndef NDEBUG - std::cout << "Send " << size_in_values << " bytes to " + std::cout << "Send " << size_in_bytes << " bytes to " << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl; #endif - config.context->accl->send(*acclSendBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + config.context->accl->send(*acclSendBuffers[i], size_in_values, + (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + 0, ACCL::GLOBAL_COMM, true); #ifndef NDEBUG - std::cout << "Recv " << size_in_values << " bytes from " + std::cout << "Recv " << size_in_bytes << " bytes from " << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl; #endif - config.context->accl->recv(*acclRecvBuffers[i], size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0); + config.context->accl->recv(*acclRecvBuffers[i], size_in_values, + (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + 0, ACCL::GLOBAL_COMM, true); #ifndef NDEBUG std::cout << "Done" << std::endl; #endif @@ -112,7 +116,7 @@ namespace network::execution_types::accl { // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { acclRecvBuffers[r]->sync_from_device(); - std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); + std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]); } return network::ExecutionTimings{ looplength, From c2d022d34818c12c919fc5da0a9aa195bee9c0e5 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 26 Apr 2023 14:58:04 +0200 Subject: [PATCH 273/318] Adjust ACCL PL executor for updated validation scheme --- .../execution_types/execution_accl_pl.hpp | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp index 2fc79956..ab765de7 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp @@ -35,7 +35,7 @@ SOFTWARE. /* Project's headers */ -extern void send_recv(const float *read_buffer,float *write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, +extern void send_recv(ap_uint<64> read_buffer,ap_uint<64> write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &cmd, STREAM &sts); @@ -54,9 +54,9 @@ namespace network::execution_types::accl_pl { int err; std::vector> dummyBufferContents; std::vector> recvBufferContents; - std::vector>> acclSendBuffers; - std::vector>> acclRecvBuffers; - cl_uint size_in_bytes = std::max(static_cast(validationData.size()), (1 << messageSize)); + std::vector>> acclSendBuffers; + std::vector>> acclRecvBuffers; + cl_uint size_in_bytes = (1 << messageSize); int current_rank; MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); @@ -77,8 +77,6 @@ namespace network::execution_types::accl_pl { std::vector calculationTimings; for (uint r =0; r < config.programSettings->numRepetitions; r++) { - dummyBufferContents.clear(); - recvBufferContents.clear(); acclSendBuffers.clear(); acclRecvBuffers.clear(); int size_in_values = (size_in_bytes + 3) / 4; @@ -86,8 +84,8 @@ namespace network::execution_types::accl_pl { for (int r = 0; r < config.programSettings->kernelReplications; r++) { dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); - acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); - acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_values * 4, ACCL::dataType::float32)); + acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32)); + acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32)); acclSendBuffers.back()->sync_to_device(); acclRecvBuffers.back()->sync_to_device(); } @@ -102,12 +100,12 @@ namespace network::execution_types::accl_pl { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); if (!config.programSettings->useAcclEmulation) { - auto run = sendrecvKernel(*(acclSendBuffers[i]->bo()), *(acclRecvBuffers[i]->bo()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, - config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32})); + auto run = sendrecvKernel(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_bytes, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32})); run.wait(); } else { - send_recv(reinterpret_cast(acclSendBuffers[i]->buffer()), reinterpret_cast(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, - config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::float32, ACCL::dataType::float32}), + send_recv(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_bytes, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}), cmd, sts); } auto endCalculation = std::chrono::high_resolution_clock::now(); @@ -135,7 +133,11 @@ namespace network::execution_types::accl_pl { if (!config.programSettings->useAcclEmulation) { acclRecvBuffers[r]->sync_from_device(); } - std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), validationData.begin() + validationData.size() / config.programSettings->kernelReplications * r); + for (int c=0; c < size_in_bytes; c++) { + std::cout << int(recvBufferContents[r][c]) << ","; + } + std::cout << std::endl; + std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]); } return network::ExecutionTimings{ looplength, From 54ea54db9b0d532d0504f4b205e41b036a21eee3 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 26 Apr 2023 14:59:58 +0200 Subject: [PATCH 274/318] Fix bandwidth calculation. Still 0s? --- b_eff/src/host/network_benchmark.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index 66cbb2a4..d4412461 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -213,11 +213,11 @@ network::NetworkBenchmark::collectResults() { int messageSize = timing.first; int num_timings = timing.second.execution_timings.size(); // The total sent data in bytes will be: - // #Nodes * message_size * looplength * 2 - // the * 2 is because we have two kernels per bitstream that will send and receive simultaneously. + // #Nodes * message_size * looplength * kernel_replications + // the * kernel_replications is because we have multiple replications per bitstream that will send and receive simultaneously. // This will be divided by half of the maximum of the minimum measured runtime over all ranks. - timing.second.maxCalcBW = static_cast(num_timings * 2 * (1 << messageSize) * looplength) - / timing.second.maxMinCalculationTime; + timing.second.maxCalcBW = static_cast( num_timings * executionSettings->programSettings->kernelReplications + * (1 << messageSize) * looplength) / timing.second.maxMinCalculationTime; maxBandwidths.push_back(timing.second.maxCalcBW); @@ -231,7 +231,7 @@ network::NetworkBenchmark::collectResults() { void network::NetworkBenchmark::printResults() { std::cout << std::setw(ENTRY_SPACE) << "MSize" << " " << std::setw(ENTRY_SPACE) << "looplength" << " " - << std::setw(ENTRY_SPACE) << "transfer" << " " + << std::setw(ENTRY_SPACE) << "time [s]" << " " << std::setw(ENTRY_SPACE) << "B/s" << std::endl; for (const auto& timing : collected_timings) { From 57ac05745b41e4c5c87d63fc475c99012a73cccb Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 26 Apr 2023 16:19:05 +0200 Subject: [PATCH 275/318] Fix b_eff ACCL PL kernel recv --- b_eff/src/device/communication_ACCL_pl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/b_eff/src/device/communication_ACCL_pl.cpp b/b_eff/src/device/communication_ACCL_pl.cpp index 4d4548c5..c32a3af5 100644 --- a/b_eff/src/device/communication_ACCL_pl.cpp +++ b/b_eff/src/device/communication_ACCL_pl.cpp @@ -48,7 +48,7 @@ void send_recv(ap_uint<64> read_buffer,ap_uint<64> write_buffer, ap_uint<32> si accl.start_call( ACCL_RECV, size, communicator_addr, neighbor_rank, 0, 0, datapath_cfg, 0, 0, - 0, write_buffer, 0); + 0, 0, write_buffer); ap_wait(); accl.finalize_call(); } From cbdc76e81c7c04a45c79c2dadd9f7a17d563d05b Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 26 Apr 2023 16:19:49 +0200 Subject: [PATCH 276/318] Fix ACCL PL host code to pass validation --- b_eff/src/host/execution_types/execution_accl_pl.hpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp index ab765de7..1e8eee04 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp @@ -65,7 +65,7 @@ namespace network::execution_types::accl_pl { MPI_Comm_size(MPI_COMM_WORLD, & current_size); hlslib::Stream cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo"); - hlslib::Stream cmd, sts; + hlslib::Stream cmd("cmd"), sts("sts"); std::vector dest = {0}; std::unique_ptr cclo; @@ -100,11 +100,11 @@ namespace network::execution_types::accl_pl { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); if (!config.programSettings->useAcclEmulation) { - auto run = sendrecvKernel(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_bytes, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + auto run = sendrecvKernel(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32})); - run.wait(); + run.wait(); } else { - send_recv(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_bytes, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + send_recv(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}), cmd, sts); } @@ -130,9 +130,7 @@ namespace network::execution_types::accl_pl { // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { - if (!config.programSettings->useAcclEmulation) { - acclRecvBuffers[r]->sync_from_device(); - } + acclRecvBuffers[r]->sync_from_device(); for (int c=0; c < size_in_bytes; c++) { std::cout << int(recvBufferContents[r][c]) << ","; } From 77db40f6e478e505f4937a043ad7ae39c4409788 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 26 Apr 2023 16:24:32 +0200 Subject: [PATCH 277/318] Remove debug output --- b_eff/src/host/execution_types/execution_accl_pl.hpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp index 1e8eee04..5df86d1e 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp @@ -131,10 +131,6 @@ namespace network::execution_types::accl_pl { // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { acclRecvBuffers[r]->sync_from_device(); - for (int c=0; c < size_in_bytes; c++) { - std::cout << int(recvBufferContents[r][c]) << ","; - } - std::cout << std::endl; std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]); } return network::ExecutionTimings{ From 881fdc14b49056ca5d5a235368001bd343abccfb Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 27 Apr 2023 18:36:50 +0200 Subject: [PATCH 278/318] Change default rxbuf banks to 2,3 --- shared/setup/fpga_setup_accl.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 36561553..4e293910 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -122,10 +122,11 @@ ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program, 64 * 1024 * 1024, ACCL::dataType::int8, device, network_krnl.group_id(4))); configure_tcp(*accl.tx_buf_network, *accl.rx_buf_network, network_krnl, ranks, current_rank); } - std::vector mem(1, 0); + std::vector mem = {2, 3}; std::cout << "Create ACCL" << std::endl; accl.accl = std::unique_ptr( - new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, mem, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize)); + new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, + mem, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize, programSettings.acclBufferSize)); } else { // TODO: Add start port here. Currenty hardcoded! accl.accl = std::unique_ptr( From 39924f967ae7a9bbdce204e427198e0e9e9d7530 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 27 Apr 2023 18:37:34 +0200 Subject: [PATCH 279/318] Set send and recv buffer to different banks ACCL --- b_eff/src/host/execution_types/execution_accl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp index b678e6fb..3d5f41e5 100644 --- a/b_eff/src/host/execution_types/execution_accl.hpp +++ b/b_eff/src/host/execution_types/execution_accl.hpp @@ -68,8 +68,8 @@ namespace network::execution_types::accl { for (int r = 0; r < config.programSettings->kernelReplications; r++) { dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); - acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32)); - acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32)); + acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 0)); + acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 1)); acclSendBuffers.back()->sync_to_device(); acclRecvBuffers.back()->sync_to_device(); } From d027197d97b076c272eafd5f797567ee49700170 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 27 Apr 2023 18:37:42 +0200 Subject: [PATCH 280/318] Set send and recv buffer to different banks ACCL PL --- b_eff/src/host/execution_types/execution_accl_pl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp index 5df86d1e..9135ec84 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp @@ -84,8 +84,8 @@ namespace network::execution_types::accl_pl { for (int r = 0; r < config.programSettings->kernelReplications; r++) { dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); - acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32)); - acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32)); + acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 0)); + acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 1)); acclSendBuffers.back()->sync_to_device(); acclRecvBuffers.back()->sync_to_device(); } From b0ca5a8ced121c6d16aa3aa95a88e0c0d48c96fe Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 28 Apr 2023 16:10:40 +0200 Subject: [PATCH 281/318] Fix JSON dump feature --- shared/hpcc_settings.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/shared/hpcc_settings.cpp b/shared/hpcc_settings.cpp index 136534ff..8cbd2319 100644 --- a/shared/hpcc_settings.cpp +++ b/shared/hpcc_settings.cpp @@ -19,6 +19,7 @@ hpcc_base::BaseSettings::BaseSettings(cxxopts::ParseResult &results) : numRepeti defaultPlatform(results["platform"].as()), defaultDevice(results["device"].as()), kernelFileName(results["f"].as()), + dumpfilePath(results["dump-json"].as()), #ifdef NUM_REPLICATIONS kernelReplications(results.count("r") > 0 ? results["r"].as() : NUM_REPLICATIONS), #else @@ -57,4 +58,4 @@ hpcc_base::BaseSettings::getSettingsMap() { return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"}, {"Communication Type", commToString(communicationType)}}; -} \ No newline at end of file +} From a7d417b24eb311376c3e291e736ed1b781b2dc8e Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 5 May 2023 11:40:04 +0200 Subject: [PATCH 282/318] Allow compilation iwth new cl header with Xilinx --- b_eff/src/host/network_benchmark.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index 12b1f51f..a017aa2c 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -31,7 +31,7 @@ SOFTWARE. #include "hpcc_benchmark.hpp" #include "parameters.h" -#ifdef XILINX_FPGA +#ifdef USE_DEPRECATED_HPP_HEADER template struct aligned_allocator { From 71b9b575d89b08940951c7ef3aef517e38136704 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 15 May 2023 20:29:21 +0200 Subject: [PATCH 283/318] Convert replication to Jinja2 --- PTRANS/src/device/transpose_PQ_ACCL_stream.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp index 739792e0..15a16edf 100644 --- a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp +++ b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp @@ -15,6 +15,7 @@ const unsigned int block_size = BLOCK_SIZE; const unsigned int channel_width = CHANNEL_WIDTH; // PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)] +{% for i in range(num_replications) %} /** * Read blocks of matrix A and transpose them in memory. @@ -34,7 +35,7 @@ const unsigned int channel_width = CHANNEL_WIDTH; * @param width_in_blocks The with of matrix A in blocks * @param height_in_blocks The height of matix A in blocks */ -void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, +void transpose_read{{ i }}( const DEVICE_DATA_TYPE *A, const unsigned int offset_a, const unsigned int number_of_blocks, const unsigned int width_in_blocks, @@ -145,7 +146,7 @@ void transpose_read/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A, * @param width_in_blocks The with of matrix A in blocks * @param height_in_blocks The height of matix A in blocks */ -void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B, +void transpose_write{{ i }}(const DEVICE_DATA_TYPE *B, DEVICE_DATA_TYPE *A_out, const unsigned int offset_b, const unsigned int number_of_blocks, @@ -194,5 +195,5 @@ void transpose_write/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE *B, } } -// PY_CODE_GEN block_end +{% endfor %} From c3bfb08b6072724ce34182e3b886ebd6baab3716 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 23 May 2023 18:53:21 +0200 Subject: [PATCH 284/318] Send data to stream for full-duplex --- .../device/communication_ACCL_pl_stream.cpp | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 b_eff/src/device/communication_ACCL_pl_stream.cpp diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp new file mode 100644 index 00000000..f22913ef --- /dev/null +++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp @@ -0,0 +1,63 @@ +/* +Copyright (c) 2022 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#include "accl_hls.h" + + +void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, + ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, + STREAM &data_in, STREAM &data_out, + STREAM &cmd, STREAM &sts) { +#pragma HLS INTERFACE s_axilite port=read_buffer +#pragma HLS INTERFACE m_axi port=write_buffer +#pragma HLS INTERFACE s_axilite port=size +#pragma HLS INTERFACE s_axilite port=num_iterations +#pragma HLS INTERFACE s_axilite port=neighbor_rank +#pragma HLS INTERFACE s_axilite port=communicator_addr +#pragma HLS INTERFACE s_axilite port=datapath_cfg +#pragma HLS INTERFACE axis port=data_in +#pragma HLS INTERFACE axis port=data_out +#pragma HLS INTERFACE axis port=cmd +#pragma HLS INTERFACE axis port=sts +#pragma HLS INTERFACE s_axilite port=return + accl_hls::ACCLCommand accl(cmd, sts); + for (int i = 0; i < num_iterations; i++) { + #pragma HLS protocol fixed + // Send data from global memory to the remote FPGA. + // Remote FPGA will immediatly move data to stream. + // This will allow overlapping of send and recv. + accl.start_call( + ACCL_SEND, size, communicator_addr, neighbor_rank, 0, 0, + datapath_cfg, 0, 2, + read_buffer, 0, 0); + ap_wait(); + // receive the incoming data while send may still be in progress + for (int chunk = 0; chunk < (size + 15) / 16 ; chunk++) { + #pragma HLS pipeline II=1 + stream_word word = data_in.read(); + write_buffer[chunk] = word.data; + } + // Wait to complete send + accl.finalize_call(); + ap_wait(); + } +} + From 7168b67a97ddb6e3f9d624c98de1bbfec54ad3e6 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 23 May 2023 18:54:18 +0200 Subject: [PATCH 285/318] Use streaming ACCL PL as default --- b_eff/src/host/CMakeLists.txt | 2 +- b_eff/src/host/execution_types/execution.hpp | 2 +- .../execution_accl_pl_stream.hpp | 145 ++++++++++++++++++ 3 files changed, 147 insertions(+), 2 deletions(-) create mode 100644 b_eff/src/host/execution_types/execution_accl_pl_stream.hpp diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt index e5e09aed..ac11320e 100755 --- a/b_eff/src/host/CMakeLists.txt +++ b/b_eff/src/host/CMakeLists.txt @@ -23,7 +23,7 @@ if (USE_ACCL) set(CMAKE_SKIP_BUILD_RPATH No) set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes) list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib) - list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl.cpp) + list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl_stream.cpp) endif() add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp index 17c3241c..86aec21c 100644 --- a/b_eff/src/host/execution_types/execution.hpp +++ b/b_eff/src/host/execution_types/execution.hpp @@ -29,5 +29,5 @@ SOFTWARE. #endif #else #include "execution_types/execution_accl.hpp" -#include "execution_types/execution_accl_pl.hpp" +#include "execution_types/execution_accl_pl_stream.hpp" #endif diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp new file mode 100644 index 00000000..934f0d68 --- /dev/null +++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp @@ -0,0 +1,145 @@ +/* +Copyright (c) 2022 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_PL_STREAM_HPP +#define SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_PL_STREAM_HPP + +/* C++ standard library headers */ +#include +#include +#include + +/* External library headers */ +#include "mpi.h" +#include "accl.hpp" +#include "cclo_bfm.h" +#include "accl_hls.h" + +/* Project's headers */ + +extern void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, + ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, + STREAM &data_in, STREAM &data_out, STREAM &cmd, STREAM &sts); + +namespace network::execution_types::accl_pl { + + + /* + Implementation for the single kernel. + @copydoc bm_execution::calculate() + */ + template + network::ExecutionTimings + calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, + cl::vector &validationData) { + + int err; + std::vector> dummyBufferContents; + std::vector> recvBufferContents; + std::vector>> acclSendBuffers; + std::vector>> acclRecvBuffers; + cl_uint size_in_bytes = (1 << messageSize); + + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + + int current_size; + MPI_Comm_size(MPI_COMM_WORLD, & current_size); + + hlslib::Stream cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo"); + hlslib::Stream cmd("cmd"), sts("sts"); + + std::vector dest = {0}; + std::unique_ptr cclo; + if (config.programSettings->useAcclEmulation) { + cclo = std::make_unique(6000, current_rank, current_size, dest, cmd, sts, cclo2krnl, krnl2cclo); + cclo->run(); + } + MPI_Barrier(MPI_COMM_WORLD); + + std::vector calculationTimings; + for (uint r =0; r < config.programSettings->numRepetitions; r++) { + acclSendBuffers.clear(); + acclRecvBuffers.clear(); + int size_in_values = (size_in_bytes + 3) / 4; + // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); + recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); + acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 0)); + acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 1)); + acclSendBuffers.back()->sync_to_device(); + acclRecvBuffers.back()->sync_to_device(); + } + + xrt::kernel sendrecvKernel; + if (!config.programSettings->useAcclEmulation) { + sendrecvKernel = xrt::kernel(*config.device, *config.program, "send_recv_stream"); + } + + double calculationTime = 0.0; + for (int i = 0; i < config.programSettings->kernelReplications; i++) { + MPI_Barrier(MPI_COMM_WORLD); + auto startCalculation = std::chrono::high_resolution_clock::now(); + if (!config.programSettings->useAcclEmulation) { + auto run = sendrecvKernel(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32})); + run.wait(); + } else { + send_recv_stream(acclSendBuffers[i]->physical_address(), reinterpret_cast*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}), + cclo2krnl, krnl2cclo, cmd, sts); + } + auto endCalculation = std::chrono::high_resolution_clock::now(); + calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); + #ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl; + #endif + } + calculationTimings.push_back(calculationTime); +#ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Done " << r << std::endl; +#endif + } + + if (config.programSettings->useAcclEmulation) { + cclo->stop(); + } + // Read validation data from FPGA will be placed sequentially in buffer for all replications + // The data order should not matter, because every byte should have the same value! + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + acclRecvBuffers[r]->sync_from_device(); + std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]); + } + return network::ExecutionTimings{ + looplength, + messageSize, + calculationTimings + }; + } + +} // namespace bm_execution + +#endif From b536c3a03050886e4fccddd593bc50e5f2660872 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 23 May 2023 19:09:06 +0200 Subject: [PATCH 286/318] Use stream design for kernel build --- b_eff/src/device/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt index e878e9c5..865cb249 100644 --- a/b_eff/src/device/CMakeLists.txt +++ b/b_eff/src/device/CMakeLists.txt @@ -18,7 +18,8 @@ endif() if (Vitis_FOUND) generate_kernel_targets_xilinx(communication_PCIE) if (USE_ACCL) - generate_kernel_targets_xilinx(communication_ACCL communication_ACCL_pl) + generate_kernel_targets_xilinx(communication_ACCL + communication_ACCL_pl_stream) endif() add_test(NAME test_emulation_pcie_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 10 -m 0 -n 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}) From 7ba851d9e25e21e565de59b04beea03d09c08ea1 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 24 May 2023 10:38:50 +0200 Subject: [PATCH 287/318] Force data_out to be master --- b_eff/src/device/communication_ACCL_pl_stream.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp index f22913ef..2e3a6089 100644 --- a/b_eff/src/device/communication_ACCL_pl_stream.cpp +++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp @@ -38,6 +38,17 @@ void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer, ap_ui #pragma HLS INTERFACE axis port=cmd #pragma HLS INTERFACE axis port=sts #pragma HLS INTERFACE s_axilite port=return + + // This is just dummycode to define data_out as + // master AXI stream. There seems to be no interface pragma to do this + // and if it isn't done, the stream is implemented as slave and throw an + // error during synthesis. + if (false) { + stream_word tmp; + data_out.write(tmp); + } + + accl_hls::ACCLCommand accl(cmd, sts); for (int i = 0; i < num_iterations; i++) { #pragma HLS protocol fixed From 233c4e684bf41cfe59ad4d9a5faa3b91dedfaa5b Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 24 May 2023 13:54:04 +0200 Subject: [PATCH 288/318] Add loopback for reduce kernel --- .../src/device/communication_ACCL_pl_stream.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp index 2e3a6089..5db824f5 100644 --- a/b_eff/src/device/communication_ACCL_pl_stream.cpp +++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp @@ -72,3 +72,19 @@ void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer, ap_ui } } +void loopback_reduce(STREAM & in0, STREAM & in1, STREAM & out) { +#pragma HLS INTERFACE axis register both port=in0 +#pragma HLS INTERFACE axis register both port=in1 +#pragma HLS INTERFACE axis register both port=out +#pragma HLS INTERFACE ap_ctrl_none port=return + +stream_word tmp; + +do{ +#pragma HLS PIPELINE II=1 + tmp = in0.read(); + tmp = in1.read(); + out.write(tmp); +} while(tmp.last == 0); + +} From d58a6b6e2c37c12f465a0c8c3d5fb1dcec6742a7 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 24 May 2023 15:58:51 +0200 Subject: [PATCH 289/318] Attempt to not ptimize away write --- b_eff/src/device/communication_ACCL_pl_stream.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp index 5db824f5..d88f10d0 100644 --- a/b_eff/src/device/communication_ACCL_pl_stream.cpp +++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp @@ -43,7 +43,7 @@ void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer, ap_ui // master AXI stream. There seems to be no interface pragma to do this // and if it isn't done, the stream is implemented as slave and throw an // error during synthesis. - if (false) { + if (num_iterations == 0) { stream_word tmp; data_out.write(tmp); } From b0ff73fb2b39c47fbf041c88c151cf5bf1e1cf6a Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 25 May 2023 18:22:55 +0200 Subject: [PATCH 290/318] Modify kernel to read and write over stream --- .../device/communication_ACCL_pl_stream.cpp | 78 +++++++++++-------- .../execution_accl_pl_stream.hpp | 10 ++- 2 files changed, 52 insertions(+), 36 deletions(-) diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp index d88f10d0..060efbba 100644 --- a/b_eff/src/device/communication_ACCL_pl_stream.cpp +++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp @@ -22,12 +22,50 @@ SOFTWARE. #include "accl_hls.h" -void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, +void +write_data(ap_uint<512>* read_buffer, ap_uint<32> size, STREAM &data_out) { + // receive the incoming data while send may still be in progress + for (int chunk = 0; chunk < (size + 15) / 16; chunk++) { + #pragma HLS pipeline II=1 + stream_word word; + word.last = 1; + word.keep = -1; + word.dest = 0; + word.data = read_buffer[chunk]; + data_out.write(word); + } +} + +void +read_data(ap_uint<512>* write_buffer, ap_uint<32> size, STREAM &data_in, ap_uint<32> neighbor_rank, + ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &cmd, STREAM &sts) { + #pragma HLS protocol fixed + // Send data from stream to the remote FPGA. + // Remote FPGA will immediatly move data to stream. + // This will allow overlapping of send and recv. + accl_hls::ACCLCommand accl(cmd, sts); + accl.start_call( + ACCL_SEND, size, communicator_addr, neighbor_rank, 0, 0, + datapath_cfg, 0, 3, + 0, 0, 0); + ap_wait(); + // receive the incoming data while send may still be in progress + for (int chunk = 0; chunk < (size + 15) / 16 ; chunk++) { + #pragma HLS pipeline II=1 + stream_word word = data_in.read(); + write_buffer[chunk] = word.data; + } + ap_wait(); + accl.finalize_call(); +} + + +void send_recv_stream(ap_uint<512>* read_buffer,ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &data_in, STREAM &data_out, STREAM &cmd, STREAM &sts) { -#pragma HLS INTERFACE s_axilite port=read_buffer -#pragma HLS INTERFACE m_axi port=write_buffer +#pragma HLS INTERFACE m_axi port=read_buffer bundle=read +#pragma HLS INTERFACE m_axi port=write_buffer bundle=write #pragma HLS INTERFACE s_axilite port=size #pragma HLS INTERFACE s_axilite port=num_iterations #pragma HLS INTERFACE s_axilite port=neighbor_rank @@ -38,37 +76,13 @@ void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer, ap_ui #pragma HLS INTERFACE axis port=cmd #pragma HLS INTERFACE axis port=sts #pragma HLS INTERFACE s_axilite port=return - - // This is just dummycode to define data_out as - // master AXI stream. There seems to be no interface pragma to do this - // and if it isn't done, the stream is implemented as slave and throw an - // error during synthesis. - if (num_iterations == 0) { - stream_word tmp; - data_out.write(tmp); - } - - accl_hls::ACCLCommand accl(cmd, sts); for (int i = 0; i < num_iterations; i++) { - #pragma HLS protocol fixed - // Send data from global memory to the remote FPGA. - // Remote FPGA will immediatly move data to stream. - // This will allow overlapping of send and recv. - accl.start_call( - ACCL_SEND, size, communicator_addr, neighbor_rank, 0, 0, - datapath_cfg, 0, 2, - read_buffer, 0, 0); - ap_wait(); - // receive the incoming data while send may still be in progress - for (int chunk = 0; chunk < (size + 15) / 16 ; chunk++) { - #pragma HLS pipeline II=1 - stream_word word = data_in.read(); - write_buffer[chunk] = word.data; - } - // Wait to complete send - accl.finalize_call(); - ap_wait(); + #pragma HLS dataflow + + write_data(read_buffer, size, data_out); + + read_data(write_buffer, size, data_in, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts); } } diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp index 934f0d68..0801a23c 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp @@ -35,7 +35,7 @@ SOFTWARE. /* Project's headers */ -extern void send_recv_stream(ap_uint<64> read_buffer,ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, +extern void send_recv_stream(ap_uint<512>* read_buffer,ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &data_in, STREAM &data_out, STREAM &cmd, STREAM &sts); @@ -100,11 +100,11 @@ namespace network::execution_types::accl_pl { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); if (!config.programSettings->useAcclEmulation) { - auto run = sendrecvKernel(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + auto run = sendrecvKernel(*acclSendBuffers[i]->bo(), *acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32})); run.wait(); } else { - send_recv_stream(acclSendBuffers[i]->physical_address(), reinterpret_cast*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + send_recv_stream(reinterpret_cast*>(acclSendBuffers[i]->buffer()), reinterpret_cast*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}), cclo2krnl, krnl2cclo, cmd, sts); } @@ -130,7 +130,9 @@ namespace network::execution_types::accl_pl { // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! for (int r = 0; r < config.programSettings->kernelReplications; r++) { - acclRecvBuffers[r]->sync_from_device(); + if (!config.programSettings->useAcclEmulation) { + acclRecvBuffers[r]->sync_from_device(); + } std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]); } return network::ExecutionTimings{ From fe2e3aed07b9b517a2ecca72115780accf88faf8 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 26 May 2023 14:48:57 +0200 Subject: [PATCH 291/318] Fix gmem port names --- b_eff/src/device/communication_ACCL_pl_stream.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp index 060efbba..44e05826 100644 --- a/b_eff/src/device/communication_ACCL_pl_stream.cpp +++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp @@ -64,8 +64,8 @@ void send_recv_stream(ap_uint<512>* read_buffer,ap_uint<512>* write_buffer, ap_ ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &data_in, STREAM &data_out, STREAM &cmd, STREAM &sts) { -#pragma HLS INTERFACE m_axi port=read_buffer bundle=read -#pragma HLS INTERFACE m_axi port=write_buffer bundle=write +#pragma HLS INTERFACE m_axi port=read_buffer bundle=gmem_in +#pragma HLS INTERFACE m_axi port=write_buffer bundle=gmem_out #pragma HLS INTERFACE s_axilite port=size #pragma HLS INTERFACE s_axilite port=num_iterations #pragma HLS INTERFACE s_axilite port=neighbor_rank From da59d220f91319417343459b0a9d452114a4ff23 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 30 May 2023 19:27:51 +0200 Subject: [PATCH 292/318] Set mem bank by group id --- .../device/communication_ACCL_pl_stream.cpp | 10 ++++++---- .../execution_accl_pl_stream.hpp | 20 ++++++++++++------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp index 44e05826..bf728f61 100644 --- a/b_eff/src/device/communication_ACCL_pl_stream.cpp +++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp @@ -39,6 +39,7 @@ write_data(ap_uint<512>* read_buffer, ap_uint<32> size, STREAM &dat void read_data(ap_uint<512>* write_buffer, ap_uint<32> size, STREAM &data_in, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &cmd, STREAM &sts) { + issue_and_recv: { #pragma HLS protocol fixed // Send data from stream to the remote FPGA. // Remote FPGA will immediatly move data to stream. @@ -57,6 +58,7 @@ read_data(ap_uint<512>* write_buffer, ap_uint<32> size, STREAM &dat } ap_wait(); accl.finalize_call(); + } } @@ -71,10 +73,10 @@ void send_recv_stream(ap_uint<512>* read_buffer,ap_uint<512>* write_buffer, ap_ #pragma HLS INTERFACE s_axilite port=neighbor_rank #pragma HLS INTERFACE s_axilite port=communicator_addr #pragma HLS INTERFACE s_axilite port=datapath_cfg -#pragma HLS INTERFACE axis port=data_in -#pragma HLS INTERFACE axis port=data_out -#pragma HLS INTERFACE axis port=cmd -#pragma HLS INTERFACE axis port=sts +#pragma HLS INTERFACE axis register both port=data_in +#pragma HLS INTERFACE axis register both port=data_out +#pragma HLS INTERFACE axis register both port=cmd +#pragma HLS INTERFACE axis register both port=sts #pragma HLS INTERFACE s_axilite port=return for (int i = 0; i < num_iterations; i++) { diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp index 0801a23c..c888e6b7 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp @@ -80,21 +80,27 @@ namespace network::execution_types::accl_pl { acclSendBuffers.clear(); acclRecvBuffers.clear(); int size_in_values = (size_in_bytes + 3) / 4; + + xrt::kernel sendrecvKernel; + if (!config.programSettings->useAcclEmulation) { + sendrecvKernel = xrt::kernel(*config.device, *config.program, "send_recv_stream"); + } // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels for (int r = 0; r < config.programSettings->kernelReplications; r++) { dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); - acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 0)); - acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 1)); + if (!config.programSettings->useAcclEmulation) { + acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 0)); + acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 1)); + } + else { + acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, sendrecvKernel.group_id(0))); + acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, sendrecvKernel.group_id(1))); + } acclSendBuffers.back()->sync_to_device(); acclRecvBuffers.back()->sync_to_device(); } - xrt::kernel sendrecvKernel; - if (!config.programSettings->useAcclEmulation) { - sendrecvKernel = xrt::kernel(*config.device, *config.program, "send_recv_stream"); - } - double calculationTime = 0.0; for (int i = 0; i < config.programSettings->kernelReplications; i++) { MPI_Barrier(MPI_COMM_WORLD); From b08f31836eaf9d19fd11c9f41a8c438fbc7ffd40 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 31 May 2023 18:40:37 +0200 Subject: [PATCH 293/318] Separate logic into three kernels --- .../device/communication_ACCL_pl_stream.cpp | 81 ++++++++++++++----- .../execution_accl_pl_stream.hpp | 43 +++++++--- 2 files changed, 91 insertions(+), 33 deletions(-) diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp index bf728f61..2b9850b0 100644 --- a/b_eff/src/device/communication_ACCL_pl_stream.cpp +++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp @@ -21,6 +21,7 @@ SOFTWARE. */ #include "accl_hls.h" +typedef ap_axiu<1, 0, 0, 0> notify_word; void write_data(ap_uint<512>* read_buffer, ap_uint<32> size, STREAM &data_out) { @@ -37,9 +38,20 @@ write_data(ap_uint<512>* read_buffer, ap_uint<32> size, STREAM &dat } void -read_data(ap_uint<512>* write_buffer, ap_uint<32> size, STREAM &data_in, ap_uint<32> neighbor_rank, - ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &cmd, STREAM &sts) { - issue_and_recv: { +read_data(ap_uint<512>* write_buffer, ap_uint<32> size, STREAM &data_in) { + // receive the incoming data while send may still be in progress + for (int chunk = 0; chunk < (size + 15) / 16 ; chunk++) { + #pragma HLS pipeline II=1 + stream_word word = data_in.read(); + write_buffer[chunk] = word.data; + } +} + +void +schedule_send(ap_uint<32> size, ap_uint<32> neighbor_rank, + ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, + STREAM &cmd, STREAM &sts) { + send_fixed: { #pragma HLS protocol fixed // Send data from stream to the remote FPGA. // Remote FPGA will immediatly move data to stream. @@ -49,45 +61,70 @@ read_data(ap_uint<512>* write_buffer, ap_uint<32> size, STREAM &dat ACCL_SEND, size, communicator_addr, neighbor_rank, 0, 0, datapath_cfg, 0, 3, 0, 0, 0); - ap_wait(); - // receive the incoming data while send may still be in progress - for (int chunk = 0; chunk < (size + 15) / 16 ; chunk++) { - #pragma HLS pipeline II=1 - stream_word word = data_in.read(); - write_buffer[chunk] = word.data; - } ap_wait(); accl.finalize_call(); } } +void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, + STREAM &data_in, + STREAM ¬ify) { +#pragma HLS INTERFACE m_axi port=write_buffer bundle=gmem_out +#pragma HLS INTERFACE s_axilite port=size +#pragma HLS INTERFACE s_axilite port=num_iterations +#pragma HLS INTERFACE s_axilite port=neighbor_rank +#pragma HLS INTERFACE s_axilite port=communicator_addr +#pragma HLS INTERFACE s_axilite port=datapath_cfg +#pragma HLS INTERFACE axis port=data_in +#pragma HLS INTERFACE axis port=cmd +#pragma HLS INTERFACE axis port=sts +#pragma HLS INTERFACE axis port=notify +#pragma HLS INTERFACE s_axilite port=return + + notify_word w; + for (int i = 0; i < num_iterations; i++) { + read_data(write_buffer, size, data_in); + notify.write(w); + } +} -void send_recv_stream(ap_uint<512>* read_buffer,ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, +void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, - STREAM &data_in, STREAM &data_out, - STREAM &cmd, STREAM &sts) { -#pragma HLS INTERFACE m_axi port=read_buffer bundle=gmem_in -#pragma HLS INTERFACE m_axi port=write_buffer bundle=gmem_out + STREAM &cmd, STREAM &sts, + STREAM ¬ify) { #pragma HLS INTERFACE s_axilite port=size #pragma HLS INTERFACE s_axilite port=num_iterations #pragma HLS INTERFACE s_axilite port=neighbor_rank #pragma HLS INTERFACE s_axilite port=communicator_addr #pragma HLS INTERFACE s_axilite port=datapath_cfg -#pragma HLS INTERFACE axis register both port=data_in -#pragma HLS INTERFACE axis register both port=data_out -#pragma HLS INTERFACE axis register both port=cmd -#pragma HLS INTERFACE axis register both port=sts +#pragma HLS INTERFACE axis port=cmd +#pragma HLS INTERFACE axis port=sts +#pragma HLS INTERFACE axis port=notify #pragma HLS INTERFACE s_axilite port=return for (int i = 0; i < num_iterations; i++) { - #pragma HLS dataflow + schedule_send(size, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts); + notify_word w = notify.read(); + } +} - write_data(read_buffer, size, data_out); +void send_stream(ap_uint<512>* read_buffer, ap_uint<32> size, ap_uint<32> num_iterations, + STREAM &data_out) { +#pragma HLS INTERFACE m_axi port=read_buffer bundle=gmem_in +#pragma HLS INTERFACE s_axilite port=size +#pragma HLS INTERFACE s_axilite port=num_iterations +#pragma HLS INTERFACE s_axilite port=neighbor_rank +#pragma HLS INTERFACE s_axilite port=communicator_addr +#pragma HLS INTERFACE s_axilite port=datapath_cfg +#pragma HLS INTERFACE axis port=data_out +#pragma HLS INTERFACE s_axilite port=return - read_data(write_buffer, size, data_in, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts); + for (int i = 0; i < num_iterations; i++) { + write_data(read_buffer, size, data_out); } } + void loopback_reduce(STREAM & in0, STREAM & in1, STREAM & out) { #pragma HLS INTERFACE axis register both port=in0 #pragma HLS INTERFACE axis register both port=in1 diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp index c888e6b7..2b0e9039 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp @@ -34,10 +34,17 @@ SOFTWARE. #include "accl_hls.h" /* Project's headers */ +typedef ap_axiu<1, 0, 0, 0> notify_word; -extern void send_recv_stream(ap_uint<512>* read_buffer,ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, +extern void send_stream(ap_uint<512>* read_buffer, ap_uint<32> size, ap_uint<32> num_iterations, + STREAM &data_out); + +extern void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, + STREAM &data_in, STREAM ¬ify); + +extern void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, - STREAM &data_in, STREAM &data_out, STREAM &cmd, STREAM &sts); + STREAM &cmd, STREAM &sts, STREAM ¬ify); namespace network::execution_types::accl_pl { @@ -66,6 +73,7 @@ namespace network::execution_types::accl_pl { hlslib::Stream cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo"); hlslib::Stream cmd("cmd"), sts("sts"); + hlslib::Stream notify("notify"); std::vector dest = {0}; std::unique_ptr cclo; @@ -81,21 +89,25 @@ namespace network::execution_types::accl_pl { acclRecvBuffers.clear(); int size_in_values = (size_in_bytes + 3) / 4; - xrt::kernel sendrecvKernel; + xrt::kernel sendKernel; + xrt::kernel recvKernel; + xrt::kernel scheduleKernel; if (!config.programSettings->useAcclEmulation) { - sendrecvKernel = xrt::kernel(*config.device, *config.program, "send_recv_stream"); + sendKernel = xrt::kernel(*config.device, *config.program, "send_stream"); + recvKernel = xrt::kernel(*config.device, *config.program, "recv_stream"); + scheduleKernel = xrt::kernel(*config.device, *config.program, "schedule_stream"); } // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels for (int r = 0; r < config.programSettings->kernelReplications; r++) { dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); - if (!config.programSettings->useAcclEmulation) { + if (config.programSettings->useAcclEmulation) { acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 0)); acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 1)); } else { - acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, sendrecvKernel.group_id(0))); - acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, sendrecvKernel.group_id(1))); + acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, sendKernel.group_id(0))); + acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, recvKernel.group_id(0))); } acclSendBuffers.back()->sync_to_device(); acclRecvBuffers.back()->sync_to_device(); @@ -106,13 +118,22 @@ namespace network::execution_types::accl_pl { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); if (!config.programSettings->useAcclEmulation) { - auto run = sendrecvKernel(*acclSendBuffers[i]->bo(), *acclRecvBuffers[i]->bo(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength); + auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength); + MPI_Barrier(MPI_COMM_WORLD); + auto run_schedule = scheduleKernel(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32})); - run.wait(); + run_send.wait(); + run_recv.wait(); + run_schedule.wait(); } else { - send_recv_stream(reinterpret_cast*>(acclSendBuffers[i]->buffer()), reinterpret_cast*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + send_stream(reinterpret_cast*>(acclSendBuffers[i]->buffer()), size_in_values, looplength, + krnl2cclo); + schedule_stream(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}), - cclo2krnl, krnl2cclo, cmd, sts); + cmd, sts, notify); + recv_stream(reinterpret_cast*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, + cclo2krnl, notify); } auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); From cddae806168fbe399dcdd177c73364b08593d444 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 31 May 2023 22:20:22 +0200 Subject: [PATCH 294/318] Remove unused interface pragmas --- b_eff/src/device/communication_ACCL_pl_stream.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp index 2b9850b0..fb1e2ee1 100644 --- a/b_eff/src/device/communication_ACCL_pl_stream.cpp +++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp @@ -72,12 +72,7 @@ void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_ #pragma HLS INTERFACE m_axi port=write_buffer bundle=gmem_out #pragma HLS INTERFACE s_axilite port=size #pragma HLS INTERFACE s_axilite port=num_iterations -#pragma HLS INTERFACE s_axilite port=neighbor_rank -#pragma HLS INTERFACE s_axilite port=communicator_addr -#pragma HLS INTERFACE s_axilite port=datapath_cfg #pragma HLS INTERFACE axis port=data_in -#pragma HLS INTERFACE axis port=cmd -#pragma HLS INTERFACE axis port=sts #pragma HLS INTERFACE axis port=notify #pragma HLS INTERFACE s_axilite port=return @@ -113,9 +108,6 @@ void send_stream(ap_uint<512>* read_buffer, ap_uint<32> size, ap_uint<32> num_i #pragma HLS INTERFACE m_axi port=read_buffer bundle=gmem_in #pragma HLS INTERFACE s_axilite port=size #pragma HLS INTERFACE s_axilite port=num_iterations -#pragma HLS INTERFACE s_axilite port=neighbor_rank -#pragma HLS INTERFACE s_axilite port=communicator_addr -#pragma HLS INTERFACE s_axilite port=datapath_cfg #pragma HLS INTERFACE axis port=data_out #pragma HLS INTERFACE s_axilite port=return From ae34f5c04f18896570b14c477b1550117f2f5ecf Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 1 Jun 2023 11:43:01 +0200 Subject: [PATCH 295/318] Thread emulation --- .../execution_types/execution_accl_pl_stream.hpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp index 2b0e9039..f6805bad 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp @@ -26,6 +26,7 @@ SOFTWARE. #include #include #include +#include /* External library headers */ #include "mpi.h" @@ -127,13 +128,16 @@ namespace network::execution_types::accl_pl { run_recv.wait(); run_schedule.wait(); } else { - send_stream(reinterpret_cast*>(acclSendBuffers[i]->buffer()), size_in_values, looplength, - krnl2cclo); - schedule_stream(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + std::thread run_send(send_stream, reinterpret_cast*>(acclSendBuffers[i]->buffer()), size_in_values, looplength, + std::ref(krnl2cclo)); + std::thread run_recv(recv_stream, reinterpret_cast*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, + std::ref(cclo2krnl), std::ref(notify)); + std::thread run_schedule(schedule_stream,size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}), - cmd, sts, notify); - recv_stream(reinterpret_cast*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, - cclo2krnl, notify); + std::ref(cmd), std::ref(sts), std::ref(notify)); + run_send.join(); + run_recv.join(); + run_schedule.join(); } auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); From 060af8d044478d4ad9781967450dbcc004a1cc22 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 1 Jun 2023 14:25:19 +0200 Subject: [PATCH 296/318] Fix protocol for schedule and recv sync --- b_eff/src/device/communication_ACCL_pl_stream.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp index fb1e2ee1..aa2697d0 100644 --- a/b_eff/src/device/communication_ACCL_pl_stream.cpp +++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp @@ -78,7 +78,9 @@ void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_ notify_word w; for (int i = 0; i < num_iterations; i++) { + #pragma HLS protocol fixed read_data(write_buffer, size, data_in); + ap_wait(); notify.write(w); } } @@ -98,7 +100,9 @@ void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, #pragma HLS INTERFACE s_axilite port=return for (int i = 0; i < num_iterations; i++) { + #pragma HLS protocol fixed schedule_send(size, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts); + ap_wait(); notify_word w = notify.read(); } } From 682bff4a327cdeae2e6720b28d85f59b3999a411 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 1 Jun 2023 19:46:59 +0200 Subject: [PATCH 297/318] Fix signatures --- .../execution_types/execution_xrt_accl_pq.hpp | 11 +++++------ .../execution_xrt_accl_stream_pq.hpp | 19 +++++++++---------- .../execution_xrt_accl_stream_pq_sendrecv.hpp | 11 +++++------ .../execution_types/execution_xrt_pcie_pq.hpp | 10 +++++----- 4 files changed, 24 insertions(+), 27 deletions(-) diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp index 3a2111f3..13c7c263 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp @@ -366,7 +366,7 @@ void accl_exchangeData( * @return std::unique_ptr The measured * execution times */ -static std::unique_ptr calculate( +static std::map> calculate( const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, @@ -586,11 +586,10 @@ static std::unique_ptr calculate( transferTimings.push_back(transferTime.count()); } - std::unique_ptr result( - new transpose::TransposeExecutionTimings{transferTimings, - calculationTimings}); - - return result; + std::map> timings; + timings["transfer"] = transferTimings; + timings["calculation"] = calculationTimings; + return timings; } } // namespace accl_pq diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp index 27e240e6..84121480 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp @@ -35,7 +35,7 @@ SOFTWARE. #include "Simulation.h" #include "accl.hpp" -extern void transpose_write(const DEVICE_DATA_TYPE *B, +extern void transpose_write0(const DEVICE_DATA_TYPE *B, DEVICE_DATA_TYPE *A_out, const unsigned int offset_b, const unsigned int number_of_blocks, @@ -43,7 +43,7 @@ extern void transpose_write(const DEVICE_DATA_TYPE *B, const unsigned int height_in_blocks, hlslib::Stream &cclo2krnl); -extern void transpose_read( const DEVICE_DATA_TYPE *A, +extern void transpose_read0( const DEVICE_DATA_TYPE *A, const unsigned int offset_a, const unsigned int number_of_blocks, const unsigned int width_in_blocks, @@ -66,7 +66,7 @@ namespace accl_stream_pq { * @return std::unique_ptr The measured * execution times */ -static std::unique_ptr calculate( +static std::map> calculate( const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, @@ -248,7 +248,7 @@ static std::unique_ptr calculate( (bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)))); } else { - HLSLIB_DATAFLOW_FUNCTION(transpose_read, + HLSLIB_DATAFLOW_FUNCTION(transpose_read0, (config.programSettings->copyA ? data.A : data.A), static_cast(bufferOffsetList[r]), static_cast(blocksPerReplication[r]), @@ -257,7 +257,7 @@ static std::unique_ptr calculate( (bufferSizeList[r]) / (local_matrix_width * data.blockSize * data.blockSize)), krnl2cclo); - HLSLIB_DATAFLOW_FUNCTION(transpose_write, + HLSLIB_DATAFLOW_FUNCTION(transpose_write0, data.B, data.result, static_cast(bufferOffsetList[r]), static_cast(blocksPerReplication[r]), @@ -348,11 +348,10 @@ static std::unique_ptr calculate( transferTimings.push_back(transferTime.count()); } - std::unique_ptr result( - new transpose::TransposeExecutionTimings{transferTimings, - calculationTimings}); - - return result; + std::map> timings; + timings["transfer"] = transferTimings; + timings["calculation"] = calculationTimings; + return timings; } } // namespace accl_pq diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp index 5282b5da..20c9f596 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp @@ -70,7 +70,7 @@ namespace accl_stream_sendrecv_pq { * @return std::unique_ptr The measured * execution times */ -static std::unique_ptr calculate( +static std::map> calculate( const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, @@ -455,11 +455,10 @@ static std::unique_ptr calculate( transferTimings.push_back(transferTime.count()); } - std::unique_ptr result( - new transpose::TransposeExecutionTimings{transferTimings, - calculationTimings}); - - return result; + std::map> timings; + timings["transfer"] = transferTimings; + timings["calculation"] = calculationTimings; + return timings; } } // namespace accl_pq diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp index 0fa0f9c2..f621394a 100644 --- a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp +++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp @@ -49,7 +49,7 @@ namespace pcie_pq { * execution times */ template -static std::unique_ptr calculate( +static std::map> calculate( const hpcc_base::ExecutionSettings &config, transpose::TransposeData &data, @@ -270,11 +270,11 @@ static std::unique_ptr calculate( transferTimings.push_back(transferTime.count()); } - std::unique_ptr result( - new transpose::TransposeExecutionTimings{transferTimings, - calculationTimings}); + std::map> timings; + timings["transfer"] = transferTimings; + timings["calculation"] = calculationTimings; + return timings; - return result; } } // namespace pcie_pq From a512815d6699557c471a4e75983dba3efd1f072f Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 1 Jun 2023 19:47:56 +0200 Subject: [PATCH 298/318] Kernel replication for host side --- PTRANS/src/host/CMakeLists.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt index b9e0541b..554b4a3e 100755 --- a/PTRANS/src/host/CMakeLists.txt +++ b/PTRANS/src/host/CMakeLists.txt @@ -37,7 +37,15 @@ if (Vitis_FOUND) set(CMAKE_SKIP_BUILD_RPATH No) set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes) list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib) - list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp ${CMAKE_SOURCE_DIR}/src/device/transpose_PQ_ACCL_stream.cpp) + list(APPEND kernel_files transpose_PQ_ACCL_stream_sendrecv.cpp transpose_PQ_ACCL_stream.cpp) + foreach (files ${kernel_files}) + set(source_f "${CMAKE_BINARY_DIR}/src/device/${files}") + set(base_file "${CMAKE_SOURCE_DIR}/src/device/${files}") + add_custom_command(OUTPUT ${source_f} + COMMAND ${Python3_EXECUTABLE} ${CODE_GENERATOR} -o ${source_f} -p num_replications=1 -p num_total_replications=1 ${base_file} + MAIN_DEPENDENCY ${base_file}) + list(APPEND HOST_SOURCE ${source_f}) + endforeach() endif() add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) From e3c7266870d96fd437716ae0cf59d98696bb2f05 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 1 Jun 2023 19:48:21 +0200 Subject: [PATCH 299/318] Use custom ACCL branch --- extern/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 0e8bed30..ebeabdf3 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -62,8 +62,8 @@ message(STATUS "ACCL was selected. Fetch ACCL dependencies") FetchContent_Declare( extern_accl - GIT_REPOSITORY https://github.com/Xilinx/ACCL.git - GIT_TAG dev) + GIT_REPOSITORY https://github.com/Mellich/ACCL.git + GIT_TAG modded) FetchContent_GetProperties(extern_accl) if(NOT extern_accl_POPULATED) From 77894e7bccbb6843ca0e07e9254b07fcf7aaaf4d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 5 Jun 2023 17:57:26 +0200 Subject: [PATCH 300/318] Fix validation for multiple repetitions --- b_eff/src/host/execution_types/execution_accl_pl_stream.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp index f6805bad..12d95d5c 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp @@ -88,6 +88,8 @@ namespace network::execution_types::accl_pl { for (uint r =0; r < config.programSettings->numRepetitions; r++) { acclSendBuffers.clear(); acclRecvBuffers.clear(); + dummyBufferContents.clear(); + recvBufferContents.clear(); int size_in_values = (size_in_bytes + 3) / 4; xrt::kernel sendKernel; From 480c4e19b1496468be373fc6529c6ba4987f6ef8 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 5 Jun 2023 18:59:36 +0200 Subject: [PATCH 301/318] Add default mem bank option for ACCL --- cmake/accl.cmake | 2 ++ shared/hpcc_settings.cpp | 2 ++ shared/include/base_parameters.h.in | 2 ++ shared/include/hpcc_settings.hpp | 10 ++++++++++ shared/setup/fpga_setup_accl.cpp | 5 ++--- 5 files changed, 18 insertions(+), 3 deletions(-) diff --git a/cmake/accl.cmake b/cmake/accl.cmake index 7a31a665..01335805 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -5,6 +5,8 @@ set(ACCL_UDP_ETH_IFS 1 CACHE STRING "Number of Ethernet interfaces to synthesize set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform") set(DEFAULT_ACCL_BUFFER_SIZE 8192 CACHE STRING "Size of ACCL buffers in KB") set(DEFAULT_ACCL_BUFFER_COUNT 16 CACHE STRING "Number of ACCL buffers") +set(DEFAULT_ACCL_BUFFER_BANK 0 CACHE STRING "Default memory bank for ACCL buffers") +set(DEFAULT_ACCL_RECV_BUFFER_BANKS 1 CACHE STRING "Memory banks to allocate recevie buffers (can be comma-separated list)") set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware) set(ACCL_CCLO_ADDITIONAL_BUILD_ARGS "" CACHE STRING "Add additional build arguments that will be passed to the CCLO makefile") set(ACCL_CCLO_BUILD_ARGS ${ACCL_CCLO_ADDITIONAL_BUILD_ARGS}) diff --git a/shared/hpcc_settings.cpp b/shared/hpcc_settings.cpp index 8cbd2319..b0f096b8 100644 --- a/shared/hpcc_settings.cpp +++ b/shared/hpcc_settings.cpp @@ -30,6 +30,8 @@ hpcc_base::BaseSettings::BaseSettings(cxxopts::ParseResult &results) : numRepeti acclProtocol(fpga_setup::acclProtocolStringToEnum(results["accl-protocol"].as())), acclBufferSize(results["accl-buffer-size"].as() * 1024), acclBufferCount(results["accl-buffer-count"].as()), + acclRecvBufferMemBanks(results["accl-recv-banks"].as>()), + acclDefaultBank(results["accl-default-bank"].as()), #endif #ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED communicationType(retrieveCommunicationType(results["comm-type"].as(), results["f"].as())), diff --git a/shared/include/base_parameters.h.in b/shared/include/base_parameters.h.in index 45a1100b..2946e7cb 100644 --- a/shared/include/base_parameters.h.in +++ b/shared/include/base_parameters.h.in @@ -13,6 +13,8 @@ #cmakedefine DEFAULT_ACCL_BUFFER_SIZE @DEFAULT_ACCL_BUFFER_SIZE@ #cmakedefine DEFAULT_ACCL_BUFFER_COUNT @DEFAULT_ACCL_BUFFER_COUNT@ #cmakedefine ACCL_STACK_TYPE "@ACCL_STACK_TYPE@" +#cmakedefine DEFAULT_ACCL_RECV_BUFFER_BANKS @DEFAULT_ACCL_RECV_BUFFER_BANKS@ +#cmakedefine DEFAULT_ACCL_BUFFER_BANK @DEFAULT_ACCL_BUFFER_BANK@ /** Output separator diff --git a/shared/include/hpcc_settings.hpp b/shared/include/hpcc_settings.hpp index 7597b68d..39836045 100644 --- a/shared/include/hpcc_settings.hpp +++ b/shared/include/hpcc_settings.hpp @@ -120,6 +120,16 @@ class BaseSettings { * */ uint acclBufferCount; + + /** + * @brief Memory banks used to create ACCL receive buffers + */ + std::vector acclRecvBufferMemBanks; + + /** + * @brief Default bank for memory buffer created with ACCL driver + */ + int acclDefaultBank; #endif /** diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp index 4e293910..fdaeaf7f 100644 --- a/shared/setup/fpga_setup_accl.cpp +++ b/shared/setup/fpga_setup_accl.cpp @@ -122,11 +122,10 @@ ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program, 64 * 1024 * 1024, ACCL::dataType::int8, device, network_krnl.group_id(4))); configure_tcp(*accl.tx_buf_network, *accl.rx_buf_network, network_krnl, ranks, current_rank); } - std::vector mem = {2, 3}; std::cout << "Create ACCL" << std::endl; accl.accl = std::unique_ptr( - new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, 0, - mem, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize, programSettings.acclBufferSize)); + new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, programSettings.acclDefaultBank, + programSettings.acclRecvBufferMemBanks, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize, programSettings.acclBufferSize)); } else { // TODO: Add start port here. Currenty hardcoded! accl.accl = std::unique_ptr( From e94de6cfd87e63a85a6d2e0ee678b9b679926254 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 5 Jun 2023 19:02:40 +0200 Subject: [PATCH 302/318] Add accl buffer options to parser --- shared/include/hpcc_benchmark.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 3b96a4a6..3ffa11b8 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -272,6 +272,10 @@ class HpccFpgaBenchmark { cxxopts::value()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_SIZE))) ("accl-buffer-count", "Specify the number of ACCL buffers used within the benchmark", cxxopts::value()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_COUNT))) + ("accl-default-bank", "Default memory bank used by ACCL to create new FPGA buffers", + cxxopts::value()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_BANK))) + ("accl-recv-banks", "Memory banks used by ACCL for receive buffers", + cxxopts::value()->default_value(std::to_string(DEFAULT_ACCL_RECV_BUFFER_BANKS))) #endif ("skip-validation", "Skip the validation of the output data. This will speed up execution and helps when working with special data types.") ("device", "Index of the device that has to be used. If not given you "\ From b24953224fe36b4132183b75c0c793f8d2a0a1d6 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 12 Jun 2023 11:45:13 +0200 Subject: [PATCH 303/318] Fix ACCL buffer size parameters --- shared/hpcc_settings.cpp | 17 ++++++++++++++++- shared/include/base_parameters.h.in | 2 +- shared/include/hpcc_benchmark.hpp | 4 ++-- shared/include/setup/fpga_setup_accl.hpp | 10 ++++++++++ 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/shared/hpcc_settings.cpp b/shared/hpcc_settings.cpp index b0f096b8..e621ffe6 100644 --- a/shared/hpcc_settings.cpp +++ b/shared/hpcc_settings.cpp @@ -57,7 +57,22 @@ hpcc_base::BaseSettings::getSettingsMap() { if (mpi_size > 0) { str_mpi_ranks = std::to_string(mpi_size); } +#ifdef USE_ACCL + std::stringstream accl_recv_banks; + for (auto& b: acclRecvBufferMemBanks) { + accl_recv_banks << b << ","; + } +#endif return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"}, - {"Communication Type", commToString(communicationType)}}; + {"Communication Type", commToString(communicationType)} +#ifdef USE_ACCL + ,{"ACCL Protocol", fpga_setup::acclEnumToProtocolString(acclProtocol)}, + {"ACCL Recv. Banks", accl_recv_banks.str()}, + {"ACCL Default Bank", std::to_string(acclDefaultBank)}, + {"ACCL Buffer Size", std::to_string(acclBufferSize) + "KB"}, + {"ACCL Buffer Count", std::to_string(acclBufferCount)}, + {"ACCL Emulation", useAcclEmulation ? "Yes" : "No"} +#endif + }; } diff --git a/shared/include/base_parameters.h.in b/shared/include/base_parameters.h.in index 2946e7cb..6915a14c 100644 --- a/shared/include/base_parameters.h.in +++ b/shared/include/base_parameters.h.in @@ -13,7 +13,7 @@ #cmakedefine DEFAULT_ACCL_BUFFER_SIZE @DEFAULT_ACCL_BUFFER_SIZE@ #cmakedefine DEFAULT_ACCL_BUFFER_COUNT @DEFAULT_ACCL_BUFFER_COUNT@ #cmakedefine ACCL_STACK_TYPE "@ACCL_STACK_TYPE@" -#cmakedefine DEFAULT_ACCL_RECV_BUFFER_BANKS @DEFAULT_ACCL_RECV_BUFFER_BANKS@ +#cmakedefine DEFAULT_ACCL_RECV_BUFFER_BANKS "@DEFAULT_ACCL_RECV_BUFFER_BANKS@" #cmakedefine DEFAULT_ACCL_BUFFER_BANK @DEFAULT_ACCL_BUFFER_BANK@ /** diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index 3ffa11b8..f49edd9b 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -273,9 +273,9 @@ class HpccFpgaBenchmark { ("accl-buffer-count", "Specify the number of ACCL buffers used within the benchmark", cxxopts::value()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_COUNT))) ("accl-default-bank", "Default memory bank used by ACCL to create new FPGA buffers", - cxxopts::value()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_BANK))) + cxxopts::value()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_BANK))) ("accl-recv-banks", "Memory banks used by ACCL for receive buffers", - cxxopts::value()->default_value(std::to_string(DEFAULT_ACCL_RECV_BUFFER_BANKS))) + cxxopts::value>()->default_value(DEFAULT_ACCL_RECV_BUFFER_BANKS)) #endif ("skip-validation", "Skip the validation of the output data. This will speed up execution and helps when working with special data types.") ("device", "Index of the device that has to be used. If not given you "\ diff --git a/shared/include/setup/fpga_setup_accl.hpp b/shared/include/setup/fpga_setup_accl.hpp index 0f451ced..fb7d85b3 100644 --- a/shared/include/setup/fpga_setup_accl.hpp +++ b/shared/include/setup/fpga_setup_accl.hpp @@ -50,6 +50,16 @@ static const std::map acclProtocolMap = { {"TCP", ACCL::networkProtocol::TCP} }; +static std::string acclEnumToProtocolString(ACCL::networkProtocol p) { + for (const auto& entry: acclProtocolMap) { + if (entry.second == p) { + return entry.first; + } + } + std::runtime_error("ACCL network protocol could not be parsed to string!"); + return ""; +} + static ACCL::networkProtocol acclProtocolStringToEnum(std::string string_representation) { if (acclProtocolMap.count(string_representation)) { return acclProtocolMap.at(string_representation); From 23303389210bd1101516d88c9525cd796aa77f2f Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 29 Jun 2023 18:06:10 +0200 Subject: [PATCH 304/318] Sleep before actual send --- b_eff/src/host/execution_types/execution_accl_pl_stream.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp index 12d95d5c..bf7b36fd 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp @@ -122,8 +122,10 @@ namespace network::execution_types::accl_pl { auto startCalculation = std::chrono::high_resolution_clock::now(); if (!config.programSettings->useAcclEmulation) { auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength); MPI_Barrier(MPI_COMM_WORLD); + startCalculation = std::chrono::high_resolution_clock::now(); auto run_schedule = scheduleKernel(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32})); run_send.wait(); From d88c5cccaa3ffe70c8055d535fcf5615f5bca004 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 3 Jul 2023 17:26:35 +0200 Subject: [PATCH 305/318] Allow both streaming and buffered --- b_eff/src/device/CMakeLists.txt | 2 +- b_eff/src/host/execution_types/execution.hpp | 1 + .../host/execution_types/execution_accl_pl_stream.hpp | 2 +- b_eff/src/host/network_benchmark.cpp | 11 +++++++++-- b_eff/src/host/network_benchmark.hpp | 5 +++++ 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt index 865cb249..e1b372ea 100644 --- a/b_eff/src/device/CMakeLists.txt +++ b/b_eff/src/device/CMakeLists.txt @@ -18,7 +18,7 @@ endif() if (Vitis_FOUND) generate_kernel_targets_xilinx(communication_PCIE) if (USE_ACCL) - generate_kernel_targets_xilinx(communication_ACCL + generate_kernel_targets_xilinx(communication_ACCL communication_ACCL_pl communication_ACCL_pl_stream) endif() add_test(NAME test_emulation_pcie_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 10 -m 0 -n 1 diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp index 86aec21c..133282ea 100644 --- a/b_eff/src/host/execution_types/execution.hpp +++ b/b_eff/src/host/execution_types/execution.hpp @@ -29,5 +29,6 @@ SOFTWARE. #endif #else #include "execution_types/execution_accl.hpp" +#include "execution_types/execution_accl_pl.hpp" #include "execution_types/execution_accl_pl_stream.hpp" #endif diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp index bf7b36fd..fad46d4d 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp @@ -47,7 +47,7 @@ extern void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &cmd, STREAM &sts, STREAM ¬ify); -namespace network::execution_types::accl_pl { +namespace network::execution_types::accl_pl_stream { /* diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index d4412461..a7e07d3a 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -41,7 +41,7 @@ network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &re pcie_reverse_read_pcie(results["pcie-write"].count()), pcie_reverse_execute_kernel(results["kernel-latency"].count()), pcie_reverse_batch(results["pcie-batch"].count()), pcie_reverse(results["pcie-reverse"].count()) #ifdef USE_ACCL - , accl_from_programable_logic(results["accl-pl"].count()) + , accl_from_programable_logic(results["accl-pl"].count()), accl_axi_stream(results["accl-stream"].count()) #endif { @@ -100,6 +100,7 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options) cxxopts::value()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE))) #ifdef USE_ACCL ("accl-pl", "Use second ACCL command kernel to schedule sends and recevs from PL") + ("accl-stream", "Send and receive data to AXI streams instead of global memory") #endif ("pcie-read", "Use reverse PCIe experiment and measure PCIe read performance from device") ("pcie-write", "Use reverse PCIe experiment and measure PCIe write performance from device") @@ -140,7 +141,13 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) { #endif #else case hpcc_base::CommunicationType::accl: if (!executionSettings->programSettings->accl_from_programable_logic) { timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); - } else { timing = execution_types::accl_pl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);} break; + } else { + if (!executionSettings->programSettings->accl_axi_stream) { + timing = execution_types::accl_pl_stream::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); + } + else { + timing = execution_types::accl_pl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); + }} break; #endif default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType)); } diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp index a017aa2c..52e2a479 100644 --- a/b_eff/src/host/network_benchmark.hpp +++ b/b_eff/src/host/network_benchmark.hpp @@ -167,6 +167,11 @@ class NetworkProgramSettings : public hpcc_base::BaseSettings { */ bool accl_from_programable_logic; + /** + * @brief Forward data to AXI stream instead of global memory to further reduce latency + */ + bool accl_axi_stream; + /** * @brief his is automatically set to true if one of pcie_reverse_write_pcie, pcie_reverse_read_pcie, * or pcie_reverse_execute_kernel is set to true. The reverse PCIe experiment will be executed in that case. From e301a3c4cc4485eb578d77321dd9602bb8974f2d Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 6 Jul 2023 16:13:57 +0200 Subject: [PATCH 306/318] Move barrier behind sleep --- b_eff/src/host/execution_types/execution_accl_pl_stream.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp index fad46d4d..f5ccd1a8 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp @@ -123,8 +123,8 @@ namespace network::execution_types::accl_pl_stream { if (!config.programSettings->useAcclEmulation) { auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength); std::this_thread::sleep_for(std::chrono::milliseconds(100)); - auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength); MPI_Barrier(MPI_COMM_WORLD); + auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength); startCalculation = std::chrono::high_resolution_clock::now(); auto run_schedule = scheduleKernel(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32})); From c23e36a9bb9b7f7b2556294c0f0672bb62a7e082 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 6 Jul 2023 16:14:18 +0200 Subject: [PATCH 307/318] Enable TCP bypassing --- cmake/accl.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/accl.cmake b/cmake/accl.cmake index 01335805..dd00a8b4 100644 --- a/cmake/accl.cmake +++ b/cmake/accl.cmake @@ -61,7 +61,7 @@ add_custom_command( COMMAND mkdir build && cd build && cmake .. -DFDEV_NAME=u280 -DVIVADO_HLS_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 -DVIVADO_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 - -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=0 -DTCP_STACK_WINDOW_SCALING_EN=0 && + -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=1 -DTCP_STACK_WINDOW_SCALING_EN=0 && make installip WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR}) From 0412449fd431ccc76e89ca5e6f77317517161531 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 7 Jul 2023 14:53:26 +0200 Subject: [PATCH 308/318] Fix host linking --- b_eff/src/host/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt index ac11320e..28e92c94 100755 --- a/b_eff/src/host/CMakeLists.txt +++ b/b_eff/src/host/CMakeLists.txt @@ -23,7 +23,8 @@ if (USE_ACCL) set(CMAKE_SKIP_BUILD_RPATH No) set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes) list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib) - list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl_stream.cpp) + list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl_stream.cpp + ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl.cpp) endif() add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE}) target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS}) From a514628286836d213cfdce451b96dd05c46be4b8 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Fri, 11 Aug 2023 16:49:01 +0200 Subject: [PATCH 309/318] Switch back to ACCL dev --- extern/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index ebeabdf3..0e8bed30 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -62,8 +62,8 @@ message(STATUS "ACCL was selected. Fetch ACCL dependencies") FetchContent_Declare( extern_accl - GIT_REPOSITORY https://github.com/Mellich/ACCL.git - GIT_TAG modded) + GIT_REPOSITORY https://github.com/Xilinx/ACCL.git + GIT_TAG dev) FetchContent_GetProperties(extern_accl) if(NOT extern_accl_POPULATED) From f144324481fb42592401a65657cf5f8df0d4fec5 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 14 Aug 2023 19:19:15 +0200 Subject: [PATCH 310/318] Add ACCL stream executor and fix accl-stream flag --- b_eff/src/host/execution_types/execution.hpp | 1 + .../execution_types/execution_accl_stream.hpp | 123 ++++++++++++++++++ b_eff/src/host/network_benchmark.cpp | 11 +- 3 files changed, 132 insertions(+), 3 deletions(-) create mode 100644 b_eff/src/host/execution_types/execution_accl_stream.hpp diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp index 133282ea..0cd828bc 100644 --- a/b_eff/src/host/execution_types/execution.hpp +++ b/b_eff/src/host/execution_types/execution.hpp @@ -29,6 +29,7 @@ SOFTWARE. #endif #else #include "execution_types/execution_accl.hpp" +#include "execution_types/execution_accl_stream.hpp" #include "execution_types/execution_accl_pl.hpp" #include "execution_types/execution_accl_pl_stream.hpp" #endif diff --git a/b_eff/src/host/execution_types/execution_accl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_stream.hpp new file mode 100644 index 00000000..d59afeb4 --- /dev/null +++ b/b_eff/src/host/execution_types/execution_accl_stream.hpp @@ -0,0 +1,123 @@ +/* +Copyright (c) 2022 Marius Meyer + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ +#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_STREAM_HPP +#define SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_STREAM_HPP + +/* C++ standard library headers */ +#include +#include +#include + +/* External library headers */ +#include "mpi.h" +#include "accl.hpp" + +/* Project's headers */ + +namespace network::execution_types::accl_stream { + + /* + Implementation for the single kernel. + @copydoc bm_execution::calculate() + */ + template + network::ExecutionTimings + calculate(hpcc_base::ExecutionSettings const& config, cl_uint messageSize, cl_uint looplength, + cl::vector &validationData) { + + int err; + std::vector> dummyBufferContents; + std::vector> recvBufferContents; + std::vector>> acclSendBuffers; + std::vector>> acclRecvBuffers; + size_t size_in_bytes = std::max((1 << messageSize), 4); + + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + + int current_size; + MPI_Comm_size(MPI_COMM_WORLD, & current_size); + + std::vector calculationTimings; + for (uint r =0; r < config.programSettings->numRepetitions; r++) { + dummyBufferContents.clear(); + recvBufferContents.clear(); + acclSendBuffers.clear(); + acclRecvBuffers.clear(); + int size_in_values = (size_in_bytes + 3) / 4; + // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); + recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); + acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 0)); + acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 1)); + acclSendBuffers.back()->sync_to_device(); + acclRecvBuffers.back()->sync_to_device(); + } + + double calculationTime = 0.0; + for (int i = 0; i < config.programSettings->kernelReplications; i++) { + MPI_Barrier(MPI_COMM_WORLD); + auto startCalculation = std::chrono::high_resolution_clock::now(); + for (int l = 0; l < looplength; l++) { +#ifndef NDEBUG + std::cout << "Stream " << size_in_bytes << " bytes to " + << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl; +#endif + config.context->accl->stream_put(*acclSendBuffers[i], size_in_values, + (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + 0, ACCL::GLOBAL_COMM, true); +#ifndef NDEBUG + std::cout << "Done" << std::endl; +#endif + } + auto endCalculation = std::chrono::high_resolution_clock::now(); + calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); + #ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl; + #endif + } + calculationTimings.push_back(calculationTime); +#ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Done " << r << std::endl; +#endif + } + // Read validation data from FPGA will be placed sequentially in buffer for all replications + // The data order should not matter, because every byte should have the same value! + for (int r = 0; r < config.programSettings->kernelReplications; r++) { + acclRecvBuffers[r]->sync_from_device(); + std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]); + } + return network::ExecutionTimings{ + looplength, + messageSize, + calculationTimings + }; + } + +} // namespace bm_execution + +#endif diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp index a7e07d3a..4058c527 100644 --- a/b_eff/src/host/network_benchmark.cpp +++ b/b_eff/src/host/network_benchmark.cpp @@ -140,13 +140,18 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) { case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break; #endif #else - case hpcc_base::CommunicationType::accl: if (!executionSettings->programSettings->accl_from_programable_logic) { timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); + case hpcc_base::CommunicationType::accl: if (!executionSettings->programSettings->accl_from_programable_logic) { + if (!executionSettings->programSettings->accl_axi_stream) { + timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); + }else { + timing = execution_types::accl_stream::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); + } } else { if (!executionSettings->programSettings->accl_axi_stream) { - timing = execution_types::accl_pl_stream::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); + timing = execution_types::accl_pl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); } else { - timing = execution_types::accl_pl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); + timing = execution_types::accl_pl_stream::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); }} break; #endif default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType)); From 5d7447c11e94b5bf6a65eccf89922ee3418a0cfc Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 15 Aug 2023 11:28:01 +0200 Subject: [PATCH 311/318] Fix non-PL stream --- b_eff/src/device/communication_ACCL_pl_stream.cpp | 6 ++++-- .../host/execution_types/execution_accl_pl_stream.hpp | 6 +++--- b_eff/src/host/execution_types/execution_accl_stream.hpp | 9 +++++++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp index aa2697d0..cdf8d3d6 100644 --- a/b_eff/src/device/communication_ACCL_pl_stream.cpp +++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp @@ -85,7 +85,7 @@ void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_ } } -void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, +void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, uint enable, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &cmd, STREAM &sts, STREAM ¬ify) { @@ -101,7 +101,9 @@ void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, for (int i = 0; i < num_iterations; i++) { #pragma HLS protocol fixed - schedule_send(size, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts); + if (enable) { + schedule_send(size, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts); + } ap_wait(); notify_word w = notify.read(); } diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp index f5ccd1a8..72732615 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp @@ -43,7 +43,7 @@ extern void send_stream(ap_uint<512>* read_buffer, ap_uint<32> size, ap_uint<32> extern void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, STREAM &data_in, STREAM ¬ify); -extern void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, +extern void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, uint enable, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &cmd, STREAM &sts, STREAM ¬ify); @@ -126,7 +126,7 @@ namespace network::execution_types::accl_pl_stream { MPI_Barrier(MPI_COMM_WORLD); auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength); startCalculation = std::chrono::high_resolution_clock::now(); - auto run_schedule = scheduleKernel(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + auto run_schedule = scheduleKernel(size_in_values, looplength, 1, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32})); run_send.wait(); run_recv.wait(); @@ -136,7 +136,7 @@ namespace network::execution_types::accl_pl_stream { std::ref(krnl2cclo)); std::thread run_recv(recv_stream, reinterpret_cast*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, std::ref(cclo2krnl), std::ref(notify)); - std::thread run_schedule(schedule_stream,size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + std::thread run_schedule(schedule_stream,size_in_values, looplength, 1, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}), std::ref(cmd), std::ref(sts), std::ref(notify)); run_send.join(); diff --git a/b_eff/src/host/execution_types/execution_accl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_stream.hpp index d59afeb4..2bae5d9f 100644 --- a/b_eff/src/host/execution_types/execution_accl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_stream.hpp @@ -77,7 +77,13 @@ namespace network::execution_types::accl_stream { double calculationTime = 0.0; for (int i = 0; i < config.programSettings->kernelReplications; i++) { MPI_Barrier(MPI_COMM_WORLD); + auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + MPI_Barrier(MPI_COMM_WORLD); + auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength); auto startCalculation = std::chrono::high_resolution_clock::now(); + auto run_schedule = scheduleKernel(size_in_values, looplength, 0, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32})); for (int l = 0; l < looplength; l++) { #ifndef NDEBUG std::cout << "Stream " << size_in_bytes << " bytes to " @@ -90,6 +96,9 @@ namespace network::execution_types::accl_stream { std::cout << "Done" << std::endl; #endif } + run_send.wait(); + run_recv.wait(); + run_schedule.wait(); auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); #ifndef NDEBUG From f3b56c3afbbe79367ead8e284f74c9db55be3ad2 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Tue, 15 Aug 2023 16:26:54 +0200 Subject: [PATCH 312/318] Change enable data type --- b_eff/src/device/communication_ACCL_pl_stream.cpp | 2 +- b_eff/src/host/execution_types/execution_accl_pl_stream.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp index cdf8d3d6..f57818bf 100644 --- a/b_eff/src/device/communication_ACCL_pl_stream.cpp +++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp @@ -85,7 +85,7 @@ void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_ } } -void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, uint enable, +void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, int enable, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &cmd, STREAM &sts, STREAM ¬ify) { diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp index 72732615..c4027988 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp @@ -43,7 +43,7 @@ extern void send_stream(ap_uint<512>* read_buffer, ap_uint<32> size, ap_uint<32> extern void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, STREAM &data_in, STREAM ¬ify); -extern void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, uint enable, +extern void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, int enable, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &cmd, STREAM &sts, STREAM ¬ify); From f6aa8192a00036a3fd67de16b3ec17c1eb32a8d2 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 16 Aug 2023 11:45:07 +0200 Subject: [PATCH 313/318] Fix ACCL stream executor --- b_eff/src/host/execution_types/execution_accl_stream.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/b_eff/src/host/execution_types/execution_accl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_stream.hpp index 2bae5d9f..8803217c 100644 --- a/b_eff/src/host/execution_types/execution_accl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_stream.hpp @@ -73,7 +73,12 @@ namespace network::execution_types::accl_stream { acclSendBuffers.back()->sync_to_device(); acclRecvBuffers.back()->sync_to_device(); } - + xrt::kernel sendKernel; + xrt::kernel recvKernel; + xrt::kernel scheduleKernel; + sendKernel = xrt::kernel(*config.device, *config.program, "send_stream"); + recvKernel = xrt::kernel(*config.device, *config.program, "recv_stream"); + scheduleKernel = xrt::kernel(*config.device, *config.program, "schedule_stream"); double calculationTime = 0.0; for (int i = 0; i < config.programSettings->kernelReplications; i++) { MPI_Barrier(MPI_COMM_WORLD); From 618f5ef6ad7ff9eeb466485fd0e3766c4f68b5dc Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 16 Aug 2023 12:41:19 +0200 Subject: [PATCH 314/318] Set correct memory banks --- .../execution_types/execution_accl_stream.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_stream.hpp index 8803217c..32dd58fb 100644 --- a/b_eff/src/host/execution_types/execution_accl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_stream.hpp @@ -64,21 +64,21 @@ namespace network::execution_types::accl_stream { acclSendBuffers.clear(); acclRecvBuffers.clear(); int size_in_values = (size_in_bytes + 3) / 4; + xrt::kernel sendKernel; + xrt::kernel recvKernel; + xrt::kernel scheduleKernel; + sendKernel = xrt::kernel(*config.device, *config.program, "send_stream"); + recvKernel = xrt::kernel(*config.device, *config.program, "recv_stream"); + scheduleKernel = xrt::kernel(*config.device, *config.program, "schedule_stream"); // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels for (int r = 0; r < config.programSettings->kernelReplications; r++) { dummyBufferContents.emplace_back(size_in_bytes, static_cast(messageSize & (255))); recvBufferContents.emplace_back(size_in_bytes, static_cast(0)); - acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 0)); - acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 1)); + acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, sendKernel.group_id(0))); + acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, recvKernel.group_id(0))); acclSendBuffers.back()->sync_to_device(); acclRecvBuffers.back()->sync_to_device(); } - xrt::kernel sendKernel; - xrt::kernel recvKernel; - xrt::kernel scheduleKernel; - sendKernel = xrt::kernel(*config.device, *config.program, "send_stream"); - recvKernel = xrt::kernel(*config.device, *config.program, "recv_stream"); - scheduleKernel = xrt::kernel(*config.device, *config.program, "schedule_stream"); double calculationTime = 0.0; for (int i = 0; i < config.programSettings->kernelReplications; i++) { MPI_Barrier(MPI_COMM_WORLD); From 34dc2871133059b75a237f6b152d66b8a847ba8a Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Wed, 16 Aug 2023 12:59:23 +0200 Subject: [PATCH 315/318] Change call to stream-to-stream --- b_eff/src/host/execution_types/execution_accl_stream.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/b_eff/src/host/execution_types/execution_accl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_stream.hpp index 32dd58fb..a12436c6 100644 --- a/b_eff/src/host/execution_types/execution_accl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_stream.hpp @@ -94,9 +94,9 @@ namespace network::execution_types::accl_stream { std::cout << "Stream " << size_in_bytes << " bytes to " << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl; #endif - config.context->accl->stream_put(*acclSendBuffers[i], size_in_values, + config.context->accl->stream_put(ACCL::dataType::float32, size_in_values, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, - 0, ACCL::GLOBAL_COMM, true); + 0); #ifndef NDEBUG std::cout << "Done" << std::endl; #endif From 3cdc7321c3bb83f998623c8ba60eca731b8bb9fb Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 24 Aug 2023 17:45:20 +0200 Subject: [PATCH 316/318] Fix some includes for base build --- shared/include/hpcc_benchmark.hpp | 4 ++++ shared/include/hpcc_settings.hpp | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index f49edd9b..4a1c79e0 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -561,9 +561,13 @@ class HpccFpgaBenchmark { #ifndef USE_ACCL context = std::unique_ptr(new bool(false)); #endif +#ifdef USE_ACCL if (!programSettings->useAcclEmulation) { +#endif program = fpga_setup::fpgaSetup(*usedDevice, programSettings->kernelFileName); +#ifdef USE_ACCL } +#endif #endif #ifdef USE_OCL_HOST usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform, diff --git a/shared/include/hpcc_settings.hpp b/shared/include/hpcc_settings.hpp index 39836045..71c7a290 100644 --- a/shared/include/hpcc_settings.hpp +++ b/shared/include/hpcc_settings.hpp @@ -1,11 +1,15 @@ #ifndef HPCC_BASE_SETTINGS_H_ #define HPCC_BASE_SETTINGS_H_ +#ifdef USE_OCL_HOST #ifdef USE_DEPRECATED_HPP_HEADER #include "CL/cl.hpp" #else #include OPENCL_HPP_HEADER #endif +#else +#include "xrt/xrt_device.h" +#endif #include "cxxopts.hpp" #include "parameters.h" #include "communication_types.hpp" From f8b3ce02c377f07e4fb4dfa8c12345e2fcfb2793 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 28 Aug 2023 19:39:05 +0200 Subject: [PATCH 317/318] Change PL scheduling notification --- b_eff/src/device/communication_ACCL_pl_stream.cpp | 12 +++++++----- .../execution_types/execution_accl_pl_stream.hpp | 8 ++++---- .../host/execution_types/execution_accl_stream.hpp | 5 +---- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp index f57818bf..eb68fe8e 100644 --- a/b_eff/src/device/communication_ACCL_pl_stream.cpp +++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp @@ -67,11 +67,13 @@ schedule_send(ap_uint<32> size, ap_uint<32> neighbor_rank, } void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, + ap_uint<32> notify_enabled, STREAM &data_in, STREAM ¬ify) { #pragma HLS INTERFACE m_axi port=write_buffer bundle=gmem_out #pragma HLS INTERFACE s_axilite port=size #pragma HLS INTERFACE s_axilite port=num_iterations +#pragma HLS INTERFACE s_axilite port=notify_enabled #pragma HLS INTERFACE axis port=data_in #pragma HLS INTERFACE axis port=notify #pragma HLS INTERFACE s_axilite port=return @@ -81,11 +83,13 @@ void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_ #pragma HLS protocol fixed read_data(write_buffer, size, data_in); ap_wait(); - notify.write(w); + if (notify_enabled != 0) { + notify.write(w); + } } } -void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, int enable, +void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, STREAM &cmd, STREAM &sts, STREAM ¬ify) { @@ -101,9 +105,7 @@ void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, int enable, for (int i = 0; i < num_iterations; i++) { #pragma HLS protocol fixed - if (enable) { - schedule_send(size, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts); - } + schedule_send(size, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts); ap_wait(); notify_word w = notify.read(); } diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp index c4027988..2b12b6d1 100644 --- a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp @@ -121,12 +121,12 @@ namespace network::execution_types::accl_pl_stream { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); if (!config.programSettings->useAcclEmulation) { - auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength); + auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength, 1); std::this_thread::sleep_for(std::chrono::milliseconds(100)); MPI_Barrier(MPI_COMM_WORLD); auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength); startCalculation = std::chrono::high_resolution_clock::now(); - auto run_schedule = scheduleKernel(size_in_values, looplength, 1, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + auto run_schedule = scheduleKernel(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32})); run_send.wait(); run_recv.wait(); @@ -134,9 +134,9 @@ namespace network::execution_types::accl_pl_stream { } else { std::thread run_send(send_stream, reinterpret_cast*>(acclSendBuffers[i]->buffer()), size_in_values, looplength, std::ref(krnl2cclo)); - std::thread run_recv(recv_stream, reinterpret_cast*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, + std::thread run_recv(recv_stream, reinterpret_cast*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, 1, std::ref(cclo2krnl), std::ref(notify)); - std::thread run_schedule(schedule_stream,size_in_values, looplength, 1, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, + std::thread run_schedule(schedule_stream,size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}), std::ref(cmd), std::ref(sts), std::ref(notify)); run_send.join(); diff --git a/b_eff/src/host/execution_types/execution_accl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_stream.hpp index a12436c6..797b8ca7 100644 --- a/b_eff/src/host/execution_types/execution_accl_stream.hpp +++ b/b_eff/src/host/execution_types/execution_accl_stream.hpp @@ -82,13 +82,11 @@ namespace network::execution_types::accl_stream { double calculationTime = 0.0; for (int i = 0; i < config.programSettings->kernelReplications; i++) { MPI_Barrier(MPI_COMM_WORLD); - auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength); + auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength, 0); std::this_thread::sleep_for(std::chrono::milliseconds(100)); MPI_Barrier(MPI_COMM_WORLD); auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength); auto startCalculation = std::chrono::high_resolution_clock::now(); - auto run_schedule = scheduleKernel(size_in_values, looplength, 0, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, - config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32})); for (int l = 0; l < looplength; l++) { #ifndef NDEBUG std::cout << "Stream " << size_in_bytes << " bytes to " @@ -103,7 +101,6 @@ namespace network::execution_types::accl_stream { } run_send.wait(); run_recv.wait(); - run_schedule.wait(); auto endCalculation = std::chrono::high_resolution_clock::now(); calculationTime += std::chrono::duration_cast>(endCalculation - startCalculation).count(); #ifndef NDEBUG From 052ad9e1d4a5e9659480c729a2ddf309133745fd Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 23 Nov 2023 11:41:19 +0100 Subject: [PATCH 318/318] Fix HPL XRT baseline hostcode signature --- .../host/execution_types/execution_xrt_pcie.hpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp index 77885103..a4de60ad 100644 --- a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp +++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp @@ -48,7 +48,7 @@ namespace xrt_pcie { @copydoc bm_execution::calculate() */ template -std::unique_ptr inline calculate( +std::map> inline calculate( const hpcc_base::ExecutionSettings &config, linpack::LinpackData &data) { @@ -459,13 +459,14 @@ std::unique_ptr inline calculate( Buffer_pivot.sync(XCL_BO_SYNC_BO_FROM_DEVICE); } - std::unique_ptr results( - new linpack::LinpackExecutionTimings{gefaExecutionTimes, - geslExecutionTimes}); - - MPI_Barrier(MPI_COMM_WORLD); + std::map> timings; + + timings["gefa"] = gefaExecutionTimes; + timings["gesl"] = geslExecutionTimes; + + MPI_Barrier(MPI_COMM_WORLD); - return results; + return timings; } } // namespace xrt_pcie