diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000..5b1cfaa3
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,125 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+# https://releases.llvm.org/12.0.1/tools/clang/docs/ClangFormatStyleOptions.html
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveMacros: None
+AlignConsecutiveAssignments: None
+AlignConsecutiveBitFields: None
+AlignConsecutiveDeclarations: None
+AlignEscapedNewlines: Right
+AlignOperands:   Align
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BreakBeforeBinaryOperators: None
+BreakBeforeConceptDeclarations: true
+BreakBeforeBraces: Linux
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     120
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineBeforeAccessModifier: LogicalBlock
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+IncludeBlocks:   Preserve
+IndentCaseLabels: false
+IndentCaseBlocks: false
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentExternBlock: AfterExternBlock
+IndentRequires:  false
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PenaltyIndentedWhitespace: 0
+PointerAlignment: Right
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+BitFieldColonSpacing: Both
+Standard:        Latest
+UseCRLF:         false
+UseTab:          Never
+CommentPragmas:  '^ IWYU pragma:'
+ForEachMacros:
+  - foreach
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+StatementMacros: []
+StatementAttributeLikeMacros:
+  - Q_EMIT
+WhitespaceSensitiveMacros: []
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        1
+    SortPriority:    0
+    CaseSensitive:   false
diff --git a/.gitignore b/.gitignore
index 17305538..e6b8e632 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,9 @@
 # Python virtual environments
 .venv
 
+#YCM config
+.ycm_extra_conf.py
+
 # CMake build directories should be created in the following folder
 *._*
 build/*
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 40ca7a1f..ba21fdbd 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,6 +1,7 @@
 stages:
-    - build
-    - test
+  - check
+  - build
+  - test
 
 variables:
   SCHEDULER_PARAMETERS: "-A pc2-mitarbeiter -p normal -q cont -t 00:30:00 -n 2 -N 1"
@@ -10,294 +11,229 @@ default:
     - jacamar
   before_script:
     - module load fpga/intel/opencl_sdk/21.2.0 fpga/bittware/520n/20.4.0_max toolchain/foss/2021a devel/CMake/3.20.1-GCCcore-10.3.0 lang/Python/3.9.5-GCCcore-10.3.0
-    - python -m pip install pandas
+    - python -m pip install -r scripts/evaluation/requirements.txt
+    - python -m pip install -r scripts/code_generator/requirements.txt
 
 ###
 #
-# Build all benchmarks
+# Build documentation
 #
 ###
 
-build:STREAM:
+build:docs:
   stage: build
   script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../STREAM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/stream_kernels_single_emulate.aocx
-      - build/bin/stream_kernels_emulate.aocx
-      - build/bin/STREAM_FPGA_intel
-      - build/bin/STREAM_FPGA_test_intel
+    - python -m pip install -r docs/requirements.txt
+    - module load devel/Doxygen/1.9.1-GCCcore-10.3.0
+    - cd docs
+    - make html
+    - doxygen doxy.config
   only:
     changes:
-      - STREAM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
+      - docs/**/*
       - .gitlab-ci.yml
-
-build:STREAM_HP:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make -j 40 all
   artifacts:
     paths:
-      - build/bin/stream_kernels_single_emulate.aocx
-      - build/bin/stream_kernels_emulate.aocx
-      - build/bin/STREAM_FPGA_intel
-      - build/bin/STREAM_FPGA_test_intel
-  only:
-    changes:
-      - STREAM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+      - docs/build
+      - docs/xml
 
-build:STREAM_DP:
-  stage: build
+###
+#
+# Check formatting of all benchmarks
+#
+###
+
+.check: &check
+  stage: check
   script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/stream_kernels_single_emulate.aocx
-      - build/bin/stream_kernels_emulate.aocx
-      - build/bin/STREAM_FPGA_intel
-      - build/bin/STREAM_FPGA_test_intel
+    - module load compiler/Clang/13.0.1-GCCcore-11.2.0
+    - find $BENCHMARK_FOLDER -regex '.*\.\(cpp\|hpp\|cc\|cxx\|h\)' -exec clang-format -style=file -i {} \;
+    - git diff | cat
+    ## do not test for real yet
+    #- test -z "$(git status --porcelain)"
+
   only:
     changes:
-      - STREAM/**/*
+      - $BENCHMARK_FOLDER/**/*
       - shared/**/*
       - scripts/**/*
       - cmake/**/*
       - .gitlab-ci.yml
-      
-build:RandomAccess:
+
+check:STREAM:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: STREAM
+
+check:RandomAccess:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: RandomAccess
+
+check:PTRANS:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: PTRANS
+
+check:LINPACK:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: LINPACK
+
+check:GEMM:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: GEMM
+
+check:FFT:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: FFT
+
+check:b_eff:
+  <<: *check
+  variables:
+    BENCHMARK_FOLDER: b_eff
+
+###
+#
+# Build all benchmarks
+#
+###
+
+.build: &build
   stage: build
   script:
     - rm -rf build
     - mkdir -p build
     - cd build
-    - cmake ../RandomAccess -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - cmake ../$BENCHMARK_FOLDER -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 $BENCHMARK_OPTIONS
     - make -j 40 all
   artifacts:
     paths:
-      - build/bin/random_access_kernels_single_emulate.aocx
-      - build/bin/RandomAccess_intel
-      - build/bin/RandomAccess_test_intel
+      - build/bin/*
   only:
     changes:
-      - RandomAccess/**/*
+      - $BENCHMARK_FOLDER/**/*
       - shared/**/*
       - scripts/**/*
       - cmake/**/*
       - .gitlab-ci.yml
 
+build:STREAM:
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: STREAM
+  dependencies:
+    - check:STREAM
+  needs: ["check:STREAM"]
+
+build:STREAM_HP:
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: STREAM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32
+  dependencies:
+    - check:STREAM
+  needs: ["check:STREAM"]
+
+build:STREAM_DP:
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: STREAM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8
+  dependencies:
+    - check:STREAM
+  needs: ["check:STREAM"]
+
+build:RandomAccess:
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: RandomAccess
+  dependencies:
+    - check:RandomAccess
+  needs: ["check:RandomAccess"]
 
 build:PTRANS:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../PTRANS -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DHOST_EMULATION_REORDER=Yes
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/transpose_DIAG_IEC_emulate.aocx
-      - build/bin/transpose_PQ_IEC_emulate.aocx
-      - build/bin/transpose_PQ_PCIE_emulate.aocx
-      - build/bin/transpose_DIAG_PCIE_emulate.aocx
-      - build/bin/transpose_c2_DIAG_IEC_emulate.aocx
-      - build/bin/Transpose_intel
-      - build/bin/Transpose_test_intel
-  only:
-    changes:
-      - PTRANS/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: PTRANS
+    BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes
+  dependencies:
+    - check:PTRANS
+  needs: ["check:PTRANS"]
 
 build:LINPACK:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DUSE_PCIE_MPI_COMMUNICATION=Yes
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/hpl_torus_PCIE_emulate.aocx
-      - build/bin/hpl_torus_IEC_emulate.aocx
-      - build/bin/Linpack_intel
-      - build/bin/Linpack_test_intel
-  only:
-    changes:
-      - LINPACK/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
-
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: LINPACK
+    BENCHMARK_OPTIONS: -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3
+  dependencies:
+    - check:LINPACK
+  needs: ["check:LINPACK"]
 
 build:LINPACK_DP:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/hpl_torus_PCIE_emulate.aocx
-      - build/bin/hpl_torus_IEC_emulate.aocx
-      - build/bin/Linpack_intel
-      - build/bin/Linpack_test_intel
-  only:
-    changes:
-      - LINPACK/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: LINPACK
+    BENCHMARK_OPTIONS: -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
+  dependencies:
+    - check:LINPACK
+  needs: ["check:LINPACK"]
 
 build:GEMM:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../GEMM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/gemm_base_emulate.aocx
-      - build/bin/GEMM_intel
-      - build/bin/GEMM_test_intel
-  only:
-    changes:
-      - GEMM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: GEMM
+    BENCHMARK_OPTIONS: -DBLOCK_SIZE=32
+  dependencies:
+    - check:GEMM
+  needs: ["check:GEMM"]
 
 build:GEMM_HP_REP2:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../GEMM -DDATA_TYPE=half -DNUM_REPLICATIONS=2 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/gemm_base_emulate.aocx
-      - build/bin/GEMM_intel
-      - build/bin/GEMM_test_intel
-  only:
-    changes:
-      - GEMM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: GEMM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=half -DNUM_REPLICATIONS=2 -DBLOCK_SIZE=32
+  dependencies:
+    - check:GEMM
+  needs: ["check:GEMM"]
+  allow_failure: true
 
 build:GEMM_DP_REP2:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../GEMM -DDATA_TYPE=double -DNUM_REPLICATIONS=2 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/gemm_base_emulate.aocx
-      - build/bin/GEMM_intel
-      - build/bin/GEMM_test_intel
-  only:
-    changes:
-      - GEMM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: GEMM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=double -DNUM_REPLICATIONS=2 -DBLOCK_SIZE=32
+  dependencies:
+    - check:GEMM
+  needs: ["check:GEMM"]
 
 build:FFT:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/fft1d_float_8_emulate.aocx
-      - build/bin/FFT_intel
-      - build/bin/FFT_test_intel
-  only:
-    changes:
-      - FFT/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: FFT
+  dependencies:
+    - check:FFT
+  needs: ["check:FFT"]
 
 build:FFT_small:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/fft1d_float_8_emulate.aocx
-      - build/bin/FFT_intel
-      - build/bin/FFT_test_intel
-  only:
-    changes:
-      - FFT/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: FFT
+    BENCHMARK_OPTIONS: -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2
+  dependencies:
+    - check:FFT
+  needs: ["check:FFT"]
 
 build:b_eff:
-  stage: build
-  script:
-    - rm -rf build
-    - mkdir -p build
-    - cd build
-    - cmake ../b_eff -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DHOST_EMULATION_REORDER=Yes
-    - make -j 40 all
-  artifacts:
-    paths:
-      - build/bin/*
-  only:
-    changes:
-      - b_eff/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
+  <<: *build
+  variables:
+    BENCHMARK_FOLDER: b_eff
+    BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes
+  dependencies:
+    - check:b_eff
+  needs: ["check:b_eff"]
 
 ###
 #
@@ -305,300 +241,141 @@ build:b_eff:
 #
 ###
 
-test:STREAM:
+.test: &test
   stage: test
   script:
+    - mkdir -p build
     - cd build
-    - cmake ../STREAM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - cmake ../$BENCHMARK_FOLDER $BENCHMARK_OPTIONS -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
+    - make all -j41
+    - $PREPARE_SCRIPT
     - make CTEST_OUTPUT_ON_FAILURE=1 test
-  dependencies:
-    - build:STREAM
   artifacts:
     when: on_failure
     paths:
       - build/Testing/Temporary/LastTest.log
   only:
     changes:
-      - STREAM/**/*
+      - $BENCHMARK_FOLDER/**/*
       - shared/**/*
       - scripts/**/*
       - cmake/**/*
       - .gitlab-ci.yml
+
+test:STREAM:
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: STREAM
+  dependencies:
+    - build:STREAM
   needs: ["build:STREAM"]
 
 test:STREAM_HP:
-  stage: test
-  script:
-    - cd build
-    - cmake ../STREAM -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: STREAM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=half -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=32
   dependencies:
     - build:STREAM_HP
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - STREAM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:STREAM_HP"]
-  # Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE)
   allow_failure: true
 
 test:STREAM_DP:
-  stage: test
-  script:
-    - cd build
-    - cmake ../STREAM -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8 -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: STREAM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=double -DVECTOR_COUNT=1 -DGLOBAL_MEM_UNROLL=8
   dependencies:
     - build:STREAM_DP
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - STREAM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:STREAM_DP"]
-    
+
 test:RandomAccess:
-  stage: test
-  script:
-    - cd build
-    - cmake ../RandomAccess -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: RandomAccess
   dependencies:
     - build:RandomAccess
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - RandomAccess/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:RandomAccess"]
 
 test:PTRANS:
-  stage: test
-  script:
-    - cd build
-    - cmake ../PTRANS -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DPTRANS_HOST_EMULATION_REORDER=Yes
-    - cd bin
-    - touch kernel_output_ch0
-    - touch kernel_output_ch1
-    - touch kernel_output_ch2
-    - touch kernel_output_ch3
-    - ln -s kernel_output_ch0 kernel_input_ch1
-    - ln -s kernel_output_ch2 kernel_input_ch3
-    - ln -s kernel_output_ch1 kernel_input_ch0
-    - ln -s kernel_output_ch3 kernel_input_ch2
-    - cd ..
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: PTRANS
+    BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes
+    PREPARE_SCRIPT: ../$BENCHMARK_FOLDER/scripts/prepare_tests.sh ./bin
   dependencies:
     - build:PTRANS
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - PTRANS/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:PTRANS"]
 
 test:LINPACK:
-  stage: test
-  script:
-    - cd build
-    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0  -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: LINPACK
+    BENCHMARK_OPTIONS: -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3
   dependencies:
     - build:LINPACK
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - LINPACK/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:LINPACK"]
 
 test:LINPACK_DP:
-  stage: test
-  script:
-    - cd build
-    - cmake ../LINPACK -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0  -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
-    - make CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=1 CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: LINPACK
+    BENCHMARK_OPTIONS: -DLOCAL_MEM_BLOCK_LOG=4 -DREGISTER_BLOCK_LOG=3 -DNUM_REPLICATIONS=3 -DDATA_TYPE=double
   dependencies:
     - build:LINPACK_DP
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - LINPACK/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
   needs: ["build:LINPACK_DP"]
 
 test:GEMM:
-  stage: test
-  script:
-    - cd build
-    - cmake ../GEMM -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: GEMM
+    BENCHMARK_OPTIONS: -DBLOCK_SIZE=32
   dependencies:
     - build:GEMM
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - GEMM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:GEMM"]
 
 test:GEMM_HP_REP2:
-  stage: test
-  script:
-    - cd build
-    - cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=half -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: GEMM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=half -DNUM_REPLICATIONS=2 -DBLOCK_SIZE=32
   dependencies:
     - build:GEMM_HP_REP2
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - GEMM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:GEMM_HP_REP2"]
-  # Allow failure because: The intel emulator does not seem to support half precision kernel arguments (CL_INVALID_ARG_SIZE)
   allow_failure: true
 
 test:GEMM_DP_REP2:
-  stage: test
-  script:
-    - cd build
-    - cmake ../GEMM -DNUM_REPLICATIONS=2 -DATA_TYPE=double -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DBLOCK_SIZE=32
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: GEMM
+    BENCHMARK_OPTIONS: -DDATA_TYPE=double -DNUM_REPLICATIONS=2 -DBLOCK_SIZE=32
   dependencies:
     - build:GEMM_DP_REP2
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - GEMM/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:GEMM_DP_REP2"]
 
 test:FFT:
-  stage: test
-  script:
-    - cd build
-    - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: FFT
   dependencies:
     - build:FFT
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - FFT/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:FFT"]
 
 test:FFT_small:
-  stage: test
-  script:
-    - cd build
-    - cmake ../FFT -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: FFT
+    BENCHMARK_OPTIONS: -DLOG_FFT_SIZE=4 -DNUM_REPLICATIONS=2
   dependencies:
     - build:FFT_small
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - FFT/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:FFT_small"]
 
 test:b_eff:
-  stage: test
-  script:
-    - cd build
-    - cmake ../b_eff -DDEFAULT_PLATFORM=0 -DDEFAULT_DEVICE=0 -DHOST_EMULATION_REORDER=Yes
-    - cd bin
-    - touch kernel_output_ch0
-    - touch kernel_output_ch1
-    - touch kernel_output_ch2
-    - touch kernel_output_ch3
-    - ln -s kernel_output_ch0 kernel_input_ch1
-    - ln -s kernel_output_ch2 kernel_input_ch3
-    - ln -s kernel_output_ch1 kernel_input_ch0
-    - ln -s kernel_output_ch3 kernel_input_ch2
-    - cd ..
-    - make CTEST_OUTPUT_ON_FAILURE=1 test
+  <<: *test
+  variables:
+    BENCHMARK_FOLDER: b_eff
+    BENCHMARK_OPTIONS: -DHOST_EMULATION_REORDER=Yes
+    PREPARE_SCRIPT: ../$BENCHMARK_FOLDER/scripts/prepare_tests.sh ./bin
   dependencies:
     - build:b_eff
-  artifacts:
-    when: on_failure
-    paths:
-      - build/Testing/Temporary/LastTest.log
-  only:
-    changes:
-      - b_eff/**/*
-      - shared/**/*
-      - scripts/**/*
-      - cmake/**/*
-      - .gitlab-ci.yml
   needs: ["build:b_eff"]
-
-
diff --git a/FFT/README.md b/FFT/README.md
index 1d14663d..52269e94 100644
--- a/FFT/README.md
+++ b/FFT/README.md
@@ -59,31 +59,36 @@ For execution of the benchmark run:
     
 For more information on available input parameters run
 
-    $./FFT_intel -h
+    ./FFT_intel -h
     
     Implementation of the FFT benchmark proposed in the HPCC benchmark suite for FPGA.
-    Version: 1.2
+    Version: 1.4
     Usage:
       ./FFT_intel [OPTION...]
     
-        -f, --file arg         Kernel file name
-        -n, arg                Number of repetitions (default: 10)
-        -i,                    Use memory Interleaving
-            --skip-validation  Skip the validation of the output data. This will
-                                speed up execution and helps when working with special
-                                data types.
-            --device arg       Index of the device that has to be used. If not
-                                given you will be asked which device to use if there are
-                                multiple devices available. (default: -1)
-            --platform arg     Index of the platform that has to be used. If not
-                                given you will be asked which platform to use if there
-                                are multiple platforms available. (default: -1)
-        -h, --help             Print this help
-        -b, arg                Number of batched FFT calculations (iterations)
-                                (default: 100)
-            --inverse          If set, the inverse FFT is calculated instead
-        -r, arg                Number of kernel replications used for calculation
-                                (default: 1)
+      -f, --file arg          Kernel file name
+      -n, arg                 Number of repetitions (default: 10)
+      -i,                     Use memory Interleaving
+          --skip-validation   Skip the validation of the output data. This will
+                              speed up execution and helps when working with
+                              special data types.
+          --device arg        Index of the device that has to be used. If not
+                              given you will be asked which device to use if there
+                              are multiple devices available. (default: 0)
+          --platform arg      Index of the platform that has to be used. If not
+                              given you will be asked which platform to use if
+                              there are multiple platforms available. (default: 0)
+          --platform_str arg  Name of the platform that has to be used (default:
+                              )
+      -r, arg                 Number of used kernel replications (default: 1)
+          --dump-json arg     dump benchmark configuration and results to this
+                              file in json format (default: )
+          --test              Only test given configuration and skip execution
+                              and validation
+      -h, --help              Print this help
+      -b, arg                 Number of batched FFT calculations (iterations)
+                              (default: 100)
+          --inverse           If set, the inverse FFT is calculated instead
     
 To execute the unit and integration tests run
 
@@ -96,12 +101,13 @@ It will run an emulation of the kernel and execute some functionality tests.
 
 The benchmark will print the following two tables to standard output after execution:
 
-       res. error    mach. eps
-      2.67000e-01  1.19209e-07
-    
-                           avg         best
-       Time in s:  7.56801e-03  7.07241e-03
-          GFLOPS:  3.24735e-02  3.47491e-02
+     res. error          mach. eps
+     2.63523e-01         1.19209e-07
+
+                     avg                 best
+          Time in s: 8.93261e-04 s       8.73572e-04 s
+             GFLOPS: 2.75127e-01 GFLOP/s 2.81328e-01 GFLOP/s
+
           
 The first table contains the maximum residual error of the calculation and the
 machine epsilon that was used to calculate the residual error.
@@ -118,3 +124,99 @@ In the second table the measured execution times and calculated FLOPs are given.
 It gives the average and bast for both.
 The time gives the averaged execution time for a single FFT in case of a batched execution (an execution with more than one iteration).
 They are also used to calculate the FLOPs.
+
+The json output looks like the following.
+
+```json
+
+{
+  "config_time": "Wed Dec 14 08:40:17 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {
+    "epsilon": 1.1920928955078125e-07,
+    "residual": 0.2635231415430705
+  },
+  "execution_time": "Wed Dec 14 08:55:51 GMT 2022",
+  "git_commit": "be1a4e9-dirty",
+  "name": "FFT",
+  "results": {
+    "gflops_avg": {
+      "unit": "GFLOP/s",
+      "value": 0.2573525536079919
+    },
+    "gflops_min": {
+      "unit": "GFLOP/s",
+      "value": 0.2842073122577159
+    },
+    "t_avg": {
+      "unit": "s",
+      "value": 0.0009549545810000001
+    },
+    "t_min": {
+      "unit": "s",
+      "value": 0.00086472089
+    }
+  },
+  "settings": {
+    "Batch Size": 100,
+    "Communication Type": false,
+    "FFT Size": 4096,
+    "Inverse": false,
+    "Kernel File": false,
+    "Kernel Replications": 1,
+    "MPI Ranks": false,
+    "Repetitions": 10,
+    "Test Mode": false
+  },
+  "timings": {
+    "execution": [
+      {
+        "unit": "s",
+        "value": 0.151814849
+      },
+      {
+        "unit": "s",
+        "value": 0.086472089
+      },
+      {
+        "unit": "s",
+        "value": 0.089654183
+      },
+      {
+        "unit": "s",
+        "value": 0.09003793
+      },
+      {
+        "unit": "s",
+        "value": 0.089870966
+      },
+      {
+        "unit": "s",
+        "value": 0.089802216
+      },
+      {
+        "unit": "s",
+        "value": 0.089816195
+      },
+      {
+        "unit": "s",
+        "value": 0.089979618
+      },
+      {
+        "unit": "s",
+        "value": 0.090762352
+      },
+      {
+        "unit": "s",
+        "value": 0.086744183
+      }
+    ]
+  },
+  "validated": true,
+  "version": "1.4"
+}
+
+```
diff --git a/FFT/src/common/parameters.h.in b/FFT/src/common/parameters.h.in
index 52a87a98..57c85c61 100644
--- a/FFT/src/common/parameters.h.in
+++ b/FFT/src/common/parameters.h.in
@@ -27,7 +27,8 @@
 Short description of the program.
 Moreover the version and build time is also compiled into the description.
 */
-#define PROGRAM_DESCRIPTION "Implementation of the FFT benchmark"\
+#define PROGRAM_NAME "FFT"
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
                             
diff --git a/FFT/src/device/fft1d_float_8.cl b/FFT/src/device/fft1d_float_8.cl
index 69da1432..ba451fa8 100644
--- a/FFT/src/device/fft1d_float_8.cl
+++ b/FFT/src/device/fft1d_float_8.cl
@@ -51,13 +51,11 @@
 // code generation expects an array of maps of size num_replications with the keys "in" and "out".
 // The value of the keys have to be strings containing the attributes that
 // have to be assigned to input and output buffers in global memory
-/* PY_CODE_GEN 
-try:
-    kernel_param_attributes = generate_attributes(num_replications)
-except:
-    kernel_param_attributes = [{"in": "", "out": ""} for i in range(num_replications)]
-*/
-
+{% if generate_attributes is defined %}
+    {% set kernel_param_attributes = generate_attributes(num_replications) %}
+{% else %}
+    {% set kernel_param_attributes = create_list({"in": "", "out": ""}, num_replications) %}
+{% endif %}
 
 #define min(a,b) (a<b?a:b)
 
@@ -69,19 +67,19 @@ except:
 // Need some depth to our channels to accommodate their bursty filling.
 #ifdef INTEL_FPGA
 #pragma OPENCL EXTENSION cl_intel_channels : enable
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
-channel float2 chanin/*PY_CODE_GEN i*/[POINTS] __attribute__((depth(POINTS)));
-// PY_CODE_GEN block_end
+{% for i in range(num_total_replications) %}
+channel float2 chanin{{ i }}[POINTS] __attribute__((depth(POINTS)));
+{% endfor %}
 #endif
 #ifdef XILINX_FPGA
 #define XILINX_PIPE_DEPTH 16
 //#define XILINX_PIPE_DEPTH ((1 << (LOGN - LOGPOINTS) < 16) ? 16 : (1 << (LOGN - LOGPOINTS)))
 
 // Compiler states, that the pipe depth needs at least to be 16
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
-pipe float2x8 chanin/*PY_CODE_GEN i*/ __attribute__((xcl_reqd_pipe_depth(XILINX_PIPE_DEPTH)));
-pipe float2x8 chanout/*PY_CODE_GEN i*/ __attribute__((xcl_reqd_pipe_depth(XILINX_PIPE_DEPTH)));
-// PY_CODE_GEN block_end
+{% for i in range(num_total_replications) %}
+pipe float2x8 chanin{{ i }} __attribute__((xcl_reqd_pipe_depth(XILINX_PIPE_DEPTH)));
+pipe float2x8 chanout{{ i }} __attribute__((xcl_reqd_pipe_depth(XILINX_PIPE_DEPTH)));
+{% endfor %}
 #endif
 
 uint bit_reversed(uint x, uint bits) {
@@ -96,11 +94,11 @@ __attribute__((opencl_unroll_hint()))
   return y;
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_total_replications) %}
 
 __kernel
 __attribute__ ((max_global_work_dim(0), reqd_work_group_size(1,1,1)))
-void fetch/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["in"]*/ float2 * restrict src, int iter) {
+void fetch{{ i }}(__global {{ kernel_param_attributes[i]["in"] }} float2 * restrict src, int iter) {
 
   const int N = (1 << LOGN);
 
@@ -154,27 +152,27 @@ void fetch/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["i
         write_chunk[bit_reversed(j, LOGPOINTS)] = buf[offset + (current_index >> LOGPOINTS)][(current_index + shift) & (POINTS - 1)];
       }
 #ifdef XILINX_FPGA
-      buf2x8.i0 = write_chunk[0];          
-      buf2x8.i1 = write_chunk[1];  
-      buf2x8.i2 = write_chunk[2];  
-      buf2x8.i3 = write_chunk[3]; 
-      buf2x8.i4 = write_chunk[4]; 
+      buf2x8.i0 = write_chunk[0];
+      buf2x8.i1 = write_chunk[1];
+      buf2x8.i2 = write_chunk[2];
+      buf2x8.i3 = write_chunk[3];
+      buf2x8.i4 = write_chunk[4];
       buf2x8.i5 = write_chunk[5];
       buf2x8.i6 = write_chunk[6];
       buf2x8.i7 = write_chunk[7];
 
       // Start in the second iteration to forward the buffered data over the pipe
-      write_pipe_block(chanin/*PY_CODE_GEN i*/, &buf2x8);
+      write_pipe_block(chanin{{ i }}, &buf2x8);
 #endif
 #ifdef INTEL_FPGA
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[0], write_chunk[0]); 
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[1], write_chunk[1]);  
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[2], write_chunk[2]);  
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[3], write_chunk[3]);  
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[4], write_chunk[4]);  
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[5], write_chunk[5]); 
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[6], write_chunk[6]);  
-        write_channel_intel(chanin/*PY_CODE_GEN i*/[7], write_chunk[7]);  
+        write_channel_intel(chanin{{ i }}[0], write_chunk[0]);
+        write_channel_intel(chanin{{ i }}[1], write_chunk[1]);
+        write_channel_intel(chanin{{ i }}[2], write_chunk[2]);
+        write_channel_intel(chanin{{ i }}[3], write_chunk[3]);
+        write_channel_intel(chanin{{ i }}[4], write_chunk[4]);
+        write_channel_intel(chanin{{ i }}[5], write_chunk[5]);
+        write_channel_intel(chanin{{ i }}[6], write_chunk[6]);
+        write_channel_intel(chanin{{ i }}[7], write_chunk[7]);
 #endif
     }
   }
@@ -193,10 +191,10 @@ void fetch/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["i
 
 __attribute__ ((max_global_work_dim(0)))
 __attribute__((reqd_work_group_size(1,1,1)))
-kernel void fft1d/*PY_CODE_GEN i*/(
+kernel void fft1d{{ i }}(
 #ifdef INTEL_FPGA
                 // Intel does not need a store kernel and directly writes back the result to global memory
-                __global /*PY_CODE_GEN kernel_param_attributes[i]["out"]*/ float2 * restrict dest,
+                __global {{ kernel_param_attributes[i]["out"] }} float2 * restrict dest,
 #endif
                 int count, int inverse) {
 
@@ -235,17 +233,17 @@ kernel void fft1d/*PY_CODE_GEN i*/(
     // Perform memory transfers only when reading data in range
     if (i < count * (N / POINTS)) {
 #ifdef INTEL_FPGA
-      data.i0 = read_channel_intel(chanin/*PY_CODE_GEN i*/[0]);
-      data.i1 = read_channel_intel(chanin/*PY_CODE_GEN i*/[1]);
-      data.i2 = read_channel_intel(chanin/*PY_CODE_GEN i*/[2]);
-      data.i3 = read_channel_intel(chanin/*PY_CODE_GEN i*/[3]);
-      data.i4 = read_channel_intel(chanin/*PY_CODE_GEN i*/[4]);
-      data.i5 = read_channel_intel(chanin/*PY_CODE_GEN i*/[5]);
-      data.i6 = read_channel_intel(chanin/*PY_CODE_GEN i*/[6]);
-      data.i7 = read_channel_intel(chanin/*PY_CODE_GEN i*/[7]);
+      data.i0 = read_channel_intel(chanin{{ i }}[0]);
+      data.i1 = read_channel_intel(chanin{{ i }}[1]);
+      data.i2 = read_channel_intel(chanin{{ i }}[2]);
+      data.i3 = read_channel_intel(chanin{{ i }}[3]);
+      data.i4 = read_channel_intel(chanin{{ i }}[4]);
+      data.i5 = read_channel_intel(chanin{{ i }}[5]);
+      data.i6 = read_channel_intel(chanin{{ i }}[6]);
+      data.i7 = read_channel_intel(chanin{{ i }}[7]);
 #endif
 #ifdef XILINX_FPGA
-      read_pipe_block(chanin/*PY_CODE_GEN i*/, &data);
+      read_pipe_block(chanin{{ i }}, &data);
 #endif
     } else {
       data.i0 = data.i1 = data.i2 = data.i3 = 
@@ -274,7 +272,7 @@ kernel void fft1d/*PY_CODE_GEN i*/(
 #endif
 #ifdef XILINX_FPGA
     // For Xilinx send the data to the store kernel to enable memory bursts
-      write_pipe_block(chanout/*PY_CODE_GEN i*/, &data);
+      write_pipe_block(chanout{{ i }}, &data);
 #endif
     }
   }
@@ -287,14 +285,14 @@ This kernel works without conditional branches which enables memory bursts.
  */
 __kernel
 __attribute__ ((max_global_work_dim(0), reqd_work_group_size(1,1,1)))
-void store/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["out"]*/ float2 * restrict dest, int iter) {
+void store{{ i }}(__global {{ kernel_param_attributes[i]["out"] }} float2 * restrict dest, int iter) {
 
   const int N = (1 << LOGN);
 
   // write the data back to global memory using memory bursts
   for(unsigned k = 0; k < iter * (N / POINTS); k++){ 
       float2x8 buf2x8;
-      read_pipe_block(chanout/*PY_CODE_GEN i*/, &buf2x8);
+      read_pipe_block(chanout{{ i }}, &buf2x8);
 
       dest[(k << LOGPOINTS)]     = buf2x8.i0;    
       dest[(k << LOGPOINTS) + 1] = buf2x8.i1; 
@@ -308,4 +306,4 @@ void store/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]["o
 }
 #endif
 
-//PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/FFT/src/host/execution.h b/FFT/src/host/execution.h
index 2d588ded..fa44dc38 100644
--- a/FFT/src/host/execution.h
+++ b/FFT/src/host/execution.h
@@ -45,7 +45,7 @@ simple exchange of the different calculation methods.
 
 @return The resulting matrix
 */
-    std::unique_ptr<fft::FFTExecutionTimings>
+    std::map<std::string, std::vector<double>>
     calculate(hpcc_base::ExecutionSettings<fft::FFTProgramSettings> const& config, std::complex<HOST_DATA_TYPE>* data, std::complex<HOST_DATA_TYPE>* data_out, unsigned iterations, bool inverse);
 
 }  // namespace bm_execution
diff --git a/FFT/src/host/execution_default.cpp b/FFT/src/host/execution_default.cpp
index 59a81f87..a1ae245a 100644
--- a/FFT/src/host/execution_default.cpp
+++ b/FFT/src/host/execution_default.cpp
@@ -44,7 +44,7 @@ namespace bm_execution {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
-    std::unique_ptr<fft::FFTExecutionTimings>
+    std::map<std::string, std::vector<double>>
     calculate(hpcc_base::ExecutionSettings<fft::FFTProgramSettings> const&  config,
             std::complex<HOST_DATA_TYPE>* data,
             std::complex<HOST_DATA_TYPE>* data_out,
@@ -210,10 +210,11 @@ namespace bm_execution {
                 ASSERT_CL(err)
 #endif
         }
-        std::unique_ptr<fft::FFTExecutionTimings> result(new fft::FFTExecutionTimings{
-                calculationTimings
-        });
-        return result;
+        std::map<std::string, std::vector<double>> timings;
+
+        timings["execution"] = calculationTimings;
+
+        return timings;
     }
 
 }  // namespace bm_execution
diff --git a/FFT/src/host/fft_benchmark.cpp b/FFT/src/host/fft_benchmark.cpp
index cf7ad994..e9b86e12 100644
--- a/FFT/src/host/fft_benchmark.cpp
+++ b/FFT/src/host/fft_benchmark.cpp
@@ -35,7 +35,7 @@ SOFTWARE.
 #include "parameters.h"
 
 fft::FFTProgramSettings::FFTProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
-    iterations(results["b"].as<uint>()), inverse(results.count("inverse")), kernelReplications(results["r"].as<uint>()) {
+    iterations(results["b"].as<uint>()), inverse(results.count("inverse")) {
 
 }
 
@@ -44,7 +44,7 @@ fft::FFTProgramSettings::getSettingsMap() {
         auto map = hpcc_base::BaseSettings::getSettingsMap();
         map["FFT Size"] = std::to_string(1 << LOG_FFT_SIZE);
         map["Batch Size"] = std::to_string(iterations);
-        map["Kernel Replications"] = std::to_string(kernelReplications);
+        map["Inverse"] = inverse ? "Yes" : "No";
         return map;
 }
 
@@ -86,36 +86,43 @@ fft::FFTBenchmark::addAdditionalParseOptions(cxxopts::Options &options) {
             ("inverse", "If set, the inverse FFT is calculated instead");
 }
 
-std::unique_ptr<fft::FFTExecutionTimings>
+void
 fft::FFTBenchmark::executeKernel(FFTData &data) {
-    return bm_execution::calculate(*executionSettings, data.data, data.data_out, executionSettings->programSettings->iterations,
+    timings = bm_execution::calculate(*executionSettings, data.data, data.data_out, executionSettings->programSettings->iterations,
                                          executionSettings->programSettings->inverse);
 }
 
 void
-fft::FFTBenchmark::collectAndPrintResults(const fft::FFTExecutionTimings &output) {
+fft::FFTBenchmark::collectResults() {
     double gflop = static_cast<double>(5 * (1 << LOG_FFT_SIZE) * LOG_FFT_SIZE) * executionSettings->programSettings->iterations * 1.0e-9 * mpi_comm_size;
 
-    uint number_measurements = output.timings.size();
+    uint number_measurements = timings["execution"].size();
     std::vector<double> avg_measures(number_measurements);
 #ifdef _USE_MPI_
     // Copy the object variable to a local variable to make it accessible to the lambda function
     int mpi_size = mpi_comm_size;
-    MPI_Reduce(output.timings.data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(timings.data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
     std::for_each(avg_measures.begin(),avg_measures.end(), [mpi_size](double& x) {x /= mpi_size;});
 #else
-    std::copy(output.timings.begin(), output.timings.end(), avg_measures.begin());
+    std::copy(timings["execution"].begin(), timings["execution"].end(), avg_measures.begin());
 #endif
     if (mpi_comm_rank == 0) {
         double minTime = *min_element(avg_measures.begin(), avg_measures.end());
         double avgTime = accumulate(avg_measures.begin(), avg_measures.end(), 0.0) / avg_measures.size();
+        results.emplace("t_min", hpcc_base::HpccResult(minTime / (executionSettings->programSettings->iterations * executionSettings->programSettings->kernelReplications), "s"));
+        results.emplace("t_avg", hpcc_base::HpccResult(avgTime / (executionSettings->programSettings->iterations * executionSettings->programSettings->kernelReplications), "s"));
+        results.emplace("gflops_min", hpcc_base::HpccResult(gflop / minTime, "GFLOP/s"));
+        results.emplace("gflops_avg", hpcc_base::HpccResult(gflop / avgTime, "GFLOP/s"));
+    }
+}
 
-        std::cout << std::setw(ENTRY_SPACE) << " " << std::setw(ENTRY_SPACE) << "avg"
-                << std::setw(ENTRY_SPACE) << "best" << std::endl;
-        std::cout << std::setw(ENTRY_SPACE) << "Time in s:" << std::setw(ENTRY_SPACE) << avgTime / (executionSettings->programSettings->iterations * executionSettings->programSettings->kernelReplications)
-                    << std::setw(ENTRY_SPACE) << minTime / (executionSettings->programSettings->iterations * executionSettings->programSettings->kernelReplications) << std::endl;
-        std::cout << std::setw(ENTRY_SPACE) << "GFLOPS:" << std::setw(ENTRY_SPACE) << gflop / avgTime
-                    << std::setw(ENTRY_SPACE) << gflop / minTime << std::endl;
+void
+fft::FFTBenchmark::printResults() {
+    if (mpi_comm_rank == 0) {
+        std::cout << std::setw(ENTRY_SPACE) << " " << std::left << std::setw(ENTRY_SPACE) << " avg"
+                << std::setw(ENTRY_SPACE) << " best" << std::right << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << "Time in s: " << results.at("t_avg") << results.at("t_min") << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << "GFLOPS: " << results.at("gflops_avg") << results.at("gflops_min") << std::endl;
     }
 }
 
@@ -134,7 +141,7 @@ fft::FFTBenchmark::generateInputData() {
 }
 
 bool  
-fft::FFTBenchmark::validateOutputAndPrintError(fft::FFTData &data) {
+fft::FFTBenchmark::validateOutput(fft::FFTData &data) {
     double residual_max = 0;
     for (int i = 0; i < executionSettings->programSettings->iterations; i++) {
         // we have to bit reverse the output data of the FPGA kernel, since it will be provided in bit-reversed order.
@@ -152,17 +159,24 @@ fft::FFTBenchmark::validateOutputAndPrintError(fft::FFTData &data) {
             residual_max = residual_max > tmp_error ? residual_max : tmp_error;
         }
     }
+    // Calculate residual according to paper considering also the used iterations
     double error = residual_max /
                    (std::numeric_limits<HOST_DATA_TYPE>::epsilon() * LOG_FFT_SIZE);
+    
+    errors.emplace("residual", error);
+    errors.emplace("epsilon", std::numeric_limits<HOST_DATA_TYPE>::epsilon());
 
-    std::cout << std::setw(ENTRY_SPACE) << "res. error" << std::setw(ENTRY_SPACE) << "mach. eps" << std::endl;
-    std::cout << std::setw(ENTRY_SPACE) << error << std::setw(ENTRY_SPACE)
-              << std::numeric_limits<HOST_DATA_TYPE>::epsilon() << std::endl << std::endl;
-
-    // Calculate residual according to paper considering also the used iterations
     return error < 1.0;
 }
 
+void fft::FFTBenchmark::printError() {
+    if (mpi_comm_rank == 0) {
+        std::cout << std::setw(ENTRY_SPACE) << "res. error" << std::setw(ENTRY_SPACE) << "mach. eps" << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << errors.at("residual") << std::setw(ENTRY_SPACE) << errors.at("epsilon") << std::endl << std::endl;
+    }
+
+}
+
 void 
 fft::bit_reverse(std::complex<HOST_DATA_TYPE> *data, unsigned iterations) {
     auto *tmp = new std::complex<HOST_DATA_TYPE>[(1 << LOG_FFT_SIZE)];
diff --git a/FFT/src/host/fft_benchmark.hpp b/FFT/src/host/fft_benchmark.hpp
index 4ee82f12..f7905a83 100644
--- a/FFT/src/host/fft_benchmark.hpp
+++ b/FFT/src/host/fft_benchmark.hpp
@@ -55,12 +55,6 @@ class FFTProgramSettings : public hpcc_base::BaseSettings {
     */
     bool inverse;
 
-    /**
-     * @brief The number of used kernel replications
-     * 
-     */
-    uint kernelReplications;
-
     /**
      * @brief Construct a new FFT Program Settings object
      * 
@@ -137,7 +131,7 @@ class FFTExecutionTimings {
  * @brief Implementation of the FFT benchmark
  * 
  */
-class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark<FFTProgramSettings, FFTData, FFTExecutionTimings> {
+class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark<FFTProgramSettings, cl::Device, cl::Context, cl::Program, FFTData> {
 
 protected:
 
@@ -165,7 +159,7 @@ class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark<FFTProgramSettings, FFT
      * @param data The input and output data of the benchmark
      * @return std::unique_ptr<FFTExecutionTimings> Measured runtimes of the kernel execution
      */
-    std::unique_ptr<FFTExecutionTimings>
+    void
     executeKernel(FFTData &data) override;
 
     /**
@@ -176,7 +170,14 @@ class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark<FFTProgramSettings, FFT
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(FFTData &data) override;
+    validateOutput(FFTData &data) override;
+    
+    /**
+     * @brief FFT specifig implementation of the error printing
+     * 
+     */
+    void
+    printError() override;
 
     /**
      * @brief FFT specific implementation of printing the execution results
@@ -184,7 +185,10 @@ class FFTBenchmark : public hpcc_base::HpccFpgaBenchmark<FFTProgramSettings, FFT
      * @param output Measured runtimes of the kernel execution
      */
     void
-    collectAndPrintResults(const FFTExecutionTimings &output) override;
+    collectResults() override;
+    
+    void
+    printResults() override;
 
     /**
      * @brief Construct a new FFT Benchmark object
diff --git a/FFT/tests/test_execution_functionality.cpp b/FFT/tests/test_execution_functionality.cpp
index 6e7f6493..ab5040f3 100644
--- a/FFT/tests/test_execution_functionality.cpp
+++ b/FFT/tests/test_execution_functionality.cpp
@@ -34,8 +34,8 @@ struct FFTKernelTest : testing::Test {
 TEST_F(FFTKernelTest, CalculateReturnsCorrectExecutionResultFor11False) {
     bm->getExecutionSettings().programSettings->numRepetitions = 1;
     data = bm->generateInputData();
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(1, result->timings.size());
+    bm->executeKernel(*data);
+    EXPECT_EQ(1, bm->getTimingsMap().at("execution").size());
 }
 
 /**
@@ -44,8 +44,8 @@ TEST_F(FFTKernelTest, CalculateReturnsCorrectExecutionResultFor11False) {
 TEST_F(FFTKernelTest, CalculateReturnsCorrectExecutionResultFor24True) {
     bm->getExecutionSettings().programSettings->numRepetitions = 2;
     data = bm->generateInputData();
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(2, result->timings.size());
+    bm->executeKernel(*data);
+    EXPECT_EQ(2, bm->getTimingsMap().at("execution").size());
 }
 
 /**
@@ -56,7 +56,7 @@ TEST_F(FFTKernelTest, FFTReturnsZero) {
         data->data[i].real(0.0);
         data->data[i].imag(0.0);
     }
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
         EXPECT_FLOAT_EQ(std::abs(data->data_out[i]), 0.0);
     }
@@ -71,7 +71,7 @@ TEST_F(FFTKernelTest, FFTCloseToZeroForAll1And1) {
         data->data[i].real(1.0);
         data->data[i].imag(1.0);
     }
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     EXPECT_NEAR(data->data_out[0].real(), (1 << LOG_FFT_SIZE), 0.00001);
     EXPECT_NEAR(data->data_out[0].imag(), (1 << LOG_FFT_SIZE), 0.00001);
     for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
@@ -88,7 +88,7 @@ TEST_F(FFTKernelTest, FFTCloseToZeroForAll0And0) {
         data->data[i].real(0.0);
         data->data[i].imag(0.0);
     }
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i=0; i < (1 << LOG_FFT_SIZE); i++) {
         EXPECT_NEAR(data->data[i].real(), 0.0, 0.00001);
         EXPECT_NEAR(data->data[i].imag(), 0.0, 0.00001);
@@ -104,7 +104,7 @@ TEST_F(FFTKernelTest, IFFTCloseToZeroForAll1And1) {
         data->data[i].real(1.0);
         data->data[i].imag(0.0);
     }
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     EXPECT_NEAR(data->data_out[0].real(), static_cast<HOST_DATA_TYPE>(1 << LOG_FFT_SIZE), 0.00001);
     EXPECT_NEAR(data->data_out[0].imag(), 0.0, 0.00001);
     for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
@@ -119,7 +119,7 @@ TEST_F(FFTKernelTest, IFFTCloseToZeroForAll1And1) {
 TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) {
     auto verify_data = bm->generateInputData();
 
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
 
     // Normalize iFFT result
     for (int i=0; i<(1 << LOG_FFT_SIZE); i++) {
@@ -135,7 +135,7 @@ TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) {
     }
 
     bm->getExecutionSettings().programSettings->inverse = true;
-    auto result2 = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     // Since data was already sorted by iFFT the bit reversal of the kernel has t be undone
     fft::bit_reverse(data->data_out, 1);
 
@@ -150,7 +150,7 @@ TEST_F(FFTKernelTest, FFTandiFFTProduceResultCloseToSource) {
 TEST_F(FFTKernelTest, FPGAFFTAndCPUFFTGiveSameResults) {
     auto verify_data = bm->generateInputData();
 
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
 
     fft::fourier_transform_gold(false,LOG_FFT_SIZE,verify_data->data);
     fft::bit_reverse(verify_data->data, 1);
@@ -171,7 +171,7 @@ TEST_F(FFTKernelTest, FPGAiFFTAndCPUiFFTGiveSameResults) {
     auto verify_data = bm->generateInputData();
 
     bm->getExecutionSettings().programSettings->inverse = true;
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
 
     fft::fourier_transform_gold(true,LOG_FFT_SIZE,verify_data->data);
     fft::bit_reverse(verify_data->data, 1);
diff --git a/FFT/tests/test_fft_functionality.cpp b/FFT/tests/test_fft_functionality.cpp
index f5818814..4453a695 100644
--- a/FFT/tests/test_fft_functionality.cpp
+++ b/FFT/tests/test_fft_functionality.cpp
@@ -6,6 +6,7 @@
 #include "fft_benchmark.hpp"
 #include "parameters.h"
 #include "test_program_settings.h"
+#include "nlohmann/json.hpp"
 
 
 struct FFTHostTest : testing::Test {
@@ -119,4 +120,28 @@ TEST_F(FFTHostTest, FFTandiFFTProduceResultCloseToSource) {
     for (int i=1; i < (1 << LOG_FFT_SIZE); i++) {
         EXPECT_NEAR(std::abs(data->data[i]), std::abs(verify_data->data[i]), 0.001);
     }
-}
\ No newline at end of file
+}
+
+using json = nlohmann::json;
+
+TEST_F(FFTHostTest, JsonDump) {
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("fft.json");
+    std::FILE *f = std::fopen("fft.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].contains("execution"));
+        }
+        EXPECT_TRUE(j.contains("results"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("gflops_avg"));
+            EXPECT_TRUE(j["results"].contains("gflops_min"));
+            EXPECT_TRUE(j["results"].contains("t_avg"));
+            EXPECT_TRUE(j["results"].contains("t_min"));
+        }
+    }
+}
diff --git a/GEMM/README.md b/GEMM/README.md
new file mode 100755
index 00000000..8ac117df
--- /dev/null
+++ b/GEMM/README.md
@@ -0,0 +1,240 @@
+# GEMM Benchmark for FPGA
+
+This repository contains the GEMM Benchmark for FPGA and its OpenCL kernels.
+Currently only the  Intel FPGA SDK for OpenCL utility is supported.
+
+It is a modified implementation of the
+[GEMM Benchmark](http://www.netlib.org/parkbench/html/matrix-kernels.html)
+provided in the [HPC Challenge Benchmark](https://icl.utk.edu/hpcc/) suite.
+The implementation follows the Python reference implementation given in  
+_Introduction to the HPCChallenge Benchmark Suite_ available
+[here](http://icl.cs.utk.edu/news_pub/submissions/hpcc-challenge-intro.pdf).
+
+## Additional Dependencies
+
+The benchmark *optionally* depends on a library implementing the BLAS linear-algebra interface like:
+
+- OpenBLAS
+- Intel MKL
+
+If available, the benchmark will use `sgemm_` to validate the calculation instead of a slow reference implementation.
+For matrix sizes above 1000x1000 we recommend using such a library to speed up the benchmark execution. 
+Using such a library will not change the performance result of the benchmark but might affect the reported error of the calculation.
+
+For half precision support, the IEEE 754-based half-precision floating-point library by Christian Rau is used and a copy is provided with this code. 
+
+## Build
+
+CMake is used as the build system.
+The targets below can be used to build the benchmark and its kernels, where `VENDOR` can be
+`intel` or `xilinx`:
+
+ |  Target  | Description                                    |
+ | -------- | ---------------------------------------------- |
+ | GEMM_`VENDOR`   | Builds the host application                    |
+ | GEMM_test_`VENDOR`    | Compile the tests and its dependencies  |
+ 
+ More over the are additional targets to generate kernel reports and bitstreams.
+ They are generated for every kernel code in the `src/device` folder:
+ 
+  |  Target  | Description                                    |
+  | -------- | ---------------------------------------------- |
+  | gemm_cannon_`VENDOR`         | Synthesizes the kernel (takes several hours!)  |
+  | gemm_cannon_report_`VENDOR`  | Just compile kernel and create reports    |
+  | gemm_cannon_emulate_`VENDOR`  | Create a n emulation kernel             |
+ 
+ You can build for example the host application by running
+ 
+    mkdir build && cd build
+    cmake ..
+    make GEMM_intel
+
+You will find all executables and kernel files in the `bin`
+folder of your build directory.
+Next to the common configuration options given in the [README](../README.md) of the benchmark suite you might want to specify the following additional options before build:
+
+Name             | Default     | Description                          |
+---------------- |-------------|--------------------------------------|
+ `DATA_TYPE`     | float (also supported: half, double)      | Data type used for calculation. *Note: Currently, half-precision does not work on Intel FPGAs because they can not be passed as kernel argument per value.*  |
+`DEFAULT_MATRIX_SIZE` | 8      | The default size of the quadratic matrices in blocks |
+`BLOCK_SIZE`    | 512          | Block size used by the kernel for calculation |
+`GEMM_SIZE`    | 8             | Block size of the fully unrolled matrix multiplication in registers |
+`GLOBAL_MEM_UNROLL`| 16        | Unrolling factor for the global memory access |
+`INTEL_MUL_SHIFT_REG`| 0       | Size of the shift register that can be optionally used by the Intel implementation to relax data dependencies (defaults to 0, which means that no shift register is used) |
+`NUM_REPLICATIONS` | 4         | Number of kernel replications. Every kernel will calculate a part of the output matrix |
+
+Moreover the environment variable `INTELFPGAOCLSDKROOT` has to be set to the root
+of the Intel FPGA SDK installation.
+
+## Execution
+
+For execution of the benchmark run:
+
+    ./GEMM_intel -f path_to_kernel.aocx
+    
+For more information on available input parameters run
+
+    ./GEMM_intel -h
+
+    Implementation of the GEMM benchmark proposed in the HPCC benchmark adapted for FPGA
+    Version: 1.3
+
+    MPI Version:  3.1
+    Config. Time: Thu Dec 08 10:39:51 UTC 2022
+    Git Commit:   86e0064-dirty
+
+    Usage:
+      ./bin/GEMM_intel [OPTION...]
+
+      -f, --file arg          Kernel file name
+      -n, arg                 Number of repetitions (default: 10)
+      -i,                     Use memory Interleaving
+          --skip-validation   Skip the validation of the output data. This will
+                              speed up execution and helps when working with
+                              special data types.
+          --device arg        Index of the device that has to be used. If not
+                              given you will be asked which device to use if there
+                              are multiple devices available. (default: 0)
+          --platform arg      Index of the platform that has to be used. If not
+                              given you will be asked which platform to use if
+                              there are multiple platforms available. (default: 0)
+          --platform_str arg  Name of the platform that has to be used (default:
+                              )
+      -r, arg                 Number of used kernel replications (default: 4)
+          --dump-json arg     dump benchmark configuration and results to this
+                              file in json format (default: )
+          --test              Only test given configuration and skip execution
+                              and validation
+      -h, --help              Print this help
+      -m, arg                 Matrix size in number of blocks in a single
+                              dimension (default: 8)
+      -b, arg                 Block size in number of values in one dimension
+                              (default: 32)
+          --replicate-inputs  Also replicates the input buffer for each kernel
+
+To execute the unit and integration tests run
+
+    ./GEMM_test_intel -f KERNEL_FILE_NAME
+    
+in the `bin` folder within the build directory.
+It will run an emulation of the kernel and execute some functionality tests.
+
+## Output Interpretation
+
+An example output from an emulation is given below:
+
+     norm. residual      res. error          mach. eps          
+     8.08345e-05         7.62939e-06         1.19209e-07        
+
+     best                mean                GFLOPS             
+     6.50672e-03 s       1.06789e-02 s       5.15689e+00 GFLOP/s
+
+The first two rows give information about the calculation error.
+
+- `norm. residual`: The normalized residual error based on the used matrix size and used values
+- `res. error`: The maximum residual error of the calculation
+- `mach. epsilon`: The machine epsilon
+
+The last two columns contain the time measurements and based on that the achieved FLOPS
+of the calculation.
+
+- `best`: The shortest execution time in all runs
+- `mean`: Arithmetic mean of all execution times
+- `GFLOPS`: GFLOPS calculated from the shortest execution time
+
+The json output looks like the following.
+
+```json
+
+{
+  "config_time": "Wed Dec 14 08:40:52 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {
+    "epsilon": 1.1920928955078125e-07,
+    "residual": 7.62939453125e-06,
+    "residual_norm": 8.08345175162664e-05
+  },
+  "execution_time": "Wed Dec 14 09:14:09 UTC 2022",
+  "git_commit": "be1a4e9-dirty",
+  "mpi": {
+    "subversion": 1,
+    "version": 3
+  },
+  "name": "GEMM",
+  "results": {
+    "gflops": {
+      "unit": "GFLOP/s",
+      "value": 5.297554068962992
+    },
+    "t_mean": {
+      "unit": "s",
+      "value": 0.010202154299999999
+    },
+    "t_min": {
+      "unit": "s",
+      "value": 0.006333948
+    }
+  },
+  "settings": {
+    "Block Size": 32,
+    "Communication Type": false,
+    "Kernel File": false,
+    "Kernel Replications": 4,
+    "MPI Ranks": 1,
+    "Matrix Size": 256,
+    "Repetitions": 10,
+    "Replicate Inputs": false,
+    "Test Mode": false
+  },
+  "timings": {
+    "execution": [
+      {
+        "unit": "s",
+        "value": 0.012732567
+      },
+      {
+        "unit": "s",
+        "value": 0.006511861
+      },
+      {
+        "unit": "s",
+        "value": 0.006333948
+      },
+      {
+        "unit": "s",
+        "value": 0.012710817
+      },
+      {
+        "unit": "s",
+        "value": 0.006552662
+      },
+      {
+        "unit": "s",
+        "value": 0.006600733
+      },
+      {
+        "unit": "s",
+        "value": 0.012673167
+      },
+      {
+        "unit": "s",
+        "value": 0.012720237
+      },
+      {
+        "unit": "s",
+        "value": 0.012608296
+      },
+      {
+        "unit": "s",
+        "value": 0.012577255
+      }
+    ]
+  },
+  "validated": true,
+  "version": "1.3"
+}
+
+```
diff --git a/GEMM/Readme.md b/GEMM/Readme.md
deleted file mode 100755
index 831194bd..00000000
--- a/GEMM/Readme.md
+++ /dev/null
@@ -1,135 +0,0 @@
-# GEMM Benchmark for FPGA
-
-This repository contains the GEMM Benchmark for FPGA and its OpenCL kernels.
-Currently only the  Intel FPGA SDK for OpenCL utility is supported.
-
-It is a modified implementation of the
-[GEMM Benchmark](http://www.netlib.org/parkbench/html/matrix-kernels.html)
-provided in the [HPC Challenge Benchmark](https://icl.utk.edu/hpcc/) suite.
-The implementation follows the Python reference implementation given in  
-_Introduction to the HPCChallenge Benchmark Suite_ available
-[here](http://icl.cs.utk.edu/news_pub/submissions/hpcc-challenge-intro.pdf).
-
-## Additional Dependencies
-
-The benchmark *optionally* depends on a library implementing the BLAS linear-algebra interface like:
-
-- OpenBLAS
-- Intel MKL
-
-If available, the benchmark will use `sgemm_` to validate the calculation instead of a slow reference implementation.
-For matrix sizes above 1000x1000 we recommend using such a library to speed up the benchmark execution. 
-Using such a library will not change the performance result of the benchmark but might affect the reported error of the calculation.
-
-For half precision support, the IEEE 754-based half-precision floating-point library by Christian Rau is used and a copy is provided with this code. 
-
-## Build
-
-CMake is used as the build system.
-The targets below can be used to build the benchmark and its kernels, where `VENDOR` can be
-`intel` or `xilinx`:
-
- |  Target  | Description                                    |
- | -------- | ---------------------------------------------- |
- | GEMM_`VENDOR`   | Builds the host application                    |
- | GEMM_test_`VENDOR`    | Compile the tests and its dependencies  |
- 
- More over the are additional targets to generate kernel reports and bitstreams.
- They are generated for every kernel code in the `src/device` folder:
- 
-  |  Target  | Description                                    |
-  | -------- | ---------------------------------------------- |
-  | gemm_cannon_`VENDOR`         | Synthesizes the kernel (takes several hours!)  |
-  | gemm_cannon_report_`VENDOR`  | Just compile kernel and create reports    |
-  | gemm_cannon_emulate_`VENDOR`  | Create a n emulation kernel             |
- 
- You can build for example the host application by running
- 
-    mkdir build && cd build
-    cmake ..
-    make GEMM_intel
-
-You will find all executables and kernel files in the `bin`
-folder of your build directory.
-Next to the common configuration options given in the [README](../README.md) of the benchmark suite you might want to specify the following additional options before build:
-
-Name             | Default     | Description                          |
----------------- |-------------|--------------------------------------|
- `DATA_TYPE`     | float (also supported: half, double)      | Data type used for calculation. *Note: Currently, half-precision does not work on Intel FPGAs because they can not be passed as kernel argument per value.*  |
-`DEFAULT_MATRIX_SIZE` | 8      | The default size of the quadratic matrices in blocks |
-`BLOCK_SIZE`    | 512          | Block size used by the kernel for calculation |
-`GEMM_SIZE`    | 8             | Block size of the fully unrolled matrix multiplication in registers |
-`GLOBAL_MEM_UNROLL`| 16        | Unrolling factor for the global memory access |
-`INTEL_MUL_SHIFT_REG`| 0       | Size of the shift register that can be optionally used by the Intel implementation to relax data dependencies (defaults to 0, which means that no shift register is used) |
-`NUM_REPLICATIONS` | 4         | Number of kernel replications. Every kernel will calculate a part of the output matrix |
-
-Moreover the environment variable `INTELFPGAOCLSDKROOT` has to be set to the root
-of the Intel FPGA SDK installation.
-
-## Execution
-
-For execution of the benchmark run:
-
-    ./GEMM_intel -f path_to_kernel.aocx
-    
-For more information on available input parameters run
-
-    ./GEMM_intel -h
-    
-    Implementation of the GEMM benchmark proposed in the HPCC benchmark adapted for FPGA
-    Usage:
-    ./GEMM_intel [OPTION...]
-
-Implementation of the GEMM benchmark proposed in the HPCC benchmark adapted for FPGA
-Version: 1.0
-
-Usage:
-  bin/GEMM_intel [OPTION...]
-
-    -f, --file arg         Kernel file name
-    -n, arg                Number of repetitions (default: 10)
-    -i,                    Use memory Interleaving
-        --skip-validation  Skip the validation of the output data. This will
-                            speed up execution and helps when working with special
-                            data types.
-        --device arg       Index of the device that has to be used. If not
-                            given you will be asked which device to use if there are
-                            multiple devices available. (default: -1)
-        --platform arg     Index of the platform that has to be used. If not
-                            given you will be asked which platform to use if there
-                            are multiple platforms available. (default: -1)
-    -h, --help             Print this help
-    -m, arg                Matrix size in number of blocks in a single
-                            dimension (default: 8)
-    -b, arg                Block size in number of values in one dimension
-                            (default: 256)
-    -r, arg                Number of used kernel replications (default: 4)
-    
-To execute the unit and integration tests run
-
-    ./GEMM_test_intel -f KERNEL_FILE_NAME
-    
-in the `bin` folder within the build directory.
-It will run an emulation of the kernel and execute some functionality tests.
-
-## Output Interpretation
-
-An example output from an emulation is given below:
-
-    norm. resid        resid       machep
-    1.45417e-05  4.76837e-05  1.19209e-07
-           best         mean       GFLOPS
-    6.89168e-03  6.89168e-03  1.03868e+02
-
-The first two rows give information about the calculation error.
-
-- `norm. resid`: The normalized residual error based on the used matrix size and used values
-- `resid`: The maximum residual error of the calculation
-- `machep`: The machine epsilon
-
-The last two columns contain the time measurements and based on that the achieved FLOPS
-of the calculation.
-
-- `best`: The shortest execution time in all runs
-- `mean`: Arithmetic mean of all execution times
-- `GFLOPS`: GFLOPS calculated from the shortest execution time
diff --git a/GEMM/src/common/parameters.h.in b/GEMM/src/common/parameters.h.in
index 3e35bf01..82ca5a25 100644
--- a/GEMM/src/common/parameters.h.in
+++ b/GEMM/src/common/parameters.h.in
@@ -29,7 +29,9 @@
 /*
 Short description of the program
 */
-#define PROGRAM_DESCRIPTION "Implementation of the GEMM benchmark"\
+#define PROGRAM_NAME "GEMM"
+
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark adapted for FPGA\n"\
                             "Version: " VERSION "\n"
 
@@ -49,4 +51,4 @@ Output separator
 #endif
 #endif
 
-#endif // SRC_COMMON_PARAMETERS_H_
\ No newline at end of file
+#endif // SRC_COMMON_PARAMETERS_H_
diff --git a/GEMM/src/device/gemm_base.cl b/GEMM/src/device/gemm_base.cl
index 3599e6cd..4392b3c0 100644
--- a/GEMM/src/device/gemm_base.cl
+++ b/GEMM/src/device/gemm_base.cl
@@ -33,12 +33,11 @@ SOFTWARE.
 // code generation expects an array of maps of size num_replications with the keys a,b,c,out.
 // The value of the keys have to be strings containing the attributes that
 // have to be assigned to input and output buffers in global memory
-/* PY_CODE_GEN 
-try:
-    kernel_param_attributes = generate_attributes(num_replications)
-except:
-    kernel_param_attributes = [{"a": "", "b": "", "c": "", "out": ""} for i in range(num_replications)]
-*/
+{% if generate_attributes is defined %}
+    {% set kernel_param_attributes = generate_attributes(num_replications) %}
+{% else %}
+    {% set kernel_param_attributes = create_list({"a": "", "b": "", "c": "", "out": ""}, num_replications) %}
+{% endif %}
 
 /**
 Calculate for the Level 2 block:
@@ -260,7 +259,7 @@ to BRAM.
 
 // Here we use the total replications. This will also create three kernels for the Xilinx compiler because they all
 // use different hard-coded ranges in the outer loop
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 /**
 Two level blocked GEMM kernel
@@ -277,21 +276,21 @@ calculates C_OUT = alpha * A.dot(B) + beta * C
 */
 __attribute__((uses_global_work_offset(0)))
 __kernel
-void gemm/*PY_CODE_GEN i*/(
+void gemm{{ i }}(
 #ifdef ENABLE_MIXED_PRECISION
         // In mixed precision convert the values accordingly 
         // from single precision to the target precision on the FPGA
-            __global /*PY_CODE_GEN kernel_param_attributes[i]["a"]*/ const float* restrict a,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]["b"]*/ const float* restrict b,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]["c"]*/ const float* restrict c,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]["out"]*/ float* restrict c_out,
+            __global {{ kernel_param_attributes[i]["a"] }} const float* restrict a,
+          __global {{ kernel_param_attributes[i]["b"] }} const float* restrict b,
+          __global {{ kernel_param_attributes[i]["c"] }} const float* restrict c,
+          __global {{ kernel_param_attributes[i]["out"] }} float* restrict c_out,
           const float alpha,
           const float beta,
 #else
-            __global /*PY_CODE_GEN kernel_param_attributes[i]["a"]*/ const DEVICE_DATA_TYPE* restrict a,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]["b"]*/ const DEVICE_DATA_TYPE* restrict b,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]["c"]*/ const DEVICE_DATA_TYPE* restrict c,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]["out"]*/ DEVICE_DATA_TYPE* restrict c_out,
+            __global {{ kernel_param_attributes[i]["a"] }} const DEVICE_DATA_TYPE* restrict a,
+          __global {{ kernel_param_attributes[i]["b"] }} const DEVICE_DATA_TYPE* restrict b,
+          __global {{ kernel_param_attributes[i]["c"] }} const DEVICE_DATA_TYPE* restrict c,
+          __global {{ kernel_param_attributes[i]["out"] }} DEVICE_DATA_TYPE* restrict c_out,
           const DEVICE_DATA_TYPE alpha,
           const DEVICE_DATA_TYPE beta,
 #endif
@@ -445,4 +444,4 @@ __attribute__((opencl_unroll_hint(GLOBAL_MEM_UNROLL)))
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
diff --git a/GEMM/src/host/execution.h b/GEMM/src/host/execution.h
index 9446c16f..c4ce1412 100644
--- a/GEMM/src/host/execution.h
+++ b/GEMM/src/host/execution.h
@@ -48,9 +48,9 @@ simple exchange of the different calculation methods.
                 execution in number of items
 @param blockSize Size of a block that is calculated by the kernel
 
-@return The time measurements and the error rate counted from the executions
+@return The time measurements
 */
-std::unique_ptr<gemm::GEMMExecutionTimings>
+std::map<std::string, std::vector<double>>
 calculate(hpcc_base::ExecutionSettings<gemm::GEMMProgramSettings> const& config, HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, HOST_DATA_TYPE* c,
         HOST_DATA_TYPE* c_out, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta);
 }  // namespace bm_execution
diff --git a/GEMM/src/host/execution_default.cpp b/GEMM/src/host/execution_default.cpp
index aa89d258..e608a35a 100644
--- a/GEMM/src/host/execution_default.cpp
+++ b/GEMM/src/host/execution_default.cpp
@@ -42,7 +42,7 @@ namespace bm_execution {
 
  @copydoc bm_execution::calculate()
 */
-std::unique_ptr<gemm::GEMMExecutionTimings>
+std::map<std::string, std::vector<double>>
 calculate(hpcc_base::ExecutionSettings<gemm::GEMMProgramSettings> const& config, HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, HOST_DATA_TYPE* c, HOST_DATA_TYPE* c_out,
         HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta) {
 
@@ -257,10 +257,10 @@ calculate(hpcc_base::ExecutionSettings<gemm::GEMMProgramSettings> const& config,
     }
 #endif
 
-
-    std::unique_ptr<gemm::GEMMExecutionTimings> results(
-                    new gemm::GEMMExecutionTimings{executionTimes});
-    return results;
+    std::map<std::string, std::vector<double>> timings;
+    
+    timings["execution"] = executionTimes;
+    return timings;
 }
 
 }  // namespace bm_execution
diff --git a/GEMM/src/host/gemm_benchmark.cpp b/GEMM/src/host/gemm_benchmark.cpp
index 8910aacf..f624ea57 100644
--- a/GEMM/src/host/gemm_benchmark.cpp
+++ b/GEMM/src/host/gemm_benchmark.cpp
@@ -35,7 +35,7 @@ SOFTWARE.
 #include "parameters.h"
 
 gemm::GEMMProgramSettings::GEMMProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
-    matrixSize(results["b"].as<uint>() * results["m"].as<uint>()), blockSize(results["b"].as<uint>()), kernelReplications(results["r"].as<uint>()),
+    matrixSize(results["b"].as<uint>() * results["m"].as<uint>()), blockSize(results["b"].as<uint>()),
     replicateInputBuffers(results["replicate-inputs"].count() > 0) {
 
 }
@@ -44,7 +44,7 @@ std::map<std::string, std::string>
 gemm::GEMMProgramSettings::getSettingsMap() {
         auto map = hpcc_base::BaseSettings::getSettingsMap();
         map["Matrix Size"] = std::to_string(matrixSize);
-        map["Kernel Replications"] = std::to_string(kernelReplications);
+        map["Block Size"] = std::to_string(blockSize);
         map["Replicate Inputs"] = replicateInputBuffers ? "Yes" : "No";
         return map;
 }
@@ -99,29 +99,25 @@ gemm::GEMMBenchmark::addAdditionalParseOptions(cxxopts::Options &options) {
             ("replicate-inputs", "Also replicates the input buffer for each kernel");
 }
 
-std::unique_ptr<gemm::GEMMExecutionTimings>
+void
 gemm::GEMMBenchmark::executeKernel(GEMMData &data) {
-    return bm_execution::calculate(*executionSettings, data.A, data.B, data.C, data.C_out, data.alpha, data.beta);
+    timings = bm_execution::calculate(*executionSettings, data.A, data.B, data.C, data.C_out, data.alpha, data.beta);
 }
 
 void
-gemm::GEMMBenchmark::collectAndPrintResults(const gemm::GEMMExecutionTimings &output) {
+gemm::GEMMBenchmark::collectResults() {
 
-    uint number_measurements = output.timings.size();
+    uint number_measurements = timings.at("execution").size();
     std::vector<double> avg_measures(number_measurements);
 #ifdef _USE_MPI_
     // Copy the object variable to a local variable to make it accessible to the lambda function
     int mpi_size = mpi_comm_size;
-    MPI_Reduce(output.timings.data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    MPI_Reduce(timings.at("execution").data(), avg_measures.data(), number_measurements, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
     std::for_each(avg_measures.begin(),avg_measures.end(), [mpi_size](double& x) {x /= mpi_size;});
 #else
-    std::copy(output.timings.begin(), output.timings.end(), avg_measures.begin());
+    std::copy(timings.at("execution").begin(), timings.at("execution").end(), avg_measures.begin());
 #endif
     if (mpi_comm_rank == 0) {
-        std::cout << std::setw(ENTRY_SPACE)
-                << "best" << std::setw(ENTRY_SPACE) << "mean"
-                << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl;
-
         // Calculate performance for kernel execution
         double tmean = 0;
         double tmin = std::numeric_limits<double>::max();
@@ -136,10 +132,21 @@ gemm::GEMMBenchmark::collectAndPrintResults(const gemm::GEMMExecutionTimings &ou
             }
         }
         tmean = tmean / avg_measures.size();
+        results.emplace("t_mean", hpcc_base::HpccResult(tmean, "s"));
+        results.emplace("t_min", hpcc_base::HpccResult(tmin, "s"));
+        results.emplace("gflops", hpcc_base::HpccResult(gflops / tmin, "GFLOP/s"));
+    }
+}
+
+void
+gemm::GEMMBenchmark::printResults() {
+    if (mpi_comm_rank == 0) {
+        std::cout << std::left << std::setw(ENTRY_SPACE)
+                << " best" << std::setw(ENTRY_SPACE) << " mean"
+                << std::setw(ENTRY_SPACE) << " GFLOPS" << std::right << std::endl;
 
         std::cout << std::setw(ENTRY_SPACE)
-                << tmin << std::setw(ENTRY_SPACE) << tmean
-                << std::setw(ENTRY_SPACE) << gflops / tmin
+                << results.at("t_min") << results.at("t_mean") << results.at("gflops")
                 << std::endl;
     }
 }
@@ -164,7 +171,7 @@ gemm::GEMMBenchmark::generateInputData() {
 }
 
 bool  
-gemm::GEMMBenchmark::validateOutputAndPrintError(gemm::GEMMData &data) {
+gemm::GEMMBenchmark::validateOutput(gemm::GEMMData &data) {
     auto ref_data = generateInputData();
 
     gemm_ref(ref_data->A, ref_data->B, ref_data->C, executionSettings->programSettings->matrixSize, OPTIONAL_CAST(0.5), OPTIONAL_CAST(2.0));
@@ -189,19 +196,24 @@ gemm::GEMMBenchmark::validateOutputAndPrintError(gemm::GEMMData &data) {
         double eps = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
         double residn = resid / (executionSettings->programSettings->matrixSize*executionSettings->programSettings->matrixSize*ref_data->normtotal*normx*eps);
 
-        std::cout << "  norm. resid        resid       "\
-                    "machep" << std::endl;
-        std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE)
-                << resid << std::setw(ENTRY_SPACE) << eps
-                << std::endl;
+        errors.emplace("epsilon", eps);
+        errors.emplace("residual", resid);
+        errors.emplace("residual_norm", residn);
 
         return residn < 1.0;
     }
-
     // All other ranks are always reporting success of the validation
     return true;
 }
 
+void
+gemm::GEMMBenchmark::printError() {
+    if (mpi_comm_rank == 0) {
+        std::cout << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << errors.at("residual_norm") << std::setw(ENTRY_SPACE) << errors.at("residual") << std::setw(ENTRY_SPACE) << errors.at("epsilon") << std::endl;
+    }
+}
+
 void 
 gemm::gemm_ref(HOST_DATA_TYPE* a,HOST_DATA_TYPE* b, HOST_DATA_TYPE* c,
                                 int n, HOST_DATA_TYPE alpha, HOST_DATA_TYPE beta) {
diff --git a/GEMM/src/host/gemm_benchmark.hpp b/GEMM/src/host/gemm_benchmark.hpp
index fde2e2ae..a17a29f7 100644
--- a/GEMM/src/host/gemm_benchmark.hpp
+++ b/GEMM/src/host/gemm_benchmark.hpp
@@ -71,12 +71,6 @@ class GEMMProgramSettings : public hpcc_base::BaseSettings {
      */
     uint blockSize;
 
-    /**
-     * @brief Number of times the kernel is replicated
-     * 
-     */
-    uint kernelReplications;
-
     /**
      * @brief If True, replicate input buffers for each kernel replication
      */
@@ -170,25 +164,11 @@ class GEMMData {
 
 };
 
-/**
- * @brief Measured execution timing from the kernel execution
- * 
- */
-class GEMMExecutionTimings {
-public:
-    /**
-     * @brief A vector containing the timings for all repetitions for the kernel execution
-     * 
-     */
-    std::vector<double> timings;
-
-};
-
 /**
  * @brief Implementation of the GEMM benchmark
  * 
  */
-class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark<GEMMProgramSettings, GEMMData, GEMMExecutionTimings> {
+class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark<GEMMProgramSettings, GEMMData> {
 
 protected:
 
@@ -203,7 +183,7 @@ class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark<GEMMProgramSettings, G
 public:
 
     /**
-     * @brief LINPACK specific implementation of the data generation
+     * @brief GEMM specific implementation of the data generation
      * 
      * @return std::unique_ptr<GEMMData> The input and output data of the benchmark
      */
@@ -216,7 +196,7 @@ class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark<GEMMProgramSettings, G
      * @param data The input and output data of the benchmark
      * @return std::unique_ptr<GEMMExecutionTimings> Measured runtimes of the kernel execution
      */
-    std::unique_ptr<GEMMExecutionTimings>
+    void
     executeKernel(GEMMData &data) override;
 
     /**
@@ -227,15 +207,23 @@ class GEMMBenchmark : public hpcc_base::HpccFpgaBenchmark<GEMMProgramSettings, G
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(GEMMData &data) override;
+    validateOutput(GEMMData &data) override;
+
+    /**
+     * @brief GEMM specific implementation of the error printing
+     *
+     */
+    void
+    printError() override;
 
+    void collectResults() override;
     /**
      * @brief GEMM specific implementation of printing the execution results
      * 
      * @param output Measured runtimes of the kernel execution
      */
     void
-    collectAndPrintResults(const GEMMExecutionTimings &output) override;
+    printResults() override;
 
     /**
      * @brief Construct a new GEMM Benchmark object
diff --git a/GEMM/tests/test_kernel_functionality_and_host_integration.cpp b/GEMM/tests/test_kernel_functionality_and_host_integration.cpp
index aafbf650..41ead85e 100755
--- a/GEMM/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/GEMM/tests/test_kernel_functionality_and_host_integration.cpp
@@ -7,6 +7,7 @@
 #include "gemm_benchmark.hpp"
 #include "parameters.h"
 #include "test_program_settings.h"
+#include "nlohmann/json.hpp"
 
 void
 ref_matmul(HOST_DATA_TYPE* A, HOST_DATA_TYPE* B, HOST_DATA_TYPE* C, int size) {
@@ -40,8 +41,8 @@ struct GEMMKernelTest : testing::Test, testing::WithParamInterface<unsigned> {
  */
 TEST_P(GEMMKernelTest, FPGACorrectNumberOfRepetitionsIs1) {
     bm->getExecutionSettings().programSettings->numRepetitions = 1;
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(result->timings.size(), 1);
+    bm->executeKernel(*data);
+    EXPECT_EQ(bm->getTimingsMap().at("execution").size(), 1);
 }
 
 /**
@@ -49,8 +50,8 @@ TEST_P(GEMMKernelTest, FPGACorrectNumberOfRepetitionsIs1) {
  */
 TEST_P(GEMMKernelTest, FPGACorrectNumberOfRepetitionsIs3) {
     bm->getExecutionSettings().programSettings->numRepetitions = 3;
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(result->timings.size(), 3);
+    bm->executeKernel(*data);
+    EXPECT_EQ(bm->getTimingsMap().at("execution").size(), 3);
 }
 
 /**
@@ -64,7 +65,7 @@ TEST_P(GEMMKernelTest, FPGACorrectCtimesBeta) {
             data->C[i * matrix_size + j] = OPTIONAL_CAST(1.0);
         }
     }
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < matrix_size; i++) {
         for (int j = 0; j < matrix_size; j++) {
             EXPECT_NEAR(data->C_out[i * matrix_size + j], 2.0 * data->C[i * matrix_size + j], std::numeric_limits<HOST_DATA_TYPE>::epsilon());
@@ -85,7 +86,7 @@ TEST_P(GEMMKernelTest, FPGACorrectAtimesAlpha) {
     data->alpha = 2.0;
     data->beta = 0.0;
 
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < matrix_size; i++) {
         for (int j = 0; j < matrix_size; j++) {
             EXPECT_NEAR(data->C_out[i * matrix_size + j], 2.0 * data->A[i * matrix_size + j], std::numeric_limits<HOST_DATA_TYPE>::epsilon());
@@ -105,7 +106,7 @@ TEST_P(GEMMKernelTest, FPGACorrectBtimesAlpha) {
     }
     data->alpha = 2.0;
     data->beta = 0.0;
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < matrix_size; i++) {
         for (int j = 0; j < matrix_size; j++) {
             EXPECT_NEAR(data->C_out[i * matrix_size + j], 2.0 * data->B[i * matrix_size + j], std::numeric_limits<HOST_DATA_TYPE>::epsilon());
@@ -126,7 +127,7 @@ TEST_P(GEMMKernelTest, FPGACorrectAmulB) {
     }
     data->alpha = 1.0;
     data->beta = 1.0;
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
 
     HOST_DATA_TYPE c_ref_out[matrix_size * matrix_size];
     ref_matmul(data->A,data->B,c_ref_out,matrix_size);
@@ -150,7 +151,7 @@ TEST_P(GEMMKernelTest, FPGACorrectCplusA) {
     data->alpha = 1.0;
     data->beta = 1.0;
 
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < matrix_size; i++) {
         for (int j = 0; j < matrix_size; j++) {
             EXPECT_FLOAT_EQ(data->C_out[i * matrix_size + j], data->A[i * matrix_size + j] + data->C[i * matrix_size + j]);
@@ -165,7 +166,7 @@ TEST_P(GEMMKernelTest, FPGACorrectCplusA) {
 
 TEST_P(GEMMKernelTest, FPGACorrectbetaCplusalphaAB) {
     HOST_DATA_TYPE c_ref_out[matrix_size * matrix_size];
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < matrix_size; i++) {
         for (int j = 0; j < matrix_size; j++) {
            c_ref_out[i * matrix_size + j] = data->C[i * matrix_size + j];
@@ -179,6 +180,29 @@ TEST_P(GEMMKernelTest, FPGACorrectbetaCplusalphaAB) {
     }
 }
 
+using json = nlohmann::json;
+
+TEST_P(GEMMKernelTest, JsonDump) {
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("gemm.json");
+    std::FILE *f = std::fopen("gemm.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].contains("execution"));
+        }
+        EXPECT_TRUE(j.contains("results"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("gflops"));
+            EXPECT_TRUE(j["results"].contains("t_mean"));
+            EXPECT_TRUE(j["results"].contains("t_min"));
+        }
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(Default, GEMMKernelTest,
          testing::Values(1,2));
 
diff --git a/LINPACK/CMakeLists.txt b/LINPACK/CMakeLists.txt
index fb17db96..a33cc82f 100755
--- a/LINPACK/CMakeLists.txt
+++ b/LINPACK/CMakeLists.txt
@@ -19,6 +19,11 @@ if (TEST_UNIFORM)
     set(TEST_HOST_FLAGS "--uniform")
 endif()
 
+if (USE_ACCL)
+    math(EXPR calculate_accl_buffer_size "(2^${LOCAL_MEM_BLOCK_LOG})^2 * 8")
+    set(ACCL_BUFFER_SIZE ${calculate_accl_buffer_size} CACHE STRING "Size of ACCL buffers in bytes")
+endif()
+
 if (TEST_EMULATION)
     set(TEST_HOST_FLAGS "--emulation")
 endif()
diff --git a/LINPACK/Readme.md b/LINPACK/README.md
similarity index 57%
rename from LINPACK/Readme.md
rename to LINPACK/README.md
index ce7d508f..ae110a43 100644
--- a/LINPACK/Readme.md
+++ b/LINPACK/README.md
@@ -127,14 +127,13 @@ It will run an emulation of the kernel and execute some functionality tests.
 The host code will print the results of the execution to the standard output.
 The result  summary looks similar to this:
 
-    norm. resid        resid       machep   
-        3.25054e-08    5.88298e-05    1.19209e-07
-    Validation Time: 4.55059e+01 s
-            Method           best           mean         GFLOPS
-            total    5.87510e+01    5.87510e+01    2.10546e+04
-            GEFA    5.87510e+01    5.87510e+01    2.10541e+04
-            GESL    4.70000e-08    4.70000e-08    6.42532e+08
-    Validation: SUCCESS!
+     norm. residual      res. error          mach. eps
+    4.35451e-03         5.96046e-07         1.19209e-07
+
+     Method              best                mean                GFLOPS             
+     total              1.12152e-01 s       1.16113e-01 s       2.13045e-04 GFLOP/s 
+     GEFA               1.12152e-01 s       1.16113e-01 s       1.94784e-04 GFLOP/s 
+     GESL               2.00000e-08 s       3.97000e-08 s       1.02400e+02 GFLOP/s 
 
 The first row contains data from the correctness check that is done once when
 executing the benchmark:
@@ -155,3 +154,172 @@ The columns of the table contain the following information:
 The last row of the output will always contain `Validation: SUCCESS!`, if the norm. residual is below 1.
 This will be interpreted as successful validation.
 In this case, the executable will return 0 as exit code, 1 otherwise.
+
+The json output looks like the following.
+
+```json
+
+{
+  "config_time": "Wed Dec 14 08:41:58 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {
+    "epsilon": 1.1920928955078125e-07,
+    "residual": 5.960464477539062e-07,
+    "residual_norm": 0.004354506590071576
+  },
+  "execution_time": "Wed Dec 14 09:20:49 UTC 2022",
+  "git_commit": "be1a4e9-dirty",
+  "mpi": {
+    "subversion": 1,
+    "version": 3
+  },
+  "name": "LINPACK",
+  "results": {
+    "gflops": {
+      "unit": "GFLOP/s",
+      "value": 0.0006047108051562395
+    },
+    "gflops_lu": {
+      "unit": "GFLOP/s",
+      "value": 0.0005528788702090362
+    },
+    "gflops_sl": {
+      "unit": "GFLOP/s",
+      "value": 68.26666666666668
+    },
+    "t_mean": {
+      "unit": "s",
+      "value": 0.041533081799999996
+    },
+    "t_min": {
+      "unit": "s",
+      "value": 0.039512
+    },
+    "tlu_mean": {
+      "unit": "s",
+      "value": 0.041533051599999996
+    },
+    "tlu_min": {
+      "unit": "s",
+      "value": 0.03951197
+    },
+    "tsl_mean": {
+      "unit": "s",
+      "value": 3.019999999999999e-08
+    },
+    "tsl_min": {
+      "unit": "s",
+      "value": 3e-08
+    }
+  },
+  "settings": {
+    "Block Size": 16,
+    "Communication Type": false,
+    "Data Type": false,
+    "Diagonally Dominant": true,
+    "Emulate": false,
+    "FPGA Torus": {
+      "P": 1,
+      "Q": 1
+    },
+    "Kernel File": false,
+    "Kernel Replications": 3,
+    "MPI Ranks": 1,
+    "Matrix Size": 32,
+    "Repetitions": 10,
+    "Test Mode": false
+  },
+  "timings": {
+    "gefa": [
+      {
+        "unit": "s",
+        "value": 0.040978706
+      },
+      {
+        "unit": "s",
+        "value": 0.041104108
+      },
+      {
+        "unit": "s",
+        "value": 0.040878394
+      },
+      {
+        "unit": "s",
+        "value": 0.040391036
+      },
+      {
+        "unit": "s",
+        "value": 0.044723132
+      },
+      {
+        "unit": "s",
+        "value": 0.03951197
+      },
+      {
+        "unit": "s",
+        "value": 0.043374308
+      },
+      {
+        "unit": "s",
+        "value": 0.04179909
+      },
+      {
+        "unit": "s",
+        "value": 0.041162129
+      },
+      {
+        "unit": "s",
+        "value": 0.041407643
+      }
+    ],
+    "gesl": [
+      {
+        "unit": "s",
+        "value": 3e-08
+      },
+      {
+        "unit": "s",
+        "value": 3e-08
+      },
+      {
+        "unit": "s",
+        "value": 3e-08
+      },
+      {
+        "unit": "s",
+        "value": 3e-08
+      },
+      {
+        "unit": "s",
+        "value": 3e-08
+      },
+      {
+        "unit": "s",
+        "value": 3e-08
+      },
+      {
+        "unit": "s",
+        "value": 3.1e-08
+      },
+      {
+        "unit": "s",
+        "value": 3.1e-08
+      },
+      {
+        "unit": "s",
+        "value": 3e-08
+      },
+      {
+        "unit": "s",
+        "value": 3e-08
+      }
+    ]
+  },
+  "validated": true,
+  "version": "2.6"
+}
+
+```
diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake
new file mode 100644
index 00000000..94a9c4f6
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R1_ACCL.cmake
@@ -0,0 +1,31 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini" CACHE STRING "Link settings file" FORCE)
+set(ACCL_BUFFER_SIZE 524288 CACHE STRING "Set ACCL buffer size to fit single matrix block" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
+
diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake
new file mode 100644
index 00000000..9d5cc02f
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_DDR_PCIE.cmake
@@ -0,0 +1,30 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL No CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE)
+set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini" CACHE STRING "Link settings file" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake
new file mode 100644
index 00000000..9bc20f5c
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R2_HBM_PCIE_profile.cmake
@@ -0,0 +1,31 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL No CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_COMPILE_FLAGS -g --profile.stall all:all CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE)
+set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini" CACHE STRING "Link settings file" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
diff --git a/LINPACK/configs/Xilinx_U280_B8_SB3_R3_HBM_PCIE.cmake b/LINPACK/configs/Xilinx_U280_B8_SB3_R3_HBM_PCIE.cmake
new file mode 100644
index 00000000..de080ee7
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U280_B8_SB3_R3_HBM_PCIE.cmake
@@ -0,0 +1,30 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE)
+set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 3 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini" CACHE STRING "Link settings file" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake
new file mode 100644
index 00000000..20afd309
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL.cmake
@@ -0,0 +1,30 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini" CACHE STRING "Link settings file" FORCE)
+set(ACCL_BUFFER_SIZE 524288 CACHE STRING "Set ACCL buffer size to fit single matrix block" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
+
diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL_profile.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL_profile.cmake
new file mode 100644
index 00000000..ed8cc15a
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R1_ACCL_profile.cmake
@@ -0,0 +1,31 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "Names of all compute kernels" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini" CACHE STRING "Link settings file" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
+
diff --git a/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake
new file mode 100644
index 00000000..dfd8611b
--- /dev/null
+++ b/LINPACK/configs/Xilinx_U55C_B8_SB3_R2_HBM_PCIE.cmake
@@ -0,0 +1,30 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL No CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES lu top_update left_update inner_update_mm0 CACHE STRING "" FORCE)
+set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
+# LINPACK specific options
+set(DEFAULT_MATRIX_SIZE 1024 CACHE STRING "Default matrix size" FORCE)
+set(LOCAL_MEM_BLOCK_LOG 8 CACHE STRING "Used to define the width and height of the block stored in local memory" FORCE)
+set(REGISTER_BLOCK_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(REGISTER_BLOCK_MM_LOG 3 CACHE STRING "Size of the block that will be manipulated in registers" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of times the matrix multiplication kernel will be replicated" FORCE)
+
+set(XILINX_COMPILE_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.hpl_torus_pcie.ddr.ini" CACHE STRING "Compile settings file" FORCE)
+set(XILINX_LINK_SETTINGS_FILE "${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini" CACHE STRING "Link settings file" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp instead of cl2.hpp" FORCE)
+set(USE_PCIE_MPI_COMMUNICATION Yes CACHE BOOL "Use PCIe and MPI for communication between FPGAs" FORCE)
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini
new file mode 100644
index 00000000..4783d320
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.generator.ini
@@ -0,0 +1,88 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR0
+slr=top_update_1:SLR0
+slr=inner_update_mm0_1:SLR2
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:4]
+sp=lu_1.m_axi_gmem1:HBM[5:6]
+sp=lu_1.m_axi_gmem2:HBM[5:6]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:4]
+sp=top_update_1.m_axi_gmem1:HBM[5:6]
+sp=top_update_1.m_axi_gmem2:HBM[5:6]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:4]
+sp=left_update_1.m_axi_gmem1:HBM[5:6]
+sp=left_update_1.m_axi_gmem2:HBM[5:6]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:4]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[5:6]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[5:6]
+# PY_CODE_GEN block_end
+
+#ACCL
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+
+sp=ccl_offload_0.m_axi_0:HBM[5:6]
+sp=ccl_offload_0.m_axi_1:HBM[5:6]
+
+
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini
new file mode 100644
index 00000000..289a6263
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u280.generator.ini
@@ -0,0 +1,88 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR0
+slr=top_update_1:SLR0
+slr=inner_update_mm0_1:SLR1
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:4]
+sp=lu_1.m_axi_gmem1:HBM[5:6]
+sp=lu_1.m_axi_gmem2:HBM[5:6]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:4]
+sp=top_update_1.m_axi_gmem1:HBM[5:6]
+sp=top_update_1.m_axi_gmem2:HBM[5:6]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:4]
+sp=left_update_1.m_axi_gmem1:HBM[5:6]
+sp=left_update_1.m_axi_gmem2:HBM[5:6]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:4]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[5:6]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[5:6]
+# PY_CODE_GEN block_end
+
+#ACCL
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR2
+slr=compression_0_1:SLR2
+slr=compression_0_2:SLR2
+slr=lb_user_krnl:SLR2
+slr=arith_0:SLR2
+slr=ccl_offload_0:SLR2
+slr=hostctrl_0:SLR2
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
+
+sp=ccl_offload_0.m_axi_0:HBM[5:6]
+sp=ccl_offload_0.m_axi_1:HBM[5:6]
+
+
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini
new file mode 100644
index 00000000..4783d320
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.generator.ini
@@ -0,0 +1,88 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR0
+slr=top_update_1:SLR0
+slr=inner_update_mm0_1:SLR2
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:4]
+sp=lu_1.m_axi_gmem1:HBM[5:6]
+sp=lu_1.m_axi_gmem2:HBM[5:6]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:4]
+sp=top_update_1.m_axi_gmem1:HBM[5:6]
+sp=top_update_1.m_axi_gmem2:HBM[5:6]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:4]
+sp=left_update_1.m_axi_gmem1:HBM[5:6]
+sp=left_update_1.m_axi_gmem2:HBM[5:6]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:4]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[5:6]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[5:6]
+# PY_CODE_GEN block_end
+
+#ACCL
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+
+sp=ccl_offload_0.m_axi_0:HBM[5:6]
+sp=ccl_offload_0.m_axi_1:HBM[5:6]
+
+
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini
new file mode 100644
index 00000000..d4d128dc
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_accl.hbm.u55c.profile.generator.ini
@@ -0,0 +1,94 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR0
+slr=top_update_1:SLR0
+slr=inner_update_mm0_1:SLR2
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:4]
+sp=lu_1.m_axi_gmem1:HBM[5:6]
+sp=lu_1.m_axi_gmem2:HBM[5:6]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:4]
+sp=top_update_1.m_axi_gmem1:HBM[5:6]
+sp=top_update_1.m_axi_gmem2:HBM[5:6]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:4]
+sp=left_update_1.m_axi_gmem1:HBM[5:6]
+sp=left_update_1.m_axi_gmem2:HBM[5:6]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:4]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[5:6]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[5:6]
+# PY_CODE_GEN block_end
+
+#ACCL
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+
+sp=ccl_offload_0.m_axi_0:HBM[5:6]
+sp=ccl_offload_0.m_axi_1:HBM[5:6]
+
+
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
+[profile]
+data=all:all:all
+memory=all
+stall=all:all
+exec=all:all
\ No newline at end of file
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini
index e032e407..e419e22e 100644
--- a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.ddr.generator.ini
@@ -9,9 +9,9 @@ nk=inner_update_mm0:$PY_CODE_GEN num_replications$
 slr=lu_1:SLR0
 slr=left_update_1:SLR0
 slr=top_update_1:SLR0
-# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
-slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN (i +1) % 3$
-# PY_CODE_GEN block_end
+{% for i in range(num_replications) %}
+slr=inner_update_mm0_{{ i+1 }}:SLR{{ (i+1) % 3 }}
+{% endfor %}
 
 # matrix ports
 sp=lu_1.m_axi_gmem0:DDR[0]
@@ -26,9 +26,9 @@ sp=left_update_1.m_axi_gmem0:DDR[0]
 sp=left_update_1.m_axi_gmem1:DDR[1]
 sp=left_update_1.m_axi_gmem2:DDR[1]
 
-# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
-sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:DDR[0]
-sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:DDR[1]
-sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:DDR[0]
-# PY_CODE_GEN block_end
+{% for i in range(num_replications) %}
+sp=inner_update_mm0_{{ i+1 }}.m_axi_gmem0:DDR[0]
+sp=inner_update_mm0_{{ i+1 }}.m_axi_gmem1:DDR[1]
+sp=inner_update_mm0_{{ i+1 }}.m_axi_gmem2:DDR[0]
+{% endfor %}
 
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini
new file mode 100644
index 00000000..2815cc38
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.distribute_kernels.hbm.generator.ini
@@ -0,0 +1,34 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR1
+slr=top_update_1:SLR2
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN i % 3$
+# PY_CODE_GEN block_end
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:5]
+sp=lu_1.m_axi_gmem1:HBM[6]
+sp=lu_1.m_axi_gmem2:HBM[7]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:5]
+sp=top_update_1.m_axi_gmem1:HBM[6]
+sp=top_update_1.m_axi_gmem2:HBM[8]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:5]
+sp=left_update_1.m_axi_gmem1:HBM[7]
+sp=left_update_1.m_axi_gmem2:HBM[9]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:5]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[9]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8]
+# PY_CODE_GEN block_end
+
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini
new file mode 100644
index 00000000..fe68d728
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.generator.ini
@@ -0,0 +1,34 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR0
+slr=top_update_1:SLR0
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN (i +1) % 3$
+# PY_CODE_GEN block_end
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:5]
+sp=lu_1.m_axi_gmem1:HBM[6]
+sp=lu_1.m_axi_gmem2:HBM[7]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:5]
+sp=top_update_1.m_axi_gmem1:HBM[8]
+sp=top_update_1.m_axi_gmem2:HBM[6]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:5]
+sp=left_update_1.m_axi_gmem1:HBM[9]
+sp=left_update_1.m_axi_gmem2:HBM[7]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:5]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[9]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8]
+# PY_CODE_GEN block_end
+
diff --git a/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
new file mode 100644
index 00000000..aeea6acf
--- /dev/null
+++ b/LINPACK/settings/settings.link.xilinx.hpl_torus_pcie.hbm.u280.profile.generator.ini
@@ -0,0 +1,41 @@
+[connectivity]
+nk=lu:1
+nk=left_update:1
+nk=top_update:1
+nk=inner_update_mm0:$PY_CODE_GEN num_replications$
+
+# slrs
+# all special kernels are on SLR0. MM kernels are put on all remaining SLRs using RR
+slr=lu_1:SLR0
+slr=left_update_1:SLR0
+slr=top_update_1:SLR0
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+slr=inner_update_mm0_$PY_CODE_GEN i +1$:SLR$PY_CODE_GEN (i +1) % 3$
+# PY_CODE_GEN block_end
+
+# matrix ports
+sp=lu_1.m_axi_gmem0:HBM[0:5]
+sp=lu_1.m_axi_gmem1:HBM[6]
+sp=lu_1.m_axi_gmem2:HBM[7]
+
+sp=top_update_1.m_axi_gmem0:HBM[0:5]
+sp=top_update_1.m_axi_gmem1:HBM[8]
+sp=top_update_1.m_axi_gmem2:HBM[6]
+
+sp=left_update_1.m_axi_gmem0:HBM[0:5]
+sp=left_update_1.m_axi_gmem1:HBM[9]
+sp=left_update_1.m_axi_gmem2:HBM[7]
+
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem0:HBM[0:5]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem1:HBM[9]
+sp=inner_update_mm0_$PY_CODE_GEN i +1$.m_axi_gmem2:HBM[8]
+# PY_CODE_GEN block_end
+
+[profile]
+stall=all:all:all
+data=all:all:all
+exec=all:all:all
+trace_memory=HBM[16]:SLR0
+trace_memory=HBM[17]:SLR1
+trace_memory=HBM[18]:SLR2
diff --git a/LINPACK/src/common/parameters.h.in b/LINPACK/src/common/parameters.h.in
index 4c036fb9..b571e89d 100644
--- a/LINPACK/src/common/parameters.h.in
+++ b/LINPACK/src/common/parameters.h.in
@@ -1,14 +1,11 @@
 #ifndef SRC_COMMON_PARAMETERS_H_
 #define SRC_COMMON_PARAMETERS_H_
 
+#include "base_parameters.h"
+
 /**
  * Host specific parameters
  */
-#define VERSION "@PROJECT_VERSION@"
-#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@
-#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@
-#define DEFAULT_DEVICE @DEFAULT_DEVICE@
-#define HOST_DATA_TYPE @HOST_DATA_TYPE@
 #define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@
 #define DEFAULT_P_VALUE @DEFAULT_P_VALUE@
 #cmakedefine _DP
@@ -22,11 +19,9 @@
 /**
  * Device specific parameters
  */
-#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
 #define LOCAL_MEM_BLOCK_LOG @LOCAL_MEM_BLOCK_LOG@
 #define REGISTER_BLOCK_LOG @REGISTER_BLOCK_LOG@
 #define REGISTER_BLOCK_MM_LOG @REGISTER_BLOCK_MM_LOG@
-#define NUM_REPLICATIONS @NUM_REPLICATIONS@
 
 #cmakedefine USE_SVM
 #cmakedefine DISTRIBUTED_VALIDATION
@@ -34,15 +29,11 @@
 /*
 Short description of the program
 */
-#define PROGRAM_DESCRIPTION "Implementation of the LINPACK benchmark"\
+#define PROGRAM_NAME "LINPACK"
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
 
-/**
-Output separator
-*/
-#define HLINE "-------------------------------------------------------------\n"
-
 #define LEFT_BLOCK (1 << 1)
 #define TOP_BLOCK (1 << 2)
 #define LU_BLOCK_OUT (1 << 3)
diff --git a/LINPACK/src/device/CMakeLists.txt b/LINPACK/src/device/CMakeLists.txt
index 7a28cc56..2e9431a5 100644
--- a/LINPACK/src/device/CMakeLists.txt
+++ b/LINPACK/src/device/CMakeLists.txt
@@ -10,7 +10,7 @@ if (INTELFPGAOPENCL_FOUND)
 endif()
 
 if (VITIS_FOUND)
-    generate_kernel_targets_xilinx(hpl_torus_PCIE)
+    generate_kernel_targets_xilinx(hpl_torus_PCIE hpl_torus_ACCL_buffers)
     add_test(NAME test_emulation_xilinx COMMAND Linpack_xilinx -f hpl_torus_PCIE_emulate.xclbin -m 2 -n 1 ${TEST_HOST_FLAGS}
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
     add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Linpack_xilinx ${TEST_HOST_FLAGS} -f hpl_torus_PCIE_emulate.xclbin -m 2 -n 1 
diff --git a/LINPACK/src/device/hpl_torus_ACCL_buffers.cpp b/LINPACK/src/device/hpl_torus_ACCL_buffers.cpp
new file mode 120000
index 00000000..a11753b1
--- /dev/null
+++ b/LINPACK/src/device/hpl_torus_ACCL_buffers.cpp
@@ -0,0 +1 @@
+hpl_torus_PCIE.cpp
\ No newline at end of file
diff --git a/LINPACK/src/device/hpl_torus_IEC.cl b/LINPACK/src/device/hpl_torus_IEC.cl
index fc3d0257..7e8f57ea 100644
--- a/LINPACK/src/device/hpl_torus_IEC.cl
+++ b/LINPACK/src/device/hpl_torus_IEC.cl
@@ -839,7 +839,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	}
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 /**
 Update the inner blocks using the left and right column and rows
@@ -847,7 +847,7 @@ Update the inner blocks using the left and right column and rows
  */
  __attribute__((uses_global_work_offset(0)))
 __kernel
-void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a, 
+void inner_update_mm{{ i }}(__global DEVICE_DATA_TYPE* restrict a, 
 				__global DEVICE_DATA_TYPE* restrict left_global_buffer,
 				__global DEVICE_DATA_TYPE* restrict top_global_buffer,
 				const uint block_col,
@@ -945,4 +945,4 @@ void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a,
 	}
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
diff --git a/LINPACK/src/device/hpl_torus_PCIE.cl b/LINPACK/src/device/hpl_torus_PCIE.cl
index 2b3d312d..2b86657d 100644
--- a/LINPACK/src/device/hpl_torus_PCIE.cl
+++ b/LINPACK/src/device/hpl_torus_PCIE.cl
@@ -708,7 +708,7 @@ void left_update(__global DEVICE_DATA_TYPE* restrict a,
 	}
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 /**
 Update the inner blocks using the left and right column and rows
@@ -716,7 +716,7 @@ Update the inner blocks using the left and right column and rows
  */
  __attribute__((uses_global_work_offset(0)))
 __kernel
-void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a, 
+void inner_update_mm{{ i }}(__global DEVICE_DATA_TYPE* restrict a, 
 				__global DEVICE_DATA_TYPE* restrict left_global_buffer,
 				__global DEVICE_DATA_TYPE* restrict top_global_buffer,
 				const uint block_col,
@@ -862,4 +862,4 @@ void inner_update_mm/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE* restrict a,
 	}
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/LINPACK/src/device/hpl_torus_PCIE.cpp b/LINPACK/src/device/hpl_torus_PCIE.cpp
new file mode 100644
index 00000000..391ee48d
--- /dev/null
+++ b/LINPACK/src/device/hpl_torus_PCIE.cpp
@@ -0,0 +1,799 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#include "parameters.h"
+
+const unsigned block_size = (1 << LOCAL_MEM_BLOCK_LOG);
+const unsigned gemm_block = (1 << REGISTER_BLOCK_LOG);
+const unsigned gemm_block_mm = (1 << REGISTER_BLOCK_MM_LOG);
+
+#ifdef KERNEL_lu
+/**
+Executes a single step of the LU factorization.
+
+This method takes a partially solved 8x8 matrix and calculates the next step of
+the LU factorization The method needs 7 (gemm_block-1) calls to perform a single
+LU factorization. This is done to reduce resource usage, since all upcomng calls
+are anyway depending on the results of the previous call and there is no way to
+pipeline multiple executions.
+
+A is the input block that might be partially computed
+step is the current step and must be a value between 0 to gemm_block-2. After
+step gemm_block-2, the block is factorized
+ */
+void lu_block(const DEVICE_DATA_TYPE A[gemm_block][gemm_block], const int step,
+              DEVICE_DATA_TYPE A_out[gemm_block][gemm_block]) {
+
+  // Read current line from input
+  DEVICE_DATA_TYPE line[gemm_block];
+  for (int i = 0; i < gemm_block; i++) {
+    line[i] = A[step][i];
+  }
+
+  // calculate the inverse of the diagonal element for the scaling
+  DEVICE_DATA_TYPE inv_scale_a = -1.0 / line[step];
+
+  // Scale the current row
+  for (int i = 0; i < gemm_block; i++) {
+    if (i > step) {
+      line[i] = line[i] * inv_scale_a;
+    }
+  }
+  line[step] = inv_scale_a;
+
+  // Update all rows fully unrolled
+  // The multiply adds are fully independent
+  //__attribute__((opencl_unroll_hint(gemm_block)))
+  // Unrolling disabled for this loop to save resources
+  for (int j = 0; j < gemm_block; j++) {
+#pragma HLS PIPELINE II=1
+    DEVICE_DATA_TYPE curr_scale = A[j][step];
+    // Update a single row. If it is already updated, just write back the value,
+    // if it is the current row write back the value in "line", else update the
+    // value
+    if (j != step) {
+      for (int i = 0; i < gemm_block; i++) {
+        A_out[j][i] =
+            (i > step && j > step) ? A[j][i] + line[i] * curr_scale : A[j][i];
+      }
+    } else {
+      for (int i = 0; i < gemm_block; i++) {
+        A_out[j][i] = line[i];
+      }
+    }
+  }
+}
+
+/**
+This function can be used to update blocks using with three different
+operations. It will execute the update for a single row in the block. The update
+is completed after gemm_block calls of this update function
+
+operation_type: 0 for top = the top row of blocks will need a triangular MM
+                                1 for left = the left column of blocks will need
+a triangular MM, matrices have to be transposed 2 for inner block == all inner
+blocks will be updated with a MM
+ */
+void update_block(const DEVICE_DATA_TYPE a[gemm_block][gemm_block],
+                  const DEVICE_DATA_TYPE top[gemm_block],
+                  const DEVICE_DATA_TYPE left_or_lu[gemm_block],
+                  DEVICE_DATA_TYPE out[gemm_block][gemm_block],
+                  const int current_row, const int operation_type) {
+
+  // Define different operation types of function
+  const int op_top = 0;
+  const int op_left = 1;
+  const int op_inner = 2;
+
+  // Transpose the input matrices if the target is a left block
+  DEVICE_DATA_TYPE current_block[gemm_block][gemm_block];
+  if (operation_type == op_left) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int jj = 0; jj < gemm_block; jj++) {
+        current_block[ii][jj] = a[jj][ii];
+      }
+    }
+  } else {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int jj = 0; jj < gemm_block; jj++) {
+        current_block[ii][jj] = a[ii][jj];
+      }
+    }
+  }
+
+  // Generate the first scalling array depending on the operation type
+  DEVICE_DATA_TYPE scale_row[gemm_block];
+  if (operation_type == op_inner) {
+    for (int jj = 0; jj < gemm_block; jj++) {
+      scale_row[jj] = top[jj];
+    }
+  } else {
+    for (int jj = 0; jj < gemm_block; jj++) {
+      scale_row[jj] = current_block[current_row][jj];
+    }
+  }
+  if (operation_type == op_top) {
+    for (int jj = 0; jj < gemm_block; jj++) {
+      scale_row[jj] *= left_or_lu[current_row];
+    }
+  }
+
+  DEVICE_DATA_TYPE tmp[gemm_block][gemm_block];
+  // scale all values with the pre calculated scaling array and the second input
+  for (int ii = 0; ii < gemm_block; ii++) {
+    for (int jj = 0; jj < gemm_block; jj++) {
+      // left_or_lu_block are stored transposed to simplify the data access here
+      tmp[ii][jj] = current_block[ii][jj] + scale_row[jj] * left_or_lu[ii];
+    }
+  }
+
+  // overwrite results that were calculated altough they are not needed for the
+  // triangular operations left and top
+  if (operation_type != op_inner) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      if (ii == current_row) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          tmp[ii][jj] = scale_row[jj];
+        }
+      } else if (ii < current_row) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          tmp[ii][jj] = current_block[ii][jj];
+        }
+      }
+    }
+  }
+
+  // write result back and transpose if necessary
+  if (operation_type == op_left) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int jj = 0; jj < gemm_block; jj++) {
+        out[ii][jj] = tmp[jj][ii];
+      }
+    }
+  } else {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int jj = 0; jj < gemm_block; jj++) {
+        out[ii][jj] = tmp[ii][jj];
+      }
+    }
+  }
+}
+
+#endif
+
+extern "C" {
+
+#ifdef KERNEL_lu
+void lu(DEVICE_DATA_TYPE *a, DEVICE_DATA_TYPE *a_block_trans,
+        DEVICE_DATA_TYPE *a_block, const unsigned int block_col, const unsigned int block_row,
+        const unsigned int blocks_per_row) {
+
+  DEVICE_DATA_TYPE a_buffer[block_size / gemm_block][block_size / gemm_block]
+                           [gemm_block][gemm_block];
+
+  // Store current row and column in separate buffers for
+  // easier access in the deep pipeline
+  // need to be declared as local to prevent the compiler from
+  DEVICE_DATA_TYPE top_buffer[block_size / gemm_block][gemm_block];
+  DEVICE_DATA_TYPE left_buffer[block_size / gemm_block][gemm_block];
+
+  // Load block to local memory
+load_a_block:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+#pragma HLS PIPELINE
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a_buffer[i][j][ii][jj] =
+              a[block_col * block_size +
+                (block_row * block_size + i * gemm_block + ii) * block_size *
+                    blocks_per_row +
+                j * gemm_block + jj];
+        }
+      }
+    }
+  }
+
+  // For each row in the matrix update whole matrix.
+  // The iterations depend on each other, so loop pipelining is disabled here
+loop_diag:
+  for (int gk = 0; gk < block_size; gk++) {
+
+    int k = gk / gemm_block;
+    int kk = gk & (gemm_block - 1);
+
+    // Read in current LU block
+    DEVICE_DATA_TYPE lu_a_buffer_in[gemm_block][gemm_block];
+load_a_sb:
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int jj = 0; jj < gemm_block; jj++) {
+        lu_a_buffer_in[ii][jj] = a_buffer[k][k][ii][jj];
+      }
+    }
+
+    DEVICE_DATA_TYPE lu_a_buffer_out[gemm_block][gemm_block];
+    DEVICE_DATA_TYPE lu_a_buffer_out_row[gemm_block];
+    DEVICE_DATA_TYPE lu_a_buffer_out_col[gemm_block];
+    // Calculate next row and column of LU factorization and store in local
+    // memory buffer
+    lu_block(lu_a_buffer_in, kk, lu_a_buffer_out);
+write_lu_sb:
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int jj = 0; jj < gemm_block; jj++) {
+        a_buffer[k][k][ii][jj] = lu_a_buffer_out[ii][jj];
+      }
+    }
+write_lu_row:
+    for (int jj = 0; jj < gemm_block; jj++) {
+      lu_a_buffer_out_row[jj] = lu_a_buffer_out[kk][jj];
+    }
+write_lu_col:
+    for (int jj = 0; jj < gemm_block; jj++) {
+      lu_a_buffer_out_col[jj] = lu_a_buffer_out[jj][kk];
+    }
+
+    // The update pipeline does not need to be executed for the last
+    // row of blocks
+    if (gk < block_size - gemm_block) {
+
+update_inner:
+      // update all left blocks
+      for (int tj = 1; tj < block_size / gemm_block; tj++) {
+#pragma HLS PIPELINE II=1
+
+        int j = k;
+        int i = tj;
+
+        if (i > k) {
+          // copy the correct block in the second input buffer
+          // this depends on the operations that has to be executed
+          DEVICE_DATA_TYPE second_input[gemm_block];
+
+          // left matrix block will be calculated
+          for (int jj = 0; jj < gemm_block; jj++) {
+            second_input[jj] = lu_a_buffer_out_row[jj];
+          }
+          DEVICE_DATA_TYPE a_input[gemm_block][gemm_block];
+          for (int ii = 0; ii < gemm_block; ii++) {
+            for (int jj = 0; jj < gemm_block; jj++) {
+              a_input[ii][jj] = a_buffer[i][j][ii][jj];
+            }
+          }
+          DEVICE_DATA_TYPE top_input[gemm_block];
+          DEVICE_DATA_TYPE out[gemm_block][gemm_block];
+          update_block(a_input, top_input, second_input, out, kk, 1);
+
+          for (int ii = 0; ii < gemm_block; ii++) {
+            left_buffer[i][ii] = out[ii][kk];
+          }
+          for (int ii = 0; ii < gemm_block; ii++) {
+            for (int jj = 0; jj < gemm_block; jj++) {
+              a_buffer[i][j][ii][jj] = out[ii][jj];
+            }
+          }
+        }
+      }
+
+      // Update all other blocks with the new calculated row and column
+      // First update top blocks, then update left blocks, then all inner blocks
+      // ti == 0: top blocks
+      // ti == 1: left blocks
+      // ti > 1: inner blocks
+update_inner_2:
+      for (int ti = 0; ti < block_size / gemm_block - k; ti++) {
+        for (int tj = 1; tj < block_size / gemm_block; tj++) {
+#pragma HLS PIPELINE II=1
+
+          int j = tj;
+          int i = ti + k;
+          // always execute the pipeline for whole rows of matrix blocks.
+          // Only execute update for blocks that are required.
+          // This helps to keep constant latencies between data dependencies of
+          // the pipeline stages
+          if ((i > k || ti == 0) && j > k) {
+
+            // copy the correct block in the second input buffer
+            // this depends on the operations that has to be executed
+            DEVICE_DATA_TYPE second_input[gemm_block];
+            if (ti == 0) {
+              // top matrix block will be calculated
+              for (int jj = 0; jj < gemm_block; jj++) {
+                second_input[jj] = lu_a_buffer_out_col[jj];
+              }
+            } else {
+              // inner block will be calculated
+              for (int jj = 0; jj < gemm_block; jj++) {
+                second_input[jj] = left_buffer[i][jj];
+              }
+            }
+            DEVICE_DATA_TYPE a_input[gemm_block][gemm_block];
+            for (int ii = 0; ii < gemm_block; ii++) {
+              for (int jj = 0; jj < gemm_block; jj++) {
+                a_input[ii][jj] = a_buffer[i][j][ii][jj];
+              }
+            }
+            DEVICE_DATA_TYPE top_input[gemm_block];
+            for (int jj = 0; jj < gemm_block; jj++) {
+              top_input[jj] = top_buffer[j][jj];
+            }
+            DEVICE_DATA_TYPE out[gemm_block][gemm_block];
+            update_block(a_input, top_input, second_input, out, kk,
+                         (ti == 0) ? 0 : 2);
+            if (ti == 0) {
+              // only update in the first row
+              for (int jj = 0; jj < gemm_block; jj++) {
+                top_buffer[j][jj] = out[kk][jj];
+              }
+            }
+            for (int ii = 0; ii < gemm_block; ii++) {
+              for (int jj = 0; jj < gemm_block; jj++) {
+                a_buffer[i][j][ii][jj] = out[ii][jj];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Store block to global memory
+store_a:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a[block_col * block_size +
+            (block_row * block_size + i * gemm_block + ii) * block_size *
+                blocks_per_row +
+            j * gemm_block + jj] = a_buffer[i][j][ii][jj];
+        }
+      }
+    }
+  }
+  // Store current block in global memory also transposed to allow easier access
+  // from the top kernel
+  store_a_bt:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a_block_trans[(i * gemm_block + ii) * block_size + j * gemm_block +
+                        jj] = a_buffer[j][i][jj][ii];
+        }
+      }
+    }
+  }
+
+store_a_b:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a_block[(i * gemm_block + ii) * block_size + j * gemm_block + jj] =
+              a_buffer[i][j][ii][jj];
+        }
+      }
+    }
+  }
+}
+#endif
+
+#ifdef KERNEL_top_update
+/**
+Update the blocks to the right of the current LU block
+
+ */
+void top_update(DEVICE_DATA_TYPE *a, DEVICE_DATA_TYPE *top_block,
+                const DEVICE_DATA_TYPE *lu_global_buffer_transposed,
+                const unsigned int is_first_block, const unsigned int block_col,
+                const unsigned int block_row, const unsigned int blocks_per_row) {
+
+  // Store current block in local memory
+  DEVICE_DATA_TYPE
+      a_buffer[block_size / gemm_block][block_size / gemm_block][gemm_block]
+              [gemm_block];
+
+  // Load block to local memory
+load_a:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a_buffer[i][j][ii][jj] =
+              a[block_col * block_size +
+                (block_row * block_size + i * gemm_block + ii) * block_size *
+                    blocks_per_row +
+                j * gemm_block + jj];
+        }
+      }
+    }
+  }
+
+// For each row in the matrix update whole matrix.
+// The iterations depend on each other, so loop pipelining is disabled here
+diag_exe:
+  for (int gk = 0; gk < block_size; gk++) {
+
+    int k = gk / gemm_block;
+    int kk = gk & (gemm_block - 1);
+
+    DEVICE_DATA_TYPE current_lu_col[block_size / gemm_block][gemm_block];
+    DEVICE_DATA_TYPE current_row[block_size / gemm_block][gemm_block];
+    DEVICE_DATA_TYPE current_scale;
+
+scale_row:
+    for (int col = 0; col < block_size / gemm_block; col++) {
+#pragma HLS PIPELINE II=1
+	    DEVICE_DATA_TYPE col_in[gemm_block];
+#pragma HLS array_partition variable=col_in type=complete dim=0
+      DEVICE_DATA_TYPE scale_chunk[gemm_block];
+#pragma HLS array_partition variable=col_in type=complete dim=0
+
+      // get current row chunk
+      for (int i = 0; i < gemm_block; i++) {
+        scale_chunk[i] = a_buffer[k][col][kk][i];
+      }
+
+      // if current column data is still available read it in and store it in
+      // buffer
+      if (col < block_size / gemm_block - k) {
+        // Load LU data from global memory instead of receiving it from the
+        // channel
+        for (int i = 0; i < gemm_block; i++) {
+          col_in[i] =
+              lu_global_buffer_transposed[gk * block_size +
+                                          (col + k) * gemm_block + i];
+        }
+        if (col == 0) {
+          current_scale = col_in[kk];
+        }
+        for (int i = 0; i < gemm_block; i++) {
+          current_lu_col[col][i] = (col > 0 || i > kk) ? col_in[i] : 0.f;
+        }
+      }
+
+      // scale current row chunk with the rows scale factor received over the
+      // external channel
+      for (int i = 0; i < gemm_block; i++) {
+        scale_chunk[i] = scale_chunk[i] * current_scale;
+      }
+
+      for (int i = 0; i < gemm_block; i++) {
+        current_row[col][i] = scale_chunk[i];
+      }
+
+      // Update local memory buffer with chunk
+      for (int i = 0; i < gemm_block; i++) {
+        a_buffer[k][col][kk][i] = scale_chunk[i];
+      }
+    }
+
+// Update all remaining rows
+update_rows:
+    for (int row = k; row < block_size / gemm_block; row++) {
+#pragma HLS loop_tripcount min=0 max=block_size/gemm_block avg=block_size/gemm_block/2
+      // Update whole rows!
+      for (int curr_col = 0; curr_col < block_size / gemm_block; curr_col++) {
+#pragma HLS PIPELINE II=1
+        DEVICE_DATA_TYPE colbuf[gemm_block];
+        for (int j = 0; j < gemm_block; j++) {
+          colbuf[j] = current_lu_col[row - k][j];
+        }
+        for (int i = 0; i < gemm_block; i++) {
+          for (int j = 0; j < gemm_block; j++) {
+            a_buffer[row][curr_col][i][j] +=
+                colbuf[i] * current_row[curr_col][j];
+          }
+        }
+      }
+    }
+  }
+
+// Store block to global memory
+store_a:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a[block_col * block_size +
+            (block_row * block_size + i * gemm_block + ii) * block_size *
+                blocks_per_row +
+            j * gemm_block + jj] = a_buffer[i][j][ii][jj];
+        }
+      }
+    }
+  }
+// Store current block separately for easier transmission over host
+store_top:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          top_block[(i * gemm_block + ii) * block_size + j * gemm_block + jj] =
+              a_buffer[i][j][ii][jj];
+        }
+      }
+    }
+  }
+}
+#endif
+
+#ifdef KERNEL_left_update
+/**
+Update the blocks below the current LU block
+
+ */
+void left_update(DEVICE_DATA_TYPE * a,
+                 DEVICE_DATA_TYPE * left_block,
+                 const DEVICE_DATA_TYPE * lu_global_buffer,
+                 const unsigned int is_first_block, const unsigned int block_col,
+                 const unsigned int block_row, const unsigned int blocks_per_row) {
+
+  // Store current block in local memory
+  DEVICE_DATA_TYPE
+      a_buffer[block_size / gemm_block][block_size / gemm_block][gemm_block]
+              [gemm_block];
+
+  // Load block to local memory
+load_a:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a_buffer[i][j][ii][jj] =
+              a[block_col * block_size +
+                (block_row * block_size + i * gemm_block + ii) * block_size *
+                    blocks_per_row +
+                j * gemm_block + jj];
+        }
+      }
+    }
+  }
+
+  // For each row in the matrix update whole matrix.
+  // The iterations depend on each other, so loop pipelining is disabled here
+diag:
+  for (int gk = 0; gk < block_size; gk++) {
+
+    int k = gk / gemm_block;
+    int kk = gk & (gemm_block - 1);
+
+    DEVICE_DATA_TYPE current_lu_row[block_size / gemm_block][gemm_block];
+    DEVICE_DATA_TYPE current_col[block_size / gemm_block][gemm_block];
+
+first_col:
+    for (int col = 0; col < block_size / gemm_block; col++) {
+#pragma HLS PIPELINE II=1
+      DEVICE_DATA_TYPE chunk[gemm_block];
+      // get current row chunk
+      for (int i = 0; i < gemm_block; i++) {
+        chunk[i] = a_buffer[col][k][i][kk];
+      }
+
+      // Store chunk for later update
+      for (int i = 0; i < gemm_block; i++) {
+        current_col[col][i] = chunk[i];
+      }
+
+      DEVICE_DATA_TYPE row_in[gemm_block];
+
+      // if current column data is still available read it in and store it in
+      // buffer
+      if (col < block_size / gemm_block - k) {
+        // Load LU data from global memory
+        for (int i = 0; i < gemm_block; i++) {
+          row_in[i] =
+              lu_global_buffer[gk * block_size + (col + k) * gemm_block + i];
+        }
+        for (int i = 0; i < gemm_block; i++) {
+          current_lu_row[col][i] = (col > 0 || i > kk) ? row_in[i] : 0.f;
+        }
+      }
+    }
+
+    // Update all rows
+    // Update only remaining row chunks
+update:
+    for (int curr_col = 0; curr_col < block_size / gemm_block - k; curr_col++) {
+#pragma HLS loop_tripcount min=0 max=block_size/gemm_block avg=block_size/gemm_block/2
+      for (int row = 0; row < block_size / gemm_block; row++) {
+#pragma HLS PIPELINE II=1
+        DEVICE_DATA_TYPE colbuf[gemm_block];
+        for (int j = 0; j < gemm_block; j++) {
+          colbuf[j] = current_col[row][j];
+        }
+        for (int i = 0; i < gemm_block; i++) {
+          for (int j = 0; j < gemm_block; j++) {
+            a_buffer[row][curr_col + k][i][j] +=
+                current_lu_row[curr_col][j] * colbuf[i];
+          }
+        }
+      }
+    }
+  }
+
+  // Store block to global memory
+store_a:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          a[block_col * block_size +
+            (block_row * block_size + i * gemm_block + ii) * block_size *
+                blocks_per_row +
+            j * gemm_block + jj] = a_buffer[i][j][ii][jj];
+        }
+      }
+    }
+  }
+
+  // Store current block separately for easier transmission over host
+store_left:
+  for (int i = 0; i < block_size / gemm_block; i++) {
+    for (int ii = 0; ii < gemm_block; ii++) {
+      for (int j = 0; j < block_size / gemm_block; j++) {
+        for (int jj = 0; jj < gemm_block; jj++) {
+          left_block[(i * gemm_block + ii) * block_size + j * gemm_block + jj] =
+              a_buffer[j][i][jj][ii];
+        }
+      }
+    }
+  }
+}
+#endif
+
+#ifdef KERNEL_inner_update_mm0
+/**
+Update the inner blocks using the left and right column and rows
+
+ */
+void inner_update_mm0(
+    DEVICE_DATA_TYPE *a, const DEVICE_DATA_TYPE *left_global_buffer,
+    const DEVICE_DATA_TYPE *top_global_buffer, const unsigned int block_col,
+    const unsigned int block_row, const unsigned int blocks_per_row) {
+
+  // Store current block in local memory
+  DEVICE_DATA_TYPE a_buffer[block_size / gemm_block_mm]
+                           [block_size / gemm_block_mm][gemm_block_mm]
+                           [gemm_block_mm];
+  DEVICE_DATA_TYPE top_buffer[block_size / gemm_block_mm]
+                             [block_size / gemm_block_mm][gemm_block_mm]
+                             [gemm_block_mm];
+  DEVICE_DATA_TYPE left_buffer[block_size / gemm_block_mm]
+                              [block_size / gemm_block_mm][gemm_block_mm]
+                              [gemm_block_mm];
+
+  // If Xilinx FPGA, load blocks in separate pipelines to achieve memory bursts!
+  // Load blocks to local memory
+load_a_block:
+  for (int i = 0; i < block_size / gemm_block_mm; i++) {
+    for (int ii = 0; ii < gemm_block_mm; ii++) {
+      for (int j = 0; j < block_size / gemm_block_mm; j++) {
+#pragma HLS PIPELINE II=1
+        for (int jj = 0; jj < gemm_block_mm; jj++) {
+          a_buffer[i][j][ii][jj] =
+              a[block_col * block_size +
+                (block_row * block_size + i * gemm_block_mm + ii) * block_size *
+                    blocks_per_row +
+                j * gemm_block_mm + jj];
+        }
+      }
+    }
+  }
+
+load_top_block:
+  for (int i = 0; i < block_size / gemm_block_mm; i++) {
+    for (int ii = 0; ii < gemm_block_mm; ii++) {
+      for (int j = 0; j < block_size / gemm_block_mm; j++) {
+#pragma HLS PIPELINE II=1
+        for (int jj = 0; jj < gemm_block_mm; jj++) {
+          top_buffer[i][j][ii][jj] =
+              top_global_buffer[(i * gemm_block_mm + ii) * block_size +
+                                j * gemm_block_mm + jj];
+        }
+      }
+    }
+  }
+
+load_left_block:
+  for (int i = 0; i < block_size / gemm_block_mm; i++) {
+    for (int ii = 0; ii < gemm_block_mm; ii++) {
+      for (int j = 0; j < block_size / gemm_block_mm; j++) {
+#pragma HLS PIPELINE II=1
+        for (int jj = 0; jj < gemm_block_mm; jj++) {
+          left_buffer[i][j][ii][jj] =
+              left_global_buffer[(i * gemm_block_mm + ii) * block_size +
+                                 j * gemm_block_mm + jj];
+        }
+      }
+    }
+  }
+
+  // Update whole block
+calc_subblocks:
+  for (int c = 0;
+       c < (block_size / gemm_block_mm) * (block_size / gemm_block_mm) *
+               (block_size / gemm_block_mm);
+       c++) {
+#pragma HLS PIPELINE II=1
+
+    int mcol =
+        c / ((block_size / gemm_block_mm) * (block_size / gemm_block_mm));
+    int row =
+        (c / (block_size / gemm_block_mm)) % (block_size / gemm_block_mm);
+    int curr_col = c & ((block_size / gemm_block_mm) - 1);
+
+    DEVICE_DATA_TYPE top_sub[gemm_block_mm][gemm_block_mm];
+    DEVICE_DATA_TYPE left_sub[gemm_block_mm][gemm_block_mm];
+
+load_top_sb:
+    for (int i = 0; i < gemm_block_mm; i++) {
+      for (int j = 0; j < gemm_block_mm; j++) {
+        top_sub[i][j] = top_buffer[mcol][curr_col][i][j];
+      }
+    }
+
+load_left_sb:
+    for (int i = 0; i < gemm_block_mm; i++) {
+      for (int j = 0; j < gemm_block_mm; j++) {
+        left_sub[i][j] = left_buffer[mcol][row][i][j];
+      }
+    }
+
+    DEVICE_DATA_TYPE result_sub[gemm_block_mm][gemm_block_mm];
+mmul:
+    for (int i = 0; i < gemm_block_mm; i++) {
+      for (int j = 0; j < gemm_block_mm; j++) {
+        // Calculate sum of whole column and only write it back once
+        DEVICE_DATA_TYPE sum = 0.0;
+        for (int k = 0; k < gemm_block_mm; k++) {
+          sum += left_sub[k][i] * top_sub[k][j];
+        }
+        result_sub[i][j] = sum;
+      }
+    }
+
+add_sb:
+    for (int i = 0; i < gemm_block_mm; i++) {
+      for (int j = 0; j < gemm_block_mm; j++) {
+        a_buffer[row][curr_col][i][j] += result_sub[i][j];
+      }
+    }
+  }
+
+  // Store block to global memory
+store_result:
+  for (int i = 0; i < block_size / gemm_block_mm; i++) {
+    for (int ii = 0; ii < gemm_block_mm; ii++) {
+      for (int j = 0; j < block_size / gemm_block_mm; j++) {
+        for (int jj = 0; jj < gemm_block_mm; jj++) {
+          a[block_col * block_size +
+            (block_row * block_size + i * gemm_block_mm + ii) * block_size *
+                blocks_per_row +
+            j * gemm_block_mm + jj] = a_buffer[i][j][ii][jj];
+        }
+      }
+    }
+  }
+}
+
+#endif
+}
diff --git a/LINPACK/src/host/CMakeLists.txt b/LINPACK/src/host/CMakeLists.txt
index d8feb95d..72abdf1c 100755
--- a/LINPACK/src/host/CMakeLists.txt
+++ b/LINPACK/src/host/CMakeLists.txt
@@ -1,6 +1,6 @@
 
 add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase)
-set(HOST_SOURCE linpack_benchmark.cpp gmres.c blas.c)
+set(HOST_SOURCE linpack_data.cpp gmres.c blas.c)
 
 set(HOST_EXE_NAME Linpack)
 set(LIB_NAME lp)
@@ -17,11 +17,17 @@ if (INTELFPGAOPENCL_FOUND)
         target_compile_definitions(${LIB_NAME}_intel PRIVATE -DCL_VERSION_2_0)
     endif()
     target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA)
+    target_compile_definitions(${HOST_EXE_NAME}_intel PRIVATE -DINTEL_FPGA)
     target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_intel_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_intel> -h)
 endif()
 
 if (Vitis_FOUND)
+    if (USE_ACCL)
+        set(CMAKE_SKIP_BUILD_RPATH No)
+        set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes)
+        list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib)
+    endif()
     add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE})
     target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS})
     target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host)
@@ -30,6 +36,7 @@ if (Vitis_FOUND)
     target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
     target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx)
     target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
+    target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_xilinx_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_xilinx> -h)
 endif()
diff --git a/LINPACK/src/host/execution_types/execution_accl_buffers.hpp b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
new file mode 100644
index 00000000..f2eed96a
--- /dev/null
+++ b/LINPACK/src/host/execution_types/execution_accl_buffers.hpp
@@ -0,0 +1,526 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef EXECUTION_TYPES_EXECUTION_ACCL_BUFFERS_HPP
+#define EXECUTION_TYPES_EXECUTION_ACCL_BUFFERS_HPP
+
+/* C++ standard library headers */
+#include <chrono>
+#include <fstream>
+#include <list>
+#include <memory>
+#include <thread>
+#include <vector>
+
+/* External library headers */
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+#include "linpack_data.hpp"
+#include "parameters.h"
+
+namespace linpack {
+namespace execution {
+namespace accl_buffers {
+
+/*
+ Prepare kernels and execute benchmark
+
+ @copydoc bm_execution::calculate()
+*/
+std::unique_ptr<linpack::LinpackExecutionTimings> inline calculate(
+    const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings,
+                                       xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config,
+    linpack::LinpackData<fpga_setup::ACCLContext> &data) {
+
+  cl_int err;
+
+  int num_omp_threads = 1;
+#ifdef _OPENMP
+  num_omp_threads = omp_get_num_threads();
+#endif
+
+  uint blocks_per_row = data.matrix_width / config.programSettings->blockSize;
+  uint blocks_per_col = data.matrix_height / config.programSettings->blockSize;
+
+  // Communicate with all ranks in the same row of the torus
+  // Configure ACCL Communicators
+
+  // Get group of global communicator
+  std::vector<ACCL::rank_t> all_accl_ranks =
+      config.context->accl->get_comm_group(ACCL::GLOBAL_COMM);
+
+  std::vector<ACCL::rank_t> row_ranks;
+  std::vector<ACCL::rank_t> col_ranks;
+
+  // Create sub-groups for rows and columns
+  for (int i = config.programSettings->torus_width *
+               config.programSettings->torus_row;
+       i < config.programSettings->torus_width *
+               (config.programSettings->torus_row + 1);
+       i++) {
+    row_ranks.push_back(all_accl_ranks[i]);
+  }
+  for (int i = config.programSettings->torus_col; i < all_accl_ranks.size();
+       i += config.programSettings->torus_width) {
+    col_ranks.push_back(all_accl_ranks[i]);
+  }
+
+  // Create communicators from sub-groups
+  ACCL::communicatorId row_comm = config.context->accl->create_communicator(
+      row_ranks, config.programSettings->torus_col);
+  ACCL::communicatorId col_comm = config.context->accl->create_communicator(
+      col_ranks, config.programSettings->torus_row);
+
+  // Create global memory buffers
+  auto lu_tmp_kernel = xrt::kernel(*config.device, *config.program, "lu");
+  xrt::bo Buffer_a(*config.device, data.A,
+                   sizeof(HOST_DATA_TYPE) * data.matrix_height *
+                       data.matrix_width,
+                   lu_tmp_kernel.group_id(0));
+  xrt::bo Buffer_b(*config.device, data.b,
+                   sizeof(HOST_DATA_TYPE) * data.matrix_width,
+                   lu_tmp_kernel.group_id(0));
+  xrt::bo Buffer_pivot(*config.device, data.ipvt,
+                       sizeof(cl_int) * data.matrix_height,
+                       lu_tmp_kernel.group_id(0));
+
+  // TODO: To make this code work with the ACCL simulator, we need to create
+  // buffers using bos. This vector is used to store these bos during execution.
+  // They will be accessed via the ACCL buffers are not required in the code
+  // itself. Fixing the simulator code of ACCL to always create a bo would fix
+  // this issue.
+  std::vector<xrt::bo> tmp_bos;
+
+  /* --- Setup MPI communication and required additional buffers --- */
+
+  // Buffers only used to store data received over the network layer
+  // The content will not be modified by the host
+  tmp_bos.emplace_back(*config.device,
+                       sizeof(HOST_DATA_TYPE) *
+                           (config.programSettings->blockSize) *
+                           (config.programSettings->blockSize),
+                       lu_tmp_kernel.group_id(1));
+  auto Buffer_lu1 = config.context->accl->create_buffer<HOST_DATA_TYPE>(
+      tmp_bos.back(),
+      (config.programSettings->blockSize) * (config.programSettings->blockSize),
+      ACCL::dataType::float32);
+  tmp_bos.emplace_back(*config.device,
+                       sizeof(HOST_DATA_TYPE) *
+                           (config.programSettings->blockSize) *
+                           (config.programSettings->blockSize),
+                       lu_tmp_kernel.group_id(2));
+  auto Buffer_lu2 = config.context->accl->create_buffer<HOST_DATA_TYPE>(
+      tmp_bos.back(),
+      (config.programSettings->blockSize) * (config.programSettings->blockSize),
+      ACCL::dataType::float32);
+  Buffer_lu1->sync_to_device();
+  Buffer_lu2->sync_to_device();
+
+  std::vector<std::vector<std::unique_ptr<ACCL::BaseBuffer>>> Buffer_left_list;
+  std::vector<std::vector<std::unique_ptr<ACCL::BaseBuffer>>> Buffer_top_list;
+
+  // Create two sets of communication buffers to allow overlap of communication
+  // and matrix multiplications
+  for (int rep = 0; rep < 2; rep++) {
+    Buffer_left_list.emplace_back();
+    Buffer_top_list.emplace_back();
+    for (int i = 0; i < blocks_per_row; i++) {
+      tmp_bos.emplace_back(*config.device,
+                           sizeof(HOST_DATA_TYPE) *
+                               (config.programSettings->blockSize) *
+                               (config.programSettings->blockSize),
+                           lu_tmp_kernel.group_id(1));
+      Buffer_top_list.back().push_back(
+          config.context->accl->create_buffer<HOST_DATA_TYPE>(
+              tmp_bos.back(),
+              (config.programSettings->blockSize) *
+                  (config.programSettings->blockSize),
+              ACCL::dataType::float32));
+      Buffer_top_list.back().back()->sync_to_device();
+    }
+
+    for (int i = 0; i < blocks_per_col; i++) {
+      tmp_bos.emplace_back(*config.device,
+                           sizeof(HOST_DATA_TYPE) *
+                               (config.programSettings->blockSize) *
+                               (config.programSettings->blockSize),
+                           lu_tmp_kernel.group_id(2));
+      Buffer_left_list.back().push_back(
+          config.context->accl->create_buffer<HOST_DATA_TYPE>(
+              tmp_bos.back(),
+              (config.programSettings->blockSize) *
+                  (config.programSettings->blockSize),
+              ACCL::dataType::float32));
+      Buffer_left_list.back().back()->sync_to_device();
+    }
+  }
+
+  /* --- Execute actual benchmark kernels --- */
+
+  double t;
+  std::vector<double> gefaExecutionTimes;
+  std::vector<double> geslExecutionTimes;
+  std::vector<double> gefaWaitTimes;
+  for (int i = 0; i < config.programSettings->numRepetitions; i++) {
+
+    Buffer_a.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    Buffer_b.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+    // Command queues
+    // A new command queue is created for every iteration of the algorithm to
+    // reduce the overhead of too large queues
+    std::vector<xrt::run> inner_mms;
+    std::thread flush_thread;
+
+    std::chrono::time_point<std::chrono::high_resolution_clock> t1, t2, twait1,
+        twait2;
+    std::chrono::duration<double> currentwaittime =
+        std::chrono::duration<double>::zero();
+
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col << "Start! " << std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
+    t1 = std::chrono::high_resolution_clock::now();
+
+    int kernel_offset = 0;
+#pragma omp parallel
+    {
+
+#pragma omp single
+      uint current_replication = 0;
+
+      // For every row of blocks create kernels and enqueue them
+      for (int block_row = 0; block_row < config.programSettings->matrixSize /
+                                              config.programSettings->blockSize;
+           block_row++) {
+
+        int local_block_row_remainder =
+            (block_row % config.programSettings->torus_height);
+        int local_block_row =
+            (block_row / config.programSettings->torus_height);
+        int local_block_col_remainder =
+            (block_row % config.programSettings->torus_width);
+        int local_block_col = (block_row / config.programSettings->torus_width);
+        bool in_same_row_as_lu =
+            local_block_row_remainder == config.programSettings->torus_row;
+        bool in_same_col_as_lu =
+            local_block_col_remainder == config.programSettings->torus_col;
+        int start_row_index =
+            local_block_row +
+            ((local_block_row_remainder >= config.programSettings->torus_row)
+                 ? 1
+                 : 0);
+        int start_col_index =
+            local_block_col +
+            ((local_block_col_remainder >= config.programSettings->torus_col)
+                 ? 1
+                 : 0);
+        int num_left_blocks =
+            (in_same_col_as_lu) ? blocks_per_col - start_row_index : 0;
+        int num_top_blocks =
+            (in_same_row_as_lu) ? blocks_per_row - start_col_index : 0;
+        int num_inner_block_rows = (blocks_per_col - start_row_index);
+        int num_inner_block_cols =
+            (num_inner_block_rows > 0) ? (blocks_per_row - start_col_index) : 0;
+        num_inner_block_rows =
+            (num_inner_block_cols > 0) ? num_inner_block_rows : 0;
+        bool is_calulating_lu_block = (in_same_col_as_lu && in_same_row_as_lu);
+
+#ifndef NDEBUG
+        std::cout << "Torus " << config.programSettings->torus_row << ","
+                  << config.programSettings->torus_col
+                  << " Start iteration     " << block_row << std::endl;
+#endif
+
+        uint total_inner_updates_first_row = num_inner_block_cols;
+        uint updates_per_replication =
+            total_inner_updates_first_row /
+            config.programSettings->kernelReplications;
+        uint total_inner_updates =
+            (num_inner_block_cols - 1) * (num_inner_block_rows - 1);
+        uint total_updates_per_replication =
+            total_inner_updates / config.programSettings->kernelReplications;
+        uint current_update = 0;
+
+        std::vector<xrt::run> comm_kernel_runs;
+
+#pragma omp single
+        {
+
+          if (is_calulating_lu_block) {
+            // create the LU kernel
+            auto lu_kernel = xrt::kernel(*config.device, *config.program, "lu");
+
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " LU     "
+                      << local_block_row << "," << local_block_col << std::endl;
+#endif
+            auto lu_run =
+                lu_kernel(Buffer_a, *Buffer_lu1->bo(), *Buffer_lu2->bo(),
+                          local_block_col, local_block_row, blocks_per_row);
+            ert_cmd_state state = lu_run.wait();
+            if (state != ERT_CMD_STATE_COMPLETED) {
+              std::cerr << "Execution Lu failed: " << state << std::endl;
+            }
+          }
+
+          // Exchange LU blocks on all ranks to prevent stalls in MPI broadcast
+          // All tasks until now need to be executed so we can use the result of
+          // the LU factorization and communicate it via MPI with the other
+          // FPGAs
+
+          // Broadcast LU block in column to update all left blocks
+          config.context->accl->bcast(*Buffer_lu2,
+                             config.programSettings->blockSize *
+                                 config.programSettings->blockSize,
+                             local_block_row_remainder, col_comm, true, true);
+          // Broadcast LU block in row to update all top blocks
+          config.context->accl->bcast(*Buffer_lu1,
+                             config.programSettings->blockSize *
+                                 config.programSettings->blockSize,
+                             local_block_col_remainder, row_comm, true, true);
+        }
+        if (num_top_blocks > 0) {
+
+// Create top kernels
+#pragma omp for
+          for (int tops = start_col_index; tops < blocks_per_row; tops++) {
+            xrt::kernel k(*config.device, *config.program, "top_update");
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " Top    "
+                      << local_block_row << "," << tops << std::endl;
+#endif
+
+            comm_kernel_runs.push_back(
+                k(Buffer_a,
+                  *Buffer_top_list[block_row % 2][tops - start_col_index]->bo(),
+                  *Buffer_lu1->bo(), (tops == start_col_index), tops,
+                  local_block_row, blocks_per_row));
+          }
+        }
+        if (num_left_blocks > 0) {
+
+// Create left kernels
+#pragma omp for
+          for (int tops = start_row_index; tops < blocks_per_col; tops++) {
+            xrt::kernel k(*config.device, *config.program, "left_update");
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " Left   " << tops
+                      << "," << local_block_col << std::endl;
+#endif
+            comm_kernel_runs.push_back(k(
+                Buffer_a,
+                *Buffer_left_list[block_row % 2][tops - start_row_index]->bo(),
+                *Buffer_lu2->bo(), (tops == start_row_index), local_block_col,
+                tops, blocks_per_row));
+          }
+        }
+
+#pragma omp single
+        {
+          // Wait until all top and left blocks are calculated
+          for (auto &run : comm_kernel_runs) {
+            run.wait();
+          }
+
+          // Send the left and top blocks to all other ranks so they can be used
+          // to update all inner blocks
+          for (int lbi = 0;
+               lbi <
+               std::max(static_cast<int>(blocks_per_col - local_block_col), 0);
+               lbi++) {
+            config.context->accl->bcast(*Buffer_left_list[block_row % 2][lbi],
+                               config.programSettings->blockSize *
+                                   config.programSettings->blockSize,
+                               local_block_col_remainder, row_comm, true, true);
+          }
+          for (int tbi = 0;
+               tbi <
+               std::max(static_cast<int>(blocks_per_row - local_block_row), 0);
+               tbi++) {
+            config.context->accl->bcast(*Buffer_top_list[block_row % 2][tbi],
+                               config.programSettings->blockSize *
+                                   config.programSettings->blockSize,
+                               local_block_row_remainder, col_comm, true, true);
+          }
+          // update all remaining inner blocks using only global memory
+        }
+
+        std::vector<xrt::run> outer_mms;
+
+        // Wait for previous inner MMs to complete.
+        // They may need to be reused by the next outer MM calls!
+        for (auto &run : inner_mms) {
+          run.wait();
+        }
+
+#pragma omp for
+        for (int lbi = 1; lbi < num_inner_block_rows; lbi++) {
+
+          // select the matrix multiplication kernel that should be used for
+          // this block updated
+          xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
+
+          int current_block_col = static_cast<cl_uint>(
+              (data.matrix_width / config.programSettings->blockSize) -
+              num_inner_block_cols);
+          int current_block_row = static_cast<cl_uint>(
+              (data.matrix_height / config.programSettings->blockSize) -
+              num_inner_block_rows + lbi);
+
+#ifndef NDEBUG
+          std::cout << "Torus " << config.programSettings->torus_row << ","
+                    << config.programSettings->torus_col << " MM col "
+                    << current_block_row << "," << current_block_col
+                    << std::endl;
+#endif
+
+          outer_mms.push_back(
+              k(Buffer_a, *Buffer_left_list[block_row % 2][lbi]->bo(),
+                *Buffer_top_list[block_row % 2][0]->bo(), current_block_col,
+                current_block_row, blocks_per_row));
+        }
+
+#pragma omp for
+        for (int tbi = 0; tbi < num_inner_block_cols; tbi++) {
+
+          // select the matrix multiplication kernel that should be used for
+          // this block updated
+          xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
+
+          int current_block_col = static_cast<cl_uint>(
+              (data.matrix_width / config.programSettings->blockSize) -
+              num_inner_block_cols + tbi);
+          int current_block_row = static_cast<cl_uint>(
+              (data.matrix_height / config.programSettings->blockSize) -
+              num_inner_block_rows);
+
+#ifndef NDEBUG
+          std::cout << "Torus " << config.programSettings->torus_row << ","
+                    << config.programSettings->torus_col << " MM row "
+                    << current_block_row << "," << current_block_col
+                    << std::endl;
+#endif
+
+          outer_mms.push_back(
+              k(Buffer_a, *Buffer_left_list[block_row % 2][0]->bo(),
+                *Buffer_top_list[block_row % 2][tbi]->bo(), current_block_col,
+                current_block_row, blocks_per_row));
+        }
+
+        // Clear inner MM runs vector for this iteration
+        // All runs have completed before scheduling the outer MMs
+        inner_mms.clear();
+
+#pragma omp for collapse(2) schedule(static)
+        for (int lbi = 1; lbi < num_inner_block_rows; lbi++) {
+          for (int tbi = 1; tbi < num_inner_block_cols; tbi++) {
+            // select the matrix multiplication kernel that should be used for
+            // this block updated
+
+            xrt::kernel k(*config.device, *config.program, "inner_update_mm0");
+
+            int current_block_col = static_cast<cl_uint>(
+                (data.matrix_width / config.programSettings->blockSize) -
+                num_inner_block_cols + tbi);
+            int current_block_row = static_cast<cl_uint>(
+                (data.matrix_height / config.programSettings->blockSize) -
+                num_inner_block_rows + lbi);
+
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " MM     "
+                      << current_block_row << "," << current_block_col
+                      << std::endl;
+#endif
+
+            inner_mms.push_back(
+                k(Buffer_a, *Buffer_left_list[block_row % 2][lbi]->bo(),
+                  *Buffer_top_list[block_row % 2][tbi]->bo(), current_block_col,
+                  current_block_row, blocks_per_row));
+          }
+        }
+
+        // Wait for all outer MMs to complete because the results are required
+        // by the next communication phase
+        for (auto &run : outer_mms) {
+          run.wait();
+        }
+
+#ifndef NDEBUG
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (is_calulating_lu_block)
+          std::cout << "---------------" << std::endl;
+#endif
+      }
+    }
+
+    t2 = std::chrono::high_resolution_clock::now();
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col << "End! " << std::endl;
+
+#ifndef NDEBUG
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col
+              << "Wait time: " << currentwaittime.count() << "s" << std::endl;
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col << " Exit    " << i
+              << std::endl;
+#endif
+
+    std::chrono::duration<double> timespan =
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+    gefaExecutionTimes.push_back(timespan.count());
+
+    // Execute GESL
+    t1 = std::chrono::high_resolution_clock::now();
+    t2 = std::chrono::high_resolution_clock::now();
+    timespan =
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+    geslExecutionTimes.push_back(timespan.count());
+  }
+
+  /* --- Read back results from Device --- */
+
+  Buffer_a.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  if (!config.programSettings->isDiagonallyDominant) {
+    Buffer_pivot.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  }
+
+  std::unique_ptr<linpack::LinpackExecutionTimings> results(
+      new linpack::LinpackExecutionTimings{gefaExecutionTimes,
+                                           geslExecutionTimes});
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  return results;
+}
+
+} // namespace accl_buffers
+} // namespace execution
+} // namespace linpack
+
+#endif
diff --git a/LINPACK/src/host/execution_types/execution_iec.hpp b/LINPACK/src/host/execution_types/execution_iec.hpp
index b98bcc31..8168b6c1 100644
--- a/LINPACK/src/host/execution_types/execution_iec.hpp
+++ b/LINPACK/src/host/execution_types/execution_iec.hpp
@@ -35,7 +35,7 @@ SOFTWARE.
 #endif
 
 #include "parameters.h"
-#include "linpack_benchmark.hpp"
+#include "linpack_data.hpp"
 
 namespace linpack {
 namespace execution {
@@ -44,9 +44,9 @@ namespace iec {
 /*
  Prepare kernels and execute benchmark for a bitstream that makes use of intel external channels
 */
-std::unique_ptr<linpack::LinpackExecutionTimings>
-calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&config,
-          linpack::LinpackData& data) {
+std::map<std::string, std::vector<double>> inline
+calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings, cl::Device, cl::Context, cl::Program>&config,
+          linpack::LinpackData<cl::Context>& data) {
 
     int err;
 
@@ -722,17 +722,18 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
     }
     buffer_queue.finish();
 #endif
+    
+    std::map<std::string, std::vector<double>> timings;
 
-    std::unique_ptr<linpack::LinpackExecutionTimings> results(
-                    new linpack::LinpackExecutionTimings{gefaExecutionTimes, geslExecutionTimes});
+    timings["gefa"] = gefaExecutionTimes;
+    timings["gesl"] = geslExecutionTimes;
     
     MPI_Barrier(MPI_COMM_WORLD);
-
-    return results;
+    return timings;
 }
 
 }   // namespace iec
 }   // namespace execution
 }  // namespace linpack
 
-#endif
\ No newline at end of file
+#endif
diff --git a/LINPACK/src/host/execution_types/execution_pcie.hpp b/LINPACK/src/host/execution_types/execution_pcie.hpp
index 51b9c546..ac990091 100644
--- a/LINPACK/src/host/execution_types/execution_pcie.hpp
+++ b/LINPACK/src/host/execution_types/execution_pcie.hpp
@@ -39,7 +39,7 @@ SOFTWARE.
 #endif
 
 #include "parameters.h"
-#include "linpack_benchmark.hpp"
+#include "linpack_data.hpp"
 
 namespace linpack {
 namespace execution {
@@ -50,9 +50,9 @@ namespace pcie {
 
  @copydoc bm_execution::calculate()
 */
-std::unique_ptr<linpack::LinpackExecutionTimings>
-calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&config,
-          linpack::LinpackData& data) {
+std::map<std::string, std::vector<double>> inline
+calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings, cl::Device, cl::Context, cl::Program>&config,
+          linpack::LinpackData<cl::Context>& data) {
 
     cl_int err;
 
@@ -717,16 +717,18 @@ calculate(const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings>&co
     MPI_Comm_free(&row_communicator);
     MPI_Comm_free(&col_communicator);
 
-    std::unique_ptr<linpack::LinpackExecutionTimings> results(
-                    new linpack::LinpackExecutionTimings{gefaExecutionTimes, geslExecutionTimes});
+    std::map<std::string, std::vector<double>> timings;
+    
+    timings["gefa"] = gefaExecutionTimes;
+    timings["gesl"] = geslExecutionTimes;
     
     MPI_Barrier(MPI_COMM_WORLD);
 
-    return results;
+    return timings;
 }
 
 }   // namespace pcie
 }   // namespace execution
 }  // namespace linpack
 
-#endif
\ No newline at end of file
+#endif
diff --git a/LINPACK/src/host/execution_types/execution_types.hpp b/LINPACK/src/host/execution_types/execution_types.hpp
index 975dd4cf..457f4e85 100644
--- a/LINPACK/src/host/execution_types/execution_types.hpp
+++ b/LINPACK/src/host/execution_types/execution_types.hpp
@@ -22,7 +22,14 @@ SOFTWARE.
 #ifndef EXECUTION_TYPES_HPP
 #define EXECUTION_TYPES_HPP
 
+#ifdef USE_OCL_HOST
 #include "execution_types/execution_pcie.hpp"
 #include "execution_types/execution_iec.hpp"
-
+#endif
+#ifdef USE_XRT_HOST
+#include "execution_types/execution_xrt_pcie.hpp"
+#ifdef USE_ACCL
+#include "execution_types/execution_accl_buffers.hpp"
+#endif
+#endif
 #endif
\ No newline at end of file
diff --git a/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
new file mode 100644
index 00000000..a4de60ad
--- /dev/null
+++ b/LINPACK/src/host/execution_types/execution_xrt_pcie.hpp
@@ -0,0 +1,476 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef EXECUTION_TYPES_EXECUTION_XRT_PCIE_HPP
+#define EXECUTION_TYPES_EXECUTION_XRT_PCIE_HPP
+
+/* C++ standard library headers */
+#include <chrono>
+#include <fstream>
+#include <list>
+#include <memory>
+#include <thread>
+#include <vector>
+
+/* External library headers */
+#ifdef _OPENMP
+#include "omp.h"
+#endif
+
+#include "linpack_data.hpp"
+#include "parameters.h"
+
+namespace linpack {
+namespace execution {
+namespace xrt_pcie {
+
+/*
+ Prepare kernels and execute benchmark
+
+ @copydoc bm_execution::calculate()
+*/
+template<class TContext>
+std::map<std::string, std::vector<double>> inline calculate(
+    const hpcc_base::ExecutionSettings<linpack::LinpackProgramSettings,
+                                       xrt::device, TContext, xrt::uuid> &config,
+    linpack::LinpackData<TContext> &data) {
+
+  cl_int err;
+
+  int num_omp_threads = 1;
+#ifdef _OPENMP
+  num_omp_threads = omp_get_num_threads();
+#endif
+
+  uint blocks_per_row = data.matrix_width / config.programSettings->blockSize;
+  uint blocks_per_col = data.matrix_height / config.programSettings->blockSize;
+
+  // Communicate with all ranks in the same row of the torus
+  MPI_Comm row_communicator;
+  MPI_Comm col_communicator;
+
+  MPI_Comm_split(MPI_COMM_WORLD, config.programSettings->torus_row, 0,
+                 &row_communicator);
+  MPI_Comm_split(MPI_COMM_WORLD, config.programSettings->torus_col, 0,
+                 &col_communicator);
+
+  xrt::kernel kernel_mm(*config.device, *config.program, "inner_update_mm0");
+  xrt::kernel kernel_lu(*config.device, *config.program, "lu");
+  xrt::kernel kernel_top(*config.device, *config.program, "top_update");
+  xrt::kernel kernel_left(*config.device, *config.program, "left_update");
+
+  xrt::bo Buffer_a(*config.device, data.A,
+                   sizeof(HOST_DATA_TYPE) * data.matrix_height *
+                       data.matrix_width,
+                   kernel_lu.group_id(0));
+  xrt::bo Buffer_b(*config.device, data.b,
+                   sizeof(HOST_DATA_TYPE) * data.matrix_width,
+                   kernel_lu.group_id(0));
+  xrt::bo Buffer_pivot(*config.device, data.ipvt,
+                       sizeof(cl_int) * data.matrix_height,
+                       kernel_lu.group_id(0));
+
+  /* --- Setup MPI communication and required additional buffers --- */
+
+  // Buffers only used to store data received over the network layer
+  // The content will not be modified by the host
+  xrt::bo Buffer_lu1(*config.device,
+                     sizeof(HOST_DATA_TYPE) *
+                         (config.programSettings->blockSize) *
+                         (config.programSettings->blockSize),
+                     kernel_lu.group_id(1));
+  xrt::bo Buffer_lu2(*config.device,
+                     sizeof(HOST_DATA_TYPE) *
+                         (config.programSettings->blockSize) *
+                         (config.programSettings->blockSize),
+                     kernel_lu.group_id(2));
+
+  std::vector<std::vector<xrt::bo>> Buffer_left_list(2);
+  std::vector<std::vector<xrt::bo>> Buffer_top_list(2);
+
+  for (int double_buffer = 0; double_buffer < 2; double_buffer++) {
+    for (int i = 0; i < blocks_per_row; i++) {
+      Buffer_top_list[double_buffer].emplace_back(
+          *config.device,
+          sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
+              (config.programSettings->blockSize),
+          kernel_top.group_id(1));
+    }
+
+    for (int i = 0; i < blocks_per_col; i++) {
+      Buffer_left_list[double_buffer].emplace_back(
+          *config.device,
+          sizeof(HOST_DATA_TYPE) * (config.programSettings->blockSize) *
+              (config.programSettings->blockSize),
+          kernel_left.group_id(1));
+    }
+  }
+
+  /* --- Execute actual benchmark kernels --- */
+
+  double t;
+  std::vector<double> gefaExecutionTimes;
+  std::vector<double> geslExecutionTimes;
+  std::vector<double> gefaWaitTimes;
+  for (int i = 0; i < config.programSettings->numRepetitions; i++) {
+
+    Buffer_a.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    Buffer_b.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+    // Command queues
+    // A new command queue is created for every iteration of the
+    // algorithm to reduce the overhead of too large queues
+    std::vector<xrt::run> inner_mms;
+    std::thread flush_thread;
+
+    std::chrono::time_point<std::chrono::high_resolution_clock> t1, t2, twait1,
+        twait2;
+    std::chrono::duration<double> currentwaittime =
+        std::chrono::duration<double>::zero();
+
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col << "Start! " << std::endl;
+    MPI_Barrier(MPI_COMM_WORLD);
+    t1 = std::chrono::high_resolution_clock::now();
+
+    int kernel_offset = 0;
+#pragma omp parallel
+    {
+
+#pragma omp single
+      uint current_replication = 0;
+
+      // For every row of blocks create kernels and enqueue them
+      for (int block_row = 0; block_row < config.programSettings->matrixSize /
+                                              config.programSettings->blockSize;
+           block_row++) {
+
+        int local_block_row_remainder =
+            (block_row % config.programSettings->torus_height);
+        int local_block_row =
+            (block_row / config.programSettings->torus_height);
+        int local_block_col_remainder =
+            (block_row % config.programSettings->torus_width);
+        int local_block_col = (block_row / config.programSettings->torus_width);
+        bool in_same_row_as_lu =
+            local_block_row_remainder == config.programSettings->torus_row;
+        bool in_same_col_as_lu =
+            local_block_col_remainder == config.programSettings->torus_col;
+        int start_row_index =
+            local_block_row +
+            ((local_block_row_remainder >= config.programSettings->torus_row)
+                 ? 1
+                 : 0);
+        int start_col_index =
+            local_block_col +
+            ((local_block_col_remainder >= config.programSettings->torus_col)
+                 ? 1
+                 : 0);
+        int num_left_blocks =
+            (in_same_col_as_lu) ? blocks_per_col - start_row_index : 0;
+        int num_top_blocks =
+            (in_same_row_as_lu) ? blocks_per_row - start_col_index : 0;
+        int num_inner_block_rows = (blocks_per_col - start_row_index);
+        int num_inner_block_cols =
+            (num_inner_block_rows > 0) ? (blocks_per_row - start_col_index) : 0;
+        num_inner_block_rows =
+            (num_inner_block_cols > 0) ? num_inner_block_rows : 0;
+        bool is_calulating_lu_block = (in_same_col_as_lu && in_same_row_as_lu);
+
+#ifndef NDEBUG
+        std::cout << "Torus " << config.programSettings->torus_row << ","
+                  << config.programSettings->torus_col
+                  << " Start iteration     " << block_row << std::endl;
+#endif
+
+        uint total_inner_updates_first_row = num_inner_block_cols;
+        uint updates_per_replication =
+            total_inner_updates_first_row /
+            config.programSettings->kernelReplications;
+        uint total_inner_updates =
+            (num_inner_block_cols - 1) * (num_inner_block_rows - 1);
+        uint total_updates_per_replication =
+            total_inner_updates / config.programSettings->kernelReplications;
+        uint current_update = 0;
+
+        std::vector<xrt::run> comm_kernel_runs;
+
+#pragma omp single
+        {
+
+          if (is_calulating_lu_block) {
+
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " LU     "
+                      << local_block_row << "," << local_block_col << std::endl;
+#endif
+            auto lu_run =
+                kernel_lu(Buffer_a, Buffer_lu1, Buffer_lu2, local_block_col,
+                          local_block_row, blocks_per_row);
+            ert_cmd_state state = lu_run.wait();
+            if (state != ERT_CMD_STATE_COMPLETED) {
+              std::cerr << "Execution Lu failed: " << state << std::endl;
+            }
+            Buffer_lu1.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+            Buffer_lu2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+          }
+
+          // Broadcast LU block in column to update all left blocks
+          MPI_Bcast(Buffer_lu2.map(),
+                    config.programSettings->blockSize *
+                        config.programSettings->blockSize,
+                    MPI_DATA_TYPE, local_block_row_remainder, col_communicator);
+          // Broadcast LU block in row to update all top blocks
+          MPI_Bcast(Buffer_lu1.map(),
+                    config.programSettings->blockSize *
+                        config.programSettings->blockSize,
+                    MPI_DATA_TYPE, local_block_col_remainder, row_communicator);
+        }
+
+        if (num_top_blocks > 0) {
+
+          Buffer_lu1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+// Create top kernels
+#pragma omp for
+          for (int tops = start_col_index; tops < blocks_per_row; tops++) {
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " Top    "
+                      << local_block_row << "," << tops << std::endl;
+#endif
+
+            comm_kernel_runs.push_back(
+                kernel_top(Buffer_a,
+                  Buffer_top_list[block_row % 2][tops - start_col_index],
+                  Buffer_lu1, (tops == start_col_index), tops, local_block_row,
+                  blocks_per_row));
+          }
+        }
+        if (num_left_blocks > 0) {
+
+          Buffer_lu2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+// Create left kernels
+#pragma omp for
+          for (int tops = start_row_index; tops < blocks_per_col; tops++) {
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " Left   " << tops
+                      << "," << local_block_col << std::endl;
+#endif
+            comm_kernel_runs.push_back(
+                kernel_left(Buffer_a,
+                  Buffer_left_list[block_row % 2][tops - start_row_index],
+                  Buffer_lu2, (tops == start_row_index), local_block_col, tops,
+                  blocks_per_row));
+          }
+        }
+
+#pragma omp single
+        {
+          // Wait until all top and left blocks are calculated
+          for (auto &run : comm_kernel_runs) {
+            run.wait();
+          }
+          
+          // Send the left and top blocks to all other ranks so they can be used
+          // to update all inner blocks
+          for (int lbi = 0;
+               lbi <
+               std::max(static_cast<int>(blocks_per_col - local_block_col), 0);
+               lbi++) {
+            Buffer_left_list[block_row % 2][lbi].sync(
+                XCL_BO_SYNC_BO_FROM_DEVICE);
+            MPI_Bcast(Buffer_left_list[block_row % 2][lbi].map(),
+                      config.programSettings->blockSize *
+                          config.programSettings->blockSize,
+                      MPI_DATA_TYPE, local_block_col_remainder,
+                      row_communicator);
+            Buffer_left_list[block_row % 2][lbi].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+          }
+          for (int tbi = 0;
+               tbi <
+               std::max(static_cast<int>(blocks_per_row - local_block_row), 0);
+               tbi++) {
+            Buffer_top_list[block_row % 2][tbi].sync(
+                XCL_BO_SYNC_BO_FROM_DEVICE);
+            MPI_Bcast(Buffer_top_list[block_row % 2][tbi].map(),
+                      config.programSettings->blockSize *
+                          config.programSettings->blockSize,
+                      MPI_DATA_TYPE, local_block_row_remainder,
+                      col_communicator);
+            Buffer_top_list[block_row % 2][tbi].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+          }
+
+          // update all remaining inner blocks using only global memory
+        }
+
+        std::vector<xrt::run> outer_mms;
+
+        // Wait for previous inner MMs to complete.
+        // They may need to be reused by the next outer MM calls!
+        for (auto &run : inner_mms) {
+          run.wait();
+        }
+
+#pragma omp for
+        for (int lbi = 1; lbi < num_inner_block_rows; lbi++) {
+
+          int current_block_col = static_cast<cl_uint>(
+              (data.matrix_width / config.programSettings->blockSize) -
+              num_inner_block_cols);
+          int current_block_row = static_cast<cl_uint>(
+              (data.matrix_height / config.programSettings->blockSize) -
+              num_inner_block_rows + lbi);
+
+#ifndef NDEBUG
+          std::cout << "Torus " << config.programSettings->torus_row << ","
+                    << config.programSettings->torus_col << " MM col "
+                    << current_block_row << "," << current_block_col
+                    << std::endl;
+#endif
+
+          outer_mms.push_back(kernel_mm(Buffer_a, Buffer_left_list[block_row % 2][lbi],
+                                Buffer_top_list[block_row % 2][0],
+                                current_block_col, current_block_row,
+                                blocks_per_row));
+        }
+
+#pragma omp for
+        for (int tbi = 0; tbi < num_inner_block_cols; tbi++) {
+
+          int current_block_col = static_cast<cl_uint>(
+              (data.matrix_width / config.programSettings->blockSize) -
+              num_inner_block_cols + tbi);
+          int current_block_row = static_cast<cl_uint>(
+              (data.matrix_height / config.programSettings->blockSize) -
+              num_inner_block_rows);
+
+#ifndef NDEBUG
+          std::cout << "Torus " << config.programSettings->torus_row << ","
+                    << config.programSettings->torus_col << " MM row "
+                    << current_block_row << "," << current_block_col
+                    << std::endl;
+#endif
+
+          outer_mms.push_back(kernel_mm(Buffer_a, Buffer_left_list[block_row % 2][0],
+                                Buffer_top_list[block_row % 2][tbi],
+                                current_block_col, current_block_row,
+                                blocks_per_row));
+        }
+
+        // Clear inner MM runs vector for this iteration
+        // All runs have completed before scheduling the outer MMs
+        inner_mms.clear();
+
+#pragma omp for collapse(2) schedule(static)
+        for (int lbi = 1; lbi < num_inner_block_rows; lbi++) {
+          for (int tbi = 1; tbi < num_inner_block_cols; tbi++) {
+
+            int current_block_col = static_cast<cl_uint>(
+                (data.matrix_width / config.programSettings->blockSize) -
+                num_inner_block_cols + tbi);
+            int current_block_row = static_cast<cl_uint>(
+                (data.matrix_height / config.programSettings->blockSize) -
+                num_inner_block_rows + lbi);
+
+#ifndef NDEBUG
+            std::cout << "Torus " << config.programSettings->torus_row << ","
+                      << config.programSettings->torus_col << " MM     "
+                      << current_block_row << "," << current_block_col
+                      << std::endl;
+#endif
+
+            inner_mms.push_back(
+                kernel_mm(Buffer_a, Buffer_left_list[block_row % 2][lbi],
+                  Buffer_top_list[block_row % 2][tbi], current_block_col,
+                  current_block_row, blocks_per_row));
+          }
+        }
+
+        // Wait for all outer MMs to complete because the results are required
+        // by the next communication phase
+        for (auto &run : outer_mms) {
+          run.wait();
+        }
+
+#ifndef NDEBUG
+        // Wait for iiner MMs in this communication round to keep
+        // sync with prints
+        for (auto &run : inner_mms) {
+          run.wait();
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        if (is_calulating_lu_block)
+          std::cout << "---------------" << std::endl;
+#endif
+      }
+    }
+
+    t2 = std::chrono::high_resolution_clock::now();
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col << "End! " << std::endl;
+
+#ifndef NDEBUG
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col
+              << "Wait time: " << currentwaittime.count() << "s" << std::endl;
+    std::cout << "Torus " << config.programSettings->torus_row << ","
+              << config.programSettings->torus_col << " Exit    " << i
+              << std::endl;
+#endif
+
+    std::chrono::duration<double> timespan =
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+    gefaExecutionTimes.push_back(timespan.count());
+
+    // Execute GESL
+    t1 = std::chrono::high_resolution_clock::now();
+    t2 = std::chrono::high_resolution_clock::now();
+    timespan =
+        std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1);
+    geslExecutionTimes.push_back(timespan.count());
+  }
+
+  /* --- Read back results from Device --- */
+
+  Buffer_a.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  if (!config.programSettings->isDiagonallyDominant) {
+    Buffer_pivot.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  }
+
+    std::map<std::string, std::vector<double>> timings;
+    
+    timings["gefa"] = gefaExecutionTimes;
+    timings["gesl"] = geslExecutionTimes;
+    
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    return timings;
+}
+
+} // namespace xrt_pcie
+} // namespace execution
+} // namespace linpack
+
+#endif
diff --git a/LINPACK/src/host/linpack_benchmark.cpp b/LINPACK/src/host/linpack_benchmark.cpp
deleted file mode 100644
index d60be9d1..00000000
--- a/LINPACK/src/host/linpack_benchmark.cpp
+++ /dev/null
@@ -1,713 +0,0 @@
-//
-// Created by Marius Meyer on 04.12.19.
-//
-
-/*
-Copyright (c) 2019 Marius Meyer
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-*/
-
-#include "linpack_benchmark.hpp"
-
-/* C++ standard library headers */
-#include <memory>
-#include <random>
-
-/* Project's headers */
-#include "communication_types.hpp"
-#include "execution_types/execution_types.hpp"
-#include "parameters.h"
-
-linpack::LinpackProgramSettings::LinpackProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
-    matrixSize(results["m"].as<uint>() * (1 << (results["b"].as<uint>()))), blockSize(1 << (results["b"].as<uint>())), 
-    isEmulationKernel(results.count("emulation") > 0), isDiagonallyDominant(results.count("uniform") == 0),
-    torus_width(results["p"].as<uint>()) {
-    int mpi_comm_rank;
-    int mpi_comm_size;
-    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size);
-    // calculate the row and column of the MPI rank in the torus 
-    if (mpi_comm_size % torus_width != 0) {
-        throw std::runtime_error("MPI size not dividable by P=" + std::to_string(torus_width) + "!");
-    } 
-    torus_height = mpi_comm_size / torus_width;
-    torus_row = (mpi_comm_rank / torus_width);
-    torus_col = (mpi_comm_rank % torus_width);
-}
-
-std::map<std::string, std::string>
-linpack::LinpackProgramSettings::getSettingsMap() {
-        auto map = hpcc_base::BaseSettings::getSettingsMap();
-        map["Matrix Size"] = std::to_string(matrixSize);
-        map["Block Size"] = std::to_string(blockSize);
-        map["Emulate"] = (isEmulationKernel) ? "Yes" : "No";
-        map["Data Type"] = STR(HOST_DATA_TYPE);
-        map["FPGA Torus"] = "P=" + std::to_string(torus_width) + ", Q=" + std::to_string(torus_height);
-        return map;
-}
-
-linpack::LinpackData::LinpackData(cl::Context context, size_t width, size_t height) : norma(0.0), context(context),
-    matrix_width(width), matrix_height(height) {
-#ifdef USE_SVM
-    A = reinterpret_cast<HOST_DATA_TYPE*>(
-                        clSVMAlloc(context(), 0 ,
-                        size * size * sizeof(HOST_DATA_TYPE), 1024));
-    b = reinterpret_cast<HOST_DATA_TYPE*>(
-                        clSVMAlloc(context(), 0 ,
-                        size  * sizeof(HOST_DATA_TYPE), 1024));
-    ipvt = reinterpret_cast<cl_int*>(
-                        clSVMAlloc(context(), 0 ,
-                        size * sizeof(cl_int), 1024));
-#else
-    posix_memalign(reinterpret_cast<void**>(&A), 4096, width * height * sizeof(HOST_DATA_TYPE));
-    posix_memalign(reinterpret_cast<void**>(&b), 4096, width * sizeof(HOST_DATA_TYPE));
-    posix_memalign(reinterpret_cast<void**>(&ipvt), 4096, height * sizeof(cl_int));
-#endif
-    }
-
-linpack::LinpackData::~LinpackData() {
-#ifdef USE_SVM
-    clSVMFree(context(), reinterpret_cast<void*>(A));
-    clSVMFree(context(), reinterpret_cast<void*>(b));
-    clSVMFree(context(), reinterpret_cast<void*>(ipvt));
-#else
-    free(A);
-    free(b);
-    free(ipvt);
-#endif
-}
-
-linpack::LinpackBenchmark::LinpackBenchmark(int argc, char* argv[]) : HpccFpgaBenchmark(argc, argv) {
-    setupBenchmark(argc, argv);
-}
-
-void
-linpack::LinpackBenchmark::addAdditionalParseOptions(cxxopts::Options &options) {
-    options.add_options()
-        ("m", "Global matrix size in number of blocks in one dimension. Local matrix sizes will be determined by PQ grid.",
-            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_MATRIX_SIZE)))
-        ("b", "Log2 of the block size in number of values in one dimension",
-            cxxopts::value<uint>()->default_value(std::to_string(LOCAL_MEM_BLOCK_LOG)))
-        ("p", "Width of the FPGA grid. The heigth (Q) will be calculated from mpi_size / P.",
-            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_P_VALUE)))
-        ("uniform", "Generate a uniform matrix instead of a diagonally dominant. This has to be supported by the FPGA kernel!")
-        ("emulation", "Use kernel arguments for emulation. This may be necessary to simulate persistent local memory on the FPGA");
-}
-
-std::unique_ptr<linpack::LinpackExecutionTimings>
-linpack::LinpackBenchmark::executeKernel(LinpackData &data) {
-    std::unique_ptr<linpack::LinpackExecutionTimings> timings;
-    switch (executionSettings->programSettings->communicationType) {
-        case hpcc_base::CommunicationType::pcie_mpi : timings = execution::pcie::calculate(*executionSettings, data); break;
-        case hpcc_base::CommunicationType::intel_external_channels: timings = execution::iec::calculate(*executionSettings, data); break;
-        default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType));
-    }
-#ifdef DISTRIBUTED_VALIDATION
-    distributed_gesl_nopvt_ref(data);
-#endif
-    return timings;
-}
-
-void
-linpack::LinpackBenchmark::collectAndPrintResults(const linpack::LinpackExecutionTimings &output) {
-    // Calculate performance for kernel execution plus data transfer
-    double tmean = 0;
-    double tlumean = 0;
-    double tslmean = 0;
-    double tmin = std::numeric_limits<double>::max();
-    double lu_min = std::numeric_limits<double>::max();
-    double sl_min = std::numeric_limits<double>::max();
-
-#ifndef NDEBUG
-    std::cout << "Rank " << mpi_comm_rank << ": Result collection started" << std::endl;
-#endif
-
-    std::vector<double> global_lu_times(output.gefaTimings.size());
-    MPI_Reduce(output.gefaTimings.data(), global_lu_times.data(), output.gefaTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-    std::vector<double> global_sl_times(output.geslTimings.size());
-    MPI_Reduce(output.geslTimings.data(), global_sl_times.data(), output.geslTimings.size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-#ifndef NDEBUG
-    std::cout << "Rank " << mpi_comm_rank << ": Result collection done" << std::endl;
-#endif
-
-
-    if (mpi_comm_rank > 0) {
-        // Only the master rank needs to calculate and print result
-        return;
-    }
-
-    double total_matrix_size = static_cast<double>(executionSettings->programSettings->matrixSize);
-    double gflops_lu = ((2.0e0*total_matrix_size * total_matrix_size * total_matrix_size)/ 3.0) / 1.0e9; 
-    double gflops_sl = (2.0*(total_matrix_size * total_matrix_size))/1.0e9;
-    for (int i =0; i < global_lu_times.size(); i++) {
-        double currentTime = global_lu_times[i] + global_sl_times[i];
-        tmean +=  currentTime;
-        tlumean +=  global_lu_times[i];
-        tslmean += global_sl_times[i];
-        if (currentTime < tmin) {
-            tmin = currentTime;
-        }
-        if (global_lu_times[i] < lu_min) {
-            lu_min = global_lu_times[i];
-        }
-        if (global_sl_times[i] < sl_min) {
-            sl_min = global_sl_times[i];
-        }
-    }
-    tmean = tmean / global_lu_times.size();
-    tlumean = tlumean / global_lu_times.size();
-    tslmean = tslmean / global_sl_times.size();
-
-     std::cout << std::setw(ENTRY_SPACE)
-              << "Method" << std::setw(ENTRY_SPACE)
-              << "best" << std::setw(ENTRY_SPACE) << "mean"
-              << std::setw(ENTRY_SPACE) << "GFLOPS" << std::endl;
-
-    std::cout << std::setw(ENTRY_SPACE) << "total" << std::setw(ENTRY_SPACE)
-              << tmin << std::setw(ENTRY_SPACE) << tmean
-              << std::setw(ENTRY_SPACE) << ((gflops_lu + gflops_sl) / tmin)
-              << std::endl;
-
-    std::cout << std::setw(ENTRY_SPACE) << "GEFA" << std::setw(ENTRY_SPACE)
-            << lu_min << std::setw(ENTRY_SPACE) << tlumean
-            << std::setw(ENTRY_SPACE) << ((gflops_lu) / lu_min)
-            << std::endl;
-
-    std::cout << std::setw(ENTRY_SPACE) << "GESL" << std::setw(ENTRY_SPACE)
-              << sl_min << std::setw(ENTRY_SPACE) << tslmean
-              << std::setw(ENTRY_SPACE) << (gflops_sl / sl_min)
-              << std::endl;
-}
-
-std::unique_ptr<linpack::LinpackData>
-linpack::LinpackBenchmark::generateInputData() {
-    int local_matrix_width = executionSettings->programSettings->matrixSize / executionSettings->programSettings->torus_width;
-    int local_matrix_height = executionSettings->programSettings->matrixSize / executionSettings->programSettings->torus_height;
-
-    if ((executionSettings->programSettings->matrixSize / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_width > 0 || 
-        (executionSettings->programSettings->matrixSize / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_height > 0) {
-            throw std::runtime_error("Global matrix size must be multiple of LCM of PQ grid!");
-    }
-
-    auto d = std::unique_ptr<linpack::LinpackData>(new linpack::LinpackData(*executionSettings->context ,local_matrix_width, local_matrix_height));
-    std::mt19937 gen(this->mpi_comm_rank);
-    std::uniform_real_distribution<> dis(0.0, 1.0);
-    d->norma = 0.0;
-    d->normb = 0.0;
-
-
-    /*
-    Generate a matrix by using pseudo random number in the range (0,1)
-    */
-    for (int j = 0; j < local_matrix_height; j++) {
-        // fill a single column of the matrix
-        for (int i = 0; i < local_matrix_width; i++) {
-                HOST_DATA_TYPE temp = dis(gen);
-                d->A[local_matrix_width*j+i] = temp;
-                d->norma = (temp > d->norma) ? temp : d->norma;
-        }
-    }
-
-
-    // If the matrix should be diagonally dominant, we need to exchange the sum of the rows with
-    // the ranks that share blocks in the same column
-    if (executionSettings->programSettings->isDiagonallyDominant) {
-        // create a communicator to exchange the rows
-        MPI_Comm row_communicator;
-        MPI_Comm_split(MPI_COMM_WORLD, executionSettings->programSettings->torus_row, 0,&row_communicator);
-
-        // Caclulate the sum for every row and insert in into the matrix
-        for (int local_matrix_row = 0; local_matrix_row < local_matrix_height; local_matrix_row++) {
-            int blockSize = executionSettings->programSettings->blockSize;
-            int global_matrix_row = executionSettings->programSettings->torus_row * blockSize + (local_matrix_row / blockSize) * blockSize * executionSettings->programSettings->torus_height + (local_matrix_row % blockSize);
-            int local_matrix_col = (global_matrix_row - executionSettings->programSettings->torus_col * blockSize) / (blockSize * executionSettings->programSettings->torus_width) * blockSize + (global_matrix_row % blockSize);
-            int diagonal_rank = (global_matrix_row / blockSize) % executionSettings->programSettings->torus_width;
-            bool diagonal_on_this_rank = diagonal_rank == executionSettings->programSettings->torus_col;
-            // set the diagonal elements of the matrix to 0
-            if (diagonal_on_this_rank) {
-                d->A[local_matrix_width*local_matrix_row + local_matrix_col] = 0.0;
-            }
-            HOST_DATA_TYPE local_row_sum = 0.0;
-            for (int i = 0; i < local_matrix_width; i++) {
-                local_row_sum += d->A[local_matrix_width*local_matrix_row + i];
-            } 
-            HOST_DATA_TYPE row_sum = 0.0;
-            MPI_Reduce(&local_row_sum, &row_sum, 1, MPI_DATA_TYPE, MPI_SUM, diagonal_rank, row_communicator);
-            // insert row sum into matrix if it contains the diagonal block
-            if (diagonal_on_this_rank) {
-                // update norm of local matrix
-                d->norma = (row_sum > d->norma) ? row_sum : d->norma;
-                d->A[local_matrix_width*local_matrix_row + local_matrix_col] = row_sum;
-            }
-        }
-    }
-        
-    // initialize other vectors
-    for (int i = 0; i < local_matrix_width; i++) {
-        d->b[i] = 0.0;
-    }
-    for (int i = 0; i < local_matrix_height; i++) {
-        d->ipvt[i] = i;
-    }
-
-    MPI_Comm col_communicator;
-    MPI_Comm_split(MPI_COMM_WORLD, executionSettings->programSettings->torus_col, 0,&col_communicator);
-
-    // Generate vector b by accumulating the columns of the matrix.
-    // This will lead to a result vector x with ones on every position
-    // Every rank will have a valid part of the final b vector stored
-    for (int j = 0; j < local_matrix_width; j++) {
-        HOST_DATA_TYPE local_col_sum = 0.0;
-        for (int i = 0; i < local_matrix_height; i++) {
-            local_col_sum += d->A[local_matrix_width*i+j];
-        }
-        MPI_Allreduce(&local_col_sum, &(d->b[j]), 1, MPI_DATA_TYPE, MPI_SUM, col_communicator);   
-        d->normb = (d->b[j] > d->normb) ? d->b[j] : d->normb;   
-    }
-    return d;
-}
-
-bool  
-linpack::LinpackBenchmark::validateOutputAndPrintError(linpack::LinpackData &data) {
-    uint n= executionSettings->programSettings->matrixSize;
-    uint matrix_width = data.matrix_width;
-    uint matrix_height = data.matrix_height;
-    double residn;
-    double resid = 0.0;
-    double normx = 0.0;
-#ifndef DISTRIBUTED_VALIDATION
-    if (mpi_comm_rank > 0) {
-        for (int j = 0; j < matrix_height; j++) {
-            for (int i = 0; i < matrix_width; i+= executionSettings->programSettings->blockSize) {
-                MPI_Send(&data.A[matrix_width * j + i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
-            }
-        }
-        if (executionSettings->programSettings->torus_row == 0) {
-            for (int i = 0; i < matrix_width; i+= executionSettings->programSettings->blockSize) {
-                MPI_Send(&data.b[i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
-            }
-        }
-        residn = 0;
-    }
-    else {
-        MPI_Status status;
-        size_t current_offset = 0;
-        std::vector<HOST_DATA_TYPE> total_b_original(n);
-        std::vector<HOST_DATA_TYPE> total_b(n);
-        std::vector<HOST_DATA_TYPE> total_a(n*n);
-        for (int j = 0; j < n; j++) {
-            for (int i = 0; i < n; i+= executionSettings->programSettings->blockSize) {
-                int recvcol= (i / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_width;
-                int recvrow= (j / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_height;
-                int recvrank = executionSettings->programSettings->torus_width * recvrow + recvcol;
-                if (recvrank > 0) {
-                    MPI_Recv(&total_a[j * n + i],executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 0, MPI_COMM_WORLD,  &status);
-                }
-                else {
-                    for (int k=0; k < executionSettings->programSettings->blockSize; k++) {
-                        total_a[j * n + i + k] = data.A[current_offset + k];
-                    }
-                    current_offset += executionSettings->programSettings->blockSize;
-                }
-            }
-        }
-        current_offset = 0;
-        for (int i = 0; i < n; i+= executionSettings->programSettings->blockSize) {
-            int recvcol= (i / executionSettings->programSettings->blockSize) % executionSettings->programSettings->torus_width;
-            if (recvcol > 0) {
-                MPI_Recv(&total_b[i], executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvcol, 0, MPI_COMM_WORLD, &status);
-            }
-            else {
-                for (int k=0; k < executionSettings->programSettings->blockSize; k++) {
-                    total_b[i + k] = data.b[current_offset + k];
-                }
-                current_offset += executionSettings->programSettings->blockSize;
-            }
-        }
-
-        std::copy(total_b.begin(), total_b.end(), total_b_original.begin());
-        gesl_ref_nopvt(total_a.data(), total_b.data(), n, n);
-
-        for (int i = 0; i < n; i++) {
-            resid = (resid > std::abs(total_b[i] - 1)) ? resid : std::abs(total_b[i] - 1);
-            normx = (normx > std::abs(total_b_original[i])) ? normx : std::abs(total_b_original[i]);
-        }
-    }
-#else
-    double local_resid = 0;
-    double local_normx = data.normb;
-    #pragma omp parallel for reduction(max:local_resid)
-    for (int i = 0; i < data.matrix_width; i++) {
-        local_resid = (local_resid > std::abs(data.b[i] - 1)) ? local_resid : std::abs(data.b[i] - 1);
-    }
-#ifndef NDEBUG
-    std::cout << "Rank " << mpi_comm_rank << ": resid=" << local_resid << ", normx=" << local_normx << std::endl;
-#endif
-
-    MPI_Reduce(&local_resid, &resid, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-    MPI_Reduce(&local_normx, &normx, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-#endif
-
-
-    HOST_DATA_TYPE eps = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
-    residn = resid / (static_cast<double>(n)*normx*eps);
-
-    #ifndef NDEBUG
-        if (residn > 1 &&  mpi_comm_size == 1) {
-            auto ref_result = generateInputData();
-            // For each column right of current diagonal element
-            for (int j = 0; j < n; j++) {
-                // For each element below it
-                for (int i = 0; i < n; i++) {
-                    std::cout << ref_result->A[n * j + i] << ", ";
-                }
-                std::cout << std::endl;
-            }
-            std::cout << std::endl;
-            // For each column right of current diagonal element
-            for (int j = 0; j < n; j++) {
-                // For each element below it
-                for (int i = 0; i < n; i++) {
-                    std::cout << data.A[n * j + i] << ", ";
-                }
-                std::cout << std::endl;
-            }
-            std::cout << std::endl;
-            if (executionSettings->programSettings->isDiagonallyDominant) {
-                linpack::gefa_ref_nopvt(ref_result->A, n, n);
-                linpack::gesl_ref_nopvt(ref_result->A, ref_result->b, n, n);
-            }
-            else {
-                linpack::gefa_ref(ref_result->A, n, n, ref_result->ipvt);
-                linpack::gesl_ref(ref_result->A, ref_result->b, ref_result->ipvt, n, n);
-            }
-            // For each column right of current diagonal element
-            for (int j = 0; j < n; j++) {
-                // For each element below it
-                for (int i = 0; i < n; i++) {
-                    std::cout << std::abs(ref_result->A[n * j + i] - data.A[n * j + i]) << ", ";
-                }
-                std::cout << std::endl;
-            }
-            std::cout << std::endl;
-        }
-    #endif
-
-    if (mpi_comm_rank == 0) {
-        //std::cout << resid << ", " << norma << ", " << normx << std::endl;
-        std::cout << "  norm. resid        resid       "\
-                    "machep   " << std::endl;
-        std::cout << std::setw(ENTRY_SPACE) << residn << std::setw(ENTRY_SPACE)
-                << resid << std::setw(ENTRY_SPACE) << eps << std::endl;
-        return residn < 1;
-    }
-    else {
-        return true;
-    }
-}
-
-void 
-linpack::LinpackBenchmark::distributed_gesl_nopvt_ref(linpack::LinpackData& data) {
-    uint global_matrix_size = executionSettings->programSettings->matrixSize;
-    uint matrix_width = data.matrix_width;
-    uint matrix_height = data.matrix_height;
-    uint block_size = executionSettings->programSettings->blockSize;
-    // create a communicator to exchange the rows
-    MPI_Comm row_communicator;
-    MPI_Comm_split(MPI_COMM_WORLD, executionSettings->programSettings->torus_row, 0,&row_communicator);
-    MPI_Comm col_communicator;
-    MPI_Comm_split(MPI_COMM_WORLD, executionSettings->programSettings->torus_col, 0,&col_communicator);
-    std::vector<HOST_DATA_TYPE> b_tmp(matrix_width);
-
-    for (int k = 0; k < b_tmp.size(); k++) {
-        b_tmp[k] = data.b[k];
-    }
-
-    // solve l*y = b
-    // For each row in matrix
-    for (int k = 0; k < global_matrix_size - 1; k++) {
-        size_t local_k_index_col =  k / (block_size * executionSettings->programSettings->torus_width) * block_size;
-        size_t local_k_index_row =  k / (block_size * executionSettings->programSettings->torus_height) * block_size;
-        size_t remaining_k_col = k % (block_size * executionSettings->programSettings->torus_width);
-        size_t remaining_k_row = k % (block_size * executionSettings->programSettings->torus_height);
-        size_t start_offset = local_k_index_col;
-        if (remaining_k_col / block_size > executionSettings->programSettings->torus_col){
-            local_k_index_col += block_size;
-            start_offset = local_k_index_col;
-        }
-        else if (remaining_k_col / block_size == executionSettings->programSettings->torus_col) {
-            local_k_index_col += (remaining_k_col % block_size);
-            start_offset = local_k_index_col + 1;
-        }
-        if (remaining_k_row / block_size > executionSettings->programSettings->torus_row){
-            local_k_index_row += block_size;
-        }
-        else if (remaining_k_row / block_size == executionSettings->programSettings->torus_row) {
-            local_k_index_row += (remaining_k_row % block_size);
-        }
-
-        int row_diagonal_rank = (k / block_size) % executionSettings->programSettings->torus_height;
-        int col_diagonal_rank = (k / block_size) % executionSettings->programSettings->torus_width;
-        std::vector<HOST_DATA_TYPE> tmp_scaled_b(matrix_width, 0.0);
-        if (row_diagonal_rank == executionSettings->programSettings->torus_row) {
-            HOST_DATA_TYPE current_k;
-            current_k = (local_k_index_col < matrix_width) ? b_tmp[local_k_index_col] : 0.0;
-            MPI_Bcast(&current_k, 1, MPI_DATA_TYPE,  col_diagonal_rank, row_communicator);
-            // For each row below add
-            for (int i = start_offset; i < matrix_width; i++) {
-                // add solved upper row to current row
-                tmp_scaled_b[i] = current_k * data.A[matrix_width * local_k_index_row + i];
-            }
-        }
-        MPI_Bcast(&tmp_scaled_b.data()[start_offset], matrix_width - start_offset, MPI_DATA_TYPE, row_diagonal_rank, col_communicator);
-        for (int i = start_offset; i < matrix_width; i++) {
-            // add solved upper row to current row
-            b_tmp[i] += tmp_scaled_b[i];
-        }
-    }
-
-    // now solve  u*x = y
-    for (int k = global_matrix_size - 1; k >= 0; k--) {
-        size_t local_k_index_col =  k / (block_size * executionSettings->programSettings->torus_width) * block_size;
-        size_t local_k_index_row =  k / (block_size * executionSettings->programSettings->torus_height) * block_size;
-        size_t remaining_k_col = k % (block_size * executionSettings->programSettings->torus_width);
-        size_t remaining_k_row = k % (block_size * executionSettings->programSettings->torus_height);
-        if (remaining_k_col / block_size > executionSettings->programSettings->torus_col){
-            local_k_index_col += block_size;
-        }
-        else if (remaining_k_col / block_size == executionSettings->programSettings->torus_col) {
-            local_k_index_col += remaining_k_col % block_size;
-        }
-        if (remaining_k_row / block_size > executionSettings->programSettings->torus_row){
-            local_k_index_row += block_size;
-        }
-        else if (remaining_k_row / block_size == executionSettings->programSettings->torus_row) {
-            local_k_index_row += remaining_k_row % block_size;
-        }
-
-        HOST_DATA_TYPE scale_element = (local_k_index_col < matrix_width && local_k_index_row < matrix_height) ? b_tmp[local_k_index_col] * data.A[matrix_width * local_k_index_row + local_k_index_col] : 0.0;
-        int row_diagonal_rank = (k / block_size) % executionSettings->programSettings->torus_height;
-        int col_diagonal_rank = (k / block_size) % executionSettings->programSettings->torus_width;
-        MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, row_diagonal_rank, col_communicator);
-        if (col_diagonal_rank == executionSettings->programSettings->torus_col) {
-            b_tmp[local_k_index_col] = -scale_element;
-        }
-        MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, col_diagonal_rank, row_communicator);
-        size_t end_offset = local_k_index_col;
-
-        std::vector<HOST_DATA_TYPE> tmp_scaled_b(matrix_width, 0.0);
-        if (row_diagonal_rank == executionSettings->programSettings->torus_row) {
-            // For each row below add
-            for (int i = 0; i < end_offset; i++) {
-                tmp_scaled_b[i] = scale_element * data.A[matrix_width * local_k_index_row + i];
-            }
-        }
-        MPI_Bcast(tmp_scaled_b.data(), end_offset, MPI_DATA_TYPE, row_diagonal_rank, col_communicator);
-        for (int i = 0; i < end_offset; i++) {
-            // add solved upper row to current row
-            b_tmp[i] += tmp_scaled_b[i];
-        }
-    }
-    for (int k = 0; k < b_tmp.size(); k++) {
-        data.b[k] = b_tmp[k];
-    }
-
-#ifndef NDEBUG
-    MPI_Barrier(MPI_COMM_WORLD);
-    for (int rank = 0; rank < mpi_comm_size; rank++) {
-        if (rank == mpi_comm_rank) {
-            double sum = 0;
-            double max = 0;
-            for (int k = 0; k < matrix_width; k++) {
-                sum += std::abs(data.b[k]);
-                if (std::abs(data.b[k] - 1) > 0.1 || data.b[k] == NAN) {
-                    std::cout << "Rank " << mpi_comm_rank << " Pos: " << k << " Value: " << std::abs(data.b[k]) << std::endl;
-                }
-            }
-            std::cout << "Rank " << mpi_comm_rank << " Dist.Sum: " << sum << " Max: " << max << std::endl;
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-    }
-#endif
-}
-
-/**
-Standard LU factorization on a block with fixed size
-
-Case 1 of Zhangs description
-*/
-void
-linpack::gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt) {
-    for (int i = 0; i < n; i++) {
-        ipvt[i] = i;
-    }
-    // For each diagnonal element
-    for (int k = 0; k < n - 1; k++) {
-        HOST_DATA_TYPE max_val = fabs(a[k * lda + k]);
-        int pvt_index = k;
-        for (int i = k + 1; i < n; i++) {
-            if (max_val < fabs(a[k * lda + i])) {
-                pvt_index = i;
-                max_val = fabs(a[k * lda + i]);
-            }
-        }
-
-        for (int i = k; i < n; i++) {
-            HOST_DATA_TYPE tmp_val = a[i * lda + k];
-            a[i * lda + k] = a[i * lda + pvt_index];
-            a[i * lda + pvt_index] = tmp_val;
-        }
-        ipvt[k] = pvt_index;
-
-        // For each element below it
-        for (int i = k + 1; i < n; i++) {
-            a[k * lda + i] *= -1.0 / a[k * lda + k];
-        }
-        // For each column right of current diagonal element
-        for (int j = k + 1; j < n; j++) {
-            // For each element below it
-            for (int i = k+1; i < n; i++) {
-                a[j * lda + i] += a[k * lda + i] * a[j * lda + k];
-            }
-        }
-
-#ifdef DEBUG
-        std::cout << "A(k=" << k <<"): " << std::endl;
-                for (int i= 0; i < n; i++) {
-                    for (int j=0; j < n; j++) {
-                        std::cout << a[i*lda + j] << ", ";
-                    }
-                    std::cout << std::endl;
-                }
-                std::cout <<  std::endl;
-#endif
-
-    }
-}
-
-void
-linpack::gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda) {
-    auto b_tmp = new HOST_DATA_TYPE[n];
-    {
-        for (int k = 0; k < n; k++) {
-            b_tmp[k] = b[k];
-        }
-
-        // solve l*y = b
-        // For each row in matrix
-        for (int k = 0; k < n - 1; k++) {
-            if (ipvt[k] != k) {
-                HOST_DATA_TYPE tmp = b_tmp[k];
-                b_tmp[k] = b_tmp[ipvt[k]];
-                b_tmp[ipvt[k]] = tmp;
-            }
-            // For each row below add
-            for (int i = k + 1; i < n; i++) {
-                // add solved upper row to current row
-                b_tmp[i] += b_tmp[k] * a[lda * k + i];
-            }
-        }
-
-        // now solve  u*x = y
-        for (int k = n - 1; k >= 0; k--) {
-            b_tmp[k] = b_tmp[k] / a[lda * k + k];
-            for (int i = 0; i < k; i++) {
-                b_tmp[i] -= b_tmp[k] * a[lda * k + i];
-            }
-        }
-        for (int k = 0; k < n; k++) {
-            b[k] = b_tmp[k];
-        }
-    }
-    delete [] b_tmp;
-}
-
-void linpack::dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m, bool transposed) {
-    for (int i=0; i < n1; i++) {
-        for (int j=0; j < n2; j++) {
-            y[i] = y[i] + x[j] * (transposed ? m[ldm*i + j] :m[ldm*j + i]);
-        }
-    }
-}
-
-void
-linpack::gefa_ref_nopvt(HOST_DATA_TYPE* a, unsigned n, unsigned lda) {
-    // For each diagnonal element
-    for (int k = 0; k < n; k++) {
-        // Store negatie invers of diagonal elements to get rid of some divisions afterwards!
-        a[k * lda + k] = -1.0 / a[k * lda + k];
-        // For each element below it
-        for (int i = k + 1; i < n; i++) {
-            a[k * lda + i] *= a[k * lda + k];
-        }
-        // For each column right of current diagonal element
-        for (int j = k + 1; j < n; j++) {
-            // For each element below it
-            for (int i = k+1; i < n; i++) {
-                a[j * lda + i] += a[k * lda + i] * a[j * lda + k];
-            }
-        }
-
-#ifdef DEBUG
-        std::cout << "A(k=" << k << "): " << std::endl;
-                for (int i= 0; i < n; i++) {
-                    for (int j=0; j < n; j++) {
-                        std::cout << a[i*lda + j] << ", ";
-                    }
-                    std::cout << std::endl;
-                }
-                std::cout <<  std::endl;
-#endif
-
-    }
-}
-
-
-void
-linpack::gesl_ref_nopvt(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, unsigned n, unsigned lda) {
-    auto b_tmp = new HOST_DATA_TYPE[n];
-
-    for (int k = 0; k < n; k++) {
-        b_tmp[k] = b[k];
-    }
-
-    // solve l*y = b
-    // For each row in matrix
-    for (int k = 0; k < n - 1; k++) {
-        // For each row below add
-        for (int i = k + 1; i < n; i++) {
-            // add solved upper row to current row
-            b_tmp[i] += b_tmp[k] * a[lda * k + i];
-        }
-    }
-
-    // now solve  u*x = y
-    for (int k = n - 1; k >= 0; k--) {
-        HOST_DATA_TYPE scale = b_tmp[k] * a[lda * k + k];
-        b_tmp[k] = -scale;
-        for (int i = 0; i < k; i++) {
-            b_tmp[i] += scale * a[lda * k + i];
-        }
-    }
-    for (int k = 0; k < n; k++) {
-        b[k] = b_tmp[k];
-    }
-    delete [] b_tmp;
-}
diff --git a/LINPACK/src/host/linpack_benchmark.hpp b/LINPACK/src/host/linpack_benchmark.hpp
index c05b323a..4b4da0aa 100644
--- a/LINPACK/src/host/linpack_benchmark.hpp
+++ b/LINPACK/src/host/linpack_benchmark.hpp
@@ -26,10 +26,13 @@ SOFTWARE.
 /* C++ standard library headers */
 #include <complex>
 #include <memory>
+#include <random>
 
 /* Project's headers */
 #include "hpcc_benchmark.hpp"
+#include "execution_types/execution_types.hpp"
 #include "parameters.h"
+#include "linpack_data.hpp"
 extern "C" {
     #include "gmres.h"
 }
@@ -40,177 +43,12 @@ extern "C" {
  */
 namespace linpack {
 
-/**
- * @brief The Linpack specific program settings
- * 
- */
-class LinpackProgramSettings : public hpcc_base::BaseSettings {
-
-public:
-    /**
-     * @brief The size of the local matrix in number of blocks in one dimension
-     * 
-     */
-    uint matrixSize;
-
-    /**
-     * @brief Size of a single block of the matrix in values in one dimension
-     * 
-     */
-    uint blockSize;
-
-    /**
-     * @brief Indicates if the generated input matrix should be diagonally dominant
-     * 
-     */
-    bool isDiagonallyDominant;
-
-    /**
-     * @brief True, if the used kernel is an emulation kernel. Different kernel arguments may be used in this case to
-     *          simulate persistent local memory.
-     * 
-     */
-    bool isEmulationKernel;
-
-    /**
-     * @brief The row position of this MPI rank in the torus
-     * 
-     */
-    int torus_row;
-
-    /**
-     * @brief The rcolumn position of this MPI rank in the torus
-     * 
-     */
-    int torus_col;
-
-    /**
-     * @brief Width of the torus in number of ranks
-     * 
-     */
-    int torus_width;
-
-    /**
-     * @brief Height of the FPGA torus in number of ranks
-     * 
-     */
-    int torus_height;
-
-    /**
-     * @brief Construct a new Linpack Program Settings object
-     * 
-     * @param results the result map from parsing the program input parameters
-     */
-    LinpackProgramSettings(cxxopts::ParseResult &results);
-
-    /**
-     * @brief Get a map of the settings. This map will be used to print the final configuration.
-     * 
-     * @return a map of program parameters. keys are the name of the parameter.
-     */
-    std::map<std::string, std::string> getSettingsMap() override;
-
-};
-
-/**
- * @brief Data class containing the data the kernel is exeucted with
- * 
- */
-class LinpackData {
-
-public:
-
-    /**
-     * @brief  The input matrix representing the left side of the linear equation system
-     * 
-     */
-    HOST_DATA_TYPE *A;
-
-    /**
-     * @brief  The input vector the right side of the linear equation system
-     * 
-     */
-    HOST_DATA_TYPE *b;
-
-    /**
-     * @brief A vector that can be used to store pivoting information
-     * 
-     */
-    cl_int* ipvt;
-
-    /**
-     * @brief Width of the local matrix in values
-     * 
-     */
-    size_t matrix_width;
-
-    /**
-     * @brief Height of the local matrix in values
-     * 
-     */
-    size_t matrix_height;
-
-    /**
-     * @brief The context that is used to allocate memory in SVM mode
-     * 
-     */
-    cl::Context context;
-
-    /**
-     * @brief The maximum value of A that will be used for the error calculation
-     * 
-     */
-    HOST_DATA_TYPE norma;
-
-    /**
-     * @brief The maximum value of A that will be used for the error calculation
-     * 
-     */
-    HOST_DATA_TYPE normb;
-
-    /**
-     * @brief Construct a new Linpack Data object
-     * 
-     * @param context The OpenCL context used to allocate memory in SVM mode
-     * @param width width of the local matrix in values
-     * @param height height of the local matrix in values
-     */
-    LinpackData(cl::Context context, size_t width, size_t height);
-
-    /**
-     * @brief Destroy the Linpack Data object. Free the allocated memory
-     * 
-     */
-    ~LinpackData();
-
-};
-
-/**
- * @brief Measured execution timing from the kernel execution
- * 
- */
-class LinpackExecutionTimings {
-public:
-    /**
-     * @brief A vector containing the timings for all repetitions for the kernel execution for the gefa kernel
-     * 
-     */
-    std::vector<double> gefaTimings;
-
-    /**
-     * @brief A vector containing the timings for all repetitions for the kernel execution for the gesl kernel
-     * 
-     */
-    std::vector<double> geslTimings;
-
-
-};
-
 /**
  * @brief Implementation of the Linpack benchmark
  * 
  */
-class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSettings, LinpackData, LinpackExecutionTimings> {
+template<class TDevice, class TContext, class TProgram>
+class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSettings, TDevice, TContext, TProgram, LinpackData<TContext>> {
 
 protected:
 
@@ -220,7 +58,17 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @param options 
      */
     void
-    addAdditionalParseOptions(cxxopts::Options &options) override;
+    addAdditionalParseOptions(cxxopts::Options &options) override {
+    options.add_options()
+        ("m", "Global matrix size in number of blocks in one dimension. Local matrix sizes will be determined by PQ grid.",
+            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_MATRIX_SIZE)))
+        ("b", "Log2 of the block size in number of values in one dimension",
+            cxxopts::value<uint>()->default_value(std::to_string(LOCAL_MEM_BLOCK_LOG)))
+        ("p", "Width of the FPGA grid. The heigth (Q) will be calculated from mpi_size / P.",
+            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_P_VALUE)))
+        ("uniform", "Generate a uniform matrix instead of a diagonally dominant. This has to be supported by the FPGA kernel!")
+        ("emulation", "Use kernel arguments for emulation. This may be necessary to simulate persistent local memory on the FPGA");
+    }
 
     /**
      * @brief Distributed solving of l*y=b and u*x = y 
@@ -228,7 +76,130 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @param data The local data. b will contain the solution for the unknows that were handeled by this rank
      */
     void 
-    distributed_gesl_nopvt_ref(linpack::LinpackData& data);
+    distributed_gesl_nopvt_ref(linpack::LinpackData<TContext>& data) {
+    uint global_matrix_size = this->executionSettings->programSettings->matrixSize;
+    uint matrix_width = data.matrix_width;
+    uint matrix_height = data.matrix_height;
+    uint block_size = this->executionSettings->programSettings->blockSize;
+    // create a communicator to exchange the rows
+    MPI_Comm row_communicator;
+    MPI_Comm_split(MPI_COMM_WORLD, this->executionSettings->programSettings->torus_row, 0,&row_communicator);
+    MPI_Comm col_communicator;
+    MPI_Comm_split(MPI_COMM_WORLD, this->executionSettings->programSettings->torus_col, 0,&col_communicator);
+    std::vector<HOST_DATA_TYPE> b_tmp(matrix_width);
+
+    for (int k = 0; k < b_tmp.size(); k++) {
+        b_tmp[k] = data.b[k];
+    }
+
+    // solve l*y = b
+    // For each row in matrix
+    for (int k = 0; k < global_matrix_size - 1; k++) {
+        size_t local_k_index_col =  k / (block_size * this->executionSettings->programSettings->torus_width) * block_size;
+        size_t local_k_index_row =  k / (block_size * this->executionSettings->programSettings->torus_height) * block_size;
+        size_t remaining_k_col = k % (block_size * this->executionSettings->programSettings->torus_width);
+        size_t remaining_k_row = k % (block_size * this->executionSettings->programSettings->torus_height);
+        size_t start_offset = local_k_index_col;
+        if (remaining_k_col / block_size > this->executionSettings->programSettings->torus_col){
+            local_k_index_col += block_size;
+            start_offset = local_k_index_col;
+        }
+        else if (remaining_k_col / block_size == this->executionSettings->programSettings->torus_col) {
+            local_k_index_col += (remaining_k_col % block_size);
+            start_offset = local_k_index_col + 1;
+        }
+        if (remaining_k_row / block_size > this->executionSettings->programSettings->torus_row){
+            local_k_index_row += block_size;
+        }
+        else if (remaining_k_row / block_size == this->executionSettings->programSettings->torus_row) {
+            local_k_index_row += (remaining_k_row % block_size);
+        }
+
+        int row_diagonal_rank = (k / block_size) % this->executionSettings->programSettings->torus_height;
+        int col_diagonal_rank = (k / block_size) % this->executionSettings->programSettings->torus_width;
+        std::vector<HOST_DATA_TYPE> tmp_scaled_b(matrix_width, 0.0);
+        if (row_diagonal_rank == this->executionSettings->programSettings->torus_row) {
+            HOST_DATA_TYPE current_k;
+            current_k = (local_k_index_col < matrix_width) ? b_tmp[local_k_index_col] : 0.0;
+            MPI_Bcast(&current_k, 1, MPI_DATA_TYPE,  col_diagonal_rank, row_communicator);
+            // For each row below add
+            for (int i = start_offset; i < matrix_width; i++) {
+                // add solved upper row to current row
+                tmp_scaled_b[i] = current_k * data.A[matrix_width * local_k_index_row + i];
+            }
+        }
+        MPI_Bcast(&tmp_scaled_b.data()[start_offset], matrix_width - start_offset, MPI_DATA_TYPE, row_diagonal_rank, col_communicator);
+        for (int i = start_offset; i < matrix_width; i++) {
+            // add solved upper row to current row
+            b_tmp[i] += tmp_scaled_b[i];
+        }
+    }
+
+    // now solve  u*x = y
+    for (int k = global_matrix_size - 1; k >= 0; k--) {
+        size_t local_k_index_col =  k / (block_size * this->executionSettings->programSettings->torus_width) * block_size;
+        size_t local_k_index_row =  k / (block_size * this->executionSettings->programSettings->torus_height) * block_size;
+        size_t remaining_k_col = k % (block_size * this->executionSettings->programSettings->torus_width);
+        size_t remaining_k_row = k % (block_size * this->executionSettings->programSettings->torus_height);
+        if (remaining_k_col / block_size > this->executionSettings->programSettings->torus_col){
+            local_k_index_col += block_size;
+        }
+        else if (remaining_k_col / block_size == this->executionSettings->programSettings->torus_col) {
+            local_k_index_col += remaining_k_col % block_size;
+        }
+        if (remaining_k_row / block_size > this->executionSettings->programSettings->torus_row){
+            local_k_index_row += block_size;
+        }
+        else if (remaining_k_row / block_size == this->executionSettings->programSettings->torus_row) {
+            local_k_index_row += remaining_k_row % block_size;
+        }
+
+        HOST_DATA_TYPE scale_element = (local_k_index_col < matrix_width && local_k_index_row < matrix_height) ? b_tmp[local_k_index_col] * data.A[matrix_width * local_k_index_row + local_k_index_col] : 0.0;
+        int row_diagonal_rank = (k / block_size) % this->executionSettings->programSettings->torus_height;
+        int col_diagonal_rank = (k / block_size) % this->executionSettings->programSettings->torus_width;
+        MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, row_diagonal_rank, col_communicator);
+        if (col_diagonal_rank == this->executionSettings->programSettings->torus_col) {
+            b_tmp[local_k_index_col] = -scale_element;
+        }
+        MPI_Bcast(&scale_element, 1, MPI_DATA_TYPE, col_diagonal_rank, row_communicator);
+        size_t end_offset = local_k_index_col;
+
+        std::vector<HOST_DATA_TYPE> tmp_scaled_b(matrix_width, 0.0);
+        if (row_diagonal_rank == this->executionSettings->programSettings->torus_row) {
+            // For each row below add
+            for (int i = 0; i < end_offset; i++) {
+                tmp_scaled_b[i] = scale_element * data.A[matrix_width * local_k_index_row + i];
+            }
+        }
+        MPI_Bcast(tmp_scaled_b.data(), end_offset, MPI_DATA_TYPE, row_diagonal_rank, col_communicator);
+        for (int i = 0; i < end_offset; i++) {
+            // add solved upper row to current row
+            b_tmp[i] += tmp_scaled_b[i];
+        }
+    }
+    for (int k = 0; k < b_tmp.size(); k++) {
+        data.b[k] = b_tmp[k];
+    }
+
+#ifndef NDEBUG
+    MPI_Barrier(MPI_COMM_WORLD);
+    for (int rank = 0; rank < this->mpi_comm_size; rank++) {
+        if (rank == this->mpi_comm_rank) {
+            double sum = 0;
+            double max = 0;
+            for (int k = 0; k < matrix_width; k++) {
+                sum += std::abs(data.b[k]);
+                if (std::abs(data.b[k] - 1) > 0.1 || data.b[k] == NAN) {
+                    std::cout << "Rank " << this->mpi_comm_rank << " Pos: " << k << " Value: " << std::abs(data.b[k]) << std::endl;
+                }
+            }
+            std::cout << "Rank " << this->mpi_comm_rank << " Dist.Sum: " << sum << " Max: " << max << std::endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+#endif
+}
+
 
 public:
 
@@ -237,8 +208,94 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * 
      * @return std::unique_ptr<LinpackData> The input and output data of the benchmark
      */
-    std::unique_ptr<LinpackData>
-    generateInputData() override;
+    std::unique_ptr<LinpackData<TContext>>
+    generateInputData() override {
+    int local_matrix_width = this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->torus_width;
+    int local_matrix_height = this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->torus_height;
+
+    if ((this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_width > 0 || 
+        (this->executionSettings->programSettings->matrixSize / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_height > 0) {
+            throw std::runtime_error("Global matrix size must be multiple of LCM of PQ grid!");
+    }
+
+    auto d = std::unique_ptr<linpack::LinpackData<TContext>>(new linpack::LinpackData<TContext>(*this->executionSettings->context ,local_matrix_width, local_matrix_height));
+    std::mt19937 gen(this->mpi_comm_rank);
+    std::uniform_real_distribution<> dis(0.0, 1.0);
+    d->norma = 0.0;
+    d->normb = 0.0;
+
+
+    /*
+    Generate a matrix by using pseudo random number in the range (0,1)
+    */
+    for (int j = 0; j < local_matrix_height; j++) {
+        // fill a single column of the matrix
+        for (int i = 0; i < local_matrix_width; i++) {
+                HOST_DATA_TYPE temp = dis(gen);
+                d->A[local_matrix_width*j+i] = temp;
+                d->norma = (temp > d->norma) ? temp : d->norma;
+        }
+    }
+
+
+    // If the matrix should be diagonally dominant, we need to exchange the sum of the rows with
+    // the ranks that share blocks in the same column
+    if (this->executionSettings->programSettings->isDiagonallyDominant) {
+        // create a communicator to exchange the rows
+        MPI_Comm row_communicator;
+        MPI_Comm_split(MPI_COMM_WORLD, this->executionSettings->programSettings->torus_row, 0,&row_communicator);
+
+        // Caclulate the sum for every row and insert in into the matrix
+        for (int local_matrix_row = 0; local_matrix_row < local_matrix_height; local_matrix_row++) {
+            int blockSize = this->executionSettings->programSettings->blockSize;
+            int global_matrix_row = this->executionSettings->programSettings->torus_row * blockSize + (local_matrix_row / blockSize) * blockSize * this->executionSettings->programSettings->torus_height + (local_matrix_row % blockSize);
+            int local_matrix_col = (global_matrix_row - this->executionSettings->programSettings->torus_col * blockSize) / (blockSize * this->executionSettings->programSettings->torus_width) * blockSize + (global_matrix_row % blockSize);
+            int diagonal_rank = (global_matrix_row / blockSize) % this->executionSettings->programSettings->torus_width;
+            bool diagonal_on_this_rank = diagonal_rank == this->executionSettings->programSettings->torus_col;
+            // set the diagonal elements of the matrix to 0
+            if (diagonal_on_this_rank) {
+                d->A[local_matrix_width*local_matrix_row + local_matrix_col] = 0.0;
+            }
+            HOST_DATA_TYPE local_row_sum = 0.0;
+            for (int i = 0; i < local_matrix_width; i++) {
+                local_row_sum += d->A[local_matrix_width*local_matrix_row + i];
+            } 
+            HOST_DATA_TYPE row_sum = 0.0;
+            MPI_Reduce(&local_row_sum, &row_sum, 1, MPI_DATA_TYPE, MPI_SUM, diagonal_rank, row_communicator);
+            // insert row sum into matrix if it contains the diagonal block
+            if (diagonal_on_this_rank) {
+                // update norm of local matrix
+                d->norma = (row_sum > d->norma) ? row_sum : d->norma;
+                d->A[local_matrix_width*local_matrix_row + local_matrix_col] = row_sum;
+            }
+        }
+    }
+        
+    // initialize other vectors
+    for (int i = 0; i < local_matrix_width; i++) {
+        d->b[i] = 0.0;
+    }
+    for (int i = 0; i < local_matrix_height; i++) {
+        d->ipvt[i] = i;
+    }
+
+    MPI_Comm col_communicator;
+    MPI_Comm_split(MPI_COMM_WORLD, this->executionSettings->programSettings->torus_col, 0,&col_communicator);
+
+    // Generate vector b by accumulating the columns of the matrix.
+    // This will lead to a result vector x with ones on every position
+    // Every rank will have a valid part of the final b vector stored
+    for (int j = 0; j < local_matrix_width; j++) {
+        HOST_DATA_TYPE local_col_sum = 0.0;
+        for (int i = 0; i < local_matrix_height; i++) {
+            local_col_sum += d->A[local_matrix_width*i+j];
+        }
+        MPI_Allreduce(&local_col_sum, &(d->b[j]), 1, MPI_DATA_TYPE, MPI_SUM, col_communicator);   
+        d->normb = (d->b[j] > d->normb) ? d->b[j] : d->normb;   
+    }
+    return d;
+}
+
 
     /**
      * @brief Linpack specific implementation of the kernel execution
@@ -246,8 +303,26 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @param data The input and output data of the benchmark
      * @return std::unique_ptr<LinpackExecutionTimings> Measured runtimes of the kernel execution
      */
-    std::unique_ptr<LinpackExecutionTimings>
-    executeKernel(LinpackData &data) override;
+    void
+    executeKernel(LinpackData<TContext> &data) override {
+    switch (this->executionSettings->programSettings->communicationType) {
+#ifdef USE_OCL_HOST
+        case hpcc_base::CommunicationType::pcie_mpi : this->timings = execution::pcie::calculate(*this->executionSettings, data); break;
+        case hpcc_base::CommunicationType::intel_external_channels: this->timings = execution::iec::calculate(*this->executionSettings, data); break;
+#endif
+#ifdef USE_XRT_HOST
+        case hpcc_base::CommunicationType::pcie_mpi : this->timings = execution::xrt_pcie::calculate(*this->executionSettings, data); break;
+#ifdef USE_ACCL
+        case hpcc_base::CommunicationType::accl : this->timings = execution::accl_buffers::calculate(*this->executionSettings, data); break;
+#endif
+#endif
+        default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(this->executionSettings->programSettings->communicationType));
+    }
+#ifdef DISTRIBUTED_VALIDATION
+    distributed_gesl_nopvt_ref(data);
+#endif
+}
+
 
     /**
      * @brief Linpack specific implementation of the execution validation
@@ -257,15 +332,233 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(LinpackData &data) override;
+    validateOutput(LinpackData<TContext> &data) override {
+        uint n= this->executionSettings->programSettings->matrixSize;
+    uint matrix_width = data.matrix_width;
+    uint matrix_height = data.matrix_height;
+    double residn;
+    double resid = 0.0;
+    double normx = 0.0;
+#ifndef DISTRIBUTED_VALIDATION
+    if (this->mpi_comm_rank > 0) {
+        for (int j = 0; j < matrix_height; j++) {
+            for (int i = 0; i < matrix_width; i+= this->executionSettings->programSettings->blockSize) {
+                MPI_Send(&data.A[matrix_width * j + i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
+            }
+        }
+        if (this->executionSettings->programSettings->torus_row == 0) {
+            for (int i = 0; i < matrix_width; i+= this->executionSettings->programSettings->blockSize) {
+                MPI_Send(&data.b[i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, 0, 0, MPI_COMM_WORLD);
+            }
+        }
+        residn = 0;
+    }
+    else {
+        MPI_Status status;
+        size_t current_offset = 0;
+        std::vector<HOST_DATA_TYPE> total_b_original(n);
+        std::vector<HOST_DATA_TYPE> total_b(n);
+        std::vector<HOST_DATA_TYPE> total_a(n*n);
+        for (int j = 0; j < n; j++) {
+            for (int i = 0; i < n; i+= this->executionSettings->programSettings->blockSize) {
+                int recvcol= (i / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_width;
+                int recvrow= (j / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_height;
+                int recvrank = this->executionSettings->programSettings->torus_width * recvrow + recvcol;
+                if (recvrank > 0) {
+                    MPI_Recv(&total_a[j * n + i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvrank, 0, MPI_COMM_WORLD,  &status);
+                }
+                else {
+                    for (int k=0; k < this->executionSettings->programSettings->blockSize; k++) {
+                        total_a[j * n + i + k] = data.A[current_offset + k];
+                    }
+                    current_offset += this->executionSettings->programSettings->blockSize;
+                }
+            }
+        }
+        current_offset = 0;
+        for (int i = 0; i < n; i+= this->executionSettings->programSettings->blockSize) {
+            int recvcol= (i / this->executionSettings->programSettings->blockSize) % this->executionSettings->programSettings->torus_width;
+            if (recvcol > 0) {
+                MPI_Recv(&total_b[i], this->executionSettings->programSettings->blockSize, MPI_DATA_TYPE, recvcol, 0, MPI_COMM_WORLD, &status);
+            }
+            else {
+                for (int k=0; k < this->executionSettings->programSettings->blockSize; k++) {
+                    total_b[i + k] = data.b[current_offset + k];
+                }
+                current_offset += this->executionSettings->programSettings->blockSize;
+            }
+        }
+
+        std::copy(total_b.begin(), total_b.end(), total_b_original.begin());
+        gesl_ref_nopvt(total_a.data(), total_b.data(), n, n);
+
+        for (int i = 0; i < n; i++) {
+            resid = (resid > std::abs(total_b[i] - 1)) ? resid : std::abs(total_b[i] - 1);
+            normx = (normx > std::abs(total_b_original[i])) ? normx : std::abs(total_b_original[i]);
+        }
+    }
+#else
+    double local_resid = 0;
+    double local_normx = data.normb;
+    #pragma omp parallel for reduction(max:local_resid)
+    for (int i = 0; i < data.matrix_width; i++) {
+        local_resid = (local_resid > std::abs(data.b[i] - 1)) ? local_resid : std::abs(data.b[i] - 1);
+    }
+#ifndef NDEBUG
+    std::cout << "Rank " << this->mpi_comm_rank << ": resid=" << local_resid << ", normx=" << local_normx << std::endl;
+#endif
+
+    MPI_Reduce(&local_resid, &resid, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&local_normx, &normx, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+#endif
+
+
+    HOST_DATA_TYPE eps = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
+    residn = resid / (static_cast<double>(n)*normx*eps);
+
+    #ifndef NDEBUG
+        if (residn > 1 &&  this->mpi_comm_size == 1) {
+            auto ref_result = generateInputData();
+            // For each column right of current diagonal element
+            for (int j = 0; j < n; j++) {
+                // For each element below it
+                for (int i = 0; i < n; i++) {
+                    std::cout << ref_result->A[n * j + i] << ", ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+            // For each column right of current diagonal element
+            for (int j = 0; j < n; j++) {
+                // For each element below it
+                for (int i = 0; i < n; i++) {
+                    std::cout << data.A[n * j + i] << ", ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+            if (this->executionSettings->programSettings->isDiagonallyDominant) {
+                linpack::gefa_ref_nopvt(ref_result->A, n, n);
+                linpack::gesl_ref_nopvt(ref_result->A, ref_result->b, n, n);
+            }
+            else {
+                linpack::gefa_ref(ref_result->A, n, n, ref_result->ipvt);
+                linpack::gesl_ref(ref_result->A, ref_result->b, ref_result->ipvt, n, n);
+            }
+            // For each column right of current diagonal element
+            for (int j = 0; j < n; j++) {
+                // For each element below it
+                for (int i = 0; i < n; i++) {
+                    std::cout << std::abs(ref_result->A[n * j + i] - data.A[n * j + i]) << ", ";
+                }
+                std::cout << std::endl;
+            }
+            std::cout << std::endl;
+        }
+    #endif
+
+    this->errors.emplace("epsilon", eps);
+    this->errors.emplace("residual", resid);
+    this->errors.emplace("residual_norm", residn);
+
+    if (this->mpi_comm_rank == 0) {
+        return residn < 1;
+    } else {
+        return true;
+    }
+}
+
+void
+printError() override {
+    if (this->mpi_comm_rank == 0) {
+        std::cout << std::setw(ENTRY_SPACE) << " norm. residual" << std::setw(ENTRY_SPACE) << " res. error" << std::setw(ENTRY_SPACE) << " mach. eps" << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << this->errors.at("residual_norm") << std::setw(ENTRY_SPACE) << this->errors.at("residual") << std::setw(ENTRY_SPACE) << this->errors.at("epsilon") << std::endl;
+    }
+}
 
-    /**
-     * @brief Linpack specific implementation of printing the execution results
-     * 
-     * @param output Measured runtimes of the kernel execution
-     */
-    void
-    collectAndPrintResults(const LinpackExecutionTimings &output) override;
+
+void
+collectResults() {
+    // Calculate performance for kernel execution plus data transfer
+    double t = 0;
+    double tlu = 0;
+    double tsl = 0;
+    double tmin = std::numeric_limits<double>::max();
+    double lu_min = std::numeric_limits<double>::max();
+    double sl_min = std::numeric_limits<double>::max();
+
+#ifndef NDEBUG
+    std::cout << "Rank " << this->mpi_comm_rank << ": Result collection started" << std::endl;
+#endif
+
+    std::vector<double> global_lu_times(this->timings["gefa"].size());
+    MPI_Reduce(this->timings["gefa"].data(), global_lu_times.data(), this->timings["gefa"].size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    std::vector<double> global_sl_times(this->timings["gesl"].size());
+    MPI_Reduce(this->timings["gesl"].data(), global_sl_times.data(), this->timings["gesl"].size(), MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+#ifndef NDEBUG
+    std::cout << "Rank " << this->mpi_comm_rank << ": Result collection done" << std::endl;
+#endif
+
+
+    if (this->mpi_comm_rank > 0) {
+        // Only the master rank needs to calculate and print result
+        return;
+    }
+
+    double total_matrix_size = static_cast<double>(this->executionSettings->programSettings->matrixSize);
+    double gflop_lu = ((2.0e0*total_matrix_size * total_matrix_size * total_matrix_size)/ 3.0) / 1.0e9; 
+    double gflop_sl = (2.0*(total_matrix_size * total_matrix_size))/1.0e9;
+    for (int i =0; i < global_lu_times.size(); i++) {
+        double currentTime = global_lu_times[i] + global_sl_times[i];
+        t +=  currentTime;
+        tlu +=  global_lu_times[i];
+        tsl += global_sl_times[i];
+        if (currentTime < tmin) {
+            tmin = currentTime;
+        }
+        if (global_lu_times[i] < lu_min) {
+            lu_min = global_lu_times[i];
+        }
+        if (global_sl_times[i] < sl_min) {
+            sl_min = global_sl_times[i];
+        }
+    }
+    
+    this->results.emplace("t_mean", hpcc_base::HpccResult(t / global_lu_times.size(), "s"));
+    this->results.emplace("t_min", hpcc_base::HpccResult(tmin, "s"));
+    this->results.emplace("tlu_mean", hpcc_base::HpccResult(tlu / global_lu_times.size(), "s"));
+    this->results.emplace("tlu_min", hpcc_base::HpccResult(lu_min, "s"));
+    this->results.emplace("tsl_mean", hpcc_base::HpccResult(tsl / global_sl_times.size(), "s"));
+    this->results.emplace("tsl_min", hpcc_base::HpccResult(sl_min, "s"));
+    this->results.emplace("gflops", hpcc_base::HpccResult((gflop_lu + gflop_sl) / tmin, "GFLOP/s"));
+    this->results.emplace("gflops_lu", hpcc_base::HpccResult(gflop_lu / lu_min, "GFLOP/s"));
+    this->results.emplace("gflops_sl", hpcc_base::HpccResult(gflop_sl / sl_min, "GFLOP/s"));
+    
+    return;
+}
+
+void
+printResults() {
+    if (this->mpi_comm_rank == 0) {
+        std::cout << std::left << std::setw(ENTRY_SPACE) << " Method"
+            << std::setw(ENTRY_SPACE) << " best"
+            << std::setw(ENTRY_SPACE) << " mean"
+            << std::setw(ENTRY_SPACE) << " GFLOPS"
+            << std::endl;
+
+        std::cout << std::left << std::setw(ENTRY_SPACE) << " total" 
+                  << this->results.at("t_min") << this->results.at("t_mean") << this->results.at("gflops")
+                  << std::endl;
+
+        std::cout << std::left << std::setw(ENTRY_SPACE) << " GEFA"
+                << this->results.at("tlu_min") << this->results.at("tlu_mean") << this->results.at("gflops_lu")
+                << std::endl;
+
+        std::cout << std::left << std::setw(ENTRY_SPACE) << " GESL"
+                  << this->results.at("tsl_min") << this->results.at("tsl_mean") << this->results.at("gflops_sl")
+                  << std::right << std::endl;
+    }
+}
 
     /**
      * @brief Construct a new Linpack Benchmark object
@@ -273,7 +566,9 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
      * @param argc the number of program input parameters
      * @param argv the program input parameters as array of strings
      */
-    LinpackBenchmark(int argc, char* argv[]);
+    LinpackBenchmark(int argc, char* argv[]) : hpcc_base::HpccFpgaBenchmark<linpack::LinpackProgramSettings, TDevice, TContext, TProgram, linpack::LinpackData<TContext>>(argc, argv) {
+        this->setupBenchmark(argc, argv);
+    }
 
         /**
      * @brief Construct a new Linpack Benchmark object
@@ -282,69 +577,7 @@ class LinpackBenchmark : public hpcc_base::HpccFpgaBenchmark<LinpackProgramSetti
 
 };
 
-/**
- *
- *
- * @param n1
- * @param y
- * @param n2
- * @param ldm
- * @param x
- * @param m
- */
-void dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m, bool transposed);
-
-/**
-Gaussian elemination reference implementation with partial pivoting.
-Can be used in exchange with kernel functions for functionality testing
-
-@param a the matrix with size of n*n
-@param n size of matrix A
-@param lda row with of the matrix. must be >=n
-@param ipvt array of pivoting indices
-
-*/
-void gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt);
-
-/**
-Solve linear equations using its LU decomposition.
-Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU
-where A is a matrix of size n*n
-
-@param a the matrix a in LU representation calculated by gefa call
-@param b vector b of the given equation
-@param ipvt vector containing pivoting information
-@param n size of matrix A
-@param lda row with of the matrix. must be >=n
-
-*/
-void gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda);
-
-/**
-Gaussian elemination reference implementation without pivoting.
-Can be used in exchange with kernel functions for functionality testing
-
-@param a the matrix with size of n*n
-@param n size of matrix A
-@param lda row with of the matrix. must be >=n
-
-*/
-void gefa_ref_nopvt(HOST_DATA_TYPE* a, unsigned n, unsigned lda);
-
-/**
-Solve linear equations using its LU decomposition without pivoting.
-Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU
-where A is a matrix of size n*n
-
-@param a the matrix a in LU representation calculated by gefa call
-@param b vector b of the given equation
-@param n size of matrix A
-@param lda row with of the matrix. must be >=n
-
-*/
-void gesl_ref_nopvt(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, unsigned n, unsigned lda);
-
-} // namespace stream
+} // namespace linpack
 
 
 #endif // SRC_HOST_STREAM_BENCHMARK_H_
diff --git a/LINPACK/src/host/linpack_data.cpp b/LINPACK/src/host/linpack_data.cpp
new file mode 100644
index 00000000..b6c8f3b8
--- /dev/null
+++ b/LINPACK/src/host/linpack_data.cpp
@@ -0,0 +1,229 @@
+//
+// Created by Marius Meyer on 04.12.19.
+//
+
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "linpack_data.hpp"
+
+/* C++ standard library headers */
+#include <memory>
+#include <random>
+
+/* Project's headers */
+#include "communication_types.hpp"
+#include "parameters.h"
+
+linpack::LinpackProgramSettings::LinpackProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
+    matrixSize(results["m"].as<uint>() * (1 << (results["b"].as<uint>()))), blockSize(1 << (results["b"].as<uint>())), 
+    isEmulationKernel(results.count("emulation") > 0), isDiagonallyDominant(results.count("uniform") == 0),
+    torus_width(results["p"].as<uint>()) {
+    int mpi_comm_rank;
+    int mpi_comm_size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size);
+    // calculate the row and column of the MPI rank in the torus 
+    if (mpi_comm_size % torus_width != 0) {
+        throw std::runtime_error("MPI size not dividable by P=" + std::to_string(torus_width) + "!");
+    } 
+    torus_height = mpi_comm_size / torus_width;
+    torus_row = (mpi_comm_rank / torus_width);
+    torus_col = (mpi_comm_rank % torus_width);
+}
+
+std::map<std::string, std::string>
+linpack::LinpackProgramSettings::getSettingsMap() {
+    auto map = hpcc_base::BaseSettings::getSettingsMap();
+    map["Matrix Size"] = std::to_string(matrixSize);
+    map["Block Size"] = std::to_string(blockSize);
+    map["Emulate"] = (isEmulationKernel) ? "Yes" : "No";
+    map["Diagonally Dominant"] = isDiagonallyDominant ? "Yes" : "No";
+    map["Data Type"] = STR(HOST_DATA_TYPE);
+    map["FPGA Torus"] = "P=" + std::to_string(torus_width) +
+                        ", Q=" + std::to_string(torus_height);
+    return map;
+}
+
+/**
+Standard LU factorization on a block with fixed size
+
+Case 1 of Zhangs description
+*/
+void
+linpack::gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt) {
+    for (int i = 0; i < n; i++) {
+        ipvt[i] = i;
+    }
+    // For each diagnonal element
+    for (int k = 0; k < n - 1; k++) {
+        HOST_DATA_TYPE max_val = fabs(a[k * lda + k]);
+        int pvt_index = k;
+        for (int i = k + 1; i < n; i++) {
+            if (max_val < fabs(a[k * lda + i])) {
+                pvt_index = i;
+                max_val = fabs(a[k * lda + i]);
+            }
+        }
+
+        for (int i = k; i < n; i++) {
+            HOST_DATA_TYPE tmp_val = a[i * lda + k];
+            a[i * lda + k] = a[i * lda + pvt_index];
+            a[i * lda + pvt_index] = tmp_val;
+        }
+        ipvt[k] = pvt_index;
+
+        // For each element below it
+        for (int i = k + 1; i < n; i++) {
+            a[k * lda + i] *= -1.0 / a[k * lda + k];
+        }
+        // For each column right of current diagonal element
+        for (int j = k + 1; j < n; j++) {
+            // For each element below it
+            for (int i = k+1; i < n; i++) {
+                a[j * lda + i] += a[k * lda + i] * a[j * lda + k];
+            }
+        }
+
+#ifdef DEBUG
+        std::cout << "A(k=" << k <<"): " << std::endl;
+                for (int i= 0; i < n; i++) {
+                    for (int j=0; j < n; j++) {
+                        std::cout << a[i*lda + j] << ", ";
+                    }
+                    std::cout << std::endl;
+                }
+                std::cout <<  std::endl;
+#endif
+
+    }
+}
+
+void
+linpack::gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda) {
+    auto b_tmp = new HOST_DATA_TYPE[n];
+    {
+        for (int k = 0; k < n; k++) {
+            b_tmp[k] = b[k];
+        }
+
+        // solve l*y = b
+        // For each row in matrix
+        for (int k = 0; k < n - 1; k++) {
+            if (ipvt[k] != k) {
+                HOST_DATA_TYPE tmp = b_tmp[k];
+                b_tmp[k] = b_tmp[ipvt[k]];
+                b_tmp[ipvt[k]] = tmp;
+            }
+            // For each row below add
+            for (int i = k + 1; i < n; i++) {
+                // add solved upper row to current row
+                b_tmp[i] += b_tmp[k] * a[lda * k + i];
+            }
+        }
+
+        // now solve  u*x = y
+        for (int k = n - 1; k >= 0; k--) {
+            b_tmp[k] = b_tmp[k] / a[lda * k + k];
+            for (int i = 0; i < k; i++) {
+                b_tmp[i] -= b_tmp[k] * a[lda * k + i];
+            }
+        }
+        for (int k = 0; k < n; k++) {
+            b[k] = b_tmp[k];
+        }
+    }
+    delete [] b_tmp;
+}
+
+void linpack::dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m, bool transposed) {
+    for (int i=0; i < n1; i++) {
+        for (int j=0; j < n2; j++) {
+            y[i] = y[i] + x[j] * (transposed ? m[ldm*i + j] :m[ldm*j + i]);
+        }
+    }
+}
+
+void
+linpack::gefa_ref_nopvt(HOST_DATA_TYPE* a, unsigned n, unsigned lda) {
+    // For each diagnonal element
+    for (int k = 0; k < n; k++) {
+        // Store negatie invers of diagonal elements to get rid of some divisions afterwards!
+        a[k * lda + k] = -1.0 / a[k * lda + k];
+        // For each element below it
+        for (int i = k + 1; i < n; i++) {
+            a[k * lda + i] *= a[k * lda + k];
+        }
+        // For each column right of current diagonal element
+        for (int j = k + 1; j < n; j++) {
+            // For each element below it
+            for (int i = k+1; i < n; i++) {
+                a[j * lda + i] += a[k * lda + i] * a[j * lda + k];
+            }
+        }
+
+#ifdef DEBUG
+        std::cout << "A(k=" << k << "): " << std::endl;
+                for (int i= 0; i < n; i++) {
+                    for (int j=0; j < n; j++) {
+                        std::cout << a[i*lda + j] << ", ";
+                    }
+                    std::cout << std::endl;
+                }
+                std::cout <<  std::endl;
+#endif
+
+    }
+}
+
+
+void
+linpack::gesl_ref_nopvt(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, unsigned n, unsigned lda) {
+    auto b_tmp = new HOST_DATA_TYPE[n];
+
+    for (int k = 0; k < n; k++) {
+        b_tmp[k] = b[k];
+    }
+
+    // solve l*y = b
+    // For each row in matrix
+    for (int k = 0; k < n - 1; k++) {
+        // For each row below add
+        for (int i = k + 1; i < n; i++) {
+            // add solved upper row to current row
+            b_tmp[i] += b_tmp[k] * a[lda * k + i];
+        }
+    }
+
+    // now solve  u*x = y
+    for (int k = n - 1; k >= 0; k--) {
+        HOST_DATA_TYPE scale = b_tmp[k] * a[lda * k + k];
+        b_tmp[k] = -scale;
+        for (int i = 0; i < k; i++) {
+            b_tmp[i] += scale * a[lda * k + i];
+        }
+    }
+    for (int k = 0; k < n; k++) {
+        b[k] = b_tmp[k];
+    }
+    delete [] b_tmp;
+}
diff --git a/LINPACK/src/host/linpack_data.hpp b/LINPACK/src/host/linpack_data.hpp
new file mode 100644
index 00000000..341ce0a2
--- /dev/null
+++ b/LINPACK/src/host/linpack_data.hpp
@@ -0,0 +1,301 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef SRC_HOST_LINPACK_DATA_H_
+#define SRC_HOST_LINPACK_DATA_H_
+
+/* C++ standard library headers */
+#include <complex>
+#include <memory>
+#include <random>
+
+/* Project's headers */
+#include "hpcc_benchmark.hpp"
+#include "parameters.h"
+extern "C" {
+    #include "gmres.h"
+}
+
+/**
+ * @brief Contains all classes and methods needed by the LINPACK benchmark
+ * 
+ */
+namespace linpack {
+
+/**
+ * @brief The Linpack specific program settings
+ * 
+ */
+class LinpackProgramSettings : public hpcc_base::BaseSettings {
+
+public:
+    /**
+     * @brief The size of the local matrix in number of blocks in one dimension
+     * 
+     */
+    uint matrixSize;
+
+    /**
+     * @brief Size of a single block of the matrix in values in one dimension
+     * 
+     */
+    uint blockSize;
+
+    /**
+     * @brief Indicates if the generated input matrix should be diagonally dominant
+     * 
+     */
+    bool isDiagonallyDominant;
+
+    /**
+     * @brief True, if the used kernel is an emulation kernel. Different kernel arguments may be used in this case to
+     *          simulate persistent local memory.
+     * 
+     */
+    bool isEmulationKernel;
+
+    /**
+     * @brief The row position of this MPI rank in the torus
+     * 
+     */
+    int torus_row;
+
+    /**
+     * @brief The rcolumn position of this MPI rank in the torus
+     * 
+     */
+    int torus_col;
+
+    /**
+     * @brief Width of the torus in number of ranks
+     * 
+     */
+    int torus_width;
+
+    /**
+     * @brief Height of the FPGA torus in number of ranks
+     * 
+     */
+    int torus_height;
+
+    /**
+     * @brief Construct a new Linpack Program Settings object
+     * 
+     * @param results the result map from parsing the program input parameters
+     */
+    LinpackProgramSettings(cxxopts::ParseResult &results);
+
+    /**
+     * @brief Get a map of the settings. This map will be used to print the final configuration.
+     * 
+     * @return a map of program parameters. keys are the name of the parameter.
+     */
+    std::map<std::string, std::string> getSettingsMap() override;
+
+};
+
+/**
+ * @brief Data class containing the data the kernel is exeucted with
+ * 
+ */
+template<class TContext>
+class LinpackData {
+
+public:
+
+    /**
+     * @brief  The input matrix representing the left side of the linear equation system
+     * 
+     */
+    HOST_DATA_TYPE *A;
+
+    /**
+     * @brief  The input vector the right side of the linear equation system
+     * 
+     */
+    HOST_DATA_TYPE *b;
+
+    /**
+     * @brief A vector that can be used to store pivoting information
+     * 
+     */
+    cl_int* ipvt;
+
+    /**
+     * @brief Width of the local matrix in values
+     * 
+     */
+    size_t matrix_width;
+
+    /**
+     * @brief Height of the local matrix in values
+     * 
+     */
+    size_t matrix_height;
+
+    /**
+     * @brief The context that is used to allocate memory in SVM mode
+     * 
+     */
+    TContext context;
+
+    /**
+     * @brief The maximum value of A that will be used for the error calculation
+     * 
+     */
+    HOST_DATA_TYPE norma;
+
+    /**
+     * @brief The maximum value of A that will be used for the error calculation
+     * 
+     */
+    HOST_DATA_TYPE normb;
+
+    /**
+     * @brief Construct a new Linpack Data object
+     * 
+     * @param context The OpenCL context used to allocate memory in SVM mode
+     * @param width width of the local matrix in values
+     * @param height height of the local matrix in values
+     */
+    LinpackData(TContext &context, size_t width, size_t height) : norma(0.0), 
+#ifdef USE_SVM
+    context(context),
+#endif
+    matrix_width(width), matrix_height(height) {
+#ifdef USE_SVM
+        A = reinterpret_cast<HOST_DATA_TYPE*>(
+                            clSVMAlloc(context(), 0 ,
+                            size * size * sizeof(HOST_DATA_TYPE), 1024));
+        b = reinterpret_cast<HOST_DATA_TYPE*>(
+                            clSVMAlloc(context(), 0 ,
+                            size  * sizeof(HOST_DATA_TYPE), 1024));
+        ipvt = reinterpret_cast<cl_int*>(
+                            clSVMAlloc(context(), 0 ,
+                            size * sizeof(cl_int), 1024));
+#else
+        posix_memalign(reinterpret_cast<void**>(&A), 4096, width * height * sizeof(HOST_DATA_TYPE));
+        posix_memalign(reinterpret_cast<void**>(&b), 4096, width * sizeof(HOST_DATA_TYPE));
+        posix_memalign(reinterpret_cast<void**>(&ipvt), 4096, height * sizeof(cl_int));
+#endif
+    }
+
+    ~LinpackData() {
+#ifdef USE_SVM
+        clSVMFree(context(), reinterpret_cast<void*>(A));
+        clSVMFree(context(), reinterpret_cast<void*>(b));
+        clSVMFree(context(), reinterpret_cast<void*>(ipvt));
+#else
+        free(A);
+        free(b);
+        free(ipvt);
+#endif
+    }
+
+};
+
+/**
+ * @brief Measured execution timing from the kernel execution
+ * 
+ */
+class LinpackExecutionTimings {
+public:
+    /**
+     * @brief A vector containing the timings for all repetitions for the kernel execution for the gefa kernel
+     * 
+     */
+    std::vector<double> gefaTimings;
+
+    /**
+     * @brief A vector containing the timings for all repetitions for the kernel execution for the gesl kernel
+     * 
+     */
+    std::vector<double> geslTimings;
+
+
+};
+
+/**
+ *
+ *
+ * @param n1
+ * @param y
+ * @param n2
+ * @param ldm
+ * @param x
+ * @param m
+ */
+void dmxpy(unsigned n1, HOST_DATA_TYPE* y, unsigned n2, unsigned ldm, HOST_DATA_TYPE* x, HOST_DATA_TYPE* m, bool transposed);
+
+/**
+Gaussian elemination reference implementation with partial pivoting.
+Can be used in exchange with kernel functions for functionality testing
+
+@param a the matrix with size of n*n
+@param n size of matrix A
+@param lda row with of the matrix. must be >=n
+@param ipvt array of pivoting indices
+
+*/
+void gefa_ref(HOST_DATA_TYPE* a, unsigned n, unsigned lda, cl_int* ipvt);
+
+/**
+Solve linear equations using its LU decomposition.
+Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU
+where A is a matrix of size n*n
+
+@param a the matrix a in LU representation calculated by gefa call
+@param b vector b of the given equation
+@param ipvt vector containing pivoting information
+@param n size of matrix A
+@param lda row with of the matrix. must be >=n
+
+*/
+void gesl_ref(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, cl_int* ipvt, unsigned n, unsigned lda);
+
+/**
+Gaussian elemination reference implementation without pivoting.
+Can be used in exchange with kernel functions for functionality testing
+
+@param a the matrix with size of n*n
+@param n size of matrix A
+@param lda row with of the matrix. must be >=n
+
+*/
+void gefa_ref_nopvt(HOST_DATA_TYPE* a, unsigned n, unsigned lda);
+
+/**
+Solve linear equations using its LU decomposition without pivoting.
+Therefore solves A*x = b by solving L*y = b and then U*x = y with A = LU
+where A is a matrix of size n*n
+
+@param a the matrix a in LU representation calculated by gefa call
+@param b vector b of the given equation
+@param n size of matrix A
+@param lda row with of the matrix. must be >=n
+
+*/
+void gesl_ref_nopvt(HOST_DATA_TYPE* a, HOST_DATA_TYPE* b, unsigned n, unsigned lda);
+
+
+}
+#endif // SRC_HOST_LINPACK_DATA_H__
diff --git a/LINPACK/src/host/main.cpp b/LINPACK/src/host/main.cpp
index d05a7319..51c4d292 100644
--- a/LINPACK/src/host/main.cpp
+++ b/LINPACK/src/host/main.cpp
@@ -12,7 +12,16 @@ The program entry point
 int
 main(int argc, char *argv[]) {
     // Setup benchmark
-    LinpackBenchmark bm(argc, argv);
+#ifdef USE_OCL_HOST
+    LinpackBenchmark<cl::Device, cl::Context, cl::Program> bm(argc, argv);
+#endif
+#ifdef USE_XRT_HOST
+#ifndef USE_ACCL
+    LinpackBenchmark<xrt::device, bool, xrt::uuid> bm(argc, argv);
+#else
+    LinpackBenchmark<xrt::device, fpga_setup::ACCLContext, xrt::uuid> bm(argc, argv);
+#endif
+#endif
     bool success = bm.executeBenchmark();
     if (success) {
         return 0;
diff --git a/LINPACK/tests/test_host_reference_implementations.cpp b/LINPACK/tests/test_host_reference_implementations.cpp
index b1c7c8fc..0775b906 100644
--- a/LINPACK/tests/test_host_reference_implementations.cpp
+++ b/LINPACK/tests/test_host_reference_implementations.cpp
@@ -8,12 +8,12 @@
 
 struct LinpackHostTest : testing::Test {
     
-    std::unique_ptr<linpack::LinpackBenchmark> bm;
-    std::unique_ptr<linpack::LinpackData> data;
+    std::unique_ptr<linpack::LinpackBenchmark<cl::Device, cl::Context, cl::Program>> bm;
+    std::unique_ptr<linpack::LinpackData<cl::Context>> data;
     int array_size = 0;
 
     void SetUp() override {
-        bm = std::unique_ptr<linpack::LinpackBenchmark>(new linpack::LinpackBenchmark(global_argc, global_argv));
+        bm = std::unique_ptr<linpack::LinpackBenchmark<cl::Device, cl::Context, cl::Program>>(new linpack::LinpackBenchmark<cl::Device, cl::Context, cl::Program>(global_argc, global_argv));
         bm->getExecutionSettings().programSettings->matrixSize = 1 << LOCAL_MEM_BLOCK_LOG;
         bm->getExecutionSettings().programSettings->isDiagonallyDominant = true;
         data = bm->generateInputData();
@@ -74,7 +74,8 @@ TEST_F(LinpackHostTest, ReferenceSolveGMRES) {
     for (int i=0; i < array_size; i++) {
         data->b[i] = static_cast<float>(x[i]);
     }
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError(); 
 }
 #endif
 
@@ -83,7 +84,8 @@ TEST_F(LinpackHostTest, ReferenceSolveWithPivoting) {
     data = bm->generateInputData();
     linpack::gefa_ref(data->A, array_size, array_size, data->ipvt);
     linpack::gesl_ref(data->A, data->b, data->ipvt, array_size, array_size);
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError(); 
 }
 
 
@@ -91,7 +93,8 @@ TEST_F(LinpackHostTest, ReferenceSolveWithoutPivoting) {
     data = bm->generateInputData();
     linpack::gefa_ref_nopvt(data->A, array_size, array_size);
     linpack::gesl_ref_nopvt(data->A, data->b, array_size, array_size);
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError(); 
 }
 
 
diff --git a/LINPACK/tests/test_kernel_communication.cpp b/LINPACK/tests/test_kernel_communication.cpp
index dfcb8867..c2072974 100644
--- a/LINPACK/tests/test_kernel_communication.cpp
+++ b/LINPACK/tests/test_kernel_communication.cpp
@@ -14,14 +14,14 @@
 class LinpackKernelCommunicationTest : public testing::Test {
 
 public:
-    std::unique_ptr<linpack::LinpackBenchmark> bm;
-    std::unique_ptr<linpack::LinpackData> data;
+    std::unique_ptr<linpack::LinpackBenchmark<cl::Device, cl::Context, cl::Program>> bm;
+    std::unique_ptr<linpack::LinpackData<cl::Context>> data;
     const unsigned numberOfChannels = 4;
     const std::string channelOutName = "kernel_output_ch";
     const std::string channelInName = "kernel_input_ch";
 
     virtual void SetUp() override {
-        bm = std::unique_ptr<linpack::LinpackBenchmark>(new linpack::LinpackBenchmark(global_argc, global_argv));
+        bm = std::unique_ptr<linpack::LinpackBenchmark<cl::Device, cl::Context, cl::Program>>(new linpack::LinpackBenchmark<cl::Device, cl::Context, cl::Program>(global_argc, global_argv));
         bm->getExecutionSettings().programSettings->isDiagonallyDominant = true;
         bm->getExecutionSettings().programSettings->matrixSize = BLOCK_SIZE;
         if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
@@ -920,7 +920,8 @@ TEST_F(LinpackKernelCommunicationTestLU, LUBlockExternalResultisSameAsRef) {
 
 TEST_F(LinpackKernelCommunicationTestLU, LUBlockExternalResultisCorrect) {
     linpack::gesl_ref_nopvt(data->A, data->b, bm->getExecutionSettings().programSettings->matrixSize,bm->getExecutionSettings().programSettings->matrixSize);
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError(); 
 
 }
 
@@ -1206,8 +1207,10 @@ class LinpackKernelCommunicationTestAll : public LinpackKernelCommunicationTest
     }
 };
 
-
-TEST_F(LinpackKernelCommunicationTestAll, AllBlockExternalResultisCorrect) {
+// TODO: This test is disabled because it fails non-deterministicly although 
+// calculations with benchmark host are correct.
+// Maybe this is related to a problem with intel external channels in emulation.
+TEST_F(LinpackKernelCommunicationTestAll, DISABLED_AllBlockExternalResultisCorrect) {
     uint matrix_size = bm->getExecutionSettings().programSettings->matrixSize;
 
     auto ref_data = bm->generateInputData();
diff --git a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp
index 9432fb49..0961ac25 100644
--- a/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/LINPACK/tests/test_kernel_functionality_and_host_integration.cpp
@@ -5,6 +5,7 @@
 #include "parameters.h"
 #include "test_program_settings.h"
 #include "linpack_benchmark.hpp"
+#include "nlohmann/json.hpp"
 
 #ifdef _LAPACK_
 #ifdef _DP
@@ -16,13 +17,13 @@ extern "C" void sgesv_(int* size, int* lrhs, float* A, int* size2, int* ipvt, fl
 
 struct LinpackKernelTest : testing::TestWithParam<uint> {
     
-    std::unique_ptr<linpack::LinpackBenchmark> bm;
-    std::unique_ptr<linpack::LinpackData> data;
+    std::unique_ptr<linpack::LinpackBenchmark<cl::Device, cl::Context, cl::Program>> bm;
+    std::unique_ptr<linpack::LinpackData<cl::Context>> data;
     uint array_size = 0;
 
     void SetUp() override {
         uint matrix_blocks = GetParam();
-        bm = std::unique_ptr<linpack::LinpackBenchmark>(new linpack::LinpackBenchmark(global_argc, global_argv));
+        bm = std::unique_ptr<linpack::LinpackBenchmark<cl::Device, cl::Context, cl::Program>>(new linpack::LinpackBenchmark<cl::Device, cl::Context, cl::Program>(global_argc, global_argv));
         bm->getExecutionSettings().programSettings->matrixSize = matrix_blocks * (1 << LOCAL_MEM_BLOCK_LOG);
         data = bm->generateInputData();
         array_size = bm->getExecutionSettings().programSettings->matrixSize;
@@ -40,7 +41,7 @@ struct LinpackKernelTest : testing::TestWithParam<uint> {
  * Execution returns correct results for a single repetition
  */
 TEST_P(LinpackKernelTest, FPGACorrectResultsOneRepetition) {
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < array_size; i++) {
         EXPECT_NEAR(data->b[i], 1.0, 1.0e-3);
     }
@@ -50,7 +51,7 @@ TEST_P(LinpackKernelTest, FPGACorrectResultsOneRepetition) {
  * GEFA Execution returns correct results for a single repetition
  */
 TEST_P(LinpackKernelTest, DISABLED_FPGACorrectResultsGEFA) {
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     auto data2 = bm->generateInputData();
     if (bm->getExecutionSettings().programSettings->isDiagonallyDominant) {
         linpack::gefa_ref_nopvt(data2->A, array_size, array_size);
@@ -87,14 +88,44 @@ TEST_P(LinpackKernelTest, DISABLED_ValidationWorksForMKL) {
 #else
         dgesv_(&s, &lrhs, data_cpu->A, &s, data_cpu->ipvt, data_cpu->b, &s, &info);
 #endif
-    bool success = bm->validateOutputAndPrintError(*data_cpu);
-    EXPECT_TRUE(success);
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError(); 
 }
 
 
 #endif
 
+using json = nlohmann::json;
+
+TEST_P(LinpackKernelTest, JsonDump) {
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("linpack.json");
+    std::FILE *f = std::fopen("linpack.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].contains("gefa"));
+            EXPECT_TRUE(j["timings"].contains("gesl"));
+        }
+        EXPECT_TRUE(j.contains("results"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("gflops"));
+            EXPECT_TRUE(j["results"].contains("gflops_lu"));
+            EXPECT_TRUE(j["results"].contains("gflops_sl"));
+            EXPECT_TRUE(j["results"].contains("t_mean"));
+            EXPECT_TRUE(j["results"].contains("t_min"));
+            EXPECT_TRUE(j["results"].contains("tlu_mean"));
+            EXPECT_TRUE(j["results"].contains("tlu_min"));
+            EXPECT_TRUE(j["results"].contains("tsl_mean"));
+            EXPECT_TRUE(j["results"].contains("tsl_min"));
+        }
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(
         LinpackKernelParametrizedTests,
         LinpackKernelTest,
-        ::testing::Values(1, 2, 3));
\ No newline at end of file
+        ::testing::Values(1, 2, 3));
diff --git a/PTRANS/CMakeLists.txt b/PTRANS/CMakeLists.txt
index 71e64026..ef4c4a47 100755
--- a/PTRANS/CMakeLists.txt
+++ b/PTRANS/CMakeLists.txt
@@ -18,6 +18,11 @@ set(HOST_EMULATION_REORDER No CACHE BOOL "Reorder the scheduling of FPGA kernels
 
 mark_as_advanced(READ_KERNEL_NAME WRITE_KERNEL_NAME USE_BUFFER_WRITE_RECT_FOR_A XILINX_UNROLL_INNER_LOOPS)
 
+if (USE_ACCL)
+    math(EXPR calculate_accl_buffer_size "${BLOCK_SIZE} * ${BLOCK_SIZE} * 8")
+    set(ACCL_BUFFER_SIZE ${calculate_accl_buffer_size} CACHE STRING "Size of ACCL buffers in bytes")
+endif()
+
 set(USE_MPI Yes)
 set(USE_OPENMP Yes)
 set(USE_DEPRECATED_HPP_HEADER No)
diff --git a/PTRANS/README.md b/PTRANS/README.md
index 55dfd8c4..521389a0 100644
--- a/PTRANS/README.md
+++ b/PTRANS/README.md
@@ -69,58 +69,58 @@ For the execution of the benchmark run:
     
 For more information on available input parameters run
 
-    $./Transpose_xilinx -h
-    -------------------------------------------------------------
-    General setup:
-    C++ high resolution clock is used.
-    The clock precision seems to be 1.00000e+01ns
-    -------------------------------------------------------------
+    ./Transpose_xilinx -h
+
     Implementation of the matrix transposition benchmark proposed in the HPCC benchmark suite for FPGA.
     Version: 1.7
 
     MPI Version:  3.1
-    Config. Time: Fri Mar 04 10:31:13 UTC 2022
-    Git Commit:   caebda4-dirty
+    Config. Time: Thu Dec 08 10:41:51 UTC 2022
+    Git Commit:   86e0064-dirty
 
     Usage:
-    bin/Transpose_intel [OPTION...]
+      ./bin/Transpose_intel [OPTION...]
 
-    -f, --file arg            Kernel file name
-    -n, arg                   Number of repetitions (default: 10)
-    -i,                       Use memory Interleaving
-        --skip-validation     Skip the validation of the output data. This will
+      -f, --file arg            Kernel file name
+      -n, arg                   Number of repetitions (default: 10)
+      -i,                       Use memory Interleaving
+          --skip-validation     Skip the validation of the output data. This will
                                 speed up execution and helps when working with
                                 special data types.
-        --device arg          Index of the device that has to be used. If not
+          --device arg          Index of the device that has to be used. If not
                                 given you will be asked which device to use if
-                                there are multiple devices available. (default: -1)
-        --platform arg        Index of the platform that has to be used. If not
+                                there are multiple devices available. (default: 0)
+          --platform arg        Index of the platform that has to be used. If not
                                 given you will be asked which platform to use if
                                 there are multiple platforms available. (default:
-                                -1)
-    -r, arg                   Number of used kernel replications (default: 2)
-        --comm-type arg       Used communication type for inter-FPGA
+                                0)
+          --platform_str arg    Name of the platform that has to be used
+                                (default: )
+      -r, arg                   Number of used kernel replications (default: 2)
+          --comm-type arg       Used communication type for inter-FPGA
                                 communication (default: AUTO)
-        --test                Only test given configuration and skip execution
+          --dump-json arg       dump benchmark configuration and results to this
+                                file in json format (default: )
+          --test                Only test given configuration and skip execution
                                 and validation
-    -h, --help                Print this help
-    -m, arg                   Matrix size in number of blocks in one dimension
-                                (default: 8)
-    -b, arg                   Block size in number of values in one dimension
+      -h, --help                Print this help
+      -m, arg                   Matrix size in number of blocks in one dimension
                                 (default: 8)
-    -p, arg                   Value of P that equals the width of the PQ grid
+      -b, arg                   Block size in number of values in one dimension
+                                (default: 512)
+      -p, arg                   Value of P that equals the width of the PQ grid
                                 of FPGAs. Q is determined by the world size.
                                 (default: 1)
-        --distribute-buffers  Distribute buffers over memory banks. This will
+          --distribute-buffers  Distribute buffers over memory banks. This will
                                 use three memory banks instead of one for a single
                                 kernel replication, but kernel replications may
                                 interfere. This is an Intel only attribute, since
                                 buffer placement is decided at compile time for
                                 Xilinx FPGAs.
-        --handler arg         Specify the used data handler that distributes
+          --handler arg         Specify the used data handler that distributes
                                 the data over devices and memory banks (default:
                                 AUTO)
-    
+
 Available options for `--comm-type`:
 
 - `CPU`: CPU only execution. MKL required.
@@ -142,16 +142,12 @@ It will run an emulation of the kernel and execute some functionality tests.
 
 An example output from an emulation is given below:
 
-    -------------------------------------------------------------
-    Validate output...
-    -------------------------------------------------------------
-    Maximum error: 7.62939e-06 < 1.19209e-05
+    Maximum error: 1.19209e-07          < 1.19209e-05
     Mach. Epsilon: 1.19209e-07
-    Validation Time: 4.66312e+00 s
-           total [s]     transfer [s]  calc [s]      calc FLOPS    Mem [B/s]     PCIe [B/s]
-    avg:   1.15886e+00   1.04112e+00   1.17743e-01   9.11940e+09   1.09433e+11   1.23760e+10
-    best:  1.13323e+00   1.02481e+00   1.08424e-01   9.90319e+09   1.18838e+11   1.25730e+10
-    Validation: SUCCESS!
+
+                    total time          transfer time       calc time           calc FLOPS          Memory Bandwidth    PCIe Bandwidth      
+               avg: 6.05723e-02 s       1.30980e-02 s       4.74743e-02 s       3.53396e-01 GFLOP/s 4.24075e+00 GB/s    1.53708e+01 GB/s    
+              best: 4.69977e-02 s       1.05343e-02 s       3.64633e-02 s       4.60112e-01 GFLOP/s 5.52134e+00 GB/s    1.91115e+01 GB/s    
 
 The output gives the average and best calculation time for the transposition and important derived metrics based on these times.
 For the average and best timings, we have the following columns:
@@ -171,3 +167,109 @@ The machine epsilon is given in the row below with `Mach. Epsilon`.
 Moreover, the total time that was needed for the validation of the result is given, which is just a debug information.
 The very last column summarizes the result: The last row will show `Validation: SUCCESS!` if the validation succeeded and the error is below the tolerated threshold.
 
+The json output looks like the following.
+
+```json
+
+{
+  "config_time": "Wed Dec 14 08:42:29 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {
+    "epsilon": 1.1920928955078125e-07,
+    "max_error": 199.96849060058594
+  },
+  "execution_time": "Wed Dec 14 09:57:30 UTC 2022",
+  "git_commit": "be1a4e9-dirty",
+  "mpi": {
+    "subversion": 1,
+    "version": 3
+  },
+  "name": "matrix transposition",
+  "results": {
+    "avg_calc_flops": {
+      "unit": "GFLOP/s",
+      "value": 0.011002914958427963
+    },
+    "avg_calc_t": {
+      "unit": "s",
+      "value": 1.524797389
+    },
+    "avg_mem_bandwidth": {
+      "unit": "GB/s",
+      "value": 0.13203497950113555
+    },
+    "avg_t": {
+      "unit": "s",
+      "value": 1.5332141689999998
+    },
+    "avg_transfer_bandwidth": {
+      "unit": "GB/s",
+      "value": 23.919669042080226
+    },
+    "avg_transfer_t": {
+      "unit": "s",
+      "value": 0.00841678
+    },
+    "max_calc_flops": {
+      "unit": "GFLOP/s",
+      "value": 0.011002914958427963
+    },
+    "max_mem_bandwidth": {
+      "unit": "GB/s",
+      "value": 0.13203497950113555
+    },
+    "max_transfer_bandwidth": {
+      "unit": "GB/s",
+      "value": 23.919669042080226
+    },
+    "min_calc_t": {
+      "unit": "s",
+      "value": 1.524797389
+    },
+    "min_t": {
+      "unit": "s",
+      "value": 1.5332141689999998
+    },
+    "min_transfer_t": {
+      "unit": "s",
+      "value": 0.00841678
+    }
+  },
+  "settings": {
+    "Block Size": 512,
+    "Communication Type": false,
+    "Data Handler": false,
+    "Dist. Buffers": false,
+    "FPGA Torus": {
+      "P": 1,
+      "Q": 3
+    },
+    "Kernel File": false,
+    "Kernel Replications": 2,
+    "MPI Ranks": 3,
+    "Matrix Size": 4096,
+    "Repetitions": 1,
+    "Test Mode": false
+  },
+  "timings": {
+    "calculation": [
+      {
+        "unit": "s",
+        "value": 1.523696949
+      }
+    ],
+    "transfer": [
+      {
+        "unit": "s",
+        "value": 0.008189295
+      }
+    ]
+  },
+  "validated": false,
+  "version": "1.7"
+}
+
+```
diff --git a/PTRANS/configs/Xilinx_U280_DDR_ACCL_TCP_buffers.cmake b/PTRANS/configs/Xilinx_U280_DDR_ACCL_TCP_buffers.cmake
new file mode 100644
index 00000000..e8e77751
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U280_DDR_ACCL_TCP_buffers.cmake
@@ -0,0 +1,28 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(ACCL_STACK_TYPE "TCP" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake b/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake
new file mode 100644
index 00000000..21c8ec77
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U280_DDR_ACCL_buffers.cmake
@@ -0,0 +1,27 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake
new file mode 100644
index 00000000..6b196634
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U280_DDR_ACCL_stream.cmake
@@ -0,0 +1,28 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(ACCL_STACK_TYPE "TCP" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake
index 46ef245c..eb878f8d 100644
--- a/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake
+++ b/PTRANS/configs/Xilinx_U280_DDR_PCIE.cmake
@@ -9,9 +9,13 @@
 set(USE_MPI Yes CACHE BOOL "" FORCE)
 set(USE_SVM No CACHE BOOL "" FORCE)
 set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
 set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
 set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini CACHE FILEPATH "" FORCE)
 set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.ddr.ini CACHE FILEPATH "" FORCE)
+set(FORCE_FILE_ENDING "cpp" CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES transpose0 CACHE STRING "" FORCE)
 
 # STREAM specific options
 # Defaults to a total of ~12GB data
diff --git a/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake
new file mode 100644
index 00000000..d5223408
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream.cmake
@@ -0,0 +1,29 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream_profile.cmake b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream_profile.cmake
new file mode 100644
index 00000000..1b1aa691
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U280_HBM_ACCL_stream_profile.cmake
@@ -0,0 +1,30 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_gen3x16_xdma_1_202211_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake
new file mode 100644
index 00000000..f84f73a2
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream.cmake
@@ -0,0 +1,28 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream_profile.cmake b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream_profile.cmake
new file mode 100644
index 00000000..a61bd058
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U55C_HBM_ACCL_stream_profile.cmake
@@ -0,0 +1,30 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(ACCL_STACK_TYPE "UDP" CACHE STRING "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_KERNEL_NAMES transpose_read0 transpose_write0 CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 1 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/configs/Xilinx_U55C_HBM_PCIE.cmake b/PTRANS/configs/Xilinx_U55C_HBM_PCIE.cmake
new file mode 100644
index 00000000..c2f3cb4d
--- /dev/null
+++ b/PTRANS/configs/Xilinx_U55C_HBM_PCIE.cmake
@@ -0,0 +1,23 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini CACHE FILEPATH "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(DEFAULT_MATRIX_SIZE 8 CACHE STRING "Default size of the used matrices" FORCE)
+set(BLOCK_SIZE 256 CACHE STRING "Block size used in the FPGA kernel" FORCE)
+set(CHANNEL_WIDTH 16 CACHE STRING "Width of a single channel in number of values. Also specifies the width of memory" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications (should match number of external channels here)" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/PTRANS/scripts/prepare_tests.sh b/PTRANS/scripts/prepare_tests.sh
new file mode 100755
index 00000000..2705d74d
--- /dev/null
+++ b/PTRANS/scripts/prepare_tests.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/bash
+
+cd $1
+touch kernel_output_ch0
+touch kernel_output_ch1
+touch kernel_output_ch2
+touch kernel_output_ch3
+ln -s kernel_output_ch0 kernel_input_ch1
+ln -s kernel_output_ch2 kernel_input_ch3
+ln -s kernel_output_ch1 kernel_input_ch0
+ln -s kernel_output_ch3 kernel_input_ch2
diff --git a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini
index 7e52533c..d259f88f 100644
--- a/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini
+++ b/PTRANS/settings/settings.compile.xilinx.transpose_pq_pcie.hbm.ini
@@ -1,4 +1,2 @@
-kernel_frequency=450
-
 [hls]
 max_memory_ports=all
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini
new file mode 100644
index 00000000..1cb8cc27
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_buffers.ddr.ini
@@ -0,0 +1,76 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+nk=transpose0:2:transpose0.transpose1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
+slr=transpose0:SLR0
+slr=transpose1:SLR1
+
+sp=ccl_offload_0.m_axi_0:DDR[0:1]
+sp=ccl_offload_0.m_axi_1:DDR[0:1]
+sp=transpose0.m_axi_gmem:DDR[0]
+sp=transpose1.m_axi_gmem:DDR[1]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini
new file mode 100644
index 00000000..3c7fcf31
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.ddr.ini
@@ -0,0 +1,76 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=transpose_read0:1
+nk=transpose_write0:1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR2
+slr=transpose_read0_1:SLR0
+slr=transpose_write0_1:SLR0
+
+sp=ccl_offload_0.m_axi_0:DDR[0:1]
+sp=ccl_offload_0.m_axi_1:DDR[0:1]
+sp=transpose_read0_1.m_axi_gmem0:DDR[0:1]
+sp=transpose_write0_1.m_axi_gmem0:DDR[0]
+sp=transpose_write0_1.m_axi_gmem1:DDR[1]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl
+stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl
+
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini
new file mode 100644
index 00000000..559ff34f
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.ini
@@ -0,0 +1,77 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:0
+nk=transpose_read0:1
+nk=transpose_write0:1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+slr=transpose_read0_1:SLR2
+slr=transpose_write0_1:SLR0
+
+sp=ccl_offload_0.m_axi_0:HBM[31]
+sp=ccl_offload_0.m_axi_1:HBM[31]
+sp=transpose_read0_1.m_axi_gmem0:HBM[0:7]
+sp=transpose_write0_1.m_axi_gmem0:HBM[8:15]
+sp=transpose_write0_1.m_axi_gmem1:HBM[16:23]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl
+stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:32
+
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini
new file mode 100644
index 00000000..9dec51d7
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.profile.ini
@@ -0,0 +1,83 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:0
+nk=transpose_read0:1
+nk=transpose_write0:1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+slr=transpose_read0_1:SLR2
+slr=transpose_write0_1:SLR0
+
+sp=ccl_offload_0.m_axi_0:HBM[31]
+sp=ccl_offload_0.m_axi_1:HBM[31]
+sp=transpose_read0_1.m_axi_gmem0:HBM[0:7]
+sp=transpose_write0_1.m_axi_gmem0:HBM[8:15]
+sp=transpose_write0_1.m_axi_gmem1:HBM[16:23]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl
+stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:512
+
+[profile]
+data=transpose_read0:all:all
+data=transpose_write0:all:all
+memory=transpose_read0_1.m_axi_gmem0
+memory=transpose_write0_1.m_axi_gmem0
+memory=transpose_write0_1.m_axi_gmem1
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini
new file mode 100644
index 00000000..83150287
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.ini
@@ -0,0 +1,76 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:0
+nk=transpose_read0:1
+nk=transpose_write0:1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
+slr=transpose_read0_1:SLR1
+slr=transpose_write0_1:SLR0
+
+sp=ccl_offload_0.m_axi_0:HBM[31]
+sp=ccl_offload_0.m_axi_1:HBM[31]
+sp=transpose_read0_1.m_axi_gmem0:HBM[0:7]
+sp=transpose_write0_1.m_axi_gmem0:HBM[8:15]
+sp=transpose_write0_1.m_axi_gmem1:HBM[16:23]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl
+stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:512
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini
new file mode 100644
index 00000000..3860eb41
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_stream.hbm.u280.profile.ini
@@ -0,0 +1,82 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:0
+nk=transpose_read0:1
+nk=transpose_write0:1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
+slr=transpose_read0_1:SLR1
+slr=transpose_write0_1:SLR0
+
+sp=ccl_offload_0.m_axi_0:HBM[31]
+sp=ccl_offload_0.m_axi_1:HBM[31]
+sp=transpose_read0_1.m_axi_gmem0:HBM[0:7]
+sp=transpose_write0_1.m_axi_gmem0:HBM[8:15]
+sp=transpose_write0_1.m_axi_gmem1:HBM[16:23]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:transpose_write0_1.cclo2krnl
+stream_connect=transpose_read0_1.krnl2cclo:ccl_offload_0.s_axis_krnl:512
+
+[profile]
+data=all:all:all
+memory=all
+stall=all:all
+exec=all:all
\ No newline at end of file
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini
new file mode 100644
index 00000000..a1492b0a
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_accl_tcp_buffers.ddr.ini
@@ -0,0 +1,86 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=network_krnl:1:network_krnl_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_krnl:1:cmac_krnl_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+nk=transpose0:2:transpose0.transpose1
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=lb_user_krnl:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=network_krnl_0:SLR1
+slr=cmac_krnl_0:SLR2
+slr=transpose0:SLR0
+slr=transpose1:SLR1
+
+sp=network_krnl_0.m00_axi:DDR[0]
+sp=network_krnl_0.m01_axi:DDR[0]
+sp=ccl_offload_0.m_axi_0:DDR[0:1]
+sp=ccl_offload_0.m_axi_1:DDR[0:1]
+sp=transpose0.m_axi_gmem:DDR[0]
+sp=transpose1.m_axi_gmem:DDR[1]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to TCP Network Kernel
+stream_connect=network_krnl_0.m_axis_tcp_port_status:ccl_offload_0.s_axis_eth_port_status:512
+stream_connect=network_krnl_0.m_axis_tcp_open_status:ccl_offload_0.s_axis_eth_open_status:512
+stream_connect=network_krnl_0.m_axis_tcp_notification:ccl_offload_0.s_axis_eth_notification:512
+stream_connect=network_krnl_0.m_axis_tcp_rx_meta:ccl_offload_0.s_axis_eth_rx_meta:512
+stream_connect=network_krnl_0.m_axis_tcp_rx_data:ccl_offload_0.s_axis_eth_rx_data:512
+stream_connect=network_krnl_0.m_axis_tcp_tx_status:ccl_offload_0.s_axis_eth_tx_status:512
+stream_connect=ccl_offload_0.m_axis_eth_listen_port:network_krnl_0.s_axis_tcp_listen_port:512
+stream_connect=ccl_offload_0.m_axis_eth_open_connection:network_krnl_0.s_axis_tcp_open_connection:512
+stream_connect=ccl_offload_0.m_axis_eth_read_pkg:network_krnl_0.s_axis_tcp_read_pkg:512
+stream_connect=ccl_offload_0.m_axis_eth_tx_meta:network_krnl_0.s_axis_tcp_tx_meta:512
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:network_krnl_0.s_axis_tcp_tx_data:512
+
+# Connect Network Kernel to CMAC Kernel
+stream_connect=cmac_krnl_0.axis_net_rx:network_krnl_0.axis_net_rx
+stream_connect=network_krnl_0.axis_net_tx:cmac_krnl_0.axis_net_tx
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini
index 882d5af1..3b7b0497 100644
--- a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.ddr.generator.ini
@@ -13,7 +13,5 @@ slr=transpose0_$PY_CODE_GEN i + 1$:SLR$PY_CODE_GEN i % num_slrs$
 
 # Assign the kernels to the memory ports
 # PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
-sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem0:DDR[$PY_CODE_GEN i % num_ddrs$]
-sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem1:DDR[$PY_CODE_GEN i % num_ddrs$]
-sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem2:DDR[$PY_CODE_GEN i % num_ddrs$]
+sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem:DDR[$PY_CODE_GEN i % num_ddrs$]
 # PY_CODE_GEN block_end
diff --git a/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini
new file mode 100644
index 00000000..e6f72be5
--- /dev/null
+++ b/PTRANS/settings/settings.link.xilinx.transpose_pq_pcie.hbm.generator.ini
@@ -0,0 +1,17 @@
+
+# Set number of available SLRs
+# PY_CODE_GEN num_slrs = 3
+# PY_CODE_GEN num_ddrs = 2
+
+[connectivity]
+nk=transpose0:$PY_CODE_GEN num_replications$
+
+# Assign kernels to the SLRs
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+slr=transpose0_$PY_CODE_GEN i + 1$:SLR$PY_CODE_GEN i % num_slrs$
+# PY_CODE_GEN block_end
+
+# Assign the kernels to the memory ports
+# PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+sp=transpose0_$PY_CODE_GEN i +1$.m_axi_gmem:HBM[$PY_CODE_GEN i*8$:$PY_CODE_GEN (i+1)*8$]
+# PY_CODE_GEN block_end
diff --git a/PTRANS/src/common/parameters.h.in b/PTRANS/src/common/parameters.h.in
index 68b50dd7..364e12ce 100644
--- a/PTRANS/src/common/parameters.h.in
+++ b/PTRANS/src/common/parameters.h.in
@@ -1,30 +1,21 @@
 #ifndef SRC_COMMON_PARAMETERS_H_
 #define SRC_COMMON_PARAMETERS_H_
 
-#define VERSION "@PROJECT_VERSION@"
+#include "base_parameters.h"
 
 #define READ_KERNEL_NAME "@READ_KERNEL_NAME@"
 #define WRITE_KERNEL_NAME "@WRITE_KERNEL_NAME@"
-#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@
 #define DEFAULT_MATRIX_SIZE @DEFAULT_MATRIX_SIZE@
 #define DEFAULT_COMM_TYPE "@DEFAULT_COMM_TYPE@"
 #define DEFAULT_DIST_TYPE "@DEFAULT_DIST_TYPE@"
-#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@
-#define DEFAULT_DEVICE @DEFAULT_DEVICE@
 #define DEFAULT_P_VALUE @DEFAULT_P_VALUE@
 
-#define NUM_REPLICATIONS @NUM_REPLICATIONS@
-#cmakedefine HOST_EMULATION_REORDER
-
 /**
  * Kernel Parameters
  */
 #define BLOCK_SIZE @BLOCK_SIZE@
 #define CHANNEL_WIDTH @CHANNEL_WIDTH@
 
-#define HOST_DATA_TYPE @HOST_DATA_TYPE@
-#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
-
 #cmakedefine USE_SVM
 #cmakedefine USE_BUFFER_WRITE_RECT_FOR_A
 #cmakedefine XILINX_UNROLL_INNER_LOOPS
@@ -33,13 +24,9 @@
 Short description of the program.
 Moreover the version and build time is also compiled into the description.
 */
-#define PROGRAM_DESCRIPTION "Implementation of the matrix transposition benchmark"\
+#define PROGRAM_NAME "matrix transposition"
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
 
-/**
-Output separator
-*/
-#define HLINE "-------------------------------------------------------------\n"
-
 #endif // SRC_COMMON_PARAMETERS_H_
\ No newline at end of file
diff --git a/PTRANS/src/device/CMakeLists.txt b/PTRANS/src/device/CMakeLists.txt
index 7542a861..bbee1bff 100644
--- a/PTRANS/src/device/CMakeLists.txt
+++ b/PTRANS/src/device/CMakeLists.txt
@@ -11,7 +11,7 @@ if (INTELFPGAOPENCL_FOUND)
 endif()
 
 if (VITIS_FOUND)
-    generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE)
+    generate_kernel_targets_xilinx(transpose_PQ_PCIE transpose_DIAG_PCIE transpose_PQ_ACCL_buffers transpose_PQ_ACCL_stream transpose_PQ_ACCL_stream_sendrecv)
     add_test(NAME test_emulation_PQ_PCIE_xilinx COMMAND Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1 WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
     add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Transpose_xilinx -f transpose_PQ_PCIE_emulate.xclbin -n 1 -m 1
             WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
diff --git a/PTRANS/src/device/transpose_DIAG_IEC.cl b/PTRANS/src/device/transpose_DIAG_IEC.cl
index 513b39e8..a5ab3a03 100644
--- a/PTRANS/src/device/transpose_DIAG_IEC.cl
+++ b/PTRANS/src/device/transpose_DIAG_IEC.cl
@@ -16,11 +16,11 @@ typedef struct {
     DEVICE_DATA_TYPE data[CHANNEL_WIDTH];
 } ch_data;
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_total_replications) %}
 // Channel used to send the transposed blocks of A
-channel ch_data chan_a_out/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(i) + "\""*/), depth(1)));
-channel ch_data chan_a_in/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2 * (i // 2) + ((i + 1) % 2)) + "\""*/), depth(1)));
-// PY_CODE_GEN block_end
+channel ch_data chan_a_out{{ i }} __attribute((io("kernel_output_ch{{ i }}"), depth(1)));
+channel ch_data chan_a_in{{ i }} __attribute((io("kernel_input_ch{{ (2 * (i // 2) + ((i + 1) % 2)) }}"), depth(1)));
+{% endfor %}
 #endif
 
 /**
@@ -64,7 +64,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
         }
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_total_replications) %}
 
 /**
 * send a chunk of A into local memory in a reordered fashion
@@ -77,7 +77,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
 *
 */
 void
-send_chunk_of_a/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
+send_chunk_of_a{{ i }}(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
         const ulong row,
         const ulong col) {
 
@@ -104,7 +104,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
             data.data[unroll_count] = rotate_out[(unroll_count + rot_out) & (CHANNEL_WIDTH - 1)];
         }
 
-        write_channel_intel(chan_a_out/*PY_CODE_GEN i*/, data); 
+        write_channel_intel(chan_a_out{{ i }}, data); 
 }
 
 /**
@@ -121,7 +121,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
+void transpose_read{{ i }}(__global DEVICE_DATA_TYPE *restrict A,
             const ulong block_offset,
             const ulong number_of_blocks) {
 
@@ -139,7 +139,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
                     load_chunk_of_a(A, a_block[block & 1], block, row, col);
                 }
                 if (block > 0) {
-                    send_chunk_of_a/*PY_CODE_GEN i*/(a_block[(block - 1) & 1], row, col);
+                    send_chunk_of_a{{ i }}(a_block[(block - 1) & 1], row, col);
                 }
             }
         }
@@ -162,7 +162,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B,
+void transpose_write{{ i }}(__global DEVICE_DATA_TYPE *restrict B,
             __global DEVICE_DATA_TYPE *restrict A_out,
             const ulong block_offset,
             const ulong number_of_blocks) {
@@ -173,7 +173,7 @@ void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B,
         for (ulong row = 0; row < BLOCK_SIZE; row++) {
             for (ulong col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) {
 
-                ch_data data = read_channel_intel(chan_a_in/*PY_CODE_GEN i*/); 
+                ch_data data = read_channel_intel(chan_a_in{{ i }}); 
 
                 unsigned rot_out = row & (CHANNEL_WIDTH - 1);
                 // rotate temporary buffer to store data into local buffer
@@ -188,4 +188,4 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
diff --git a/PTRANS/src/device/transpose_DIAG_PCIE.cl b/PTRANS/src/device/transpose_DIAG_PCIE.cl
index 614800f3..b443803d 100644
--- a/PTRANS/src/device/transpose_DIAG_PCIE.cl
+++ b/PTRANS/src/device/transpose_DIAG_PCIE.cl
@@ -127,7 +127,7 @@ store_a(__global DEVICE_DATA_TYPE *restrict A_out,
     }
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 /**
  * Read blocks of matrix A and transpose them in memory.
@@ -144,7 +144,7 @@ store_a(__global DEVICE_DATA_TYPE *restrict A_out,
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
+void transpose{{ i }}(__global DEVICE_DATA_TYPE *restrict A,
                                 __global DEVICE_DATA_TYPE *restrict B,
                                 __global DEVICE_DATA_TYPE *restrict A_out,
             const uint number_of_blocks) {
@@ -172,4 +172,4 @@ void transpose/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
diff --git a/PTRANS/src/device/transpose_PQ_ACCL_buffers.cpp b/PTRANS/src/device/transpose_PQ_ACCL_buffers.cpp
new file mode 120000
index 00000000..58aeb801
--- /dev/null
+++ b/PTRANS/src/device/transpose_PQ_ACCL_buffers.cpp
@@ -0,0 +1 @@
+transpose_PQ_PCIE.cpp
\ No newline at end of file
diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
new file mode 100644
index 00000000..15a16edf
--- /dev/null
+++ b/PTRANS/src/device/transpose_PQ_ACCL_stream.cpp
@@ -0,0 +1,199 @@
+/******************************************************************************
+ *  Author: Arjun Ramaswami
+ *
+ *  Edited by Marius Meyer:
+ *  - Adapt to used kernel signature
+ *  - Change to row-column loop structure
+ *****************************************************************************/
+#include "parameters.h"
+#include "ap_int.h"
+#include "ap_utils.h"
+#include "ap_axi_sdata.h"
+#include "accl_hls.h"
+
+const unsigned int block_size = BLOCK_SIZE;
+const unsigned int channel_width = CHANNEL_WIDTH;
+
+// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
+
+/**
+ * Read blocks of matrix A and transpose them in memory.
+ * Write the block into an external channel.
+ *
+ * Will do the following:
+ *
+ * A -> trans(A) -> ext. ch
+ *
+ * @param A Buffer for matrix A
+ * @param B Buffer for matrix B
+ * @param A_out Buffer for result matrix
+ * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise
+                on the block level, the whole matrix A might be written to global memory and the relevant columns
+                need to be picked using this offset.
+ * @param number_of_blocks The number of blocks that will be processed starting from the block offset
+ * @param width_in_blocks The with of matrix A in blocks
+ * @param height_in_blocks The height of matix A in blocks
+ */
+void transpose_read{{ i }}( const DEVICE_DATA_TYPE *A,
+            const unsigned int offset_a,
+            const unsigned int number_of_blocks,
+            const unsigned int width_in_blocks,
+            const unsigned int height_in_blocks,
+            STREAM<stream_word> &krnl2cclo) {
+#pragma HLS INTERFACE axis register both port=krnl2cclo
+
+    // local memory double buffer for a matrix block
+    DEVICE_DATA_TYPE a_block[2][block_size * block_size / channel_width][channel_width];
+#pragma HLS ARRAY_PARTITION variable = a_block complete dim = 3
+
+    // transpose the matrix block-wise from global memory
+block_loop:
+    for (unsigned int block = 0; block < number_of_blocks + 1; block++) {
+#pragma HLS loop_tripcount min=1 max=1024 avg=1
+
+read_A:
+        for (unsigned int row = 0; row < block_size; row++) {
+read_A_line:
+            for (unsigned int col = 0; col < block_size / channel_width; col++) {
+#pragma HLS PIPELINE
+                unsigned long block_row_a = (block + offset_a) / width_in_blocks;
+                unsigned long block_col_a = (block + offset_a) % width_in_blocks;
+                unsigned long ls_address_trans = block_col_a * block_size * block_size * height_in_blocks +
+                            block_row_a * block_size + 
+                            row * block_size * height_in_blocks;
+
+#ifdef EMULATE
+                // This condition is actually required to not read out of bounds
+                // but prevents memory bursts, so for hardware this should be removed
+                // In emulation it prevents segfaults
+                if (block < number_of_blocks) {
+#endif
+                    // read in block of A from global memory and store it in a memory efficient manner for transpose
+                    DEVICE_DATA_TYPE rotate_in[channel_width];
+#pragma HLS ARRAY_PARTITION variable = rotate_in complete dim = 0
+
+                    // Blocks of a will be stored columnwise in global memory
+                    for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                        rotate_in[unroll_count] = A[ls_address_trans + col * channel_width + unroll_count];
+                    }
+
+                    unsigned int chunk = row * (block_size / channel_width) + col;
+
+                    unsigned rot = (row) % (channel_width);
+
+                    // rotate temporary buffer to store data into local buffer
+                    for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                        // every block of (N / channel_width), rotates the index by 1
+                        // store in double buffer
+                        a_block[block & 1][chunk][unroll_count] = rotate_in[(unroll_count + channel_width - rot)
+                                                                                                    % (channel_width)];
+                    }
+#ifdef EMULATE
+                }
+#endif
+                if (block > 0) {
+                    DEVICE_DATA_TYPE data_chunk[channel_width];
+#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0
+                    DEVICE_DATA_TYPE rotate_out[channel_width];
+#pragma HLS ARRAY_PARTITION variable = rotate_out complete dim = 0
+
+                    unsigned int base = col * block_size;
+                    unsigned int offset = row / channel_width;
+
+                    for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                        unsigned rot = ((channel_width + unroll_count - row) * (block_size / channel_width)) %
+                                                                                                    (block_size);
+                        unsigned row_rotate = base + offset + rot;
+                        rotate_out[unroll_count] = a_block[(block - 1) & 1][row_rotate][unroll_count];
+                    }
+
+                    unsigned rot_out = row % (channel_width);
+
+                    // rotate temporary buffer to store data into local buffer
+                    for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                        data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) % (channel_width)];
+                    }
+
+                    stream_word tmp;
+
+                    // load tranposed A from global memory
+                    for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                        DEVICE_DATA_TYPE v = data_chunk[unroll_count];
+                        tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8) 
+                                = *reinterpret_cast<ap_uint<sizeof(DEVICE_DATA_TYPE)*8>*>(&v);
+                    }
+                    tmp.dest = 9;
+                    tmp.last = 1;
+                    tmp.keep = -1;
+                    STREAM_WRITE(krnl2cclo,tmp);              
+                }
+            }
+        }
+    }
+}
+
+/**
+ *
+ * ext. channel -> trans(A) + B -> A_out
+ *
+ * @param B Buffer for matrix B
+ * @param A_out Buffer for result matrix
+ * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise
+                on the block level, the whole matrix A might be written to global memory and the relevant columns
+                need to be picked using this offset.
+ * @param number_of_blocks The number of blocks that will be processed starting from the block offset
+ * @param width_in_blocks The with of matrix A in blocks
+ * @param height_in_blocks The height of matix A in blocks
+ */
+void transpose_write{{ i }}(const DEVICE_DATA_TYPE *B,
+                                 DEVICE_DATA_TYPE *A_out,
+            const unsigned int offset_b,
+            const unsigned int number_of_blocks,
+            const unsigned int width_in_blocks,
+            const unsigned int height_in_blocks,
+            STREAM<stream_word> &cclo2krnl) {
+#pragma HLS INTERFACE axis register both port=cclo2krnl
+
+    // transpose the matrix block-wise from global memory
+block_loop:
+    for (unsigned int block = 0; block < number_of_blocks; block++) {
+#pragma HLS loop_tripcount min=1 max=1024 avg=1
+        // Read transposed A from local memory and add B 
+read_B:
+        for (unsigned int row = 0; row < block_size; row++) {
+read_B_line:
+            for (unsigned int col = 0; col < block_size / channel_width; col++) {
+                unsigned long block_row = (block + offset_b) / width_in_blocks;
+                unsigned long block_col = (block + offset_b) % width_in_blocks;
+                unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks +
+                        block_col * block_size + 
+                        row * block_size * width_in_blocks;
+                unsigned int chunk = row * (block_size / channel_width) + col;
+
+                DEVICE_DATA_TYPE data_chunk[channel_width];
+#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0
+
+                stream_word tmp = STREAM_READ(cclo2krnl);
+
+                // rotate temporary buffer to store data into local buffer
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    ap_uint<sizeof(DEVICE_DATA_TYPE)*8> v = tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8);
+                    data_chunk[unroll_count] = *reinterpret_cast<DEVICE_DATA_TYPE*>(&v);
+                }
+
+                // load tranposed A from global memory
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    data_chunk[unroll_count] += B[ls_address_row + col * channel_width + unroll_count];
+                }
+
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    A_out[ls_address_row + col * channel_width + unroll_count] = data_chunk[unroll_count];
+                }
+            }
+        }
+    }
+}
+
+{% endfor %}
+
diff --git a/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp
new file mode 100644
index 00000000..c43736d9
--- /dev/null
+++ b/PTRANS/src/device/transpose_PQ_ACCL_stream_sendrecv.cpp
@@ -0,0 +1,252 @@
+/******************************************************************************
+ *  Author: Arjun Ramaswami
+ *
+ *  Edited by Marius Meyer:
+ *  - Adapt to used kernel signature
+ *  - Change to row-column loop structure
+ *****************************************************************************/
+#include "parameters.h"
+#include "ap_int.h"
+#include "ap_utils.h"
+#include "ap_axi_sdata.h"
+#include "accl_hls.h"
+
+
+const int block_size = BLOCK_SIZE;
+const int channel_width = CHANNEL_WIDTH;
+
+/**
+ * @brief Modulo operation that always produces positive values in range [0,op-1]. This is required for the PQ transpose algorithm and is different from the usual remainder calculation done with %!
+ * 
+ * @tparam T Data type used for the modulo operation.
+ * @param number Number the modulo is calculated from
+ * @param op Modulo operator
+ * @return T number mod op
+ */
+template<typename T> 
+T mod(T number, T op) {
+    T result = number % op;
+    // result >= op required for unsinged data types
+    return (result < 0 || result >= op) ? op + result : result;
+}
+
+
+void transpose_block_transpose(const DEVICE_DATA_TYPE *A,
+            DEVICE_DATA_TYPE a_block[][channel_width],
+            const unsigned int offset_a,
+            const unsigned int width_in_blocks,
+            const unsigned int height_in_blocks) {
+
+#pragma HLS INTERFACE axis register both port=krnl2cclo
+
+    // transpose the matrix block-wise from global memory
+read_A:
+    for (unsigned int row = 0; row < block_size; row++) {
+read_A_line:
+        for (unsigned int col = 0; col < block_size / channel_width; col++) {
+#pragma HLS PIPELINE
+            unsigned long block_row_a = (offset_a) / width_in_blocks;
+            unsigned long block_col_a = (offset_a) % width_in_blocks;
+            unsigned long ls_address_trans = block_col_a * block_size * block_size * height_in_blocks +
+                        block_row_a * block_size + 
+                        row * block_size * height_in_blocks;
+
+
+            // read in block of A from global memory and store it in a memory efficient manner for transpose
+            DEVICE_DATA_TYPE rotate_in[channel_width];
+#pragma HLS ARRAY_PARTITION variable = rotate_in complete dim = 0
+
+            // Blocks of a will be stored columnwise in global memory
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                rotate_in[unroll_count] = A[ls_address_trans + col * channel_width + unroll_count];
+            }
+
+            unsigned int chunk = row * (block_size / channel_width) + col;
+
+            unsigned rot = (row) % (channel_width);
+
+            // rotate temporary buffer to store data into local buffer
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                // every block of (N / channel_width), rotates the index by 1
+                // store in double buffer
+                a_block[chunk][unroll_count] = rotate_in[(unroll_count + channel_width - rot)
+                                                                                            % (channel_width)];
+            }
+        }
+    }
+}
+
+void transpose_block_forward(DEVICE_DATA_TYPE a_block[][channel_width],
+            STREAM<stream_word> &krnl2cclo) {
+
+read_A:
+    for (unsigned int row = 0; row < block_size; row++) {
+read_A_line:
+        for (unsigned int col = 0; col < block_size / channel_width; col++) {
+            DEVICE_DATA_TYPE data_chunk[channel_width];
+#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0
+            DEVICE_DATA_TYPE rotate_out[channel_width];
+#pragma HLS ARRAY_PARTITION variable = rotate_out complete dim = 0
+
+            unsigned int base = col * block_size;
+            unsigned int offset = row / channel_width;
+
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                unsigned rot = ((channel_width + unroll_count - row) * (block_size / channel_width)) %
+                                                                                            (block_size);
+                unsigned row_rotate = base + offset + rot;
+                rotate_out[unroll_count] = a_block[row_rotate][unroll_count];
+            }
+
+            unsigned rot_out = row % (channel_width);
+
+            // rotate temporary buffer to store data into local buffer
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) % (channel_width)];
+            }
+
+            stream_word tmp;
+
+            // load tranposed A from global memory
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                DEVICE_DATA_TYPE v = data_chunk[unroll_count];
+                tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8) 
+                        = *reinterpret_cast<ap_uint<sizeof(DEVICE_DATA_TYPE)*8>*>(&v);
+            }
+            tmp.dest = 9;
+            tmp.last = 1;
+            tmp.keep = -1;
+            STREAM_WRITE(krnl2cclo,tmp);              
+        }
+    }
+}
+
+/**
+ *
+ * ext. channel -> trans(A) + B -> A_out
+ *
+ * @param B Buffer for matrix B
+ * @param A_out Buffer for result matrix
+ * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise
+                on the block level, the whole matrix A might be written to global memory and the relevant columns
+                need to be picked using this offset.
+ * @param number_of_blocks The number of blocks that will be processed starting from the block offset
+ * @param width_in_blocks The with of matrix A in blocks
+ * @param height_in_blocks The height of matix A in blocks
+ */
+void transpose_block_receive(const DEVICE_DATA_TYPE *B,
+                                 DEVICE_DATA_TYPE *A_out,
+            const unsigned int offset_b,
+            const unsigned int width_in_blocks,
+            STREAM<stream_word> &cclo2krnl) {
+#pragma HLS INTERFACE axis register both port=cclo2krnl
+
+    // transpose the matrix block-wise from global memory
+#pragma HLS loop_tripcount min=1 max=1024 avg=1
+        // Read transposed A from local memory and add B 
+read_B:
+    for (unsigned int row = 0; row < block_size; row++) {
+read_B_line:
+        for (unsigned int col = 0; col < block_size / channel_width; col++) {
+            unsigned long block_row = (offset_b) / width_in_blocks;
+            unsigned long block_col = (offset_b) % width_in_blocks;
+            unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks +
+                    block_col * block_size + 
+                    row * block_size * width_in_blocks;
+            unsigned int chunk = row * (block_size / channel_width) + col;
+
+            DEVICE_DATA_TYPE data_chunk[channel_width];
+#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0
+
+            stream_word tmp = STREAM_READ(cclo2krnl);
+
+            // rotate temporary buffer to store data into local buffer
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                ap_uint<sizeof(DEVICE_DATA_TYPE)*8> v = tmp.data((unroll_count + 1) * sizeof(DEVICE_DATA_TYPE)*8 - 1, unroll_count * sizeof(DEVICE_DATA_TYPE)*8);
+                data_chunk[unroll_count] = *reinterpret_cast<DEVICE_DATA_TYPE*>(&v);
+            }
+
+            // load tranposed A from global memory
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                data_chunk[unroll_count] += B[ls_address_row + col * channel_width + unroll_count];
+            }
+
+            for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                A_out[ls_address_row + col * channel_width + unroll_count] = data_chunk[unroll_count];
+            }
+        }
+    }
+}
+
+void transpose_read_sendrecv(const DEVICE_DATA_TYPE* A,
+                const int* target_list,
+                int pq_row, int pq_col, 
+                int pq_width, int pq_height,
+                int gcd, int least_common_multiple,
+                int height_per_rank,
+                int width_per_rank,
+                STREAM<stream_word> &krnl2cclo) {
+
+    // Begin algorithm from Figure 14 for general case
+    int g = mod(pq_row - pq_col, gcd);
+    int p = mod(pq_col + g, pq_width);
+    int q = mod(pq_row - g, pq_height);
+
+    for (int j = 0; j < least_common_multiple/pq_width; j++) {
+        for (int i = 0; i < least_common_multiple/pq_height; i++) {
+            // Determine sender and receiver rank of current rank for current communication step
+            int send_rank = mod(p + i * gcd, pq_width) + mod(q - j * gcd, pq_height) * pq_width;
+
+            for (int col = 0; col < least_common_multiple/pq_width; col++) {
+                for (int row = 0; row < least_common_multiple/pq_height; row++) {
+                    if (target_list[row * least_common_multiple/pq_width + col] == send_rank) {
+                        for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_height); lcm_col++) {
+                            for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_width); lcm_row++) {
+                                unsigned int matrix_buffer_offset = (row + lcm_col * least_common_multiple/pq_height) + (col + lcm_row * least_common_multiple/pq_width) * width_per_rank;
+                                DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width];
+                                transpose_block_transpose(A, a_block, matrix_buffer_offset, width_per_rank, height_per_rank);
+                                transpose_block_forward(a_block, krnl2cclo);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+void transpose_write_sendrecv(const DEVICE_DATA_TYPE* B,
+                    DEVICE_DATA_TYPE* C,
+                const int* target_list,
+                int pq_row, int pq_col, 
+                int pq_width, int pq_height,
+                int gcd, int least_common_multiple,
+                int height_per_rank,
+                int width_per_rank,
+                STREAM<stream_word> &cclo2krnl) {
+
+    // Begin algorithm from Figure 14 for general case
+    int g = mod(pq_row - pq_col, gcd);
+    int p = mod(pq_col + g, pq_width);
+    int q = mod(pq_row - g, pq_height);
+    for (int j = 0; j < least_common_multiple/pq_width; j++) {
+        for (int i = 0; i < least_common_multiple/pq_height; i++) {
+
+            int recv_rank = mod(p - i * gcd, pq_width) + mod(q + j * gcd, pq_height) * pq_width;
+
+            for (int col = 0; col < least_common_multiple/pq_width; col++) {
+                for (int row = 0; row < least_common_multiple/pq_height; row++) {
+                    if (target_list[row * least_common_multiple/pq_width + col] == recv_rank) {
+                        for (int lcm_row = 0; lcm_row < (height_per_rank)/(least_common_multiple/pq_width); lcm_row++) {
+                            for (int lcm_col = 0; lcm_col < (width_per_rank)/(least_common_multiple/pq_height); lcm_col++) {
+                                unsigned int matrix_buffer_offset = (row + lcm_col * least_common_multiple/pq_height) + (col + lcm_row * least_common_multiple/pq_width) * width_per_rank;
+                                transpose_block_receive(B,C,matrix_buffer_offset,width_per_rank, cclo2krnl);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } 
+}
+
diff --git a/PTRANS/src/device/transpose_PQ_IEC.cl b/PTRANS/src/device/transpose_PQ_IEC.cl
index e219ae1c..9bfb6485 100644
--- a/PTRANS/src/device/transpose_PQ_IEC.cl
+++ b/PTRANS/src/device/transpose_PQ_IEC.cl
@@ -16,11 +16,11 @@ typedef struct {
     DEVICE_DATA_TYPE data[CHANNEL_WIDTH];
 } ch_data;
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_total_replications) %}
 // Channel used to send the transposed blocks of A
-channel ch_data chan_a_out/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(i) + "\""*/), depth(1)));
-channel ch_data chan_a_in/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2 * (i // 2) + ((i + 1) % 2)) + "\""*/), depth(1)));
-// PY_CODE_GEN block_end
+channel ch_data chan_a_out{{ i }} __attribute((io("kernel_output_ch{{ i }}"), depth(1)));
+channel ch_data chan_a_in{{ i }} __attribute((io("kernel_input_ch{{ 2 * (i // 2) + ((i + 1) % 2) }}"), depth(1)));
+{% endfor %}
 #endif
 
 /**
@@ -69,7 +69,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
         }
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_total_replications) %}
 
 /**
 * send a chunk of A into local memory in a reordered fashion
@@ -82,7 +82,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
 *
 */
 void
-send_chunk_of_a/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
+send_chunk_of_a{{ i }}(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
         const ulong row,
         const ulong col) {
 
@@ -109,7 +109,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
             data.data[unroll_count] = rotate_out[(unroll_count + rot_out) & (CHANNEL_WIDTH - 1)];
         }
 
-        write_channel_intel(chan_a_out/*PY_CODE_GEN i*/, data); 
+        write_channel_intel(chan_a_out{{ i }}, data); 
 }
 
 /**
@@ -126,7 +126,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
+void transpose_read{{ i }}(__global DEVICE_DATA_TYPE *restrict A,
             const ulong offset,
             const ulong width_in_blocks,
             const ulong height_in_blocks,
@@ -148,7 +148,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
                     load_chunk_of_a(A, a_block[block & 1], block_row, block_col, width_in_blocks, row, col);
                 }
                 if (block > offset) {
-                    send_chunk_of_a/*PY_CODE_GEN i*/(a_block[(block - 1) & 1], row, col);
+                    send_chunk_of_a{{ i }}(a_block[(block - 1) & 1], row, col);
                 }
             }
         }
@@ -171,7 +171,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B,
+void transpose_write{{ i }}(__global DEVICE_DATA_TYPE *restrict B,
             __global DEVICE_DATA_TYPE *restrict A_out,
             const ulong offset,
             const ulong width_in_blocks,
@@ -183,7 +183,7 @@ void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B,
         for (ulong row = 0; row < BLOCK_SIZE; row++) {
             for (ulong col = 0; col < BLOCK_SIZE / CHANNEL_WIDTH; col++) {
 
-                ch_data data = read_channel_intel(chan_a_in/*PY_CODE_GEN i*/); 
+                ch_data data = read_channel_intel(chan_a_in{{ i }}); 
 
                 ulong block_col = block % width_in_blocks;
                 ulong block_row = block / width_in_blocks;
@@ -202,4 +202,4 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cl b/PTRANS/src/device/transpose_PQ_PCIE.cl
index 161fcb88..3fccf79a 100644
--- a/PTRANS/src/device/transpose_PQ_PCIE.cl
+++ b/PTRANS/src/device/transpose_PQ_PCIE.cl
@@ -8,14 +8,13 @@
 
 #include "parameters.h"
 
-/* PY_CODE_GEN 
-try:
-    kernel_param_attributes = generate_attributes(num_replications)
-except:
-    kernel_param_attributes = ["" for i in range(num_replications)]
-*/
+{% if generate_attributes is defined %}
+        {% set kernel_param_attributes = generate_attributes(num_replications) %}
+{% else %}
+        {% set kernel_param_attributes = create_list("", num_replications) %}
+{% endif %}
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 /**
  * Read blocks of matrix A and transpose them in memory.
@@ -37,9 +36,9 @@ except:
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE *restrict A,
-                                __global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE *restrict B,
-                                __global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE *restrict A_out,
+void transpose{{ i }}(__global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict A,
+                                __global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict B,
+                                __global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE *restrict A_out,
             const uint offset_a,
             const uint offset_b,
             const uint number_of_blocks,
@@ -190,4 +189,4 @@ void transpose/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/PTRANS/src/device/transpose_PQ_PCIE.cpp b/PTRANS/src/device/transpose_PQ_PCIE.cpp
new file mode 100644
index 00000000..be7e6828
--- /dev/null
+++ b/PTRANS/src/device/transpose_PQ_PCIE.cpp
@@ -0,0 +1,161 @@
+/******************************************************************************
+ *  Author: Arjun Ramaswami
+ *
+ *  Edited by Marius Meyer:
+ *  - Adapt to used kernel signature
+ *  - Change to row-column loop structure
+ *****************************************************************************/
+#include "parameters.h"
+
+const unsigned int block_size = BLOCK_SIZE;
+const unsigned int channel_width = CHANNEL_WIDTH;
+
+
+
+extern "C" {
+
+// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+
+/**
+ * Read blocks of matrix A and transpose them in memory.
+ * Write the block into an external channel.
+ *
+ * Will do the following:
+ *
+ * A -> trans(A) -> ext. ch
+ *
+ * @param A Buffer for matrix A
+ * @param B Buffer for matrix B
+ * @param A_out Buffer for result matrix
+ * @param offset Offset in blocks that is used to read the current block of A. Since A is read column-wise
+                on the block level, the whole matrix A might be written to global memory and the relevant columns
+                need to be picked using this offset.
+ * @param number_of_blocks The number of blocks that will be processed starting from the block offset
+ * @param width_in_blocks The with of matrix A in blocks
+ * @param height_in_blocks The height of matix A in blocks
+ */
+void transpose/*PY_CODE_GEN i*/( const DEVICE_DATA_TYPE *A,
+                                 const DEVICE_DATA_TYPE *B,
+                                 DEVICE_DATA_TYPE *A_out,
+            const unsigned int offset_a,
+            const unsigned int offset_b,
+            const unsigned int number_of_blocks,
+            const unsigned int width_in_blocks,
+            const unsigned int height_in_blocks) {
+
+    // transpose the matrix block-wise from global memory
+block_loop:
+    for (unsigned int block = 0; block < number_of_blocks; block++) {
+
+        // local memory double buffer for a matrix block
+        DEVICE_DATA_TYPE a_block[block_size * block_size / channel_width][channel_width];
+#pragma HLS ARRAY_PARTITION variable = a_block complete dim = 2
+// #pragma HLS BIND_STORAGE variable = a_block type = RAM_1P impl = URAM
+        // local memory double buffer for a matrix block
+        DEVICE_DATA_TYPE a_plus_b_block[block_size * block_size / channel_width][channel_width];
+#pragma HLS ARRAY_PARTITION variable = a_plus_b_block complete dim = 2
+// #pragma HLS BIND_STORAGE variable = a_plus_b_block type = RAM_1P impl = URAM
+
+read_A:
+        for (unsigned int row = 0; row < block_size; row++) {
+read_A_line:
+            for (unsigned int col = 0; col < block_size / channel_width; col++) {
+#pragma HLS PIPELINE
+                unsigned long block_row_a = (block + offset_a) / width_in_blocks;
+                unsigned long block_col_a = (block + offset_a) % width_in_blocks;
+                unsigned long ls_address_trans = block_col_a * block_size * block_size * height_in_blocks +
+                            block_row_a * block_size + 
+                            row * block_size * height_in_blocks;
+
+                // read in block of A from global memory and store it in a memory efficient manner for transpose
+                DEVICE_DATA_TYPE rotate_in[channel_width];
+#pragma HLS ARRAY_PARTITION variable = rotate_in complete dim = 0
+
+                // Blocks of a will be stored columnwise in global memory
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    rotate_in[unroll_count] = A[ls_address_trans + col * channel_width + unroll_count];
+                }
+
+                unsigned int chunk = row * (block_size / channel_width) + col;
+
+                unsigned rot = (row) % (channel_width);
+
+                // rotate temporary buffer to store data into local buffer
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    // every block of (N / channel_width), rotates the index by 1
+                    // store in double buffer
+                    a_block[chunk][unroll_count] = rotate_in[(unroll_count + channel_width - rot)
+                                                                                                % (channel_width)];
+                }
+            }
+        }
+
+        // Read transposed A from local memory and add B 
+read_B:
+        for (unsigned int row = 0; row < block_size; row++) {
+read_B_line:
+            for (unsigned int col = 0; col < block_size / channel_width; col++) {
+#pragma HLS PIPELINE
+                unsigned long block_row = (block + offset_b) / width_in_blocks;
+                unsigned long block_col = (block + offset_b) % width_in_blocks;
+                unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks +
+                        block_col * block_size + 
+                        row * block_size * width_in_blocks;
+                unsigned int chunk = row * (block_size / channel_width) + col;
+
+                DEVICE_DATA_TYPE data_chunk[channel_width];
+#pragma HLS ARRAY_PARTITION variable = data_chunk complete dim = 0
+                DEVICE_DATA_TYPE rotate_out[channel_width];
+#pragma HLS ARRAY_PARTITION variable = rotate_out complete dim = 0
+
+                unsigned int base = col * block_size;
+                unsigned int offset = row / channel_width;
+
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    unsigned rot = ((channel_width + unroll_count - row) * (block_size / channel_width)) %
+                                                                                                (block_size);
+                    unsigned row_rotate = base + offset + rot;
+                    rotate_out[unroll_count] = a_block[row_rotate][unroll_count];
+                }
+
+                unsigned rot_out = row % (channel_width);
+
+                // rotate temporary buffer to store data into local buffer
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    data_chunk[unroll_count] = rotate_out[(unroll_count + rot_out) % (channel_width)];
+                }
+
+                // load tranposed A from global memory
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    data_chunk[unroll_count] += B[ls_address_row + col * channel_width + unroll_count];
+                }
+
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    a_plus_b_block[chunk][unroll_count] = data_chunk[unroll_count];
+                }
+            }
+        }
+        // Write back result
+write_result:
+        for (unsigned int row = 0; row < block_size; row++) {
+write_result_line:
+            for (unsigned int col = 0; col < block_size / channel_width; col++) {
+#pragma HLS PIPELINE
+                unsigned long block_row = (block + offset_b) / width_in_blocks;
+                unsigned long block_col = (block + offset_b) % width_in_blocks;
+                unsigned long ls_address_row = block_row * block_size * block_size * width_in_blocks +
+                        block_col * block_size + 
+                        row * block_size * width_in_blocks;
+                unsigned int chunk = row * (block_size / channel_width) + col;
+
+                for (unsigned unroll_count = 0; unroll_count < channel_width; unroll_count++) {
+                    A_out[ls_address_row + col * channel_width + unroll_count] = a_plus_b_block[chunk][unroll_count];
+                }
+            }
+        }
+    }
+}
+
+// PY_CODE_GEN block_end
+
+}
diff --git a/PTRANS/src/device/transpose_c2_DIAG_IEC.cl b/PTRANS/src/device/transpose_c2_DIAG_IEC.cl
index dfad9f87..a40d6bb0 100644
--- a/PTRANS/src/device/transpose_c2_DIAG_IEC.cl
+++ b/PTRANS/src/device/transpose_c2_DIAG_IEC.cl
@@ -16,13 +16,13 @@ typedef struct {
     DEVICE_DATA_TYPE data[CHANNEL_WIDTH/2];
 } ch_data;
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_replications) %}
 // Channel used to send the transposed blocks of A
-channel ch_data chan_a_out1/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(2*i) + "\""*/), depth(1)));
-channel ch_data chan_a_out2/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(2*i + 1) + "\""*/), depth(1)));
-channel ch_data chan_a_in1/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2*i + 1) + "\""*/), depth(1)));
-channel ch_data chan_a_in2/*PY_CODE_GEN i*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(2*i) + "\""*/), depth(1)));
-// PY_CODE_GEN block_end
+channel ch_data chan_a_out1{{ i }} __attribute((io("kernel_output_ch{{ 2*i }}"), depth(1)));
+channel ch_data chan_a_out2{{ i }} __attribute((io("kernel_output_ch{{ 2*i + 1 }}"), depth(1)));
+channel ch_data chan_a_in1{{ i }} __attribute((io("kernel_input_ch{{ 2*i + 1 }}"), depth(1)));
+channel ch_data chan_a_in2{{ i }} __attribute((io("kernel_input_ch{{ 2*i }}"), depth(1)));
+{% endfor %}
 #endif
 
 /**
@@ -65,7 +65,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
         }
 }
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_total_replications)]
+{% for i in range(num_total_replications) %}
 
 /**
 * send a chunk of A into local memory in a reordered fashion
@@ -78,7 +78,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
 *
 */
 void
-send_chunk_of_a/*PY_CODE_GEN i*/(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
+send_chunk_of_a{{ i }}(const DEVICE_DATA_TYPE local_buffer[BLOCK_SIZE * BLOCK_SIZE / CHANNEL_WIDTH][CHANNEL_WIDTH],
         const ulong row,
         const ulong col) {
 
@@ -111,7 +111,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2)))
         for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH/2; unroll_count++) {
             data1.data[unroll_count] = channel_data[unroll_count];
         }
-        write_channel_intel(chan_a_out1/*PY_CODE_GEN i*/, data1); 
+        write_channel_intel(chan_a_out1{{ i }}, data1); 
 
         ch_data data2;
         // rotate temporary buffer to store data into local buffer
@@ -119,7 +119,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2)))
         for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH/2; unroll_count++) {
             data2.data[unroll_count] = channel_data[CHANNEL_WIDTH/2 + unroll_count];
         }
-        write_channel_intel(chan_a_out2/*PY_CODE_GEN i*/, data2); 
+        write_channel_intel(chan_a_out2{{ i }}, data2); 
 }
 
 /**
@@ -136,7 +136,7 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2)))
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
+void transpose_read{{ i }}(__global DEVICE_DATA_TYPE *restrict A,
             const ulong block_offset,
             const ulong number_of_blocks) {
 
@@ -154,7 +154,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
                     load_chunk_of_a(A, a_block[block & 1], block, row, col);
                 }
                 if (block > 0) {
-                    send_chunk_of_a/*PY_CODE_GEN i*/(a_block[(block - 1) & 1], row, col);
+                    send_chunk_of_a{{ i }}(a_block[(block - 1) & 1], row, col);
                 }
             }
         }
@@ -177,7 +177,7 @@ void transpose_read/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict A,
  */
 __attribute__((max_global_work_dim(0)))
 __kernel
-void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B,
+void transpose_write{{ i }}(__global DEVICE_DATA_TYPE *restrict B,
             __global DEVICE_DATA_TYPE *restrict A_out,
             const ulong block_offset,
             const ulong number_of_blocks) {
@@ -190,13 +190,13 @@ void transpose_write/*PY_CODE_GEN i*/(__global DEVICE_DATA_TYPE *restrict B,
 
                 DEVICE_DATA_TYPE channel_data[CHANNEL_WIDTH];
 
-                ch_data data1 = read_channel_intel(chan_a_in1/*PY_CODE_GEN i*/); 
+                ch_data data1 = read_channel_intel(chan_a_in1{{ i }}); 
                 __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2)))
                 for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH/2; unroll_count++) {
                     channel_data[unroll_count] = data1.data[unroll_count];
                 }
 
-                ch_data data2 = read_channel_intel(chan_a_in2/*PY_CODE_GEN i*/); 
+                ch_data data2 = read_channel_intel(chan_a_in2{{ i }}); 
                 __attribute__((opencl_unroll_hint(CHANNEL_WIDTH/2)))
                 for (unsigned unroll_count = 0; unroll_count < CHANNEL_WIDTH/2; unroll_count++) {
                     channel_data[CHANNEL_WIDTH/2 + unroll_count] = data2.data[unroll_count];
@@ -217,4 +217,4 @@ __attribute__((opencl_unroll_hint(CHANNEL_WIDTH)))
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/PTRANS/src/host/CMakeLists.txt b/PTRANS/src/host/CMakeLists.txt
index 89b45ff8..554b4a3e 100755
--- a/PTRANS/src/host/CMakeLists.txt
+++ b/PTRANS/src/host/CMakeLists.txt
@@ -1,6 +1,6 @@
 
 add_subdirectory(../../../shared ${CMAKE_BINARY_DIR}/lib/hpccbase)
-set(HOST_SOURCE transpose_benchmark.cpp transpose_data.cpp)
+set(HOST_SOURCE transpose_data.cpp)
 
 set(HOST_EXE_NAME Transpose)
 set(LIB_NAME trans)
@@ -27,19 +27,39 @@ if (INTELFPGAOPENCL_FOUND)
         target_include_directories(${LIB_NAME}_intel PRIVATE "$ENV{MKL_ROOT}/include")
     endif()
     target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA)
+    target_compile_definitions(${HOST_EXE_NAME}_intel PRIVATE -DINTEL_FPGA)
     target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_intel_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_intel> -h)
 endif()
 
 if (Vitis_FOUND)
+    if (USE_ACCL)
+        set(CMAKE_SKIP_BUILD_RPATH No)
+        set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes)
+        list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib)
+        list(APPEND kernel_files transpose_PQ_ACCL_stream_sendrecv.cpp transpose_PQ_ACCL_stream.cpp)
+        foreach (files ${kernel_files})
+            set(source_f "${CMAKE_BINARY_DIR}/src/device/${files}")
+            set(base_file "${CMAKE_SOURCE_DIR}/src/device/${files}")
+            add_custom_command(OUTPUT ${source_f}
+            COMMAND ${Python3_EXECUTABLE} ${CODE_GENERATOR} -o ${source_f} -p num_replications=1 -p num_total_replications=1 ${base_file}
+            MAIN_DEPENDENCY ${base_file})
+            list(APPEND HOST_SOURCE ${source_f})
+        endforeach()
+    endif()
     add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE})
     target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS})
     target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host)
+    target_include_directories(${LIB_NAME}_xilinx PRIVATE ${extern_accl_SOURCE_DIR}/hlslib/include/xilinx)
     add_executable(${HOST_EXE_NAME}_xilinx main.cpp)
     target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
     target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
     target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx)
+    if (USE_ACCL)
+        target_link_libraries(${HOST_EXE_NAME}_xilinx zmqpp)
+    endif()
     target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
+    target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_xilinx_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_xilinx> -h)
 endif()
diff --git a/PTRANS/src/host/data_handlers/diagonal.hpp b/PTRANS/src/host/data_handlers/diagonal.hpp
index e1d72f3b..9f601105 100644
--- a/PTRANS/src/host/data_handlers/diagonal.hpp
+++ b/PTRANS/src/host/data_handlers/diagonal.hpp
@@ -44,7 +44,8 @@ namespace transpose {
  *         the missing data. e.g. for N ranks, the pairs will be (0, N/2), (1, N/2 + 1), ...
  * 
  */
-class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler {
+template<class TDevice, class TContext, class TProgram>
+class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler<TDevice, TContext, TProgram> {
 
 private:
 
@@ -68,44 +69,44 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler {
      * @param settings The execution settings that contain information about the data size
      * @return std::unique_ptr<TransposeData> The generated data
      */
-    std::unique_ptr<TransposeData>
-    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& settings) override {
+    std::unique_ptr<TransposeData<TContext>>
+    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, TDevice, TContext, TProgram>& settings) override {
         MPI_Type_contiguous(settings.programSettings->blockSize * settings.programSettings->blockSize, MPI_FLOAT, &data_block);
         MPI_Type_commit(&data_block);
         
         int width_in_blocks = settings.programSettings->matrixSize / settings.programSettings->blockSize;
 
-        int avg_blocks_per_rank = (width_in_blocks * width_in_blocks) / mpi_comm_size;
+        int avg_blocks_per_rank = (width_in_blocks * width_in_blocks) / this->mpi_comm_size;
         int avg_diagonal_blocks = width_in_blocks;
         if (avg_blocks_per_rank > 0) {
             avg_diagonal_blocks = (width_in_blocks / avg_blocks_per_rank);
         }
         num_diagonal_ranks = std::max(avg_diagonal_blocks, 1);
 
-        if (num_diagonal_ranks % 2 != mpi_comm_size % 2) {
+        if (num_diagonal_ranks % 2 != this->mpi_comm_size % 2) {
         #ifndef NDEBUG
-            std::cout << "Rank " << mpi_comm_rank << ": Fail 1!" << std::endl;
+            std::cout << "Rank " << this->mpi_comm_rank << ": Fail 1!" << std::endl;
         #endif
             // Abort if there is a too high difference in the number of matrix blocks between the MPI ranks
             throw std::runtime_error("Matrix size and MPI ranks to not allow fair distribution of blocks! Increase or reduce the number of MPI ranks by 1.");
         }
-        if ((mpi_comm_size - num_diagonal_ranks) % 2 != 0 || (mpi_comm_size - num_diagonal_ranks) == 0 && width_in_blocks > 1) {
+        if ((this->mpi_comm_size - num_diagonal_ranks) % 2 != 0 || (this->mpi_comm_size - num_diagonal_ranks) == 0 && width_in_blocks > 1) {
         #ifndef NDEBUG
-            std::cout << "Rank " << mpi_comm_rank << ": Fail 2!" << std::endl;
+            std::cout << "Rank " << this->mpi_comm_rank << ": Fail 2!" << std::endl;
         #endif
             throw std::runtime_error("Not possible to create pairs of MPI ranks for lower and upper half of matrix. Increase number of MPI ranks!.");
         }
-        bool this_rank_is_diagonal = mpi_comm_rank >= (mpi_comm_size - num_diagonal_ranks);
-        int blocks_if_diagonal = width_in_blocks / num_diagonal_ranks + ( (mpi_comm_rank - (mpi_comm_size - num_diagonal_ranks)) < (width_in_blocks % num_diagonal_ranks) ? 1 : 0);
+        bool this_rank_is_diagonal = this->mpi_comm_rank >= (this->mpi_comm_size - num_diagonal_ranks);
+        int blocks_if_diagonal = width_in_blocks / num_diagonal_ranks + ( (this->mpi_comm_rank - (this->mpi_comm_size - num_diagonal_ranks)) < (width_in_blocks % num_diagonal_ranks) ? 1 : 0);
         int blocks_if_not_diagonal = 0;
-        if ((mpi_comm_size - num_diagonal_ranks) > 0 ) {
-            blocks_if_not_diagonal = (width_in_blocks * (width_in_blocks - 1)) / (mpi_comm_size - num_diagonal_ranks) + (mpi_comm_rank  < ((width_in_blocks * (width_in_blocks - 1)) % (mpi_comm_size - num_diagonal_ranks)) ? 1 : 0);
+        if ((this->mpi_comm_size - num_diagonal_ranks) > 0 ) {
+            blocks_if_not_diagonal = (width_in_blocks * (width_in_blocks - 1)) / (this->mpi_comm_size - num_diagonal_ranks) + (this->mpi_comm_rank  < ((width_in_blocks * (width_in_blocks - 1)) % (this->mpi_comm_size - num_diagonal_ranks)) ? 1 : 0);
         }
 
 
         int blocks_per_rank = (this_rank_is_diagonal) ? blocks_if_diagonal : blocks_if_not_diagonal;
 
-        if (mpi_comm_rank == 0) {
+        if (this->mpi_comm_rank == 0) {
             std::cout << "Diag. blocks per rank:              " << blocks_if_diagonal << std::endl;
             std::cout << "Blocks per rank:                    " << blocks_if_not_diagonal << std::endl;
             std::cout << "Loopback ranks for diagonal blocks: " << num_diagonal_ranks << std::endl;
@@ -114,14 +115,14 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler {
         int data_height_per_rank = blocks_per_rank * settings.programSettings->blockSize;
 
     #ifndef NDEBUG
-        std::cout << "Rank " << mpi_comm_rank << ": NumBlocks = " << blocks_per_rank << std::endl;
+        std::cout << "Rank " << this->mpi_comm_rank << ": NumBlocks = " << blocks_per_rank << std::endl;
     #endif
         
         // Allocate memory for a single device and all its memory banks
-        auto d = std::unique_ptr<transpose::TransposeData>(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank));
+        auto d = std::unique_ptr<transpose::TransposeData<TContext>>(new transpose::TransposeData<TContext>(*settings.context, settings.programSettings->blockSize, blocks_per_rank));
 
         // Fill the allocated memory with pseudo random values
-        std::mt19937 gen(mpi_comm_rank);
+        std::mt19937 gen(this->mpi_comm_rank);
         std::uniform_real_distribution<> dis(-100.0, 100.0);
         for (size_t i = 0; i < data_height_per_rank; i++) {
             for (size_t j = 0; j < settings.programSettings->blockSize; j++) {
@@ -141,16 +142,16 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler {
      *              Exchanged data will be stored in the same object.
      */
     void
-    exchangeData(TransposeData& data) override {
+    exchangeData(TransposeData<TContext>& data) override {
 
     #ifndef NDEBUG
         // std::cout << "Start data exchange " << mpi_comm_rank << std::endl;
     #endif
         // Only need to exchange data, if rank has a partner
-        if (mpi_comm_rank < mpi_comm_size - num_diagonal_ranks) {
+        if (this->mpi_comm_rank < this->mpi_comm_size - num_diagonal_ranks) {
 
-            int first_upper_half_rank = (mpi_comm_size - num_diagonal_ranks)/2;
-            int pair_rank = (mpi_comm_rank >= first_upper_half_rank) ? mpi_comm_rank - first_upper_half_rank : mpi_comm_rank + first_upper_half_rank;
+            int first_upper_half_rank = (this->mpi_comm_size - num_diagonal_ranks)/2;
+            int pair_rank = (this->mpi_comm_rank >= first_upper_half_rank) ? this->mpi_comm_rank - first_upper_half_rank : this->mpi_comm_rank + first_upper_half_rank;
 
             // To re-calculate the matrix transposition locally on this host, we need to 
             // exchange matrix A for every kernel replication
@@ -184,7 +185,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler {
     }
 
     void 
-    reference_transpose(TransposeData& data) {
+    reference_transpose(TransposeData<TContext>& data) {
         size_t block_offset = data.blockSize * data.blockSize;
         for (size_t b = 0; b < data.numBlocks; b++) {
             for (size_t i = 0; i < data.blockSize; i++) {
@@ -196,7 +197,7 @@ class DistributedDiagonalTransposeDataHandler : public TransposeDataHandler {
         }
     }
 
-    DistributedDiagonalTransposeDataHandler(int mpi_rank, int mpi_size): TransposeDataHandler(mpi_rank, mpi_size) {
+    DistributedDiagonalTransposeDataHandler(int mpi_rank, int mpi_size): TransposeDataHandler<TDevice, TContext, TProgram>(mpi_rank, mpi_size) {
         if (mpi_rank >= mpi_size) {
             throw std::runtime_error("MPI rank must be smaller the MPI world size!");
         }
diff --git a/PTRANS/src/host/data_handlers/handler.hpp b/PTRANS/src/host/data_handlers/handler.hpp
index fe1293fe..646fcdbf 100644
--- a/PTRANS/src/host/data_handlers/handler.hpp
+++ b/PTRANS/src/host/data_handlers/handler.hpp
@@ -43,6 +43,7 @@ namespace data_handler {
  *          calculate the overall validation error.
  * 
  */
+template<class TDevice, class TContext, class TProgram>
 class TransposeDataHandler {
 
 protected:
@@ -67,8 +68,8 @@ class TransposeDataHandler {
      * @param settings The execution settings that contain information about the data size
      * @return std::unique_ptr<TransposeData> The generated data
      */
-    virtual std::unique_ptr<TransposeData>
-    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& settings) = 0;
+    virtual std::unique_ptr<TransposeData<TContext>>
+    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, TDevice, TContext, TProgram>& settings) = 0;
 
     /**
      * @brief Exchange the data blocks for verification
@@ -77,10 +78,10 @@ class TransposeDataHandler {
      *              Exchanged data will be stored in the same object.
      */
     virtual void
-    exchangeData(TransposeData& data) = 0;
+    exchangeData(TransposeData<TContext>& data) = 0;
 
     virtual void
-    reference_transpose(TransposeData& data) = 0;
+    reference_transpose(TransposeData<TContext>& data) = 0;
 
     /**
      * @brief Construct a new Transpose Data Handler object and initialize the MPI rank and MPI size variables if MPI is used
diff --git a/PTRANS/src/host/data_handlers/pq.hpp b/PTRANS/src/host/data_handlers/pq.hpp
index 01f2261d..87e7d15f 100644
--- a/PTRANS/src/host/data_handlers/pq.hpp
+++ b/PTRANS/src/host/data_handlers/pq.hpp
@@ -26,6 +26,7 @@ SOFTWARE.
 /* C++ standard library headers */
 #include <memory>
 #include <algorithm>
+#include <random>
 
 /* Project's headers */
 #include "handler.hpp"
@@ -52,7 +53,8 @@ static T mod(T number, T op) {
     return (result < 0 || result >= op) ? op + result : result;
 }
 
-class DistributedPQTransposeDataHandler : public TransposeDataHandler {
+template<class TDevice, class TContext, class TProgram>
+class DistributedPQTransposeDataHandler : public TransposeDataHandler<TDevice, TContext, TProgram> {
 
 private:
 
@@ -128,21 +130,29 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler {
         return height_per_rank;
     }
 
+    int getP() {
+        return pq_width;
+    }
+
+    int getQ() {
+        return pq_height;
+    }
+
     /**
      * @brief Generate data for transposition based on the implemented distribution scheme
      * 
      * @param settings The execution settings that contain information about the data size
      * @return std::unique_ptr<TransposeData> The generated data
      */
-    std::unique_ptr<TransposeData>
-    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& settings) override {
+    std::unique_ptr<TransposeData<TContext>>
+    generateData(hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, TDevice, TContext, TProgram>& settings) override {
         int width_in_blocks = settings.programSettings->matrixSize / settings.programSettings->blockSize;
         global_width = width_in_blocks;
 
         width_per_rank = width_in_blocks / pq_width;
         height_per_rank = width_in_blocks / pq_height;
-        pq_row = mpi_comm_rank / pq_width;
-        pq_col = mpi_comm_rank % pq_width;
+        pq_row = this->mpi_comm_rank / pq_width;
+        pq_col = this->mpi_comm_rank % pq_width;
 
         // If the torus width is not a divisor of the matrix size,
         // distribute remaining blocks to the ranks
@@ -163,10 +173,10 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler {
         }
         
         // Allocate memory for a single device and all its memory banks
-        auto d = std::unique_ptr<transpose::TransposeData>(new transpose::TransposeData(*settings.context, settings.programSettings->blockSize, blocks_per_rank));
+        auto d = std::unique_ptr<transpose::TransposeData<TContext>>(new transpose::TransposeData<TContext>(*settings.context, settings.programSettings->blockSize, blocks_per_rank));
 
         // Fill the allocated memory with pseudo random values
-        std::mt19937 gen(mpi_comm_rank);
+        std::mt19937 gen(this->mpi_comm_rank);
         std::uniform_real_distribution<> dis(-100.0, 100.0);
         for (size_t i = 0; i < blocks_per_rank * settings.programSettings->blockSize; i++) {
             for (size_t j = 0; j < settings.programSettings->blockSize; j++) {
@@ -186,7 +196,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler {
      *              Exchanged data will be stored in the same object.
      */
     void
-    exchangeData(TransposeData& data) override {
+    exchangeData(TransposeData<TContext>& data) override {
 
         MPI_Status status;     
 
@@ -206,14 +216,14 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler {
                 // 3 2 . .
    
 
-                size_t remaining_data_size = data.numBlocks;
+                size_t remaining_data_size = data.numBlocks * data.blockSize * data.blockSize;
                 size_t offset = 0;
                 while (remaining_data_size > 0) {
                     int next_chunk = (remaining_data_size > std::numeric_limits<int>::max()) ? std::numeric_limits<int>::max(): remaining_data_size;
-                    MPI_Sendrecv(&data.A[offset], next_chunk, data_block, pair_rank, 0, &data.exchange[offset], next_chunk, data_block, pair_rank, 0, MPI_COMM_WORLD, &status);
+                    MPI_Sendrecv(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, &data.exchange[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD, &status);
 
                     remaining_data_size -= next_chunk;
-                    offset += static_cast<size_t>(next_chunk) * static_cast<size_t>(data.blockSize * data.blockSize);
+                    offset += static_cast<size_t>(next_chunk);
                 }
 
                 // Exchange window pointers
@@ -307,7 +317,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler {
 
                     // Do actual MPI communication
 #ifndef NDEBUG
-                    std::cout << "Rank " << mpi_comm_rank << ": blocks (" << sending_size / (data.blockSize * data.blockSize) << "," << receiving_size / (data.blockSize * data.blockSize) << ") send " << send_rank << ", recv " << recv_rank << std::endl << std::flush;
+                    std::cout << "Rank " << this->mpi_comm_rank << ": blocks (" << sending_size / (data.blockSize * data.blockSize) << "," << receiving_size / (data.blockSize * data.blockSize) << ") send " << send_rank << ", recv " << recv_rank << std::endl << std::flush;
 #endif
                     MPI_Isend(send_buffers[current_parallel_execution].data(), sending_size, MPI_FLOAT, send_rank, 0, MPI_COMM_WORLD, &mpi_requests[current_parallel_execution]);
                     MPI_Irecv(recv_buffers[current_parallel_execution].data(), receiving_size, MPI_FLOAT, recv_rank, 0, MPI_COMM_WORLD, &mpi_requests[gcd + current_parallel_execution]);
@@ -369,7 +379,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler {
     }
 
     void 
-    reference_transpose(TransposeData& data) {
+    reference_transpose(TransposeData<TContext>& data) override {
         for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
             for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
                 data.A[i * height_per_rank * data.blockSize + j] -= (data.result[j * width_per_rank * data.blockSize + i] - data.B[j * width_per_rank * data.blockSize + i]);
@@ -384,7 +394,7 @@ class DistributedPQTransposeDataHandler : public TransposeDataHandler {
  * @param mpi_size Size of the communication world
  * @param p Width of the PQ grid the FPGAs are arranged in
  */
-    DistributedPQTransposeDataHandler(int mpi_rank, int mpi_size, int p) : TransposeDataHandler(mpi_rank, mpi_size) {
+    DistributedPQTransposeDataHandler(int mpi_rank, int mpi_size, int p) : TransposeDataHandler<TDevice, TContext, TProgram>(mpi_rank, mpi_size) {
         if (mpi_size % p != 0) {
             throw std::runtime_error("Number of MPI ranks must be multiple of P! P=" + std::to_string(p));
         }
diff --git a/PTRANS/src/host/execution_types/execution_cpu.hpp b/PTRANS/src/host/execution_types/execution_cpu.hpp
index ab74fdc9..604640f9 100644
--- a/PTRANS/src/host/execution_types/execution_cpu.hpp
+++ b/PTRANS/src/host/execution_types/execution_cpu.hpp
@@ -50,8 +50,9 @@ namespace transpose
  * @param data data object that contains all required data for the execution
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
-            static std::unique_ptr<transpose::TransposeExecutionTimings>
-            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler)
+            template<class TDevice, class TContext, class TProgram>
+            static std::map<std::string, std::vector<double>>
+            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, TDevice, TContext, TProgram> &config, transpose::TransposeData<TContext> &data, transpose::data_handler::TransposeDataHandler &handler)
             {
                 int err;
 
@@ -115,10 +116,10 @@ namespace transpose
                     transferTimings.push_back(transferTime.count());
                 }
 
-                std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
-                    transferTimings,
-                    calculationTimings});
-                return result;
+                std::map<std::string, std::vector<double>> timings;
+                timings["transfer"] = transferTimings;
+                timings["calculation"] = calculationTimings;
+                return timings;
             }
 
         } // namespace bm_execution
diff --git a/PTRANS/src/host/execution_types/execution_intel.hpp b/PTRANS/src/host/execution_types/execution_intel.hpp
index d95bf578..fb60aa5d 100644
--- a/PTRANS/src/host/execution_types/execution_intel.hpp
+++ b/PTRANS/src/host/execution_types/execution_intel.hpp
@@ -40,11 +40,10 @@ namespace intel {
  * 
  * @param config The progrma configuration
  * @param data data object that contains all required data for the execution on the FPGA
- * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
+ * @return std::map<std::string, std::vector<double>> The measured execution times 
  */
-static  std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& config, transpose::TransposeData& data) {
-        int err;
+    static std::map<std::string, std::vector<double>>
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData<cl::Context>& data) {
 
         if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::diagonal) {
                 throw std::runtime_error("Used data handler not supported by execution handler!");
@@ -58,6 +57,7 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
         std::vector<cl::Kernel> transposeWriteKernelList;
         std::vector<cl::CommandQueue> readCommandQueueList;
         std::vector<cl::CommandQueue> writeCommandQueueList;
+        int err;
 
         // Setup the kernels depending on the number of kernel replications
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
@@ -264,15 +264,14 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
             transferTimings.push_back(transferTime.count());
         }
 
-        std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
-                transferTimings,
-                calculationTimings
-        });
-        return result;
+        std::map<std::string, std::vector<double>> timings;
+        timings["transfer"] = transferTimings;
+        timings["calculation"] = calculationTimings;
+        return timings;
     }
 
 }  // namespace transpose
 }  // namespace fpga_execution
 }  // namespace intel
 
-#endif
\ No newline at end of file
+#endif
diff --git a/PTRANS/src/host/execution_types/execution_intel_pq.hpp b/PTRANS/src/host/execution_types/execution_intel_pq.hpp
index 431ff40d..de478aa4 100644
--- a/PTRANS/src/host/execution_types/execution_intel_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_intel_pq.hpp
@@ -43,8 +43,8 @@ namespace intel_pq {
  * @param data data object that contains all required data for the execution on the FPGA
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
-static  std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) {
+    static std::map<std::string, std::vector<double>>
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData<cl::Context>& data, transpose::data_handler::DistributedPQTransposeDataHandler<cl::Device, cl::Context, cl::Program> &handler) {
         int err;
 
         if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
@@ -343,15 +343,14 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
             transferTimings.push_back(transferTime.count());
         }
 
-        std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
-                transferTimings,
-                calculationTimings
-        });
-        return result;
+        std::map<std::string, std::vector<double>> timings;
+        timings["transfer"] = transferTimings;
+        timings["calculation"] = calculationTimings;
+        return timings;
     }
 
 }  // namespace transpose
 }  // namespace fpga_execution
 }  // namespace intel
 
-#endif
\ No newline at end of file
+#endif
diff --git a/PTRANS/src/host/execution_types/execution_pcie.hpp b/PTRANS/src/host/execution_types/execution_pcie.hpp
index 5e29ad2e..2676f32d 100644
--- a/PTRANS/src/host/execution_types/execution_pcie.hpp
+++ b/PTRANS/src/host/execution_types/execution_pcie.hpp
@@ -48,8 +48,8 @@ namespace transpose
  * @param handler data handler instance that should be used to exchange data between hosts
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
-            static std::unique_ptr<transpose::TransposeExecutionTimings>
-            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings> &config, transpose::TransposeData &data, transpose::data_handler::TransposeDataHandler &handler)
+            static std::map<std::string, std::vector<double>>
+            calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program> &config, transpose::TransposeData<cl::Context> &data, transpose::data_handler::TransposeDataHandler<cl::Device, cl::Context, cl::Program> &handler)
             {
                 int err;
 
@@ -227,10 +227,10 @@ namespace transpose
                     transferTimings.push_back(transferTime.count());
                 }
 
-                std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
-                    transferTimings,
-                    calculationTimings});
-                return result;
+                std::map<std::string, std::vector<double>> timings;
+                timings["transfer"] = transferTimings;
+                timings["calculation"] = calculationTimings;
+                return timings;
             }
 
         } // namespace bm_execution
diff --git a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
index d2cfae7e..e04224a9 100644
--- a/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
+++ b/PTRANS/src/host/execution_types/execution_pcie_pq.hpp
@@ -44,8 +44,8 @@ namespace pcie_pq {
  * @param handler data handler instance that should be used to exchange data between hosts
  * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured execution times 
  */
-static  std::unique_ptr<transpose::TransposeExecutionTimings>
-    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings>& config, transpose::TransposeData& data, transpose::data_handler::DistributedPQTransposeDataHandler &handler) {
+    static std::map<std::string, std::vector<double>>
+    calculate(const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings, cl::Device, cl::Context, cl::Program>& config, transpose::TransposeData<cl::Context>& data, transpose::data_handler::DistributedPQTransposeDataHandler<cl::Device, cl::Context, cl::Program> &handler) {
         int err;
 
         if (config.programSettings->dataHandlerIdentifier != transpose::data_handler::DataHandlerType::pq) {
@@ -366,16 +366,14 @@ static  std::unique_ptr<transpose::TransposeExecutionTimings>
             transferTimings.push_back(transferTime.count());
         }
 
-        std::unique_ptr<transpose::TransposeExecutionTimings> result(new transpose::TransposeExecutionTimings{
-                transferTimings,
-                calculationTimings
-        });
-
-        return result;
+        std::map<std::string, std::vector<double>> timings;
+        timings["transfer"] = transferTimings;
+        timings["calculation"] = calculationTimings;
+        return timings;
     }
 
 }  // namespace transpose
 }  // namespace fpga_execution
 }  // namespace intel
 
-#endif
\ No newline at end of file
+#endif
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
new file mode 100644
index 00000000..13c7c263
--- /dev/null
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_pq.hpp
@@ -0,0 +1,599 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_ACCL_PQ_EXECUTION_H_
+#define SRC_HOST_ACCL_PQ_EXECUTION_H_
+
+/* C++ standard library headers */
+#include <chrono>
+#include <memory>
+#include <vector>
+
+/* Project's headers */
+#include "accl.hpp"
+#include "data_handlers/data_handler_types.h"
+#include "data_handlers/pq.hpp"
+#include "transpose_data.hpp"
+
+namespace transpose {
+namespace fpga_execution {
+namespace accl_pq {
+
+void accl_exchangeData(
+    ACCL::ACCL &accl,
+    transpose::data_handler::DistributedPQTransposeDataHandler<
+        xrt::device, fpga_setup::ACCLContext, xrt::uuid> &handler,
+    transpose::TransposeData<fpga_setup::ACCLContext> &data, std::vector<xrt::bo> &bufferAXrt,
+    int global_width) {
+
+  int pq_width = handler.getP();
+  int pq_height = handler.getQ();
+  int width_per_rank = handler.getWidthforRank();
+  int height_per_rank = handler.getHeightforRank();
+
+  int mpi_comm_rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank);
+  int pq_row = mpi_comm_rank / pq_width;
+  int pq_col = mpi_comm_rank % pq_width;
+
+  std::vector<std::unique_ptr<ACCL::BaseBuffer>> acclBuffersA;
+  for (auto &bo : bufferAXrt) {
+    acclBuffersA.push_back(accl.create_buffer<HOST_DATA_TYPE>(
+        bo, data.blockSize * data.blockSize * data.numBlocks,
+        ACCL::dataType::float32));
+  }
+
+  if (pq_width == pq_height) {
+    if (pq_col != pq_row) {
+
+      int pair_rank = pq_width * pq_col + pq_row;
+
+      // To re-calculate the matrix transposition locally on this host, we need
+      // to exchange matrix A for every kernel replication The order of the
+      // matrix blocks does not change during the exchange, because they are
+      // distributed diagonally and will be handled in the order below:
+      //
+      // . . 1 3
+      // . . . 2
+      // 1 . . .
+      // 3 2 . .
+      auto acclBufferA_recv = accl.create_buffer<HOST_DATA_TYPE>(
+          data.blockSize * data.blockSize * data.numBlocks,
+          ACCL::dataType::float32);
+      // Send and receive matrix A using ACCL directly on FPGA
+      for (int block_chunk = 0; block_chunk < data.numBlocks; block_chunk+= 16) {
+        for (int block_num = block_chunk; block_num < std::min<size_t>(data.numBlocks, block_chunk + 16); block_num++) {
+          accl.send(*acclBuffersA[0]->slice(
+                        data.blockSize * data.blockSize * block_num,
+                        data.blockSize * data.blockSize * (block_num + 1)),
+                    data.blockSize * data.blockSize, pair_rank, 0, ACCL::GLOBAL_COMM, true);
+        }
+        for (int block_num = block_chunk; block_num < std::min<size_t>(data.numBlocks, block_chunk + 16); block_num++) {
+          accl.recv(*acclBufferA_recv->slice(
+                        data.blockSize * data.blockSize * block_num,
+                        data.blockSize * data.blockSize * (block_num + 1)),
+                    data.blockSize * data.blockSize, pair_rank,
+                    1, ACCL::GLOBAL_COMM, true);
+        }
+      }
+
+      accl.copy(*acclBufferA_recv, *acclBuffersA[0],
+                data.blockSize * data.blockSize * data.numBlocks, true, true);
+    }
+  } else {
+    // Taken from "Parallel matrix transpose algorithms on distributed memory
+    // concurrent computers" by J. Choi, J. J. Dongarra, D. W. Walker and
+    // translated to C++ This will do a diagonal exchange of matrix blocks.
+
+    // Determine LCM using GCD from standard library using the C++14 call
+    // In C++17 this changes to std::gcd in numeric, also std::lcm is directly
+    // available in numeric
+    int gcd = std::__gcd(pq_height, pq_width);
+    int least_common_multiple = pq_height * pq_width / gcd;
+
+    // If the global matrix size is not a multiple of the LCM block size, the
+    // numbers of send and received blocks may be wrongly calculated. Throw
+    // exception to prevent this and make aware of this issue!
+    if (global_width % least_common_multiple > 0) {
+      throw std::runtime_error(
+          "Implementation does not support matrix sizes that are not multiple "
+          "of LCM blocks! Results may be wrong!");
+    }
+
+    // MPI requests for non-blocking communication
+    // First half of vector is for Isend, second half for Irecv!
+    std::vector<ACCL::CCLO *> accl_requests(2 * gcd);
+
+    // Begin algorithm from Figure 14 for general case
+    int g = transpose::data_handler::mod(pq_row - pq_col, gcd);
+    int p = transpose::data_handler::mod(pq_col + g, pq_width);
+    int q = transpose::data_handler::mod(pq_row - g, pq_height);
+
+    // Pre-calculate target ranks in LCM block
+    // The vector list variable can be interpreted as 2D matrix. Every entry
+    // represents the target rank of the sub-block Since the LCM block will
+    // repeat, we only need to store this small amount of data!
+    std::vector<int> target_list(least_common_multiple / pq_height *
+                                 least_common_multiple / pq_width);
+    for (int row = 0; row < least_common_multiple / pq_height; row++) {
+      for (int col = 0; col < least_common_multiple / pq_width; col++) {
+        int global_block_col = pq_col + col * pq_width;
+        int global_block_row = pq_row + row * pq_height;
+        int destination_rank = (global_block_col % pq_height) * pq_width +
+                               (global_block_row % pq_width);
+        target_list[row * least_common_multiple / pq_width + col] =
+            destination_rank;
+      }
+    }
+
+    // Create some ACCL buffers to send and receive from other FPGAs
+    // They can reside completely on FPGA
+    std::vector<std::unique_ptr<ACCL::BaseBuffer>> send_buffers;
+    std::vector<std::unique_ptr<ACCL::BaseBuffer>> recv_buffers;
+    for (int i = 0; i < gcd; i++) {
+      // TODO Is there a way to initialize buffer only in FPGA memory with ACCL?
+      send_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(
+          data.blockSize * data.blockSize * data.numBlocks,
+          ACCL::dataType::float32));
+      recv_buffers.push_back(accl.create_buffer<HOST_DATA_TYPE>(
+          data.blockSize * data.blockSize * data.numBlocks,
+          ACCL::dataType::float32));
+      send_buffers.back()->sync_to_device();
+      recv_buffers.back()->sync_to_device();
+    }
+    int current_parallel_execution = 0;
+    for (int j = 0; j < least_common_multiple / pq_width; j++) {
+      for (int i = 0; i < least_common_multiple / pq_height; i++) {
+        // Determine sender and receiver rank of current rank for current
+        // communication step
+        int send_rank =
+            transpose::data_handler::mod(p + i * gcd, pq_width) +
+            transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width;
+        int recv_rank =
+            transpose::data_handler::mod(p - i * gcd, pq_width) +
+            transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width;
+
+        // Also count receiving buffer size because sending and receiving buffer
+        // size may differ in certain scenarios!
+        int receiving_size = 0;
+        int sending_size = 0;
+
+        std::vector<int> send_rows;
+        std::vector<int> send_cols;
+        // Look up which blocks are affected by the current rank
+        for (int row = 0; row < least_common_multiple / pq_height; row++) {
+          for (int col = 0; col < least_common_multiple / pq_width; col++) {
+            if (target_list[row * least_common_multiple / pq_width + col] ==
+                send_rank) {
+              send_rows.push_back(row);
+              send_cols.push_back(col);
+              sending_size += data.blockSize * data.blockSize;
+            }
+            if (target_list[row * least_common_multiple / pq_width + col] ==
+                recv_rank) {
+              receiving_size += data.blockSize * data.blockSize;
+            }
+          }
+        }
+        receiving_size *=
+            (height_per_rank) / (least_common_multiple / pq_height) *
+            ((width_per_rank) / (least_common_multiple / pq_width));
+        sending_size *= (height_per_rank) /
+                        (least_common_multiple / pq_height) *
+                        ((width_per_rank) / (least_common_multiple / pq_width));
+
+#ifndef NDEBUG
+        std::cout << "Copy data to send buffers" << std::endl;
+#endif
+        // Copy the required date for this communication step to the send
+        // buffer!
+        for (int t = 0; t < send_rows.size(); t++) {
+          for (int lcm_row = 0;
+               lcm_row <
+               (height_per_rank) / (least_common_multiple / pq_height);
+               lcm_row++) {
+            for (int lcm_col = 0;
+                 lcm_col <
+                 (width_per_rank) / (least_common_multiple / pq_width);
+                 lcm_col++) {
+              size_t sending_buffer_offset =
+                  lcm_row * data.blockSize * data.blockSize *
+                      ((width_per_rank) / (least_common_multiple / pq_width)) +
+                  lcm_col * data.blockSize * data.blockSize;
+              size_t matrix_buffer_offset =
+                  (send_cols[t] + lcm_col * least_common_multiple / pq_width) *
+                      data.blockSize +
+                  (send_rows[t] + lcm_row * least_common_multiple / pq_height) *
+                      width_per_rank * data.blockSize * data.blockSize;
+              for (int block_row = 0; block_row < data.blockSize; block_row++) {
+                // TODO May be more efficient when done async!
+                std::cout << "A("
+                          << matrix_buffer_offset +
+                                 block_row * width_per_rank * data.blockSize
+                          << ","
+                          << matrix_buffer_offset +
+                                 block_row * width_per_rank * data.blockSize +
+                                 data.blockSize
+                          << ") send(" << sending_buffer_offset << ","
+                          << sending_buffer_offset + data.blockSize << ")"
+                          << std::endl;
+                accl.copy(*acclBuffersA[0]->slice(
+                              matrix_buffer_offset +
+                                  block_row * width_per_rank * data.blockSize,
+                              matrix_buffer_offset +
+                                  block_row * width_per_rank * data.blockSize +
+                                  data.blockSize),
+                          *send_buffers[current_parallel_execution]->slice(
+                              sending_buffer_offset,
+                              sending_buffer_offset + data.blockSize),
+                          data.blockSize, true, true);
+                std::cout << "Copy done!" << std::endl;
+              }
+            }
+          }
+        }
+
+        // Do actual MPI communication
+#ifndef NDEBUG
+        std::cout << "Rank " << mpi_comm_rank << ": blocks ("
+                  << sending_size / (data.blockSize * data.blockSize) << ","
+                  << receiving_size / (data.blockSize * data.blockSize)
+                  << ") send " << send_rank << ", recv " << recv_rank
+                  << std::endl
+                  << std::flush;
+#endif
+        accl_requests[current_parallel_execution] = (accl.send(
+            *send_buffers[current_parallel_execution], sending_size,
+            send_rank, 0, ACCL::GLOBAL_COMM, true,
+            ACCL::dataType::none, true));
+        accl_requests[current_parallel_execution + gcd] = (accl.recv(
+            *recv_buffers[current_parallel_execution], sending_size,
+            send_rank, 0, ACCL::GLOBAL_COMM, true,
+            ACCL::dataType::none, true));
+        // Increase the counter for parallel executions
+        current_parallel_execution = (current_parallel_execution + 1) % gcd;
+
+        // Wait for MPI requests if GCD MPI calls are scheduled in parallel
+        if ((current_parallel_execution) % gcd == 0) {
+
+          for (auto &req : accl_requests) {
+
+            MPI_Status status;
+            int index;
+#ifndef NDEBUG
+            std::cout << "Wait for all requests to complete" << std::endl;
+#endif
+            // Wait for all send and recv events to complete
+            // TODO do the CCLO pointers need to be freed?
+            accl.nop(false, accl_requests);
+            // For each message that was received in parallel
+            if (index >= gcd) {
+              std::vector<int> recv_rows;
+              std::vector<int> recv_cols;
+              // Look up which blocks are affected by the current rank
+              for (int row = 0; row < least_common_multiple / pq_height;
+                   row++) {
+                for (int col = 0; col < least_common_multiple / pq_width;
+                     col++) {
+                  if (target_list[row * least_common_multiple / pq_width +
+                                  col] == status.MPI_SOURCE) {
+                    recv_rows.push_back(row);
+                    recv_cols.push_back(col);
+                  }
+                }
+              }
+              // Copy received data to matrix A buffer
+              for (int t = 0; t < recv_rows.size(); t++) {
+                for (int lcm_row = 0;
+                     lcm_row <
+                     (height_per_rank) / (least_common_multiple / pq_height);
+                     lcm_row++) {
+                  for (int lcm_col = 0;
+                       lcm_col <
+                       (width_per_rank) / (least_common_multiple / pq_width);
+                       lcm_col++) {
+                    size_t receiving_buffer_offset =
+                        lcm_row * data.blockSize * data.blockSize *
+                            ((width_per_rank) /
+                             (least_common_multiple / pq_width)) +
+                        lcm_col * data.blockSize * data.blockSize;
+                    size_t matrix_buffer_offset =
+                        (recv_cols[t] +
+                         lcm_col * least_common_multiple / pq_width) *
+                            data.blockSize +
+                        (recv_rows[t] +
+                         lcm_row * least_common_multiple / pq_height) *
+                            width_per_rank * data.blockSize * data.blockSize;
+                    for (int block_row = 0; block_row < data.blockSize;
+                         block_row++) {
+                      // TODO May be more efficient when done async!
+                      accl.copy(
+                          *recv_buffers[current_parallel_execution]->slice(
+                              receiving_buffer_offset,
+                              receiving_buffer_offset + data.blockSize),
+                          *acclBuffersA[0]->slice(
+                              matrix_buffer_offset +
+                                  block_row * width_per_rank * data.blockSize,
+                              matrix_buffer_offset +
+                                  block_row * width_per_rank * data.blockSize +
+                                  data.blockSize),
+                          data.blockSize, true, true);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  // Copy received matrix A to the buffers of other kernel replications that
+  // may be placed on different memory banks
+  for (int b = 1; b < acclBuffersA.size(); b++) {
+    accl.copy(*acclBuffersA[0], *acclBuffersA[b],
+              data.blockSize * data.blockSize * data.numBlocks, true, true);
+  }
+}
+
+/**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ
+ * distribution and PCIe+MPI over the host for communication
+ *
+ * @param config The progrma configuration
+ * @param data data object that contains all required data for the execution on
+ * the FPGA
+ * @param handler data handler instance that should be used to exchange data
+ * between hosts
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured
+ * execution times
+ */
+static std::map<std::string, std::vector<double>>  calculate(
+    const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
+                                       xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config,
+    transpose::TransposeData<fpga_setup::ACCLContext> &data,
+    transpose::data_handler::DistributedPQTransposeDataHandler<
+        xrt::device, fpga_setup::ACCLContext, xrt::uuid> &handler) {
+  int err;
+
+  if (config.programSettings->dataHandlerIdentifier !=
+      transpose::data_handler::DataHandlerType::pq) {
+    throw std::runtime_error(
+        "Used data handler not supported by execution handler!");
+  }
+#ifdef USE_SVM
+  throw new std::runtime_error("SVM not supported in the host implementation "
+                               "of this communication method");
+#endif
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+  throw new std::runtime_error(
+      "Using the Write Rect method is not supported in this host "
+      "implementation of this communication method");
+#endif
+
+  std::vector<size_t> bufferSizeList;
+  std::vector<size_t> bufferStartList;
+  std::vector<size_t> bufferOffsetList;
+  std::vector<xrt::bo> bufferListA;
+  std::vector<xrt::bo> bufferListB;
+  std::vector<xrt::bo> bufferListA_out;
+  std::vector<xrt::kernel> transposeKernelList;
+  std::vector<size_t> blocksPerReplication;
+
+  size_t local_matrix_width = handler.getWidthforRank();
+  size_t local_matrix_height = handler.getHeightforRank();
+  size_t local_matrix_width_bytes =
+      local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+
+  size_t total_offset = 0;
+  size_t row_offset = 0;
+#ifndef NDEBUG
+  std::cout << "Start kernel creation" << std::endl;
+#endif
+  // Setup the kernels depending on the number of kernel replications
+  for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+    // Calculate how many blocks the current kernel replication will need to
+    // process.
+    size_t blocks_per_replication =
+        (local_matrix_height * local_matrix_width /
+         config.programSettings->kernelReplications);
+    size_t blocks_remainder = (local_matrix_height * local_matrix_width) %
+                              config.programSettings->kernelReplications;
+    if (blocks_remainder > r) {
+      // Catch the case, that the number of blocks is not divisible by the
+      // number of kernel replications
+      blocks_per_replication += 1;
+    }
+    if (blocks_per_replication < 1) {
+      continue;
+    }
+    blocksPerReplication.push_back(blocks_per_replication);
+    size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) /
+                         local_matrix_width * local_matrix_width *
+                         data.blockSize * data.blockSize;
+    bufferSizeList.push_back(buffer_size);
+    bufferStartList.push_back(total_offset);
+    bufferOffsetList.push_back(row_offset);
+
+    row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
+
+    total_offset += (bufferOffsetList.back() + blocks_per_replication) /
+                    local_matrix_width * local_matrix_width;
+
+    // create the kernels
+    xrt::kernel transposeKernel(
+        *config.device, *config.program,
+        ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str());
+
+    if (r == 0 || config.programSettings->copyA) {
+      xrt::bo bufferA(*config.device, data.A,
+                    data.numBlocks * data.blockSize * data.blockSize *
+                        sizeof(HOST_DATA_TYPE),
+                    transposeKernel.group_id(0));
+      bufferListA.push_back(bufferA);
+    }
+    xrt::bo bufferB(
+        *config.device,
+        &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
+        buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
+    xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
+                        transposeKernel.group_id(2));
+
+    bufferListB.push_back(bufferB);
+    bufferListA_out.push_back(bufferA_out);
+    transposeKernelList.push_back(transposeKernel);
+  }
+
+  std::vector<double> transferTimings;
+  std::vector<double> calculationTimings;
+
+  for (int repetition = 0; repetition < config.programSettings->numRepetitions;
+       repetition++) {
+
+#ifndef NDEBUG
+    std::cout << "Start data transfer" << std::endl;
+#endif
+    auto startTransfer = std::chrono::high_resolution_clock::now();
+
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      if (r == 0 || config.programSettings->copyA) {
+        bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      }
+      bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    }
+    auto endTransfer = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double> transferTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endTransfer - startTransfer);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    auto startCalculation = std::chrono::high_resolution_clock::now();
+
+    // Exchange A data via ACCL
+#ifndef NDEBUG
+    std::cout << "Start data exchange with ACCL" << std::endl;
+#endif
+    accl_exchangeData(*(config.context->accl), handler, data, bufferListA,
+                      config.programSettings->matrixSize / data.blockSize);
+#ifndef NDEBUG
+    std::cout << "End data exchange with ACCL" << std::endl;
+#endif
+    std::vector<xrt::run> runs;
+    auto startKernelCalculation = std::chrono::high_resolution_clock::now();
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      runs.push_back(transposeKernelList[r](
+          (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]), bufferListB[r], bufferListA_out[r],
+          static_cast<cl_uint>(bufferOffsetList[r]),
+          static_cast<cl_uint>(bufferOffsetList[r]),
+          static_cast<cl_uint>(blocksPerReplication[r]),
+          static_cast<cl_uint>(handler.getWidthforRank()),
+          static_cast<cl_uint>(
+              (bufferSizeList[r]) /
+              (local_matrix_width * data.blockSize * data.blockSize))));
+    }
+#ifndef NDEBUG
+    std::cout << "Wait for kernels to complete" << std::endl;
+#endif
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      runs[r].wait();
+    }
+    auto endCalculation = std::chrono::high_resolution_clock::now();
+#ifndef NDEBUG
+    int mpi_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    std::cout << "Rank " << mpi_rank << ": "
+              << "Done i=" << repetition << std::endl;
+    std::cout << "Kernel execution time: "
+              << std::chrono::duration_cast<std::chrono::duration<double>>(
+                     endCalculation - startKernelCalculation)
+                     .count()
+              << "s ("
+              << ((config.programSettings->matrixSize *
+                   config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) *
+                   3) /
+                  std::chrono::duration_cast<std::chrono::duration<double>>(
+                      endCalculation - startKernelCalculation)
+                      .count() *
+                  1.0e-9)
+              << " GB/s)" << std::endl;
+#endif
+
+    std::chrono::duration<double> calculationTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endCalculation - startCalculation);
+    calculationTimings.push_back(calculationTime.count());
+
+    std::vector<HOST_DATA_TYPE> tmp_write_buffer(
+        local_matrix_height * local_matrix_width * data.blockSize *
+        data.blockSize);
+
+    startTransfer = std::chrono::high_resolution_clock::now();
+
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      // Copy possibly incomplete first block row
+      if (bufferOffsetList[r] != 0) {
+        bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+        bufferListA_out[r].read(tmp_write_buffer.data());
+        for (int row = 0; row < data.blockSize; row++) {
+          for (int col = bufferOffsetList[r] * data.blockSize;
+               col < local_matrix_width * data.blockSize; col++) {
+            data.result[bufferStartList[r] * data.blockSize * data.blockSize +
+                        row * local_matrix_width * data.blockSize + col] =
+                tmp_write_buffer[row * local_matrix_width * data.blockSize +
+                                 col];
+          }
+        }
+        // Copy remaining buffer
+        std::copy(tmp_write_buffer.begin() +
+                      local_matrix_width * data.blockSize * data.blockSize,
+                  tmp_write_buffer.begin() + bufferSizeList[r],
+                  &data.result[(bufferStartList[r] + local_matrix_width) *
+                               data.blockSize * data.blockSize]);
+      } else {
+        bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+        bufferListA_out[r].read(
+            &data.result[bufferStartList[r] * data.blockSize * data.blockSize]);
+      }
+    }
+    endTransfer = std::chrono::high_resolution_clock::now();
+
+    accl_exchangeData(*(config.context->accl), handler, data, bufferListA,
+                      config.programSettings->matrixSize / data.blockSize);
+
+    transferTime += std::chrono::duration_cast<std::chrono::duration<double>>(
+        endTransfer - startTransfer);
+    transferTimings.push_back(transferTime.count());
+  }
+
+  std::map<std::string, std::vector<double>> timings;
+  timings["transfer"] = transferTimings;
+  timings["calculation"] = calculationTimings;
+  return timings;
+}
+
+} // namespace accl_pq
+} // namespace fpga_execution
+} // namespace transpose
+
+#endif
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
new file mode 100644
index 00000000..84121480
--- /dev/null
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq.hpp
@@ -0,0 +1,361 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_ACCL_STREAM_PQ_EXECUTION_H_
+#define SRC_HOST_ACCL_STREAM_PQ_EXECUTION_H_
+
+/* C++ standard library headers */
+#include <chrono>
+#include <memory>
+#include <vector>
+
+/* Project's headers */
+#include "data_handlers/data_handler_types.h"
+#include "data_handlers/pq.hpp"
+#include "transpose_data.hpp"
+#include "cclo_bfm.h"
+#include "Simulation.h"
+#include "accl.hpp"
+
+extern void transpose_write0(const DEVICE_DATA_TYPE *B,
+                                 DEVICE_DATA_TYPE *A_out,
+            const unsigned int offset_b,
+            const unsigned int number_of_blocks,
+            const unsigned int width_in_blocks,
+            const unsigned int height_in_blocks,
+            hlslib::Stream<stream_word> &cclo2krnl);
+  
+extern void transpose_read0( const DEVICE_DATA_TYPE *A,
+            const unsigned int offset_a,
+            const unsigned int number_of_blocks,
+            const unsigned int width_in_blocks,
+            const unsigned int height_in_blocks,
+            hlslib::Stream<stream_word> &krnl2cclo);
+
+namespace transpose {
+namespace fpga_execution {
+namespace accl_stream_pq {
+
+/**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ
+ * distribution and PCIe+MPI over the host for communication
+ *
+ * @param config The progrma configuration
+ * @param data data object that contains all required data for the execution on
+ * the FPGA
+ * @param handler data handler instance that should be used to exchange data
+ * between hosts
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured
+ * execution times
+ */
+static std::map<std::string, std::vector<double>>  calculate(
+    const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
+                                       xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config,
+    transpose::TransposeData<fpga_setup::ACCLContext> &data,
+    transpose::data_handler::DistributedPQTransposeDataHandler<
+        xrt::device, fpga_setup::ACCLContext, xrt::uuid> &handler) {
+  int err;
+
+  if (config.programSettings->dataHandlerIdentifier !=
+      transpose::data_handler::DataHandlerType::pq) {
+    throw std::runtime_error(
+        "Used data handler not supported by execution handler!");
+  }
+#ifdef USE_SVM
+  throw new std::runtime_error("SVM not supported in the host implementation "
+                               "of this communication method");
+#endif
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+  throw new std::runtime_error(
+      "Using the Write Rect method is not supported in this host "
+      "implementation of this communication method");
+#endif
+
+  std::vector<size_t> bufferSizeList;
+  std::vector<size_t> bufferStartList;
+  std::vector<size_t> bufferOffsetList;
+  std::vector<xrt::bo> bufferListA;
+  std::vector<xrt::bo> bufferListB;
+  std::vector<xrt::bo> bufferListA_out;
+  std::vector<xrt::kernel> transposeReadKernelList;
+  std::vector<xrt::kernel> transposeWriteKernelList;
+  std::vector<size_t> blocksPerReplication;
+
+  size_t local_matrix_width = handler.getWidthforRank();
+  size_t local_matrix_height = handler.getHeightforRank();
+  size_t local_matrix_width_bytes =
+      local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+
+  size_t total_offset = 0;
+  size_t row_offset = 0;
+#ifndef NDEBUG
+  std::cout << "Start kernel creation" << std::endl;
+#endif
+  // Setup the kernels depending on the number of kernel replications
+  for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+    // Calculate how many blocks the current kernel replication will need to
+    // process.
+    size_t blocks_per_replication =
+        (local_matrix_height * local_matrix_width /
+         config.programSettings->kernelReplications);
+    size_t blocks_remainder = (local_matrix_height * local_matrix_width) %
+                              config.programSettings->kernelReplications;
+    if (blocks_remainder > r) {
+      // Catch the case, that the number of blocks is not divisible by the
+      // number of kernel replications
+      blocks_per_replication += 1;
+    }
+    if (blocks_per_replication < 1) {
+      continue;
+    }
+    blocksPerReplication.push_back(blocks_per_replication);
+    size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) /
+                         local_matrix_width * local_matrix_width *
+                         data.blockSize * data.blockSize;
+    bufferSizeList.push_back(buffer_size);
+    bufferStartList.push_back(total_offset);
+    bufferOffsetList.push_back(row_offset);
+
+    row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
+
+    total_offset += (bufferOffsetList.back() + blocks_per_replication) /
+                    local_matrix_width * local_matrix_width;
+
+    if (!config.programSettings->useAcclEmulation) {
+      // create the kernels
+      xrt::kernel transposeReadKernel(
+          *config.device, *config.program,
+          ("transpose_read0:{transpose_read0_" + std::to_string(r + 1) + "}").c_str());
+      xrt::kernel transposeWriteKernel(
+          *config.device, *config.program,
+          ("transpose_write0:{transpose_write0_" + std::to_string(r + 1) + "}").c_str());
+
+      if (r == 0 || config.programSettings->copyA) {
+        xrt::bo bufferA(*config.device, data.A,
+                      data.numBlocks * data.blockSize * data.blockSize *
+                          sizeof(HOST_DATA_TYPE),
+                      transposeReadKernel.group_id(0));
+        bufferListA.push_back(bufferA);
+      }
+      xrt::bo bufferB(
+          *config.device,
+          &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
+          buffer_size * sizeof(HOST_DATA_TYPE), transposeWriteKernel.group_id(0));
+      xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
+                          transposeWriteKernel.group_id(1));
+
+      bufferListB.push_back(bufferB);
+      bufferListA_out.push_back(bufferA_out);
+      transposeReadKernelList.push_back(transposeReadKernel);
+      transposeWriteKernelList.push_back(transposeWriteKernel);
+    }
+  }
+
+  std::vector<double> transferTimings;
+  std::vector<double> calculationTimings;
+
+  for (int repetition = 0; repetition < config.programSettings->numRepetitions;
+       repetition++) {
+
+#ifndef NDEBUG
+    std::cout << "Start data transfer" << std::endl;
+#endif
+    auto startTransfer = std::chrono::high_resolution_clock::now();
+
+    if (!config.programSettings->useAcclEmulation) {
+      for (int r = 0; r < transposeReadKernelList.size(); r++) {
+        if (r == 0 || config.programSettings->copyA) {
+          bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+        }
+        bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      }
+    }
+    auto endTransfer = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double> transferTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endTransfer - startTransfer);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    HLSLIB_DATAFLOW_INIT();
+    hlslib::Stream<stream_word> cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo");
+    hlslib::Stream<command_word> cmd, sts;
+
+    int pq_width = handler.getP();
+
+    int mpi_comm_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank);
+    int mpi_comm_size;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size);
+    int pq_row = mpi_comm_rank / pq_width;
+    int pq_col = mpi_comm_rank % pq_width;
+
+    int pair_rank = pq_width * pq_col + pq_row;
+    std::vector<unsigned int> dest = {0, 9};
+    std::unique_ptr<CCLO_BFM> cclo;
+    if (config.programSettings->useAcclEmulation) {
+#ifndef NDEBUG
+      std::cout << "Start BFM" << std::endl;
+#endif
+      cclo = std::make_unique<CCLO_BFM>(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+      cclo->run();
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    auto startCalculation = std::chrono::high_resolution_clock::now();
+
+#ifndef NDEBUG
+    std::cout << "Start kernel execution" << std::endl;
+#endif
+    std::vector<xrt::run> runs;
+    auto startKernelCalculation = std::chrono::high_resolution_clock::now();
+    for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+      if (!config.programSettings->useAcclEmulation) {
+        runs.push_back(transposeReadKernelList[r](
+            (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]),
+            static_cast<cl_uint>(bufferOffsetList[r]),
+            static_cast<cl_uint>(blocksPerReplication[r]),
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize))));
+        runs.push_back(transposeWriteKernelList[r](
+            bufferListB[r], bufferListA_out[r],
+            static_cast<cl_uint>(bufferOffsetList[r]),
+            static_cast<cl_uint>(blocksPerReplication[r]),
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize))));
+      } else {
+        HLSLIB_DATAFLOW_FUNCTION(transpose_read0,
+            (config.programSettings->copyA ? data.A : data.A),
+            static_cast<cl_uint>(bufferOffsetList[r]),
+            static_cast<cl_uint>(blocksPerReplication[r]),
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize)),
+                krnl2cclo);
+        HLSLIB_DATAFLOW_FUNCTION(transpose_write0,
+            data.B, data.result,
+            static_cast<cl_uint>(bufferOffsetList[r]),
+            static_cast<cl_uint>(blocksPerReplication[r]),
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize)),
+                cclo2krnl);
+      }
+    }
+    // Exchange A data via ACCL
+    config.context->accl->stream_put(ACCL::dataType::float32, data.blockSize * data.blockSize * data.numBlocks,
+                   pair_rank, 0);
+#ifndef NDEBUG
+    std::cout << "Wait for kernels to complete" << std::endl;
+#endif
+    for (int r = 0; r < runs.size(); r++) {
+      runs[r].wait();
+    }
+    HLSLIB_DATAFLOW_FINALIZE();
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (config.programSettings->useAcclEmulation) {
+      cclo->stop();
+    }
+    auto endCalculation = std::chrono::high_resolution_clock::now();
+#ifndef NDEBUG
+    int mpi_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    std::cout << "Rank " << mpi_rank << ": "
+              << "Done i=" << repetition << std::endl;
+    std::cout << "Kernel execution time: "
+              << std::chrono::duration_cast<std::chrono::duration<double>>(
+                     endCalculation - startKernelCalculation)
+                     .count()
+              << "s ("
+              << ((config.programSettings->matrixSize *
+                   config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) *
+                   3) /
+                  std::chrono::duration_cast<std::chrono::duration<double>>(
+                      endCalculation - startKernelCalculation)
+                      .count() *
+                  1.0e-9)
+              << " GB/s)" << std::endl;
+#endif
+
+    std::chrono::duration<double> calculationTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endCalculation - startCalculation);
+    calculationTimings.push_back(calculationTime.count());
+
+    std::vector<HOST_DATA_TYPE> tmp_write_buffer(
+        local_matrix_height * local_matrix_width * data.blockSize *
+        data.blockSize);
+
+    startTransfer = std::chrono::high_resolution_clock::now();
+    if (!config.programSettings->useAcclEmulation) {
+      for (int r = 0; r < transposeReadKernelList.size(); r++) {
+        // Copy possibly incomplete first block row
+        if (bufferOffsetList[r] != 0) {
+          bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+          bufferListA_out[r].read(tmp_write_buffer.data());
+          for (int row = 0; row < data.blockSize; row++) {
+            for (int col = bufferOffsetList[r] * data.blockSize;
+                col < local_matrix_width * data.blockSize; col++) {
+              data.result[bufferStartList[r] * data.blockSize * data.blockSize +
+                          row * local_matrix_width * data.blockSize + col] =
+                  tmp_write_buffer[row * local_matrix_width * data.blockSize +
+                                  col];
+            }
+          }
+          // Copy remaining buffer
+          std::copy(tmp_write_buffer.begin() +
+                        local_matrix_width * data.blockSize * data.blockSize,
+                    tmp_write_buffer.begin() + bufferSizeList[r],
+                    &data.result[(bufferStartList[r] + local_matrix_width) *
+                                data.blockSize * data.blockSize]);
+        } else {
+          bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+          bufferListA_out[r].read(
+              &data.result[bufferStartList[r] * data.blockSize * data.blockSize]);
+        }
+      }
+    }
+    endTransfer = std::chrono::high_resolution_clock::now();
+
+    transferTime += std::chrono::duration_cast<std::chrono::duration<double>>(
+        endTransfer - startTransfer);
+    transferTimings.push_back(transferTime.count());
+  }
+
+  std::map<std::string, std::vector<double>> timings;
+  timings["transfer"] = transferTimings;
+  timings["calculation"] = calculationTimings;
+  return timings;
+}
+
+} // namespace accl_pq
+} // namespace fpga_execution
+} // namespace transpose
+
+#endif
diff --git a/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp
new file mode 100644
index 00000000..20c9f596
--- /dev/null
+++ b/PTRANS/src/host/execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp
@@ -0,0 +1,468 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_ACCL_STREAM_PQ_SENDRECV_EXECUTION_H_
+#define SRC_HOST_ACCL_STREAM_PQ_SENDRECV_EXECUTION_H_
+
+/* C++ standard library headers */
+#include <chrono>
+#include <memory>
+#include <vector>
+
+/* Project's headers */
+#include "data_handlers/data_handler_types.h"
+#include "data_handlers/pq.hpp"
+#include "transpose_data.hpp"
+#include "cclo_bfm.h"
+#include "Simulation.h"
+#include "accl.hpp"
+
+void transpose_write_sendrecv(const DEVICE_DATA_TYPE* B,
+                    DEVICE_DATA_TYPE* C,
+                const int* target_list,
+                int pq_row, int pq_col, 
+                int pq_width, int pq_height,
+                int gcd, int least_common_multiple,
+                int height_per_rank,
+                int width_per_rank,
+                STREAM<stream_word> &cclo2krnl);
+  
+void transpose_read_sendrecv(const DEVICE_DATA_TYPE* A,
+                const int* target_list,
+                int pq_row, int pq_col, 
+                int pq_width, int pq_height,
+                int gcd, int least_common_multiple,
+                int height_per_rank,
+                int width_per_rank,
+                STREAM<stream_word> &krnl2cclo);
+
+namespace transpose {
+namespace fpga_execution {
+namespace accl_stream_sendrecv_pq {
+
+/**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ
+ * distribution and PCIe+MPI over the host for communication
+ *
+ * @param config The progrma configuration
+ * @param data data object that contains all required data for the execution on
+ * the FPGA
+ * @param handler data handler instance that should be used to exchange data
+ * between hosts
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured
+ * execution times
+ */
+static std::map<std::string, std::vector<double>> calculate(
+    const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
+                                       xrt::device, fpga_setup::ACCLContext, xrt::uuid> &config,
+    transpose::TransposeData<fpga_setup::ACCLContext> &data,
+    transpose::data_handler::DistributedPQTransposeDataHandler<
+        xrt::device, fpga_setup::ACCLContext, xrt::uuid> &handler) {
+  int err;
+
+  if (config.programSettings->dataHandlerIdentifier !=
+      transpose::data_handler::DataHandlerType::pq) {
+    throw std::runtime_error(
+        "Used data handler not supported by execution handler!");
+  }
+#ifdef USE_SVM
+  throw new std::runtime_error("SVM not supported in the host implementation "
+                               "of this communication method");
+#endif
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+  throw new std::runtime_error(
+      "Using the Write Rect method is not supported in this host "
+      "implementation of this communication method");
+#endif
+
+  std::vector<size_t> bufferSizeList;
+  std::vector<size_t> bufferStartList;
+  std::vector<size_t> bufferOffsetList;
+  std::vector<xrt::bo> bufferListA;
+  std::vector<xrt::bo> bufferListB;
+  std::vector<xrt::bo> bufferListA_out;
+  std::vector<std::unique_ptr<ACCL::Buffer<int>>> bufferListTargets;
+  std::vector<std::unique_ptr<ACCL::Buffer<DEVICE_DATA_TYPE>>> bufferListCopy;
+  std::vector<xrt::kernel> transposeReadKernelList;
+  std::vector<xrt::kernel> transposeWriteKernelList;
+  std::vector<size_t> blocksPerReplication;
+
+  size_t local_matrix_width = handler.getWidthforRank();
+  size_t local_matrix_height = handler.getHeightforRank();
+  size_t local_matrix_width_bytes =
+      local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+
+  size_t total_offset = 0;
+  size_t row_offset = 0;
+
+  // Algorithm defines
+  int pq_width = handler.getP();
+  int pq_height = handler.getQ();
+
+  int mpi_comm_rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_comm_rank);
+  int mpi_comm_size;
+  MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size);
+  int pq_row = mpi_comm_rank / pq_width;
+  int pq_col = mpi_comm_rank % pq_width;
+
+  int gcd = std::__gcd(pq_height, pq_width);
+  int least_common_multiple = pq_height * pq_width / gcd;
+
+#ifndef NDEBUG
+  std::cout << "Start kernel creation" << std::endl;
+#endif
+  // Setup the kernels depending on the number of kernel replications
+  for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+    // Calculate how many blocks the current kernel replication will need to
+    // process.
+    size_t blocks_per_replication =
+        (local_matrix_height * local_matrix_width /
+         config.programSettings->kernelReplications);
+    size_t blocks_remainder = (local_matrix_height * local_matrix_width) %
+                              config.programSettings->kernelReplications;
+    if (blocks_remainder > r) {
+      // Catch the case, that the number of blocks is not divisible by the
+      // number of kernel replications
+      blocks_per_replication += 1;
+    }
+    if (blocks_per_replication < 1) {
+      continue;
+    }
+    blocksPerReplication.push_back(blocks_per_replication);
+    size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) /
+                         local_matrix_width * local_matrix_width *
+                         data.blockSize * data.blockSize;
+    bufferSizeList.push_back(buffer_size);
+    bufferStartList.push_back(total_offset);
+    bufferOffsetList.push_back(row_offset);
+
+#ifndef NDEBUG
+    std::cout << "Blocks per replication: " << blocks_per_replication << std::endl;
+#endif
+
+    row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
+
+    total_offset += (bufferOffsetList.back() + blocks_per_replication) /
+                    local_matrix_width * local_matrix_width;
+
+    // Pre-calculate target ranks in LCM block
+    // The vector list variable can be interpreted as 2D matrix. Every entry
+    // represents the target rank of the sub-block Since the LCM block will
+    // repeat, we only need to store this small amount of data!
+    auto target_list = config.context->accl->create_buffer<int>(least_common_multiple / pq_height *
+                                least_common_multiple / pq_width, ACCL::dataType::int32);
+    bufferListCopy.push_back(config.context->accl->create_buffer<DEVICE_DATA_TYPE>(buffer_size, ACCL::dataType::float32));
+    for (int row = 0; row < least_common_multiple / pq_height; row++) {
+      for (int col = 0; col < least_common_multiple / pq_width; col++) {
+        int global_block_col = pq_col + col * pq_width;
+        int global_block_row = pq_row + row * pq_height;
+        int destination_rank = (global_block_col % pq_height) * pq_width +
+                              (global_block_row % pq_width);
+        target_list->buffer()[row * least_common_multiple / pq_width + col] =
+            destination_rank;
+      }
+    }
+    target_list->sync_to_device();
+    bufferListTargets.push_back(std::move(target_list));
+
+    if (!config.programSettings->useAcclEmulation) {
+      // create the kernels
+      xrt::kernel transposeReadKernel(
+          *config.device, *config.program,
+          ("transpose_read_sendrecv0:{transpose_read_sendrecv0_" + std::to_string(r + 1) + "}").c_str());
+      xrt::kernel transposeWriteKernel(
+          *config.device, *config.program,
+          ("transpose_write_sendrecv0:{transpose_write_sendrecv0_" + std::to_string(r + 1) + "}").c_str());
+
+      if (r == 0 || config.programSettings->copyA) {
+        xrt::bo bufferA(*config.device, data.A,
+                      data.numBlocks * data.blockSize * data.blockSize *
+                          sizeof(HOST_DATA_TYPE),
+                      transposeReadKernel.group_id(0));
+        bufferListA.push_back(bufferA);
+      }
+
+      xrt::bo bufferB(
+          *config.device,
+          &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
+          buffer_size * sizeof(HOST_DATA_TYPE), transposeWriteKernel.group_id(0));
+      xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
+                          transposeWriteKernel.group_id(1));
+
+      bufferListB.push_back(bufferB);
+      bufferListA_out.push_back(bufferA_out);
+      transposeReadKernelList.push_back(transposeReadKernel);
+      transposeWriteKernelList.push_back(transposeWriteKernel);
+    }
+  }
+
+  std::vector<double> transferTimings;
+  std::vector<double> calculationTimings;
+
+  for (int repetition = 0; repetition < config.programSettings->numRepetitions;
+       repetition++) {
+
+#ifndef NDEBUG
+    std::cout << "Start data transfer" << std::endl;
+#endif
+    auto startTransfer = std::chrono::high_resolution_clock::now();
+
+    if (!config.programSettings->useAcclEmulation) {
+      for (int r = 0; r < transposeReadKernelList.size(); r++) {
+        if (r == 0 || config.programSettings->copyA) {
+          bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+        }
+        bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      }
+    }
+    auto endTransfer = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double> transferTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endTransfer - startTransfer);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+#ifndef NDEBUG
+    std::cout << "Start BFM" << std::endl;
+#endif
+
+    HLSLIB_DATAFLOW_INIT();
+    hlslib::Stream<stream_word> cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo");
+    hlslib::Stream<command_word> cmd, sts;
+
+    std::vector<unsigned int> dest = {0, 9};
+    std::unique_ptr<CCLO_BFM> cclo;
+    if (config.programSettings->useAcclEmulation) {
+#ifndef NDEBUG
+      std::cout << "Start BFM" << std::endl;
+#endif
+      cclo = std::make_unique<CCLO_BFM>(6000, mpi_comm_rank, mpi_comm_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+      cclo->run();
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    auto startCalculation = std::chrono::high_resolution_clock::now();
+
+#ifndef NDEBUG
+    std::cout << "Start kernel execution" << std::endl;
+#endif
+    std::vector<xrt::run> runs;
+    auto startKernelCalculation = std::chrono::high_resolution_clock::now();
+    for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+      if (!config.programSettings->useAcclEmulation) {
+        runs.push_back(transposeReadKernelList[r](
+            (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]),
+            static_cast<cl_uint>(bufferOffsetList[r]),
+            static_cast<cl_uint>(blocksPerReplication[r]),
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize))));
+        runs.push_back(transposeWriteKernelList[r](
+            bufferListB[r], bufferListA_out[r],
+            static_cast<cl_uint>(bufferOffsetList[r]),
+            static_cast<cl_uint>(blocksPerReplication[r]),
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize))));
+      } else {
+        HLSLIB_DATAFLOW_FUNCTION(transpose_read_sendrecv,
+            (config.programSettings->copyA ? data.A : data.A),
+            bufferListTargets[r]->buffer(),
+            pq_row, pq_col, pq_width, pq_height,
+            gcd, least_common_multiple,
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize)),
+                krnl2cclo);
+        HLSLIB_DATAFLOW_FUNCTION(transpose_write_sendrecv,
+            data.B, data.result,
+            bufferListTargets[r]->buffer(),
+            pq_row, pq_col, pq_width, pq_height,
+            gcd, least_common_multiple,
+            static_cast<cl_uint>(handler.getWidthforRank()),
+            static_cast<cl_uint>(
+                (bufferSizeList[r]) /
+                (local_matrix_width * data.blockSize * data.blockSize)),
+                cclo2krnl);
+      }
+    }
+#ifndef NDEBUG
+    std::cout << "Start ACCL send/recv" << std::endl;
+#endif
+    auto dbuffer = config.context->accl->create_buffer<DEVICE_DATA_TYPE>(1,ACCL::dataType::float32);
+    int g = transpose::data_handler::mod(pq_row - pq_col, gcd);
+    int p = transpose::data_handler::mod(pq_col + g, pq_width);
+    int q = transpose::data_handler::mod(pq_row - g, pq_height);
+    // Exchange A data via ACCL
+    for (int k=0; k < 2; k++) {
+      for (int j = 0; j < least_common_multiple/pq_width; j++) {
+          for (int i = 0; i < least_common_multiple/pq_height; i++) {
+              // Determine sender and receiver rank of current rank for current communication step
+              int send_rank = transpose::data_handler::mod(p + i * gcd, pq_width) + transpose::data_handler::mod(q - j * gcd, pq_height) * pq_width;
+              int recv_rank = transpose::data_handler::mod(p - i * gcd, pq_width) + transpose::data_handler::mod(q + j * gcd, pq_height) * pq_width;
+
+              // Also count receiving buffer size because sending and receiving buffer size may differ in certain scenarios!
+              int receiving_size = 0;
+              int sending_size = 0;
+
+              std::vector<int> send_rows;
+              std::vector<int> send_cols;
+              // Look up which blocks are affected by the current rank
+              for (int row = 0; row  < least_common_multiple/pq_height; row++) {
+                  for (int col = 0; col  < least_common_multiple/pq_width; col++) {
+#ifndef NDEBUG
+    std::cout << "Check" << row * least_common_multiple/pq_width + col << std::endl;
+#endif
+                      if (bufferListTargets[0]->buffer()[row * least_common_multiple/pq_width + col] == send_rank) {
+                          send_rows.push_back(row);
+                          send_cols.push_back(col);
+                          sending_size += data.blockSize * data.blockSize;
+                      }
+                      if (bufferListTargets[0]->buffer()[row * least_common_multiple/pq_width + col] == recv_rank) {
+                          receiving_size += data.blockSize * data.blockSize;
+                      }
+                  }
+              }
+              receiving_size *= (local_matrix_height)/(least_common_multiple/pq_height) * ((local_matrix_width)/(least_common_multiple/pq_width));
+              sending_size *= (local_matrix_height)/(least_common_multiple/pq_height) * ((local_matrix_width)/(least_common_multiple/pq_width));
+
+              // Do actual MPI communication
+              if (k==0) {
+                // First schedule all sends, then all receives. This works if communication rounds <= ACCL buffers.
+                // Non-blocking communication would not offer many benefits, because the CCLO can only execute send OR recv
+#ifndef NDEBUG
+                  std::cout << "Send blocks " << sending_size / (data.blockSize * data.blockSize) << " to " << send_rank << std::endl << std::flush;
+#endif
+                  if (send_rank == mpi_comm_rank) {
+                    //TODO copy from and to string not implemented in driver yet
+                    // config.accl->copy_from_stream(*bufferListCopy[0], sending_size);
+                  } else {
+                    config.context->accl->send(ACCL::dataType::float32, sending_size, send_rank, 0);
+                  }
+              } else {
+  #ifndef NDEBUG
+                  std::cout << "Recv blocks " <<   receiving_size / (data.blockSize * data.blockSize) << " from " << recv_rank << std::endl << std::flush;
+  #endif
+                if (recv_rank == mpi_comm_rank) {
+                  //TODO copy from and to string not implemented in driver yet
+                  // config.accl->copy_to_stream(*bufferListCopy[0], receiving_size);
+                } else {
+                  config.context->accl->recv(ACCL::dataType::float32, receiving_size, recv_rank, 0);
+                }
+              }
+          }
+      }
+    }
+
+#ifndef NDEBUG
+    std::cout << "Wait for kernels to complete" << std::endl;
+#endif
+    for (int r = 0; r < runs.size(); r++) {
+      runs[r].wait();
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+    HLSLIB_DATAFLOW_FINALIZE();
+    if (config.programSettings->useAcclEmulation) {
+      cclo->stop();
+    }
+    auto endCalculation = std::chrono::high_resolution_clock::now();
+#ifndef NDEBUG
+    int mpi_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    std::cout << "Rank " << mpi_rank << ": "
+              << "Done i=" << repetition << std::endl;
+    std::cout << "Kernel execution time: "
+              << std::chrono::duration_cast<std::chrono::duration<double>>(
+                     endCalculation - startKernelCalculation)
+                     .count()
+              << "s ("
+              << ((config.programSettings->matrixSize *
+                   config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) *
+                   3) /
+                  std::chrono::duration_cast<std::chrono::duration<double>>(
+                      endCalculation - startKernelCalculation)
+                      .count() *
+                  1.0e-9)
+              << " GB/s)" << std::endl;
+#endif
+
+    std::chrono::duration<double> calculationTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endCalculation - startCalculation);
+    calculationTimings.push_back(calculationTime.count());
+
+    std::vector<HOST_DATA_TYPE> tmp_write_buffer(
+        local_matrix_height * local_matrix_width * data.blockSize *
+        data.blockSize);
+
+    startTransfer = std::chrono::high_resolution_clock::now();
+    if (!config.programSettings->useAcclEmulation) {
+      for (int r = 0; r < transposeReadKernelList.size(); r++) {
+        // Copy possibly incomplete first block row
+        if (bufferOffsetList[r] != 0) {
+          bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+          bufferListA_out[r].read(tmp_write_buffer.data());
+          for (int row = 0; row < data.blockSize; row++) {
+            for (int col = bufferOffsetList[r] * data.blockSize;
+                col < local_matrix_width * data.blockSize; col++) {
+              data.result[bufferStartList[r] * data.blockSize * data.blockSize +
+                          row * local_matrix_width * data.blockSize + col] =
+                  tmp_write_buffer[row * local_matrix_width * data.blockSize +
+                                  col];
+            }
+          }
+          // Copy remaining buffer
+          std::copy(tmp_write_buffer.begin() +
+                        local_matrix_width * data.blockSize * data.blockSize,
+                    tmp_write_buffer.begin() + bufferSizeList[r],
+                    &data.result[(bufferStartList[r] + local_matrix_width) *
+                                data.blockSize * data.blockSize]);
+        } else {
+          bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+          bufferListA_out[r].read(
+              &data.result[bufferStartList[r] * data.blockSize * data.blockSize]);
+        }
+      }
+    }
+    endTransfer = std::chrono::high_resolution_clock::now();
+
+    transferTime += std::chrono::duration_cast<std::chrono::duration<double>>(
+        endTransfer - startTransfer);
+    transferTimings.push_back(transferTime.count());
+  }
+
+  std::map<std::string, std::vector<double>> timings;
+  timings["transfer"] = transferTimings;
+  timings["calculation"] = calculationTimings;
+  return timings;
+}
+
+} // namespace accl_pq
+} // namespace fpga_execution
+} // namespace transpose
+
+#endif
diff --git a/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
new file mode 100644
index 00000000..f621394a
--- /dev/null
+++ b/PTRANS/src/host/execution_types/execution_xrt_pcie_pq.hpp
@@ -0,0 +1,284 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_XRT_PCIE_PQ_EXECUTION_H_
+#define SRC_HOST_XRT_PCIE_PQ_EXECUTION_H_
+
+/* C++ standard library headers */
+#include <chrono>
+#include <memory>
+#include <vector>
+
+/* Project's headers */
+#include "data_handlers/data_handler_types.h"
+#include "data_handlers/pq.hpp"
+#include "transpose_benchmark.hpp"
+
+namespace transpose {
+namespace fpga_execution {
+namespace pcie_pq {
+
+/**
+ * @brief Transpose and add the matrices using the OpenCL kernel using a PQ
+ * distribution and PCIe+MPI over the host for communication
+ *
+ * @param config The progrma configuration
+ * @param data data object that contains all required data for the execution on
+ * the FPGA
+ * @param handler data handler instance that should be used to exchange data
+ * between hosts
+ * @return std::unique_ptr<transpose::TransposeExecutionTimings> The measured
+ * execution times
+ */
+template<class TContext>
+static std::map<std::string, std::vector<double>> calculate(
+    const hpcc_base::ExecutionSettings<transpose::TransposeProgramSettings,
+                                       xrt::device, TContext, xrt::uuid> &config,
+          transpose::TransposeData<TContext> &data,
+    transpose::data_handler::DistributedPQTransposeDataHandler<
+        xrt::device, TContext, xrt::uuid> &handler) {
+  int err;
+
+  if (config.programSettings->dataHandlerIdentifier !=
+      transpose::data_handler::DataHandlerType::pq) {
+    throw std::runtime_error(
+        "Used data handler not supported by execution handler!");
+  }
+#ifdef USE_SVM
+  throw new std::runtime_error("SVM not supported in the host implementation "
+                               "of this communication method");
+#endif
+#ifdef USE_BUFFER_WRITE_RECT_FOR_A
+  throw new std::runtime_error(
+      "Using the Write Rect method is not supported in this host "
+                               "implementation of this communication method");
+#endif
+
+  std::vector<size_t> bufferSizeList;
+  std::vector<size_t> bufferStartList;
+  std::vector<size_t> bufferOffsetList;
+  std::vector<xrt::bo> bufferListA;
+  std::vector<xrt::bo> bufferListB;
+  std::vector<xrt::bo> bufferListA_out;
+  std::vector<xrt::kernel> transposeKernelList;
+  std::vector<size_t> blocksPerReplication;
+
+  size_t local_matrix_width = handler.getWidthforRank();
+  size_t local_matrix_height = handler.getHeightforRank();
+  size_t local_matrix_width_bytes =
+      local_matrix_width * data.blockSize * sizeof(HOST_DATA_TYPE);
+
+  size_t total_offset = 0;
+  size_t row_offset = 0;
+  // Setup the kernels depending on the number of kernel replications
+  for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+    // Calculate how many blocks the current kernel replication will need to
+    // process.
+    size_t blocks_per_replication =
+        (local_matrix_height * local_matrix_width /
+         config.programSettings->kernelReplications);
+    size_t blocks_remainder = (local_matrix_height * local_matrix_width) %
+                              config.programSettings->kernelReplications;
+    if (blocks_remainder > r) {
+      // Catch the case, that the number of blocks is not divisible by the
+      // number of kernel replications
+      blocks_per_replication += 1;
+    }
+    if (blocks_per_replication < 1) {
+      continue;
+    }
+    blocksPerReplication.push_back(blocks_per_replication);
+    size_t buffer_size = (blocks_per_replication + local_matrix_width - 1) /
+                         local_matrix_width * local_matrix_width *
+                         data.blockSize * data.blockSize;
+    bufferSizeList.push_back(buffer_size);
+    bufferStartList.push_back(total_offset);
+    bufferOffsetList.push_back(row_offset);
+
+    row_offset = (row_offset + blocks_per_replication) % local_matrix_width;
+
+    total_offset += (bufferOffsetList.back() + blocks_per_replication) /
+                    local_matrix_width * local_matrix_width;
+
+    // create the kernels
+    xrt::kernel transposeKernel(*config.device, *config.program,
+                                ("transpose0:{transpose0_" + std::to_string(r + 1) + "}").c_str());
+
+    if ( r == 0 || config.programSettings->copyA) {
+      xrt::bo bufferA(*config.device, data.A,
+                    data.numBlocks * data.blockSize * data.blockSize *
+                        sizeof(HOST_DATA_TYPE),
+                    transposeKernel.group_id(0));
+      bufferListA.push_back(bufferA);
+    }
+    xrt::bo bufferB(
+        *config.device,
+        &data.B[bufferStartList[r] * data.blockSize * data.blockSize],
+                    buffer_size * sizeof(HOST_DATA_TYPE), transposeKernel.group_id(1));
+    // TODO For small matrices, the 4KB alignment might fail for buffer B.
+    // Temporary fix seen in lines below (requires extra copying) xrt::bo
+    // bufferB(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
+    // transposeKernel.group_id(1)); bufferB.write(data.B + bufferStartList[r] *
+    // data.blockSize * data.blockSize);
+    xrt::bo bufferA_out(*config.device, buffer_size * sizeof(HOST_DATA_TYPE),
+                        transposeKernel.group_id(2));
+
+    bufferListB.push_back(bufferB);
+    bufferListA_out.push_back(bufferA_out);
+    transposeKernelList.push_back(transposeKernel);
+  }
+
+  std::vector<double> transferTimings;
+  std::vector<double> calculationTimings;
+
+  for (int repetition = 0; repetition < config.programSettings->numRepetitions; repetition++) {
+
+    auto startTransfer = std::chrono::high_resolution_clock::now();
+
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      if ( r == 0 || config.programSettings->copyA) {
+        bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      }
+      bufferListB[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    }
+    auto endTransfer = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double> transferTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endTransfer - startTransfer);
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    int mpi_size;
+    MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+    auto startCalculation = std::chrono::high_resolution_clock::now();
+
+    if (mpi_size > 1) {
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      if ( r == 0 || config.programSettings->copyA) {
+        bufferListA[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+      }
+    }
+
+    // Exchange A data via PCIe and MPI
+    handler.exchangeData(data);
+
+    std::copy(data.A, data.A + data.numBlocks * data.blockSize * data.blockSize,
+              data.exchange);
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      if ( r == 0 || config.programSettings->copyA) {
+        bufferListA[r].sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      }
+    }
+    }
+
+    std::vector<xrt::run> runs;
+    auto startKernelCalculation = std::chrono::high_resolution_clock::now();
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      runs.push_back(transposeKernelList[r](
+          (config.programSettings->copyA ? bufferListA[r] : bufferListA[0]), bufferListB[r], bufferListA_out[r],
+          static_cast<cl_uint>(bufferStartList[r] + bufferOffsetList[r]),
+          static_cast<cl_uint>(bufferOffsetList[r]), static_cast<cl_uint>(blocksPerReplication[r]),
+          static_cast<cl_uint>(handler.getWidthforRank()),
+          static_cast<cl_uint>(handler.getHeightforRank())));
+    }
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      runs[r].wait();
+    }
+    auto endCalculation = std::chrono::high_resolution_clock::now();
+#ifndef NDEBUG
+    int mpi_rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+    std::cout << "Rank " << mpi_rank << ": "
+              << "Done i=" << repetition << std::endl;
+    std::cout << "Kernel execution time: "
+              << std::chrono::duration_cast<std::chrono::duration<double>>(
+                     endCalculation - startKernelCalculation)
+                     .count()
+              << "s ("
+              << ((config.programSettings->matrixSize *
+                   config.programSettings->matrixSize * sizeof(HOST_DATA_TYPE) *
+                   3) /
+                  std::chrono::duration_cast<std::chrono::duration<double>>(
+                      endCalculation - startKernelCalculation)
+                      .count() *
+                  1.0e-9)
+              << " GB/s)" << std::endl;
+#endif
+
+    // Transfer back data for next repetition!
+    handler.exchangeData(data);
+
+    std::chrono::duration<double> calculationTime =
+        std::chrono::duration_cast<std::chrono::duration<double>>(
+            endCalculation - startCalculation);
+    calculationTimings.push_back(calculationTime.count());
+
+    std::vector<HOST_DATA_TYPE> tmp_write_buffer(
+        local_matrix_height * local_matrix_width * data.blockSize *
+        data.blockSize);
+
+    startTransfer = std::chrono::high_resolution_clock::now();
+
+    for (int r = 0; r < transposeKernelList.size(); r++) {
+      // Copy possibly incomplete first block row
+      if (bufferOffsetList[r] != 0) {
+        bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+        bufferListA_out[r].read(tmp_write_buffer.data());
+        for (int row = 0; row < data.blockSize; row++) {
+          for (int col = bufferOffsetList[r] * data.blockSize;
+               col < local_matrix_width * data.blockSize; col++) {
+            data.result[bufferStartList[r] * data.blockSize * data.blockSize +
+                        row * local_matrix_width * data.blockSize + col] =
+                tmp_write_buffer[row * local_matrix_width * data.blockSize +
+                                 col];
+          }
+        }
+        // Copy remaining buffer
+        std::copy(tmp_write_buffer.begin() +
+                      local_matrix_width * data.blockSize * data.blockSize,
+                  tmp_write_buffer.begin() + bufferSizeList[r],
+                  &data.result[(bufferStartList[r] + local_matrix_width) *
+                               data.blockSize * data.blockSize]);
+      } else {
+        bufferListA_out[r].sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+        bufferListA_out[r].read(
+            &data.result[bufferStartList[r] * data.blockSize * data.blockSize]);
+      }
+    }
+    endTransfer = std::chrono::high_resolution_clock::now();
+    transferTime += std::chrono::duration_cast<std::chrono::duration<double>>(
+        endTransfer - startTransfer);
+    transferTimings.push_back(transferTime.count());
+  }
+
+  std::map<std::string, std::vector<double>> timings;
+  timings["transfer"] = transferTimings;
+  timings["calculation"] = calculationTimings;
+  return timings;
+
+}
+
+} // namespace pcie_pq
+} // namespace fpga_execution
+} // namespace transpose
+
+#endif
diff --git a/PTRANS/src/host/main.cpp b/PTRANS/src/host/main.cpp
index a054f7dd..126f6ff3 100644
--- a/PTRANS/src/host/main.cpp
+++ b/PTRANS/src/host/main.cpp
@@ -8,7 +8,15 @@ The program entry point
 int
 main(int argc, char *argv[]) {
     // Setup benchmark
-    TransposeBenchmark bm(argc, argv);
+#ifdef USE_OCL_HOST
+    TransposeBenchmark<cl::Device, cl::Context, cl::Program> bm(argc, argv);
+#else
+#ifndef USE_ACCL
+    TransposeBenchmark<xrt::device, bool, xrt::uuid> bm(argc, argv);
+#else
+    TransposeBenchmark<xrt::device, fpga_setup::ACCLContext, xrt::uuid> bm(argc, argv);
+#endif
+#endif
     bool success = bm.executeBenchmark();
     if (success) {
         return 0;
diff --git a/PTRANS/src/host/transpose_benchmark.cpp b/PTRANS/src/host/transpose_benchmark.cpp
deleted file mode 100644
index 755b11a0..00000000
--- a/PTRANS/src/host/transpose_benchmark.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-//
-// Created by Marius Meyer on 04.12.19.
-//
-
-/*
-Copyright (c) 2019 Marius Meyer
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-*/
-
-#include "transpose_benchmark.hpp"
-
-/* C++ standard library headers */
-#include <memory>
-#include <random>
-
-/* Project's headers */
-#include "execution_types/execution_intel.hpp"
-#include "execution_types/execution_intel_pq.hpp"
-#include "execution_types/execution_pcie.hpp"
-#include "execution_types/execution_pcie_pq.hpp"
-#include "execution_types/execution_cpu.hpp"
-#include "communication_types.hpp"
-
-#include "data_handlers/data_handler_types.h"
-#include "data_handlers/diagonal.hpp"
-#include "data_handlers/pq.hpp"
-
-#include "parameters.h"
-
-
-transpose::TransposeBenchmark::TransposeBenchmark(int argc, char* argv[]) : HpccFpgaBenchmark(argc, argv) {
-    if (setupBenchmark(argc, argv)) {
-        setTransposeDataHandler(executionSettings->programSettings->dataHandlerIdentifier);
-    }
-}
-
-void
-transpose::TransposeBenchmark::addAdditionalParseOptions(cxxopts::Options &options) {
-    options.add_options()
-        ("m", "Matrix size in number of blocks in one dimension",
-            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_MATRIX_SIZE)))
-        ("b", "Block size in number of values in one dimension",
-            cxxopts::value<uint>()->default_value(std::to_string(BLOCK_SIZE)))
-        ("p", "Value of P that equals the width of the PQ grid of FPGAs. Q is determined by the world size.",
-            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_P_VALUE)))
-        ("distribute-buffers", "Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs.")
-        ("handler", "Specify the used data handler that distributes the data over devices and memory banks",
-            cxxopts::value<std::string>()->default_value(DEFAULT_DIST_TYPE));
-}
-
-std::unique_ptr<transpose::TransposeExecutionTimings>
-transpose::TransposeBenchmark::executeKernel(TransposeData &data) {
-    switch (executionSettings->programSettings->communicationType) {
-        case hpcc_base::CommunicationType::intel_external_channels: 
-                                if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
-                                    return transpose::fpga_execution::intel::calculate(*executionSettings, data);
-                                }
-                                else {
-                                    return transpose::fpga_execution::intel_pq::calculate(*executionSettings, data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*dataHandler));
-                                } break;
-        case hpcc_base::CommunicationType::pcie_mpi :                                 
-                                if (executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
-                                    return transpose::fpga_execution::pcie::calculate(*executionSettings, data, *dataHandler);
-                                }
-                                else {
-                                    return transpose::fpga_execution::pcie_pq::calculate(*executionSettings, data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler&>(*dataHandler));
-                                } break;
-#ifdef MKL_FOUND
-        case hpcc_base::CommunicationType::cpu_only : return transpose::fpga_execution::cpu::calculate(*executionSettings, data, *dataHandler); break;
-#endif
-        default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(executionSettings->programSettings->communicationType));
-    }
-}
-
-void
-transpose::TransposeBenchmark::collectAndPrintResults(const transpose::TransposeExecutionTimings &output) {
-    double flops = static_cast<double>(executionSettings->programSettings->matrixSize) * executionSettings->programSettings->matrixSize;
-
-    // Number of experiment repetitions
-    uint number_measurements = output.calculationTimings.size();
-    std::vector<double> max_measures(number_measurements);
-    std::vector<double> max_transfers(number_measurements);
-#ifdef _USE_MPI_
-        // Copy the object variable to a local variable to make it accessible to the lambda function
-        int mpi_size = mpi_comm_size;
-        MPI_Reduce(output.calculationTimings.data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-        MPI_Reduce(output.transferTimings.data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-#else
-        std::copy(output.calculationTimings.begin(), output.calculationTimings.end(), max_measures.begin());
-        std::copy(output.transferTimings.begin(), output.transferTimings.end(), max_transfers.begin());
-#endif
-
-    double avgCalculationTime = accumulate(max_measures.begin(), max_measures.end(), 0.0)
-                                / max_measures.size();
-    double minCalculationTime = *min_element(max_measures.begin(), max_measures.end());
-
-    double avgTransferTime = accumulate(max_transfers.begin(), max_transfers.end(), 0.0)
-                                / max_transfers.size();
-    double minTransferTime = *min_element(max_transfers.begin(), max_transfers.end());
-
-    double avgCalcFLOPS = flops / avgCalculationTime;
-    double maxCalcFLOPS = flops / minCalculationTime;
-    double avgMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime;
-    double maxMemBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime;
-    double avgTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime;
-    double maxTransferBandwidth = flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime;
-
-
-
-
-    if (mpi_comm_rank == 0) {
-        std::cout << "       total [s]     transfer [s]  calc [s]      calc FLOPS    Mem [B/s]     PCIe [B/s]" << std::endl;
-        std::cout << "avg:   " << (avgTransferTime + avgCalculationTime)
-                << "   " << avgTransferTime
-                << "   " << avgCalculationTime
-                << "   " << avgCalcFLOPS
-                << "   " << avgMemBandwidth
-                << "   " << avgTransferBandwidth
-                << std::endl;
-        std::cout << "best:  " << (minTransferTime + minCalculationTime)
-                << "   " << minTransferTime
-                << "   " << minCalculationTime
-                << "   " << maxCalcFLOPS
-                << "   " << maxMemBandwidth
-                << "   " << maxTransferBandwidth
-                << std::endl;
-    }
-}
-
-std::unique_ptr<transpose::TransposeData>
-transpose::TransposeBenchmark::generateInputData() {
-    return dataHandler->generateData(*executionSettings);
-}
-
-bool  
-transpose::TransposeBenchmark::validateOutputAndPrintError(transpose::TransposeData &data) {
-
-    // exchange the data using MPI depending on the chosen distribution scheme
-    dataHandler->exchangeData(data);
-
-    dataHandler->reference_transpose(data);
-
-    double max_error = 0.0;
-    for (size_t i = 0; i < executionSettings->programSettings->blockSize * executionSettings->programSettings->blockSize * data.numBlocks; i++) {
-        max_error = std::max(fabs(data.A[i]), max_error);
-    }
-
-    double global_max_error = 0;
-    MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-
-    if (mpi_comm_rank == 0) {
-        std::cout << "Maximum error: " << global_max_error << " < " << 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon() <<  std::endl;
-        std::cout << "Mach. Epsilon: " << std::numeric_limits<HOST_DATA_TYPE>::epsilon() << std::endl;
-    }
-
-    return static_cast<double>(global_max_error) < 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon();
-}
-
-void
-transpose::TransposeBenchmark::setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier) {
-    switch (dataHandlerIdentifier) {
-        case transpose::data_handler::DataHandlerType::diagonal: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler>(new transpose::data_handler::DistributedDiagonalTransposeDataHandler(mpi_comm_rank, mpi_comm_size)); break;
-        case transpose::data_handler::DataHandlerType::pq: dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler>(new transpose::data_handler::DistributedPQTransposeDataHandler(mpi_comm_rank, mpi_comm_size, executionSettings->programSettings->p)); break;
-        default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier));
-    }
-        
-
-}
diff --git a/PTRANS/src/host/transpose_benchmark.hpp b/PTRANS/src/host/transpose_benchmark.hpp
index 5de333ca..c00adbd9 100644
--- a/PTRANS/src/host/transpose_benchmark.hpp
+++ b/PTRANS/src/host/transpose_benchmark.hpp
@@ -28,13 +28,29 @@ SOFTWARE.
 #include <memory>
 
 /* Project's headers */
+#include "parameters.h"
 #include "hpcc_benchmark.hpp"
 #include "transpose_data.hpp"
+#ifdef USE_OCL_HOST
+#include "execution_types/execution_intel.hpp"
+#include "execution_types/execution_intel_pq.hpp"
+#include "execution_types/execution_pcie.hpp"
+#include "execution_types/execution_pcie_pq.hpp"
+#endif
+#ifdef USE_XRT_HOST
+#include "execution_types/execution_xrt_pcie_pq.hpp"
+#ifdef USE_ACCL
+#include "execution_types/execution_xrt_accl_pq.hpp"
+#include "execution_types/execution_xrt_accl_stream_pq_sendrecv.hpp"
+#include "execution_types/execution_xrt_accl_stream_pq.hpp"
+#endif
+#endif
+#include "execution_types/execution_cpu.hpp"
+#include "communication_types.hpp"
 
 #include "data_handlers/data_handler_types.h"
-#include "data_handlers/handler.hpp"
-
-#include "parameters.h"
+#include "data_handlers/diagonal.hpp"
+#include "data_handlers/pq.hpp"
 
 /**
  * @brief Contains all classes and methods needed by the Transpose benchmark
@@ -46,8 +62,9 @@ namespace transpose {
  * @brief Implementation of the transpose benchmark
  * 
  */
-class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings, TransposeData, TransposeExecutionTimings> {
-
+template<class TDevice, class TContext, class TProgram> 
+class TransposeBenchmark : 
+public hpcc_base::HpccFpgaBenchmark<TransposeProgramSettings,TDevice, TContext, TProgram, TransposeData<TContext>> {
 protected:
 
     /**
@@ -56,9 +73,22 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark<TransposeProgramS
      * @param options 
      */
     void
-    addAdditionalParseOptions(cxxopts::Options &options) override;
+    addAdditionalParseOptions(cxxopts::Options &options) override {
+        options.add_options()
+            ("m", "Matrix size in number of blocks in one dimension",
+                cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_MATRIX_SIZE)))
+            ("b", "Block size in number of values in one dimension",
+                cxxopts::value<uint>()->default_value(std::to_string(BLOCK_SIZE)))
+            ("p", "Value of P that equals the width of the PQ grid of FPGAs. Q is determined by the world size.",
+                cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_P_VALUE)))
+            ("distribute-buffers", "Distribute buffers over memory banks. This will use three memory banks instead of one for a single kernel replication, but kernel replications may interfere. This is an Intel only attribute, since buffer placement is decided at compile time for Xilinx FPGAs.")
+            ("handler", "Specify the used data handler that distributes the data over devices and memory banks",
+                cxxopts::value<std::string>()->default_value(DEFAULT_DIST_TYPE))
+            ("copy-a", "Create a copy of matrix A for each kernel replication")
+            ("accl-stream", "Use design with user kernels directly connected to CCLO");
+    }
 
-    std::unique_ptr<transpose::data_handler::TransposeDataHandler> dataHandler;
+    std::unique_ptr<transpose::data_handler::TransposeDataHandler<TDevice, TContext, TProgram>> dataHandler;
 
 public:
 
@@ -67,24 +97,74 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark<TransposeProgramS
      * 
      * @return std::unique_ptr<TransposeData> The input and output data of the benchmark
      */
-    std::unique_ptr<TransposeData>
-    generateInputData() override;
+    std::unique_ptr<TransposeData<TContext>>
+    generateInputData() override {
+        return this->dataHandler->generateData(*(this->executionSettings));
+    }
 
     /**
      * @brief Set the data handler object by calling the function with the matching template argument
      * 
      */
     void
-    setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier);
+    setTransposeDataHandler(transpose::data_handler::DataHandlerType dataHandlerIdentifier) {
+        switch (dataHandlerIdentifier) {
+            case transpose::data_handler::DataHandlerType::diagonal: this->dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler<TDevice, TContext, TProgram>>(new transpose::data_handler::DistributedDiagonalTransposeDataHandler<TDevice, TContext, TProgram>(this->mpi_comm_rank, this->mpi_comm_size)); break;
+            case transpose::data_handler::DataHandlerType::pq: this->dataHandler = std::unique_ptr<transpose::data_handler::TransposeDataHandler<TDevice, TContext, TProgram>>(new transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>(this->mpi_comm_rank, this->mpi_comm_size, this->executionSettings->programSettings->p)); break;
+            default: throw std::runtime_error("Could not match selected data handler: " + transpose::data_handler::handlerToString(dataHandlerIdentifier));
+        }
+    }
 
     /**
      * @brief Transpose specific implementation of the kernel execution
      * 
      * @param data The input and output data of the benchmark
-     * @return std::unique_ptr<TransposeExecutionTimings> Measured runtimes of the kernel execution
      */
-    std::unique_ptr<TransposeExecutionTimings>
-    executeKernel(TransposeData &data) override;
+    void
+    executeKernel(TransposeData<TContext> &data) override {
+        switch (this->executionSettings->programSettings->communicationType) {
+#ifdef USE_OCL_HOST
+            case hpcc_base::CommunicationType::intel_external_channels: 
+                                    if (this->executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
+                                        this->timings = transpose::fpga_execution::intel::calculate(*(this->executionSettings), data);
+                                    }
+                                    else {
+                                        this->timings = transpose::fpga_execution::intel_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler));
+                                    } break;
+            case hpcc_base::CommunicationType::pcie_mpi :                                 
+                                    if (this->executionSettings->programSettings->dataHandlerIdentifier == transpose::data_handler::DataHandlerType::diagonal) {
+                                        this->timings = transpose::fpga_execution::pcie::calculate(*(this->executionSettings), data, *dataHandler);
+                                    }
+                                    else {
+                                        this->timings = transpose::fpga_execution::pcie_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler));
+                                    } break;
+#endif
+#ifdef USE_XRT_HOST
+            case hpcc_base::CommunicationType::pcie_mpi:
+                                    this->timings = transpose::fpga_execution::pcie_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler)); break;
+#ifdef USE_ACCL
+            // case hpcc_base::CommunicationType::accl:
+            //                         return transpose::fpga_execution::accl_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler)); break;
+            case hpcc_base::CommunicationType::accl: 
+                                    if (this->executionSettings->programSettings->useAcclStreams) {
+                                        auto h = reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler);
+                                        if (!h.getP() == h.getQ()) {
+                                            this->timings = transpose::fpga_execution::accl_stream_sendrecv_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler));
+                                        }
+                                        else {
+                                            this->timings = transpose::fpga_execution::accl_stream_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler));
+                                        }
+                                    } else {
+                                        this->timings = transpose::fpga_execution::accl_pq::calculate(*(this->executionSettings), data, reinterpret_cast<transpose::data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>&>(*this->dataHandler));
+                                    } break;
+#endif
+#endif
+#ifdef MKL_FOUND
+            case hpcc_base::CommunicationType::cpu_only : this->timings = transpose::fpga_execution::cpu::calculate(*(this->executionSettings), data, *dataHandler); break;
+#endif
+            default: throw std::runtime_error("No calculate method implemented for communication type " + commToString(this->executionSettings->programSettings->communicationType));
+        }
+    }
 
     /**
      * @brief Transpose specific implementation of the execution validation
@@ -94,15 +174,164 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark<TransposeProgramS
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(TransposeData &data) override;
+    validateOutput(TransposeData<TContext> &data) override {
+
+        // exchange the data using MPI depending on the chosen distribution scheme
+        this->dataHandler->exchangeData(data);
+
+#ifndef NDEBUG
+        std::vector<HOST_DATA_TYPE> oldA(this->executionSettings->programSettings->blockSize * this->executionSettings->programSettings->blockSize * data.numBlocks);
+        std::copy(data.A, data.A + oldA.size(), oldA.data());
+#endif
+
+        this->dataHandler->reference_transpose(data);
+
+        double max_error = 0.0;
+        int error_count = 0;
+        for (size_t i = 0; i < this->executionSettings->programSettings->blockSize * this->executionSettings->programSettings->blockSize * data.numBlocks; i++) {
+            max_error = std::max(std::abs<double>(data.A[i]), max_error);
+            if (std::abs<double>(data.A[i]) - 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon() > 0.0) {
+                error_count++;
+            }
+        }
+
+#ifndef NDEBUG
+        long height_per_rank = reinterpret_cast<data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>*>(this->dataHandler.get())->getHeightforRank();
+        long width_per_rank = reinterpret_cast<data_handler::DistributedPQTransposeDataHandler<TDevice, TContext, TProgram>*>(this->dataHandler.get())->getWidthforRank();
+        if (error_count > 0) {
+            if ( this->mpi_comm_rank == 0) {
+                std::cout << "A:" << std::endl;
+                for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
+                    for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
+                        std::cout << oldA[j * width_per_rank * data.blockSize + i] << ", ";
+                    }
+                    std::cout << std::endl;
+                }
+                std::cout << std::endl;
+                std::cout << "B:" << std::endl;
+                for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
+                    for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
+                        std::cout << data.B[j * width_per_rank * data.blockSize + i] << ", ";
+                    }
+                    std::cout << std::endl;
+                }
+                std::cout << std::endl;
+                std::cout << "Transposed A:" << std::endl;
+                for (size_t j = 0; j < height_per_rank * data.blockSize; j++) {
+                    for (size_t i = 0; i < width_per_rank * data.blockSize; i++) {
+                        std::cout << data.A[j * width_per_rank * data.blockSize + i] << ", ";
+                    }
+                    std::cout << std::endl;
+                }
+                std::cout << std::endl;
+            }
+        }
+
+#endif
+
+        double global_max_error = 0;
+        int global_error_count = 0;
+        MPI_Reduce(&max_error, &global_max_error, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+        MPI_Reduce(&error_count, &global_error_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
+
+        this->errors.emplace("epsilon", std::numeric_limits<HOST_DATA_TYPE>::epsilon());
+        this->errors.emplace("max_error", global_max_error);
+
+        return static_cast<double>(global_max_error) < 100 * std::numeric_limits<HOST_DATA_TYPE>::epsilon();
+    }
 
+    /**
+     * @brief Transpose specific impelmentation of the error printing
+     *
+     */
+    void
+    printError() override {
+        if (this->mpi_comm_rank == 0) {
+            std::cout << "Maximum error: " << this->errors.at("max_error") << " < " << 100 * this->errors.at("epsilon") <<  std::endl;
+            std::cout << "Mach. Epsilon: " << this->errors.at("epsilon")  << std::endl;
+        }
+    }
+
+    /**
+     * @brief Transpose specific implementation of collecting the execution results
+     * 
+     */
+    void
+    collectResults() override {
+        double flops = static_cast<double>(this->executionSettings->programSettings->matrixSize) * this->executionSettings->programSettings->matrixSize;
+
+        // Number of experiment repetitions
+        uint number_measurements = this->timings.at("calculation").size();
+        std::vector<double> max_measures(number_measurements);
+        std::vector<double> max_transfers(number_measurements);
+#ifdef _USE_MPI_
+            // Copy the object variable to a local variable to make it accessible to the lambda function
+            int mpi_size = this->mpi_comm_size;
+            MPI_Reduce(this->timings.at("calculation").data(), max_measures.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+            MPI_Reduce(this->timings.at("transfer").data(), max_transfers.data(), number_measurements, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+#else
+            std::copy(this->timings.at("calculation").begin(), this->timings.at("calculation").end(), max_measures.begin());
+            std::copy(this->timings.at("transfer").begin(), this->timings.at("transfer").end(), max_transfers.begin());
+#endif
+
+        double avgCalculationTime = accumulate(max_measures.begin(), max_measures.end(), 0.0)
+                                    / max_measures.size();
+        this->results.emplace("avg_calc_t", hpcc_base::HpccResult(avgCalculationTime, "s"));
+
+        double minCalculationTime = *min_element(max_measures.begin(), max_measures.end());
+        this->results.emplace("min_calc_t", hpcc_base::HpccResult(minCalculationTime, "s"));
+
+        double avgTransferTime = accumulate(max_transfers.begin(), max_transfers.end(), 0.0)
+                                    / max_transfers.size();
+        this->results.emplace("avg_transfer_t", hpcc_base::HpccResult(avgTransferTime, "s"));
+
+        double minTransferTime = *min_element(max_transfers.begin(), max_transfers.end());
+        this->results.emplace("min_transfer_t", hpcc_base::HpccResult(minTransferTime, "s"));
+        
+        this->results.emplace("avg_t", hpcc_base::HpccResult(avgCalculationTime + avgTransferTime, "s"));
+        this->results.emplace("min_t", hpcc_base::HpccResult(minCalculationTime + minTransferTime, "s"));
+
+        this->results.emplace("avg_calc_flops", hpcc_base::HpccResult(flops / avgCalculationTime * 1.0e-9, "GFLOP/s"));
+        this->results.emplace("max_calc_flops", hpcc_base::HpccResult(flops / minCalculationTime * 1.0e-9, "GFLOP/s"));
+        this->results.emplace("avg_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgCalculationTime * 1.0e-9, "GB/s"));
+        this->results.emplace("max_mem_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minCalculationTime * 1.0e-9, "GB/s"));
+        this->results.emplace("avg_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / avgTransferTime * 1.0e-9, "GB/s"));
+        this->results.emplace("max_transfer_bandwidth", hpcc_base::HpccResult(flops * sizeof(HOST_DATA_TYPE) * 3 / minTransferTime * 1.0e-9, "GB/s"));
+    }
+    
     /**
      * @brief Transpose specific implementation of printing the execution results
      * 
-     * @param output Measured runtimes of the kernel execution
      */
     void
-    collectAndPrintResults(const TransposeExecutionTimings &output) override;
+    printResults() override {
+        if (this->mpi_comm_rank == 0) {
+            std::cout << std::setw(ENTRY_SPACE) << " "
+                << std::left << std::setw(ENTRY_SPACE) << "total time"
+                << std::setw(ENTRY_SPACE) << "transfer time"
+                << std::setw(ENTRY_SPACE) << "calc time"
+                << std::setw(ENTRY_SPACE) << "calc FLOPS"
+                << std::setw(ENTRY_SPACE) << "Memory Bandwidth"
+                << std::setw(ENTRY_SPACE) << "PCIe Bandwidth"
+                << std::right << std::endl;
+            std::cout << std::setw(ENTRY_SPACE) << "avg: "
+                << this->results.at("avg_t")
+                << this->results.at("avg_transfer_t")
+                << this->results.at("avg_calc_t")
+                << this->results.at("avg_calc_flops")
+                << this->results.at("avg_mem_bandwidth")
+                << this->results.at("avg_transfer_bandwidth")
+                << std::endl;
+            std::cout << std::setw(ENTRY_SPACE) << "best: " 
+                << this->results.at("min_t")
+                << this->results.at("min_transfer_t")
+                << this->results.at("min_calc_t")
+                << this->results.at("max_calc_flops")
+                << this->results.at("max_mem_bandwidth")
+                << this->results.at("max_transfer_bandwidth")
+                << std::endl;
+        }
+    }
 
     /**
      * @brief Construct a new Transpose Benchmark object
@@ -110,16 +339,20 @@ class TransposeBenchmark : public hpcc_base::HpccFpgaBenchmark<TransposeProgramS
      * @param argc the number of program input parameters
      * @param argv the program input parameters as array of strings
      */
-    TransposeBenchmark(int argc, char* argv[]);
+    TransposeBenchmark(int argc, char* argv[]) : hpcc_base::HpccFpgaBenchmark<transpose::TransposeProgramSettings,TDevice, TContext, TProgram, transpose::TransposeData<TContext>>(argc, argv) {
+        if (this->setupBenchmark(argc, argv)) {
+            this->setTransposeDataHandler(this->executionSettings->programSettings->dataHandlerIdentifier);
+        }
+    }
 
-        /**
+    /**
      * @brief Construct a new Transpose Benchmark object
      */
-    TransposeBenchmark();
+    TransposeBenchmark() : hpcc_base::HpccFpgaBenchmark<transpose::TransposeProgramSettings,TDevice, TContext, TProgram, transpose::TransposeData<TContext>>() {}
 
 };
 
-} // namespace stream
+} // namespace transpose
 
 
-#endif // SRC_HOST_STREAM_BENCHMARK_H_
+#endif // SRC_HOST_TRANSPOSE_BENCHMARK_H_
diff --git a/PTRANS/src/host/transpose_data.cpp b/PTRANS/src/host/transpose_data.cpp
index af794f30..62f44263 100644
--- a/PTRANS/src/host/transpose_data.cpp
+++ b/PTRANS/src/host/transpose_data.cpp
@@ -7,7 +7,8 @@
 transpose::TransposeProgramSettings::TransposeProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
     matrixSize(results["m"].as<uint>() * results["b"].as<uint>()),
     blockSize(results["b"].as<uint>()), dataHandlerIdentifier(transpose::data_handler::stringToHandler(results["handler"].as<std::string>())),
-    distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as<uint>()) {
+    distributeBuffers(results["distribute-buffers"].count() > 0), p(results["p"].as<uint>()), copyA(results["copy-a"].count() > 0),
+    useAcclStreams(results["accl-stream"].count() > 0) {
 
         // auto detect data distribution type if required
         if (dataHandlerIdentifier == transpose::data_handler::DataHandlerType::automatic) {
@@ -26,58 +27,17 @@ transpose::TransposeProgramSettings::TransposeProgramSettings(cxxopts::ParseResu
 std::map<std::string, std::string>
 transpose::TransposeProgramSettings::getSettingsMap() {
         auto map = hpcc_base::BaseSettings::getSettingsMap();
-        int mpi_size;
-#ifdef _USE_MPI_
-        MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-#endif
+        int mpi_comm_size;
+        MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_size);
+        // calculate the row and column of the MPI rank in the torus 
+        if (mpi_comm_size % p != 0) {
+            throw std::runtime_error("MPI Comm size not dividable by P=" + std::to_string(p) + "!");
+        } 
         map["Matrix Size"] = std::to_string(matrixSize);
         map["Block Size"] = std::to_string(blockSize);
         map["Dist. Buffers"] = distributeBuffers ? "Yes" : "No";
         map["Data Handler"] = transpose::data_handler::handlerToString(dataHandlerIdentifier);
+        map["FPGA Torus"] = "P=" + std::to_string(p) + " ,Q=" + std::to_string(mpi_comm_size / p);
         return map;
 }
 
-transpose::TransposeData::TransposeData(cl::Context context, uint block_size, uint y_size) : context(context), 
-                                                                                numBlocks(y_size), blockSize(block_size) {
-    if (numBlocks * blockSize > 0) {
-#ifdef USE_SVM
-        A = reinterpret_cast<HOST_DATA_TYPE*>(
-                            clSVMAlloc(context(), 0 ,
-                            block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024));
-        B = reinterpret_cast<HOST_DATA_TYPE*>(
-                            clSVMAlloc(context(), 0 ,
-                            block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024));
-        result = reinterpret_cast<HOST_DATA_TYPE*>(
-                            clSVMAlloc(context(), 0 ,
-                            block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024));
-        exchange = reinterpret_cast<HOST_DATA_TYPE*>(
-                            clSVMAlloc(context(), 0 ,
-                            block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 1024));
-#else
-        posix_memalign(reinterpret_cast<void **>(&A), 64,
-                    sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
-        posix_memalign(reinterpret_cast<void **>(&B), 64,
-                    sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
-        posix_memalign(reinterpret_cast<void **>(&result), 64,
-                    sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
-        posix_memalign(reinterpret_cast<void **>(&exchange), 64,
-                    sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
-#endif
-    }
-}
-
-transpose::TransposeData::~TransposeData() {
-    if (numBlocks * blockSize > 0) {
-#ifdef USE_SVM
-        clSVMFree(context(), reinterpret_cast<void*>(A));});
-        clSVMFree(context(), reinterpret_cast<void*>(B));});
-        clSVMFree(context(), reinterpret_cast<void*>(result));});
-        clSVMFree(context(), reinterpret_cast<void*>(exchange));});
-#else
-        free(A);
-        free(B);
-        free(result);
-        free(exchange);
-#endif
-    }
-}
diff --git a/PTRANS/src/host/transpose_data.hpp b/PTRANS/src/host/transpose_data.hpp
index a223353f..a4086525 100644
--- a/PTRANS/src/host/transpose_data.hpp
+++ b/PTRANS/src/host/transpose_data.hpp
@@ -74,6 +74,17 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings {
      */
     bool distributeBuffers;
 
+    /**
+    * @brief If true, create a copy of matrix A for each kernel replication
+    *
+    */
+    bool copyA;
+
+    /**
+     * @brief Indicate, if a design is used where the user kernels are directly connected to the ACCL CCLO
+    */
+    bool useAcclStreams;
+
     /**
      * @brief Construct a new Transpose Program Settings object
      * 
@@ -94,6 +105,7 @@ class TransposeProgramSettings : public hpcc_base::BaseSettings {
  * @brief Data class cotnaining the data the kernel is exeucted with
  * 
  */
+template<class TContext>
 class TransposeData {
 
 public:
@@ -138,7 +150,7 @@ class TransposeData {
      * @brief The context that is used to allocate memory in SVM mode
      * 
      */
-    cl::Context context;
+    TContext context;
 
     /**
      * @brief Construct a new Transpose Data object
@@ -147,33 +159,57 @@ class TransposeData {
      * @param block_size size of the quadratic blocks that are stored within this object
      * @param y_size number of blocks that are stored within this object per replication
      */
-    TransposeData(cl::Context context, uint block_size, uint size_y);
+    TransposeData(TContext &context, uint block_size, uint y_size): 
+#ifdef USE_SVM
+    context(context),
+#endif                                                                   
+    numBlocks(y_size), blockSize(block_size) {
+        if (numBlocks * blockSize > 0) {
+#ifdef USE_SVM
+            A = reinterpret_cast<HOST_DATA_TYPE*>(
+                                     clSVMAlloc(context(), 0 ,
+                                 block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 4096));
+            B = reinterpret_cast<HOST_DATA_TYPE*>(
+                                clSVMAlloc(context(), 0 ,
+                                block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 4096));
+            result = reinterpret_cast<HOST_DATA_TYPE*>(
+                                clSVMAlloc(context(), 0 ,
+                                block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 4096));
+            exchange = reinterpret_cast<HOST_DATA_TYPE*>(
+                                clSVMAlloc(context(), 0 ,
+                                block_size * block_size * y_size * sizeof(HOST_DATA_TYPE), 4096));
+#else
+            posix_memalign(reinterpret_cast<void **>(&A), 4096,
+                        sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
+            posix_memalign(reinterpret_cast<void **>(&B), 4096,
+                        sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
+            posix_memalign(reinterpret_cast<void **>(&result), 4096,
+                        sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
+            posix_memalign(reinterpret_cast<void **>(&exchange), 4096,
+                        sizeof(HOST_DATA_TYPE) * block_size * block_size * y_size);
+#endif
+        }
+    }
 
     /**
      * @brief Destroy the Transpose Data object. Free the allocated memory
      * 
      */
-    ~TransposeData();
-
-};
-
-/**
- * @brief Measured execution timing from the kernel execution
- * 
- */
-class TransposeExecutionTimings {
-public:
-    /**
-     * @brief A vector containing the timings for all repetitions for the data transfer
-     * 
-     */
-    std::vector<double> transferTimings;
-
-    /**
-     * @brief A vector containing the timings for all repetitions for the calculation
-     * 
-     */
-    std::vector<double> calculationTimings;
+    ~TransposeData() {
+        if (numBlocks * blockSize > 0) {
+#ifdef USE_SVM
+            clSVMFree(context(), reinterpret_cast<void*>(A));});
+            clSVMFree(context(), reinterpret_cast<void*>(B));});
+            clSVMFree(context(), reinterpret_cast<void*>(result));});
+            clSVMFree(context(), reinterpret_cast<void*>(exchange));});
+#else
+            free(A);
+            free(B);
+            free(result);
+            free(exchange);
+#endif
+        }
+    }
 
 };
 
diff --git a/PTRANS/tests/test_host_functionality.cpp b/PTRANS/tests/test_host_functionality.cpp
index 0f7c64a0..a0424fc6 100644
--- a/PTRANS/tests/test_host_functionality.cpp
+++ b/PTRANS/tests/test_host_functionality.cpp
@@ -9,10 +9,10 @@
 
 
 struct TransposeHostTest : testing::Test {
-    std::unique_ptr<transpose::TransposeBenchmark> bm;
+    std::unique_ptr<transpose::TransposeBenchmark<cl::Device, cl::Context, cl::Program>> bm;
 
     TransposeHostTest() {
-        bm = std::unique_ptr<transpose::TransposeBenchmark>( new transpose::TransposeBenchmark(global_argc, global_argv));
+        bm = std::unique_ptr<transpose::TransposeBenchmark<cl::Device, cl::Context, cl::Program>>( new transpose::TransposeBenchmark<cl::Device, cl::Context, cl::Program>(global_argc, global_argv));
     }
 };
 
@@ -24,22 +24,22 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatHeader) {
     std::vector<double> calculateTimings;
     transferTimings.push_back(1.0);
     calculateTimings.push_back(1.0);
-    std::shared_ptr<transpose::TransposeExecutionTimings> results(
-            new transpose::TransposeExecutionTimings{transferTimings, calculateTimings});
-
+    bm->addTimings("transfer", transferTimings);
+    bm->addTimings("calculation", calculateTimings);
 
     // Redirect stout buffer to local buffer to make checks possible
     std::stringstream newStdOutBuffer;
     std::streambuf *oldStdOutBuffer = std::cout.rdbuf();
     std::cout.rdbuf(newStdOutBuffer.rdbuf());
 
-    bm->collectAndPrintResults(*results);
+    bm->collectResults();
+    bm->printResults();
 
     // Redirect stdout to old buffer
     std::cout.rdbuf(oldStdOutBuffer);
 
     EXPECT_THAT(newStdOutBuffer.str(),
-                ::testing::MatchesRegex("(\\s+)total\\s\\[s\\](\\s+)transfer\\s\\[s\\](\\s+)calc\\s\\[s\\](\\s+)calc\\sFLOPS(\\s+)Mem\\s\\[B/s\\](\\s+)PCIe\\s\\[B/s\\]\n.*"));
+                ::testing::MatchesRegex("(\\s+)total\\stime(\\s+)transfer\\stime(\\s+)calc\\s+time(\\s+)calc\\sFLOPS(\\s+)Memory\\sBandwidth(\\s+)PCIe\\sBandwidth(\\s+)\n.*"));
 }
 
 /**
@@ -50,8 +50,8 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatValues) {
     std::vector<double> calculateTimings;
     transferTimings.push_back(1.0);
     calculateTimings.push_back(1.0);
-    std::shared_ptr<transpose::TransposeExecutionTimings> results(
-            new transpose::TransposeExecutionTimings{transferTimings, calculateTimings});
+    bm->addTimings("transfer", transferTimings);
+    bm->addTimings("calculation", calculateTimings);
 
 
     // Redirect stout buffer to local buffer to make checks possible
@@ -59,13 +59,14 @@ TEST_F(TransposeHostTest, OutputsCorrectFormatValues) {
     std::streambuf *oldStdOutBuffer = std::cout.rdbuf();
     std::cout.rdbuf(newStdOutBuffer.rdbuf());
 
-    bm->collectAndPrintResults(*results);
+    bm->collectResults();
+    bm->printResults();
 
     // Redirect stdout to old buffer
     std::cout.rdbuf(oldStdOutBuffer);
 
     EXPECT_THAT(newStdOutBuffer.str(),
-                ::testing::MatchesRegex(".*\navg:\\s+2\\.00000e\\+00\\s+1\\.00000e\\+00\\s+1\\.00000e\\+00.*\n.*\n"));
+                ::testing::MatchesRegex(".*\n\\s+avg:\\s+2\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s\\s+1\\.00000e\\+00\\s+s.*\n.*\n"));
 }
 
 /**
@@ -89,7 +90,8 @@ TEST_F(TransposeHostTest, AggregatedErrorIsPrinted) {
     std::streambuf *oldStdOutBuffer = std::cout.rdbuf();
     std::cout.rdbuf(newStdOutBuffer.rdbuf());
 
-    bool success = bm->validateOutputAndPrintError(*data);
+    bool success = bm->validateOutput(*data);
+    bm->printError();
 
     // Redirect stdout to old buffer
     std::cout.rdbuf(oldStdOutBuffer);
@@ -127,7 +129,8 @@ TEST_F(TransposeHostTest, ValidationIsSuccess) {
     std::streambuf *oldStdOutBuffer = std::cout.rdbuf();
     std::cout.rdbuf(newStdOutBuffer.rdbuf());
 
-    bool success = bm->validateOutputAndPrintError(*data);
+    bool success = bm->validateOutput(*data);
+    bm->printError();
 
     // Redirect stdout to old buffer
     std::cout.rdbuf(oldStdOutBuffer);
diff --git a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
index d7bc0c7f..3d8c0d4a 100644
--- a/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/PTRANS/tests/test_kernel_functionality_and_host_integration.cpp
@@ -6,18 +6,18 @@
 #include "gtest/gtest.h"
 #include "parameters.h"
 #include "test_program_settings.h"
-
+#include "nlohmann/json.hpp"
 
 struct TransposeKernelTest : testing::Test {
-    std::shared_ptr<transpose::TransposeData> data;
-    std::unique_ptr<transpose::TransposeBenchmark> bm;
+    std::shared_ptr<transpose::TransposeData<cl::Context>> data;
+    std::unique_ptr<transpose::TransposeBenchmark<cl::Device, cl::Context, cl::Program>> bm;
     uint matrix_size = BLOCK_SIZE;
     unsigned numberOfChannels = 4;
     std::string channelOutName = "kernel_output_ch";
     std::string channelInName = "kernel_input_ch";
 
     TransposeKernelTest() {
-        bm = std::unique_ptr<transpose::TransposeBenchmark>( new transpose::TransposeBenchmark(global_argc, global_argv));
+        bm = std::unique_ptr<transpose::TransposeBenchmark<cl::Device, cl::Context, cl::Program>>( new transpose::TransposeBenchmark<cl::Device, cl::Context, cl::Program>(global_argc, global_argv));
     }
 
     void SetUp() override {
@@ -195,12 +195,44 @@ TEST_F(TransposeKernelTest, FPGAAAndBAreSummedUp4Blocks) {
  */
 TEST_F(TransposeKernelTest, FPGATimingsMeasuredForEveryIteration) {
     bm->getExecutionSettings().programSettings->numRepetitions = 10;
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(result->calculationTimings.size(), 10);
-    EXPECT_EQ(result->transferTimings.size(), 10);
+    bm->executeKernel(*data);
+    EXPECT_EQ(bm->getTimingsMap().at("calculation").size(), 10);
+    EXPECT_EQ(bm->getTimingsMap().at("transfer").size(), 10);
     for (int t = 0; t < 10; t++) {
-        EXPECT_GE(result->transferTimings[t], 0.0);
-        EXPECT_GE(result->calculationTimings[t], 0.0);
+        EXPECT_GE(bm->getTimingsMap().at("transfer")[t], 0.0);
+        EXPECT_GE(bm->getTimingsMap().at("calculation")[t], 0.0);
     }
 }
 
+using json = nlohmann::json;
+
+TEST_F(TransposeKernelTest, JsonDump) {
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("ptrans.json");
+    std::FILE *f = std::fopen("ptrans.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].contains("calculation"));
+            EXPECT_TRUE(j["timings"].contains("transfer"));
+        }
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("avg_calc_flops"));
+            EXPECT_TRUE(j["results"].contains("avg_calc_t"));
+            EXPECT_TRUE(j["results"].contains("avg_mem_bandwidth"));
+            EXPECT_TRUE(j["results"].contains("avg_t"));
+            EXPECT_TRUE(j["results"].contains("avg_transfer_bandwidth"));
+            EXPECT_TRUE(j["results"].contains("avg_transfer_t"));
+            EXPECT_TRUE(j["results"].contains("max_calc_flops"));
+            EXPECT_TRUE(j["results"].contains("max_mem_bandwidth"));
+            EXPECT_TRUE(j["results"].contains("max_transfer_bandwidth"));
+            EXPECT_TRUE(j["results"].contains("min_calc_t"));
+            EXPECT_TRUE(j["results"].contains("min_t"));
+            EXPECT_TRUE(j["results"].contains("min_transfer_t"));
+        }
+    }
+}
diff --git a/PTRANS/tests/test_transpose_data_handlers.cpp b/PTRANS/tests/test_transpose_data_handlers.cpp
index f8531615..0caa3f5e 100644
--- a/PTRANS/tests/test_transpose_data_handlers.cpp
+++ b/PTRANS/tests/test_transpose_data_handlers.cpp
@@ -10,10 +10,10 @@
 
 
 struct TransposeHandlersTest : testing::Test {
-    std::unique_ptr<transpose::TransposeBenchmark> bm;
+    std::unique_ptr<transpose::TransposeBenchmark<cl::Device, cl::Context, cl::Program>> bm;
 
     TransposeHandlersTest() {
-        bm = std::unique_ptr<transpose::TransposeBenchmark>( new transpose::TransposeBenchmark(global_argc, global_argv));
+        bm = std::unique_ptr<transpose::TransposeBenchmark<cl::Device, cl::Context, cl::Program>>( new transpose::TransposeBenchmark<cl::Device, cl::Context, cl::Program>(global_argc, global_argv));
         bm->setTransposeDataHandler(transpose::data_handler::DataHandlerType::diagonal);
     }
 
@@ -29,11 +29,11 @@ struct TransposeHandlersTest : testing::Test {
  * Test DitExt class instantiation
  */
 TEST_F(TransposeHandlersTest, DistDiagCreateHandlerSuccess) {
-    EXPECT_NO_THROW(transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1));
+    EXPECT_NO_THROW((transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(0,1)));
 }
 
 TEST_F(TransposeHandlersTest, DistDiagCreateHandlerFail) {
-    EXPECT_THROW(transpose::data_handler::DistributedDiagonalTransposeDataHandler(1,1), std::runtime_error);
+    EXPECT_THROW((transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(1,1)), std::runtime_error);
 }
 
 /**
@@ -48,7 +48,7 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI1Block1) {
     bm->getExecutionSettings().programSettings->matrixSize = 4* matrix_size_in_blocks;
     uint block_count = 0;
     for (int i=0; i < mpi_size; i++) {
-        auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler(i, mpi_size);
+        auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(i, mpi_size);
         auto d = h.generateData(bm->getExecutionSettings());
         block_count += d->numBlocks;
     }
@@ -63,7 +63,7 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI3Block3) {
     bm->getExecutionSettings().programSettings->matrixSize = 4* matrix_size_in_blocks;
     uint block_count = 0;
     for (int i=0; i < mpi_size; i++) {
-        auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler(i, mpi_size);
+        auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(i, mpi_size);
         auto d = h.generateData(bm->getExecutionSettings());
         block_count += d->numBlocks;
     }
@@ -78,7 +78,7 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI9Block3) {
     bm->getExecutionSettings().programSettings->matrixSize = 4* matrix_size_in_blocks;
     uint block_count = 0;
     for (int i=0; i < mpi_size; i++) {
-        auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler(i, mpi_size);
+        auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(i, mpi_size);
         auto d = h.generateData(bm->getExecutionSettings());
         block_count += d->numBlocks;
     }
@@ -93,7 +93,7 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI5Block4) {
     bm->getExecutionSettings().programSettings->matrixSize = 4* matrix_size_in_blocks;
     uint block_count = 0;
     for (int i=0; i < mpi_size; i++) {
-        auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler(i, mpi_size);
+        auto h = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(i, mpi_size);
         auto d = h.generateData(bm->getExecutionSettings());
         block_count += d->numBlocks;
     }
@@ -105,35 +105,35 @@ TEST_F(TransposeHandlersTest, DistDiagNumberOfBlocksCorrectForMPI5Block4) {
  * 
  */
 TEST_F(TransposeHandlersTest, DataGenerationDistDiagSucceedsForMPISizeEquals1SingleBlock) {
-    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(0,1);
     bm->getExecutionSettings().programSettings->blockSize = 4;
     bm->getExecutionSettings().programSettings->matrixSize = 4;
     EXPECT_NO_THROW(handler.generateData(bm->getExecutionSettings()));
 }
 
 TEST_F(TransposeHandlersTest, DataGenerationDistDiagSucceedsForMPISizeEquals1Blocks9) {
-    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(0,1);
     bm->getExecutionSettings().programSettings->blockSize = 4;
     bm->getExecutionSettings().programSettings->matrixSize = 4*3;
     EXPECT_THROW(handler.generateData(bm->getExecutionSettings()), std::runtime_error);
 }
 
 TEST_F(TransposeHandlersTest, DataGenerationDistDiagSucceedsForMPISizeEquals3Blocks9) {
-    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,3);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(0,3);
     bm->getExecutionSettings().programSettings->blockSize = 4;
     bm->getExecutionSettings().programSettings->matrixSize = 4*3;
     EXPECT_NO_THROW(handler.generateData(bm->getExecutionSettings()));
 }
 
 TEST_F(TransposeHandlersTest, DataGenerationDistDiagFailsForMPISizeEquals3Blocks1) {
-    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,3);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(0,3);
     bm->getExecutionSettings().programSettings->blockSize = 4;
     bm->getExecutionSettings().programSettings->matrixSize = 4;
     EXPECT_NO_THROW(handler.generateData(bm->getExecutionSettings()));
 }
 
 TEST_F(TransposeHandlersTest, DataGenerationDistDiagFailsForMPISizeEquals3Blocks4) {
-    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,3);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(0,3);
     bm->getExecutionSettings().programSettings->blockSize = 4;
     bm->getExecutionSettings().programSettings->matrixSize = 4 * 2;
     EXPECT_THROW(handler.generateData(bm->getExecutionSettings()), std::runtime_error);
@@ -142,7 +142,7 @@ TEST_F(TransposeHandlersTest, DataGenerationDistDiagFailsForMPISizeEquals3Blocks
 TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagForOneReplication) {
     bm->getExecutionSettings().programSettings->kernelReplications = 1;
     bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize;
-    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(0,1);
     auto data = handler.generateData(bm->getExecutionSettings());
     EXPECT_EQ(data->blockSize, bm->getExecutionSettings().programSettings->blockSize);
     EXPECT_EQ(data->numBlocks, 1);
@@ -151,7 +151,7 @@ TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagForOneReplication) {
 TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagForTwoReplications) {
     bm->getExecutionSettings().programSettings->kernelReplications = 2;
     bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize;
-    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(0,1);
     auto data = handler.generateData(bm->getExecutionSettings());
     EXPECT_EQ(data->blockSize, bm->getExecutionSettings().programSettings->blockSize);
     EXPECT_EQ(data->numBlocks, 1);
@@ -160,7 +160,7 @@ TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagForTwoReplications) {
 TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagReproducableA) {
     bm->getExecutionSettings().programSettings->kernelReplications = 2;
     bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize;
-    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(0,1);
     auto data = handler.generateData(bm->getExecutionSettings());
     auto data2 = handler.generateData(bm->getExecutionSettings());
     double aggregated_error = 0.0;
@@ -173,7 +173,7 @@ TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagReproducableA) {
 TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagReproducableB) {
     bm->getExecutionSettings().programSettings->kernelReplications = 2;
     bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize;
-    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(0,1);
     auto data = handler.generateData(bm->getExecutionSettings());
     auto data2 = handler.generateData(bm->getExecutionSettings());
     double aggregated_error = 0.0;
@@ -186,7 +186,7 @@ TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagReproducableB) {
 TEST_F(TransposeHandlersTest, DataGenerationWorksDistDiagExchangeWorksForSingleRank) {
     bm->getExecutionSettings().programSettings->kernelReplications = 2;
     bm->getExecutionSettings().programSettings->matrixSize = bm->getExecutionSettings().programSettings->blockSize;
-    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler(0,1);
+    auto handler = transpose::data_handler::DistributedDiagonalTransposeDataHandler<cl::Device, cl::Context, cl::Program>(0,1);
     auto data = handler.generateData(bm->getExecutionSettings());
     auto data2 = handler.generateData(bm->getExecutionSettings());
     handler.exchangeData(*data);
diff --git a/README.md b/README.md
index d148fc83..fefd250b 100755
--- a/README.md
+++ b/README.md
@@ -41,13 +41,14 @@ All benchmarks come with the following build dependencies:
 - CMake >= 3.13
 - C++ compiler with C++11 and <regex> support (GCC 4.9.0+)
 - Intel OpenCL FPGA SDK or Xilinx Vitis
-- Python 3 for code generation and with [pandas](https://pandas.pydata.org) installed for the evaluation scripts
+- Python 3 with [jinja2](https://jinja.palletsprojects.com) for code generation and [pandas](https://pandas.pydata.org) for the evaluation scripts.
 
 Moreover, additional libraries are fetched by the build system during configuration:
 
 - [cxxopts](https://github.com/jarro2783/cxxopts) for option parsing
 - [hlslib](https://github.com/definelicht/hlslib) for CMake FindPackages
 - [Googletest](https://github.com/google/googletest) for unit testing
+- [json](https://github.com/nlohmann/json) for json output
 
 These dependencies will be downloaded automatically when configuring a benchmark for the first time.
 The exact version that are used can be found in the `CMakeLists.txt`located in the `extern` directory where all extern dependencies are defined.
diff --git a/RandomAccess/README.md b/RandomAccess/README.md
index 12e665d7..ede6a47d 100644
--- a/RandomAccess/README.md
+++ b/RandomAccess/README.md
@@ -76,6 +76,40 @@ For more information on available input parameters run
 
     ./RandomAccess_intel -h
     
+    Implementation of the random access benchmark proposed in the HPCC benchmark suite for FPGA.
+    Version: 2.5
+
+    MPI Version:  3.1
+    Config. Time: Thu Dec 08 10:42:40 UTC 2022
+    Git Commit:   86e0064-dirty
+
+    Usage:
+      ./bin/RandomAccess_intel [OPTION...]
+
+      -f, --file arg          Kernel file name
+      -n, arg                 Number of repetitions (default: 10)
+      -i,                     Use memory Interleaving
+          --skip-validation   Skip the validation of the output data. This will
+                              speed up execution and helps when working with
+                              special data types.
+          --device arg        Index of the device that has to be used. If not
+                              given you will be asked which device to use if there
+                              are multiple devices available. (default: 0)
+          --platform arg      Index of the platform that has to be used. If not
+                              given you will be asked which platform to use if
+                              there are multiple platforms available. (default: 0)
+          --platform_str arg  Name of the platform that has to be used (default:
+                              )
+      -r, arg                 Number of used kernel replications (default: 4)
+          --dump-json arg     dump benchmark configuration and results to this
+                              file in json format (default: )
+          --test              Only test given configuration and skip execution
+                              and validation
+      -h, --help              Print this help
+      -d, arg                 Log2 of the size of the data array (default: 29)
+      -g, arg                 Log2 of the number of random number generators
+                              (default: 5)
+
 To execute the unit and integration tests for Intel devices run
 
     CL_CONTEXT_EMULATOR_DEVICE=1 ./RandomAccess_test_intel -f KERNEL_FILE_NAME
@@ -88,9 +122,10 @@ It will run an emulation of the kernel and execute some functionality tests.
 The host code will print the results of the execution to the standard output.
 The result  summary looks similar to this:
 
-    Error: 9.87137e-03%
-    best         mean         GUPS      
-    1.73506e+01  1.73507e+01  2.47540e-01 
+    Error: 3.90625e-03
+
+    best                mean                GUOPS
+    5.04258e-04 s       7.85656e-04 s       2.03071e-03 GUOP/s
 
 - `best` and `mean` are the fastest and the mean kernel execution time.
     The pure kernel execution time is measured without transferring the buffer
@@ -105,3 +140,97 @@ The result  summary looks similar to this:
 
 Benchmark results can be found in the `results` folder in this
 repository.
+
+The json output looks like the following.
+
+```json
+
+{
+  "config_time": "Wed Dec 14 08:43:07 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {
+    "ratio": 0.00390625
+  },
+  "execution_time": "Wed Dec 14 09:54:47 UTC 2022",
+  "git_commit": "be1a4e9-dirty",
+  "mpi": {
+    "subversion": 1,
+    "version": 3
+  },
+  "name": "random access",
+  "results": {
+    "guops": {
+      "unit": "GUOP/s",
+      "value": 0.0021329867229908477
+    },
+    "t_mean": {
+      "unit": "s",
+      "value": 0.0005428726000000001
+    },
+    "t_min": {
+      "unit": "s",
+      "value": 0.000480078
+    }
+  },
+  "settings": {
+    "#RNGs": 32,
+    "Array Size": 256,
+    "Communication Type": false,
+    "Kernel File": false,
+    "Kernel Replications": 4,
+    "MPI Ranks": 1,
+    "Repetitions": 10,
+    "Test Mode": false
+  },
+  "timings": {
+    "execution": [
+      {
+        "unit": "s",
+        "value": 0.000643471
+      },
+      {
+        "unit": "s",
+        "value": 0.000516849
+      },
+      {
+        "unit": "s",
+        "value": 0.000606361
+      },
+      {
+        "unit": "s",
+        "value": 0.00058182
+      },
+      {
+        "unit": "s",
+        "value": 0.00060401
+      },
+      {
+        "unit": "s",
+        "value": 0.000485259
+      },
+      {
+        "unit": "s",
+        "value": 0.000484699
+      },
+      {
+        "unit": "s",
+        "value": 0.00053713
+      },
+      {
+        "unit": "s",
+        "value": 0.000489049
+      },
+      {
+        "unit": "s",
+        "value": 0.000480078
+      }
+    ]
+  },
+  "validated": true,
+  "version": "2.5"
+}
+
+```
diff --git a/RandomAccess/src/common/parameters.h.in b/RandomAccess/src/common/parameters.h.in
index 837d3c74..a47f850e 100644
--- a/RandomAccess/src/common/parameters.h.in
+++ b/RandomAccess/src/common/parameters.h.in
@@ -35,8 +35,8 @@
 Short description of the program.
 Moreover the version and build time is also compiled into the description.
 */
-
-#define PROGRAM_DESCRIPTION "Implementation of the random access benchmark"\
+#define PROGRAM_NAME "random access"
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
 
@@ -62,4 +62,4 @@ Output separator
 
 
 
-#endif // SRC_COMMON_PARAMETERS_H_
\ No newline at end of file
+#endif // SRC_COMMON_PARAMETERS_H_
diff --git a/RandomAccess/src/device/random_access_kernels_single.cl b/RandomAccess/src/device/random_access_kernels_single.cl
index f7c59260..5ebc1376 100644
--- a/RandomAccess/src/device/random_access_kernels_single.cl
+++ b/RandomAccess/src/device/random_access_kernels_single.cl
@@ -34,14 +34,13 @@ Constant used to update the pseudo random number
 #define BLOCK_SIZE_LOG GLOBAL_MEM_UNROLL_LOG
 #define BLOCK_SIZE (1 << BLOCK_SIZE_LOG)
 
-/* PY_CODE_GEN 
-try:
-    kernel_param_attributes = generate_attributes(num_replications)
-except:
-    kernel_param_attributes = ["" for i in range(num_replications)]
-*/
+{% if generate_attributes is defined %}
+    {% set kernel_param_attributes = generate_attributes(num_replications) %}
+{% else %}
+    {% set kernel_param_attributes = create_list("", num_replications) %}
+{% endif %}
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 /*
 Kernel, that will update the given data array accoring to a predefined pseudo-
@@ -56,8 +55,8 @@ to the kernel.
 */
 __attribute__((max_global_work_dim(0),uses_global_work_offset(0)))
 __kernel
-void accessMemory_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_DATA_TYPE_UNSIGNED  volatile * restrict data,
-                        __constant /*PY_CODE_GEN kernel_param_attributes[i]*/ const DEVICE_DATA_TYPE_UNSIGNED * restrict random_init,
+void accessMemory_{{ i }}(__global {{ kernel_param_attributes[i] }} DEVICE_DATA_TYPE_UNSIGNED  volatile * restrict data,
+                        __constant {{ kernel_param_attributes[i] }} const DEVICE_DATA_TYPE_UNSIGNED * restrict random_init,
                         const DEVICE_DATA_TYPE_UNSIGNED m,
                         const DEVICE_DATA_TYPE_UNSIGNED data_chunk,
                         const uint num_cache_operations,
@@ -190,4 +189,4 @@ void accessMemory_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attribut
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/RandomAccess/src/host/execution.h b/RandomAccess/src/host/execution.h
index 88cf6736..51d1796d 100644
--- a/RandomAccess/src/host/execution.h
+++ b/RandomAccess/src/host/execution.h
@@ -40,7 +40,7 @@ namespace bm_execution {
  * @param data The data that is used as input and output of the random accesses
  * @return std::unique_ptr<random_access::RandomAccessExecutionTimings> The measured runtimes of the kernel
  */
-std::unique_ptr<random_access::RandomAccessExecutionTimings>
+std::map<std::string, std::vector<double>>
 calculate(hpcc_base::ExecutionSettings<random_access::RandomAccessProgramSettings> const& config, HOST_DATA_TYPE * data, int mpi_rank, int mpi_size);
 
 }  // namespace bm_execution
diff --git a/RandomAccess/src/host/execution_single.cpp b/RandomAccess/src/host/execution_single.cpp
index 486234bf..d4718083 100644
--- a/RandomAccess/src/host/execution_single.cpp
+++ b/RandomAccess/src/host/execution_single.cpp
@@ -40,7 +40,7 @@ namespace bm_execution {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
-    std::unique_ptr<random_access::RandomAccessExecutionTimings>
+    std::map<std::string, std::vector<double>>
     calculate(hpcc_base::ExecutionSettings<random_access::RandomAccessProgramSettings> const& config, HOST_DATA_TYPE * data, int mpi_rank, int mpi_size) {
         // int used to check for OpenCL errors
         int err;
@@ -204,7 +204,10 @@ namespace bm_execution {
 
         free(random_inits);
 
-        return std::unique_ptr<random_access::RandomAccessExecutionTimings>(new random_access::RandomAccessExecutionTimings{executionTimes});
-    }
+        std::map<std::string, std::vector<double>> timings;
+
+        timings["execution"] = executionTimes;
 
+        return timings;
+    }
 }  // namespace bm_execution
diff --git a/RandomAccess/src/host/random_access_benchmark.cpp b/RandomAccess/src/host/random_access_benchmark.cpp
index e51e1fe2..94c63d0a 100644
--- a/RandomAccess/src/host/random_access_benchmark.cpp
+++ b/RandomAccess/src/host/random_access_benchmark.cpp
@@ -36,7 +36,6 @@ SOFTWARE.
 
 random_access::RandomAccessProgramSettings::RandomAccessProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
     dataSize((1UL << results["d"].as<size_t>())),
-    kernelReplications(results["r"].as<uint>()),
     numRngs((1UL << results["g"].as<uint>())) {
 
 }
@@ -51,7 +50,6 @@ random_access::RandomAccessProgramSettings::getSettingsMap() {
     std::stringstream ss;
     ss << dataSize << " (" << static_cast<double>(dataSize * sizeof(HOST_DATA_TYPE) * mpi_size) << " Byte )";
     map["Array Size"] = ss.str();
-    map["Kernel Replications"] = std::to_string(kernelReplications);
     map["#RNGs"] = std::to_string(numRngs);
     return map;
 }
@@ -87,43 +85,49 @@ random_access::RandomAccessBenchmark::addAdditionalParseOptions(cxxopts::Options
             cxxopts::value<uint>()->default_value(std::to_string(HPCC_FPGA_RA_RNG_COUNT_LOG)));
 }
 
-std::unique_ptr<random_access::RandomAccessExecutionTimings>
+void
 random_access::RandomAccessBenchmark::executeKernel(RandomAccessData &data) {
-    return bm_execution::calculate(*executionSettings, data.data, mpi_comm_rank, mpi_comm_size);
+    timings = bm_execution::calculate(*executionSettings, data.data, mpi_comm_rank, mpi_comm_size);
 }
 
 void
-random_access::RandomAccessBenchmark::collectAndPrintResults(const random_access::RandomAccessExecutionTimings &output) {
+random_access::RandomAccessBenchmark::collectResults() {
 
-    std::vector<double> avgTimings(output.times.size());
+    std::vector<double> avgTimings(timings.at("execution").size());
 #ifdef _USE_MPI_
     // Copy the object variable to a local variable to make it accessible to the lambda function
     int mpi_size = mpi_comm_size;
-    MPI_Reduce(output.times.data(),avgTimings.data(),output.times.size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-    std::for_each(avgTimings.begin(),avgTimings.end(), [mpi_size](double& x) {x /= mpi_size;});
+    MPI_Reduce(timings.at("execution").data(), avgTimings.data(),timings.at("execution").size(), MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    std::for_each(avgTimings.begin(), avgTimings.end(), [mpi_size](double& x) {x /= mpi_size;});
 #else
-    std::copy(output.times.begin(), output.times.end(), avgTimings.begin());
+    std::copy(timings.at("execution").begin(), timings.at("execution").end(), avgTimings.begin());
 #endif
+    // Calculate performance for kernel execution
+    double tmean = 0;
+    double tmin = std::numeric_limits<double>::max();
+    double gups = static_cast<double>(4 * executionSettings->programSettings->dataSize * mpi_comm_size) / 1000000000;
+    for (double currentTime : avgTimings) {
+        tmean +=  currentTime;
+        if (currentTime < tmin) {
+            tmin = currentTime;
+        }
+    }
+    tmean = tmean / timings.at("execution").size();
+
+    results.emplace("t_min", hpcc_base::HpccResult(tmin, "s"));
+    results.emplace("t_mean", hpcc_base::HpccResult(tmean, "s"));
+    results.emplace("guops", hpcc_base::HpccResult(gups / tmin, "GUOP/s"));
+}
+
+void random_access::RandomAccessBenchmark::printResults() {
     if (mpi_comm_rank == 0) {
-        std::cout << std::setw(ENTRY_SPACE)
+        std::cout << std::left << std::setw(ENTRY_SPACE)
                 << "best" << std::setw(ENTRY_SPACE) << "mean"
-                << std::setw(ENTRY_SPACE) << "GUOPS" << std::endl;
-
-        // Calculate performance for kernel execution
-        double tmean = 0;
-        double tmin = std::numeric_limits<double>::max();
-        double gups = static_cast<double>(4 * executionSettings->programSettings->dataSize * mpi_comm_size) / 1000000000;
-        for (double currentTime : avgTimings) {
-            tmean +=  currentTime;
-            if (currentTime < tmin) {
-                tmin = currentTime;
-            }
-        }
-        tmean = tmean / output.times.size();
+                << std::setw(ENTRY_SPACE) << "GUOPS" << std::right << std::endl;
 
         std::cout << std::setw(ENTRY_SPACE)
-                << tmin << std::setw(ENTRY_SPACE) << tmean
-                << std::setw(ENTRY_SPACE) << gups / tmin
+                << results.at("t_min") << std::setw(ENTRY_SPACE) << results.at("t_mean")
+                << std::setw(ENTRY_SPACE) << results.at("guops")
                 << std::endl;
     }
 }
@@ -155,7 +159,7 @@ random_access::RandomAccessBenchmark::generateInputData() {
 }
 
 bool  
-random_access::RandomAccessBenchmark::validateOutputAndPrintError(random_access::RandomAccessData &data) {
+random_access::RandomAccessBenchmark::validateOutput(random_access::RandomAccessData &data) {
 
     HOST_DATA_TYPE* rawdata;
     if (mpi_comm_size > 1) {
@@ -186,19 +190,18 @@ random_access::RandomAccessBenchmark::validateOutputAndPrintError(random_access:
             rawdata[(temp >> 3) & (executionSettings->programSettings->dataSize * mpi_comm_size - 1)] ^= temp;
         }
 
-        double errors = 0;
-#pragma omp parallel for reduction(+:errors)
+        double error_count = 0;
+#pragma omp parallel for reduction(+:error_count)
         for (HOST_DATA_TYPE i=0; i< executionSettings->programSettings->dataSize * mpi_comm_size; i++) {
             if (rawdata[i] != i) {
                 // If the array at index i does not contain i, it differs from the initial value and is counted as an error
-                errors++;
+                error_count++;
             }
         }
 
         // The overall error is calculated in percent of the overall array size
-        double error_ratio = static_cast<double>(errors) / (executionSettings->programSettings->dataSize * mpi_comm_size);
-        std::cout  << "Error: " << error_ratio * 100 
-                    << "%" << std::endl;
+        double error_ratio = static_cast<double>(error_count) / (executionSettings->programSettings->dataSize * mpi_comm_size);
+        errors.emplace("ratio", error_ratio);
 
 #ifdef _USE_MPI_
         if (mpi_comm_rank == 0 && mpi_comm_size > 1) {
@@ -212,3 +215,10 @@ random_access::RandomAccessBenchmark::validateOutputAndPrintError(random_access:
     // All other ranks skip validation and always return true
     return true;
 }
+
+void
+random_access::RandomAccessBenchmark::printError() {
+    if (mpi_comm_rank == 0) {
+        std::cout  << "Error: " << errors.at("ratio") * 100 << " %" << std::endl;
+    }
+}
diff --git a/RandomAccess/src/host/random_access_benchmark.hpp b/RandomAccess/src/host/random_access_benchmark.hpp
index 393c9b53..0bbd02e2 100644
--- a/RandomAccess/src/host/random_access_benchmark.hpp
+++ b/RandomAccess/src/host/random_access_benchmark.hpp
@@ -50,12 +50,6 @@ class RandomAccessProgramSettings : public hpcc_base::BaseSettings {
      */
     size_t dataSize;
 
-    /**
-     * @brief The number of used kernel replications
-     * 
-     */
-    uint kernelReplications;
-
     /**
      * @brief Number of random number generators that are used per kernel replication
      * 
@@ -114,25 +108,11 @@ class RandomAccessData {
 
 };
 
-/**
- * @brief Measured execution timing from the kernel execution
- * 
- */
-class RandomAccessExecutionTimings {
-public:
-    /**
-     * @brief A vector containing the timings for all repetitions
-     * 
-     */
-    std::vector<double> times;
-
-};
-
 /**
  * @brief Implementation of the random access benchmark
  * 
  */
-class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark<RandomAccessProgramSettings, RandomAccessData, RandomAccessExecutionTimings> {
+class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark<RandomAccessProgramSettings, RandomAccessData> {
 
 protected:
 
@@ -158,9 +138,8 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark<RandomAccessPr
      * @brief RandomAccess specific implementation of the kernel execution
      * 
      * @param data The benchmark input and output data
-     * @return std::unique_ptr<RandomAccessExecutionTimings> 
      */
-    std::unique_ptr<RandomAccessExecutionTimings>
+    void
     executeKernel(RandomAccessData &data) override;
 
     /**
@@ -171,7 +150,14 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark<RandomAccessPr
      * @return false otherwise
      */
     bool
-    validateOutputAndPrintError(RandomAccessData &data) override;
+    validateOutput(RandomAccessData &data) override;
+
+    /**
+     * @brief RandomAccess specific implementation of the error printing
+     *
+     */
+    void
+    printError() override;
 
     /**
      * @brief RandomAccess specific implementation of printing the execution results
@@ -179,7 +165,10 @@ class RandomAccessBenchmark : public hpcc_base::HpccFpgaBenchmark<RandomAccessPr
      * @param output The measurement values that are generated yb the kernel execution
      */
     void
-    collectAndPrintResults(const RandomAccessExecutionTimings &output) override;
+    collectResults() override;
+
+    void
+    printResults() override;
 
     /**
      * @brief Check the given bencmark configuration and its validity
diff --git a/RandomAccess/tests/test_host_code.cpp b/RandomAccess/tests/test_host_code.cpp
index b96f3dc1..59d1a27c 100644
--- a/RandomAccess/tests/test_host_code.cpp
+++ b/RandomAccess/tests/test_host_code.cpp
@@ -24,10 +24,10 @@ struct RandomAccessHostCodeTest : testing::Test {
 TEST_F(RandomAccessHostCodeTest, ResultValidationWorksForCorrectUpdates) {
     auto data = bm->generateInputData();
     // do random accesses
-    bm->validateOutputAndPrintError(*data);
+    bm->validateOutput(*data);
     // check correctness of random accesses
-    bool success = bm->validateOutputAndPrintError(*data);
-    EXPECT_TRUE(success);
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError();
 }
 
 /**
@@ -53,6 +53,6 @@ TEST_F(RandomAccessHostCodeTest, ValidDataSizeAreDetected) {
 TEST_F(RandomAccessHostCodeTest, ResultValidationWorksForWrongUpdates) {
     auto data = bm->generateInputData();
     // check correctness of random accesses
-    bool success = bm->validateOutputAndPrintError( *data);
-    EXPECT_FALSE(success);
+    EXPECT_FALSE(bm->validateOutput(*data));
+    bm->printError();
 }
diff --git a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp
index 067ea7ab..a52ce55f 100644
--- a/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/RandomAccess/tests/test_kernel_functionality_and_host_integration.cpp
@@ -5,7 +5,7 @@
 #include "parameters.h"
 #include "random_access_benchmark.hpp"
 #include "test_program_settings.h"
-
+#include "nlohmann/json.hpp"
 
 struct RandomAccessKernelTest : testing::Test {
     std::unique_ptr<random_access::RandomAccessData> data;
@@ -28,8 +28,8 @@ struct RandomAccessKernelTest : testing::Test {
  * Check if the number of measurements from the calculation matches the number of repetitions
  */
 TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements1Rep) {
-    auto result = bm->executeKernel( *data);
-    EXPECT_EQ(result->times.size(), 1);
+    bm->executeKernel( *data);
+    EXPECT_EQ(bm->getTimingsMap().at("execution").size(), 1);
 }
 
 /**
@@ -37,15 +37,38 @@ TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements1Rep) {
  */
 TEST_F(RandomAccessKernelTest, FPGACorrectNumberOfMeasurements3Rep) {
     bm->getExecutionSettings().programSettings->numRepetitions = 3;
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(result->times.size(), 3);
+    bm->executeKernel(*data);
+    EXPECT_EQ(bm->getTimingsMap().at("execution").size(), 3);
 }
 
 /**
  * Execution returns correct results for a single repetition
  */
 TEST_F(RandomAccessKernelTest, FPGAErrorBelow1Percent) {
-    auto result = bm->executeKernel(*data);
-    bool success = bm->validateOutputAndPrintError(*data);
-    EXPECT_TRUE(success);
+    bm->executeKernel(*data);
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError();
+}
+
+using json = nlohmann::json;
+
+TEST_F(RandomAccessKernelTest, JsonDump) {
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("fft.json");
+    std::FILE *f = std::fopen("fft.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].contains("execution"));
+        }
+        EXPECT_TRUE(j.contains("results"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("guops"));
+            EXPECT_TRUE(j["results"].contains("t_mean"));
+            EXPECT_TRUE(j["results"].contains("t_min"));
+        }
+    }
 }
diff --git a/STREAM/README.md b/STREAM/README.md
index 4c5fa5ff..298777b3 100644
--- a/STREAM/README.md
+++ b/STREAM/README.md
@@ -73,24 +73,40 @@ For execution of the benchmark run:
 For more information on available input parameters run
 
     $./STREAM_FPGA_intel -h
+
     Implementation of the STREAM benchmark proposed in the HPCC benchmark suite for FPGA.
+    Version: 2.6
+
+    MPI Version:  3.1
+    Config. Time: Thu Dec 08 10:43:26 UTC 2022
+    Git Commit:   86e0064-dirty
+
     Usage:
-    ./STREAM_FPGA_xilinx [OPTION...]
-
-    -f, --file arg       Kernel file name
-    -n, arg              Number of repetitions (default: 10)
-    -s, arg              Size of the data arrays (default: 134217728)
-    -r, arg              Number of kernel replications used (default: 1)
-        --multi-kernel  Use the legacy multi-kernel implementation
-        --device arg     Index of the device that has to be used. If not given
-                        you will be asked which device to use if there are
-                        multiple devices available. (default: -1)
-        --platform arg   Index of the platform that has to be used. If not
-                        given you will be asked which platform to use if there are
-                        multiple platforms available. (default: -1)
-    -h, --help           Print this help
+      ./bin/STREAM_FPGA_intel [OPTION...]
+
+      -f, --file arg          Kernel file name
+      -n, arg                 Number of repetitions (default: 10)
+      -i,                     Use memory Interleaving
+          --skip-validation   Skip the validation of the output data. This will
+                              speed up execution and helps when working with
+                              special data types.
+          --device arg        Index of the device that has to be used. If not
+                              given you will be asked which device to use if there
+                              are multiple devices available. (default: 0)
+          --platform arg      Index of the platform that has to be used. If not
+                              given you will be asked which platform to use if
+                              there are multiple platforms available. (default: 0)
+          --platform_str arg  Name of the platform that has to be used (default:
+                              )
+      -r, arg                 Number of used kernel replications (default: 4)
+          --dump-json arg     dump benchmark configuration and results to this
+                              file in json format (default: )
+          --test              Only test given configuration and skip execution
+                              and validation
+      -h, --help              Print this help
+      -s, arg                 Size of the data arrays (default: 134217728)
+          --multi-kernel      Use the legacy multi kernel implementation
 
-    
 To execute the unit and integration tests for Intel devices run
 
     CL_CONTEXT_EMULATOR_DEVICE=1 ./STREAM_FPGA_test_intel -f KERNEL_FILE_NAME
@@ -102,13 +118,13 @@ It will run an emulation of the kernel and execute some functionality tests.
 
 The output of the host application is similar to the original STREAM benchmark:
 
-	Function    Best Rate MB/s  Avg time     Min time     Max time
-	Copy:           30875.9     0.025914     0.025910     0.025919
-	Scale:          30885.6     0.025905     0.025902     0.025911
-	Add:            46289.2     0.025928     0.025924     0.025935
-	Triad:          45613.4     0.026310     0.026308     0.026312
-	PCI Write:       6324.0     0.189800     0.189753     0.189862
-	PCI Read:        5587.3     0.214869     0.214773     0.214943
+    Function            Best Rate           Avg time            Min time            Max time
+    PCI_write           2.68152e+04 MB/s    6.36535e-02 s       6.00633e-02 s       8.45139e-02 s
+    PCI_read            2.47220e+04 MB/s    6.72553e-02 s       6.51490e-02 s       6.82519e-02 s
+    Copy                4.75583e+04 MB/s    2.32275e-02 s       2.25774e-02 s       2.55071e-02 s
+    Scale               5.35745e+04 MB/s    2.13423e-02 s       2.00420e-02 s       2.42722e-02 s
+    Add                 5.36221e+04 MB/s    3.33479e-02 s       3.00364e-02 s       3.68116e-02 s
+    Triad               4.84564e+04 MB/s    3.46477e-02 s       3.32384e-02 s       3.70085e-02 s
 
 In addition it also measures the bandwidth of the connection between host and
 device. It is distinguished between writing to and reading from the devices
@@ -143,4 +159,400 @@ The raw data of these runs can be found in the folder `csv_result_export`.
 ![Single precision results](csv_result_export/sp_global_ring_plot.jpeg)
 
 ##### Double Precision
-![Double precision results](csv_result_export/dp_global_ring_plot.jpeg)
\ No newline at end of file
+![Double precision results](csv_result_export/dp_global_ring_plot.jpeg)
+
+```json
+
+{
+  "config_time": "Wed Dec 14 08:43:42 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {
+    "a_average_error": 0,
+    "a_average_relative_error": 0,
+    "a_expected": 1153300692992,
+    "b_average_error": 0,
+    "b_average_relative_error": 0,
+    "b_expected": 230660145152,
+    "c_average_error": 0,
+    "c_average_relative_error": 0,
+    "c_expected": 307546849280,
+    "epsilon": 1.1920928955078125e-07
+  },
+  "execution_time": "Wed Dec 14 09:29:17 UTC 2022",
+  "git_commit": "be1a4e9-dirty",
+  "mpi": {
+    "subversion": 1,
+    "version": 3
+  },
+  "name": "STREAM",
+  "results": {
+    "Add_avg_t": {
+      "unit": "s",
+      "value": 0.0530118015
+    },
+    "Add_best_rate": {
+      "unit": "MB/s",
+      "value": 30506.44534004568
+    },
+    "Add_max_t": {
+      "unit": "s",
+      "value": 0.053374228
+    },
+    "Add_min_t": {
+      "unit": "s",
+      "value": 0.052795818
+    },
+    "Copy_avg_t": {
+      "unit": "s",
+      "value": 0.0389517071
+    },
+    "Copy_best_rate": {
+      "unit": "MB/s",
+      "value": 27731.67753145461
+    },
+    "Copy_max_t": {
+      "unit": "s",
+      "value": 0.040187928
+    },
+    "Copy_min_t": {
+      "unit": "s",
+      "value": 0.038718964
+    },
+    "PCI_read_avg_t": {
+      "unit": "s",
+      "value": 0.0597715322
+    },
+    "PCI_read_best_rate": {
+      "unit": "MB/s",
+      "value": 27479.82304062059
+    },
+    "PCI_read_max_t": {
+      "unit": "s",
+      "value": 0.063351971
+    },
+    "PCI_read_min_t": {
+      "unit": "s",
+      "value": 0.058610739
+    },
+    "PCI_write_avg_t": {
+      "unit": "s",
+      "value": 0.0685080558
+    },
+    "PCI_write_best_rate": {
+      "unit": "MB/s",
+      "value": 25765.843668891466
+    },
+    "PCI_write_max_t": {
+      "unit": "s",
+      "value": 0.120777629
+    },
+    "PCI_write_min_t": {
+      "unit": "s",
+      "value": 0.062509606
+    },
+    "Scale_avg_t": {
+      "unit": "s",
+      "value": 0.03978323250000001
+    },
+    "Scale_best_rate": {
+      "unit": "MB/s",
+      "value": 27084.469403573872
+    },
+    "Scale_max_t": {
+      "unit": "s",
+      "value": 0.039983335
+    },
+    "Scale_min_t": {
+      "unit": "s",
+      "value": 0.039644189
+    },
+    "Triad_avg_t": {
+      "unit": "s",
+      "value": 0.052600337100000005
+    },
+    "Triad_best_rate": {
+      "unit": "MB/s",
+      "value": 30701.997665172144
+    },
+    "Triad_max_t": {
+      "unit": "s",
+      "value": 0.052735936
+    },
+    "Triad_min_t": {
+      "unit": "s",
+      "value": 0.052459542
+    }
+  },
+  "settings": {
+    "Array Size": 134217728,
+    "Communication Type": false,
+    "Data Type": false,
+    "Kernel File": false,
+    "Kernel Replications": 4,
+    "Kernel Type": false,
+    "MPI Ranks": 1,
+    "Repetitions": 10,
+    "Test Mode": false
+  },
+  "timings": {
+    "Add": [
+      {
+        "unit": "s",
+        "value": 0.052848008
+      },
+      {
+        "unit": "s",
+        "value": 0.052795818
+      },
+      {
+        "unit": "s",
+        "value": 0.053294617
+      },
+      {
+        "unit": "s",
+        "value": 0.053374228
+      },
+      {
+        "unit": "s",
+        "value": 0.052812528
+      },
+      {
+        "unit": "s",
+        "value": 0.053091652
+      },
+      {
+        "unit": "s",
+        "value": 0.052962381
+      },
+      {
+        "unit": "s",
+        "value": 0.052992892
+      },
+      {
+        "unit": "s",
+        "value": 0.052880469
+      },
+      {
+        "unit": "s",
+        "value": 0.053065422
+      }
+    ],
+    "Copy": [
+      {
+        "unit": "s",
+        "value": 0.040187928
+      },
+      {
+        "unit": "s",
+        "value": 0.038718964
+      },
+      {
+        "unit": "s",
+        "value": 0.038728084
+      },
+      {
+        "unit": "s",
+        "value": 0.038760534
+      },
+      {
+        "unit": "s",
+        "value": 0.038793734
+      },
+      {
+        "unit": "s",
+        "value": 0.039005018
+      },
+      {
+        "unit": "s",
+        "value": 0.038862845
+      },
+      {
+        "unit": "s",
+        "value": 0.038731043
+      },
+      {
+        "unit": "s",
+        "value": 0.038891176
+      },
+      {
+        "unit": "s",
+        "value": 0.038837745
+      }
+    ],
+    "PCI_read": [
+      {
+        "unit": "s",
+        "value": 0.058610739
+      },
+      {
+        "unit": "s",
+        "value": 0.059211539
+      },
+      {
+        "unit": "s",
+        "value": 0.059094178
+      },
+      {
+        "unit": "s",
+        "value": 0.063351971
+      },
+      {
+        "unit": "s",
+        "value": 0.059738369
+      },
+      {
+        "unit": "s",
+        "value": 0.059645487
+      },
+      {
+        "unit": "s",
+        "value": 0.059697218
+      },
+      {
+        "unit": "s",
+        "value": 0.059381852
+      },
+      {
+        "unit": "s",
+        "value": 0.059468254
+      },
+      {
+        "unit": "s",
+        "value": 0.059515715
+      }
+    ],
+    "PCI_write": [
+      {
+        "unit": "s",
+        "value": 0.120777629
+      },
+      {
+        "unit": "s",
+        "value": 0.062600188
+      },
+      {
+        "unit": "s",
+        "value": 0.062606179
+      },
+      {
+        "unit": "s",
+        "value": 0.062711891
+      },
+      {
+        "unit": "s",
+        "value": 0.062509606
+      },
+      {
+        "unit": "s",
+        "value": 0.062803592
+      },
+      {
+        "unit": "s",
+        "value": 0.062787151
+      },
+      {
+        "unit": "s",
+        "value": 0.062679419
+      },
+      {
+        "unit": "s",
+        "value": 0.06271488
+      },
+      {
+        "unit": "s",
+        "value": 0.062890023
+      }
+    ],
+    "Scale": [
+      {
+        "unit": "s",
+        "value": 0.039983335
+      },
+      {
+        "unit": "s",
+        "value": 0.039644189
+      },
+      {
+        "unit": "s",
+        "value": 0.039831532
+      },
+      {
+        "unit": "s",
+        "value": 0.039766591
+      },
+      {
+        "unit": "s",
+        "value": 0.039660679
+      },
+      {
+        "unit": "s",
+        "value": 0.039933614
+      },
+      {
+        "unit": "s",
+        "value": 0.039789862
+      },
+      {
+        "unit": "s",
+        "value": 0.03967413
+      },
+      {
+        "unit": "s",
+        "value": 0.039722601
+      },
+      {
+        "unit": "s",
+        "value": 0.039825792
+      }
+    ],
+    "Triad": [
+      {
+        "unit": "s",
+        "value": 0.052583184
+      },
+      {
+        "unit": "s",
+        "value": 0.052564403
+      },
+      {
+        "unit": "s",
+        "value": 0.052735936
+      },
+      {
+        "unit": "s",
+        "value": 0.052644865
+      },
+      {
+        "unit": "s",
+        "value": 0.052699956
+      },
+      {
+        "unit": "s",
+        "value": 0.052459542
+      },
+      {
+        "unit": "s",
+        "value": 0.052657585
+      },
+      {
+        "unit": "s",
+        "value": 0.052493212
+      },
+      {
+        "unit": "s",
+        "value": 0.052600984
+      },
+      {
+        "unit": "s",
+        "value": 0.052563704
+      }
+    ]
+  },
+  "validated": true,
+  "version": "2.6"
+}
+
+```
diff --git a/STREAM/src/common/parameters.h.in b/STREAM/src/common/parameters.h.in
index 57bb0d0a..8d822247 100644
--- a/STREAM/src/common/parameters.h.in
+++ b/STREAM/src/common/parameters.h.in
@@ -33,7 +33,8 @@
 #cmakedefine USE_SVM
 #cmakedefine USE_HBM
 
-#define PROGRAM_DESCRIPTION "Implementation of the STREAM benchmark"\
+#define PROGRAM_NAME "STREAM"
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
 
@@ -48,4 +49,4 @@ Output separator
 #define TRIAD_KERNEL_TYPE 3
 
 
-#endif // SRC_COMMON_PARAMETERS_H_
\ No newline at end of file
+#endif // SRC_COMMON_PARAMETERS_H_
diff --git a/STREAM/src/device/stream_kernels.cl b/STREAM/src/device/stream_kernels.cl
index cd569727..c8a99e2b 100644
--- a/STREAM/src/device/stream_kernels.cl
+++ b/STREAM/src/device/stream_kernels.cl
@@ -6,11 +6,11 @@ KERNEL_NUMBER will be replaced by the build script with the ID of the current re
 */
 #include "parameters.h"
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 
 __kernel
 __attribute__((uses_global_work_offset(0)))
-void copy_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
+void copy_{{ i }}(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
           __global DEVICE_ARRAY_DATA_TYPE * restrict out,
           const uint array_size) {
     uint number_elements = array_size / VECTOR_COUNT;
@@ -22,7 +22,7 @@ void copy_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
 
 __kernel
 __attribute__((uses_global_work_offset(0)))
-void add_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
+void add_{{ i }}(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
           __global const DEVICE_ARRAY_DATA_TYPE * restrict in2,
           __global DEVICE_ARRAY_DATA_TYPE * restrict out,
           const uint array_size) {
@@ -35,7 +35,7 @@ void add_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
 
 __kernel
 __attribute__((uses_global_work_offset(0)))
-void scale_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
+void scale_{{ i }}(__global const DEVICE_ARRAY_DATA_TYPE * restrict in,
           __global DEVICE_ARRAY_DATA_TYPE * restrict out,
           const DEVICE_SCALAR_DATA_TYPE scalar,
           const uint array_size) {
@@ -48,7 +48,7 @@ void scale_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in
 
 __kernel
 __attribute__((uses_global_work_offset(0)))
-void triad_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
+void triad_{{ i }}(__global const DEVICE_ARRAY_DATA_TYPE * restrict in1,
           __global const DEVICE_ARRAY_DATA_TYPE * restrict in2,
           __global DEVICE_ARRAY_DATA_TYPE * restrict out,
           const DEVICE_SCALAR_DATA_TYPE scalar,
@@ -60,4 +60,4 @@ void triad_/*PY_CODE_GEN i*/(__global const DEVICE_ARRAY_DATA_TYPE * restrict in
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/STREAM/src/device/stream_kernels_single.cl b/STREAM/src/device/stream_kernels_single.cl
index 678d4fc1..b3bfe7fc 100644
--- a/STREAM/src/device/stream_kernels_single.cl
+++ b/STREAM/src/device/stream_kernels_single.cl
@@ -15,19 +15,18 @@ KERNEL_NUMBER will be replaced by the build script with the ID of the current re
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 
-/* PY_CODE_GEN 
-try:
-    kernel_param_attributes = generate_attributes(num_replications)
-except:
-    kernel_param_attributes = ["" for i in range(num_replications)]
-*/
+{% if generate_attributes is defined %}
+    {% set kernel_param_attributes = generate_attributes(num_replications) %}
+{% else %}
+    {% set kernel_param_attributes = create_list("", num_replications) %}
+{% endif %}
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(num_replications)]
+{% for i in range(num_replications) %}
 __kernel
 __attribute__((uses_global_work_offset(0)))
-void calc_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/ const DEVICE_ARRAY_DATA_TYPE *restrict in1,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]*/ const DEVICE_ARRAY_DATA_TYPE *restrict in2,
-          __global /*PY_CODE_GEN kernel_param_attributes[i]*/ DEVICE_ARRAY_DATA_TYPE *restrict out,
+void calc_{{ i }}(__global {{ kernel_param_attributes[i] }} const DEVICE_ARRAY_DATA_TYPE *restrict in1,
+          __global {{ kernel_param_attributes[i] }} const DEVICE_ARRAY_DATA_TYPE *restrict in2,
+          __global {{ kernel_param_attributes[i] }} DEVICE_ARRAY_DATA_TYPE *restrict out,
           const DEVICE_SCALAR_DATA_TYPE scalar,
           const uint array_size,
           const uint operation_type) {
@@ -126,4 +125,4 @@ void calc_/*PY_CODE_GEN i*/(__global /*PY_CODE_GEN kernel_param_attributes[i]*/
     }
 }
 
-// PY_CODE_GEN block_end
+{% endfor %}
\ No newline at end of file
diff --git a/STREAM/src/host/execution.hpp b/STREAM/src/host/execution.hpp
index 70d6f948..d3e1c31b 100644
--- a/STREAM/src/host/execution.hpp
+++ b/STREAM/src/host/execution.hpp
@@ -35,13 +35,15 @@ SOFTWARE.
 #include "half.hpp"
 
 // Map keys for execution timings
-#define PCIE_WRITE_KEY "PCI write"
-#define PCIE_READ_KEY "PCI read"
+#define PCIE_WRITE_KEY "PCI_write"
+#define PCIE_READ_KEY "PCI_read"
 #define COPY_KEY "Copy"
 #define SCALE_KEY "Scale"
 #define ADD_KEY "Add"
 #define TRIAD_KEY "Triad"
 
+const std::string keys[] = {PCIE_WRITE_KEY, PCIE_READ_KEY, COPY_KEY, SCALE_KEY, ADD_KEY, TRIAD_KEY};
+
 namespace bm_execution {
 
     static std::map<std::string,double> multiplicatorMap = {
@@ -62,7 +64,7 @@ namespace bm_execution {
      * @param C The array C of the stream benchmark
      * @return std::unique_ptr<stream::StreamExecutionTimings> The measured timings for all stream operations
      */
-    std::unique_ptr<stream::StreamExecutionTimings>
+    std::map<std::string, std::vector<double>>
     calculate(const hpcc_base::ExecutionSettings<stream::StreamProgramSettings>& config,
               HOST_DATA_TYPE* A,
               HOST_DATA_TYPE* B,
diff --git a/STREAM/src/host/execution_default.cpp b/STREAM/src/host/execution_default.cpp
index 71a4d04f..a8cc5d83 100644
--- a/STREAM/src/host/execution_default.cpp
+++ b/STREAM/src/host/execution_default.cpp
@@ -67,7 +67,7 @@ namespace bm_execution {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
-    std::unique_ptr<stream::StreamExecutionTimings>
+    std::map<std::string, std::vector<double>>
     calculate(const hpcc_base::ExecutionSettings<stream::StreamProgramSettings>& config,
             HOST_DATA_TYPE* A,
             HOST_DATA_TYPE* B,
@@ -105,7 +105,7 @@ namespace bm_execution {
                                           add_kernels, triad_kernels, command_queues);
         }
         if (!success) {
-            return std::unique_ptr<stream::StreamExecutionTimings>(nullptr);
+            return std::map<std::string, std::vector<double>>();
         }
 
         //
@@ -331,11 +331,7 @@ namespace bm_execution {
 
         }
 
-        std::unique_ptr<stream::StreamExecutionTimings> result(new stream::StreamExecutionTimings{
-                timingMap,
-                config.programSettings->streamArraySize
-        });
-        return result;
+        return timingMap;
     }
 
     bool initialize_queues_and_kernels(const hpcc_base::ExecutionSettings<stream::StreamProgramSettings> &config,
diff --git a/STREAM/src/host/stream_benchmark.cpp b/STREAM/src/host/stream_benchmark.cpp
index 4dac0ea0..f0cc01f3 100644
--- a/STREAM/src/host/stream_benchmark.cpp
+++ b/STREAM/src/host/stream_benchmark.cpp
@@ -36,7 +36,6 @@ SOFTWARE.
 
 stream::StreamProgramSettings::StreamProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
     streamArraySize(results["s"].as<uint>()),
-    kernelReplications(results["r"].as<uint>()),
     useSingleKernel(!static_cast<bool>(results.count("multi-kernel"))) {
 
 }
@@ -48,7 +47,6 @@ stream::StreamProgramSettings::getSettingsMap() {
         std::stringstream ss;
         ss << streamArraySize << " (" << static_cast<double>(streamArraySize * sizeof(HOST_DATA_TYPE)) << " Byte )";
         map["Array Size"] = ss.str();
-        map["Kernel Replications"] = std::to_string(kernelReplications);
         map["Kernel Type"] = (useSingleKernel ? "Single" : "Separate");
         return map;
 }
@@ -102,19 +100,18 @@ stream::StreamBenchmark::addAdditionalParseOptions(cxxopts::Options &options) {
             ("multi-kernel", "Use the legacy multi kernel implementation");
 }
 
-std::unique_ptr<stream::StreamExecutionTimings>
+void
 stream::StreamBenchmark::executeKernel(StreamData &data) {
-    return bm_execution::calculate(*executionSettings,
+    timings = bm_execution::calculate(*executionSettings,
               data.A,
               data.B,
               data.C);
 }
 
 void
-stream::StreamBenchmark::collectAndPrintResults(const stream::StreamExecutionTimings &output) {
-
+stream::StreamBenchmark::collectResults() {
     std::map<std::string,std::vector<double>> totalTimingsMap;
-    for (auto v : output.timings) {
+    for (auto v : timings) {
         // Number of experiment repetitions
         uint number_measurements = v.second.size();
         // create a new 
@@ -127,28 +124,37 @@ stream::StreamBenchmark::collectAndPrintResults(const stream::StreamExecutionTim
 #else
         std::copy(v.second.begin(), v.second.end(), avg_measures.begin());
 #endif
-        totalTimingsMap.insert({v.first,avg_measures});
+
+        double minTime = *min_element(v.second.begin(), v.second.end());
+        double avgTime = accumulate(v.second.begin(), v.second.end(), 0.0)
+                        / v.second.size();
+        double maxTime = *max_element(v.second.begin(), v.second.end());
+
+        double bestRate = (static_cast<double>(sizeof(HOST_DATA_TYPE)) * executionSettings->programSettings->streamArraySize * bm_execution::multiplicatorMap[v.first] / minTime) * 1.0e-6 * mpi_comm_size;
+        
+        results.emplace(v.first + "_min_t", hpcc_base::HpccResult(minTime, "s"));
+        results.emplace(v.first + "_avg_t", hpcc_base::HpccResult(avgTime, "s"));
+        results.emplace(v.first + "_max_t", hpcc_base::HpccResult(maxTime, "s"));
+        results.emplace(v.first + "_best_rate", hpcc_base::HpccResult(bestRate, "MB/s"));
     }
+}
 
+void
+stream::StreamBenchmark::printResults() {
     if (mpi_comm_rank == 0) {
-        std::cout << std::setw(ENTRY_SPACE) << "Function";
-        std::cout << std::setw(ENTRY_SPACE) << "Best Rate MB/s";
-        std::cout << std::setw(ENTRY_SPACE) << "Avg time s";
+        std::cout << std::left << std::setw(ENTRY_SPACE) << "Function";
+        std::cout << std::setw(ENTRY_SPACE) << "Best Rate";
+        std::cout << std::setw(ENTRY_SPACE) << "Avg time";
         std::cout << std::setw(ENTRY_SPACE) << "Min time" ;
-        std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::endl;
-
-        for (auto v : totalTimingsMap) {
-            double minTime = *min_element(v.second.begin(), v.second.end());
-            double avgTime = accumulate(v.second.begin(), v.second.end(), 0.0)
-                            / v.second.size();
-            double maxTime = *max_element(v.second.begin(), v.second.end());
-
-            std::cout << std::setw(ENTRY_SPACE) << v.first;
-            std::cout << std::setw(ENTRY_SPACE)
-            << (static_cast<double>(sizeof(HOST_DATA_TYPE)) * output.arraySize * bm_execution::multiplicatorMap[v.first] / minTime) * 1.0e-6 * mpi_comm_size
-                    << std::setw(ENTRY_SPACE) << avgTime
-                    << std::setw(ENTRY_SPACE) << minTime
-                    << std::setw(ENTRY_SPACE) << maxTime << std::endl;
+        std::cout << std::setw(ENTRY_SPACE) << "Max time" << std::right << std::endl;
+
+        for (auto key : keys) {
+            std::cout << std::left << std::setw(ENTRY_SPACE) << key
+                << results.at(key + "_best_rate")
+                << results.at(key + "_avg_t")
+                << results.at(key + "_min_t")
+                << results.at(key + "_max_t")
+                << std::right << std::endl;
         }
     }
 }
@@ -166,7 +172,7 @@ stream::StreamBenchmark::generateInputData() {
 }
 
 bool  
-stream::StreamBenchmark::validateOutputAndPrintError(stream::StreamData &data) {
+stream::StreamBenchmark::validateOutput(stream::StreamData &data) {
     HOST_DATA_TYPE aj,bj,cj,scalar;
     double aSumErr,bSumErr,cSumErr;
     double aAvgErr,bAvgErr,cAvgErr;
@@ -215,54 +221,86 @@ stream::StreamBenchmark::validateOutputAndPrintError(stream::StreamData &data) {
     bAvgErr = totalBAvgErr / mpi_comm_size;
 #endif
 
+    bool success = true;
     if (mpi_comm_rank == 0) {
+        errors.emplace("a_expected", aj);
+        errors.emplace("a_average_error", aAvgErr);
+        errors.emplace("a_average_relative_error", abs(aAvgErr)/aj);
+
+        errors.emplace("b_expected", bj);
+        errors.emplace("b_average_error", bAvgErr);
+        errors.emplace("b_average_relative_error", abs(bAvgErr)/bj);
+
+        errors.emplace("c_expected", cj);
+        errors.emplace("c_average_error", cAvgErr);
+        errors.emplace("c_average_relative_error", abs(cAvgErr)/cj);
 
         epsilon = std::numeric_limits<HOST_DATA_TYPE>::epsilon();
+        errors.emplace("epsilon", epsilon);
 
-        err = 0;
         if (abs(aAvgErr/aj) > epsilon) {
-            err++;
-            printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
-            printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
+            success = false;
             ierr = 0;
             for (j=0; j<executionSettings->programSettings->streamArraySize; j++) {
                 if (abs(data.A[j]/aj-1.0) > epsilon) {
                     ierr++;
                 }
             }
-            printf("     For array a[], %d errors were found.\n",ierr);
+            errors.emplace("a_error_count", ierr);
+            ierr = 0;
         }
         if (abs(bAvgErr/bj) > epsilon) {
-            err++;
-            printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
-            printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
-            printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+            success = false;
             ierr = 0;
             for (j=0; j<executionSettings->programSettings->streamArraySize; j++) {
                 if (abs(data.B[j]/bj-1.0) > epsilon) {
                     ierr++;
                 }
             }
-            printf("     For array b[], %d errors were found.\n",ierr);
+            errors.emplace("b_error_count", ierr);
         }
         if (abs(cAvgErr/cj) > epsilon) {
-            err++;
-            printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
-            printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
-            printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
+            success = false;
             ierr = 0;
             for (j=0; j<executionSettings->programSettings->streamArraySize; j++) {
                 if (abs(data.C[j]/cj-1.0) > epsilon) {
                     ierr++;
                 }
             }
-            printf("     For array c[], %d errors were found.\n",ierr);
+            errors.emplace("b_error_count", ierr);
+        }
+    }
+    return success;
+}
+
+void
+stream::StreamBenchmark::printError() {
+    if (mpi_comm_rank == 0) {
+        int err = 0;
+        double epsilon = errors.at("epsilon");
+        if (errors.at("a_average_relative_error") > epsilon) {
+            err++;
+            printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon"));
+            printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("a_expected"), errors.at("a_average_error"), errors.at("a_average_relative_error"));
+            printf("     For array a[], %d errors were found.\n", errors.at("a_error_count"));
+        }
+
+        if (errors.at("b_average_relative_error") > epsilon) {
+            err++;
+            printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon"));
+            printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("b_expected"), errors.at("b_average_error"), errors.at("b_average_relative_error"));
+            printf("     AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon"));
+            printf("     For array b[], %d errors were found.\n", errors.at("b_error_count"));
+        }
+        if (errors.at("c_average_relative_error") > epsilon) {
+            err++;
+            printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", errors.at("epsilon"));
+            printf("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", errors.at("c_expected"), errors.at("c_average_error"), errors.at("c_average_relative_error"));
+            printf("     AvgRelAbsErr > Epsilon (%e)\n", errors.at("epsilon"));
+            printf("     For array c[], %d errors were found.\n", errors.at("c_error_count"));
         }
         if (err == 0) {
-            printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
-            return true;
+            printf ("Solution Validates: avg error less than %e on all three arrays\n", errors.at("epsilon"));
         }
-        return false;
     }
-    return true;
-}
\ No newline at end of file
+}
diff --git a/STREAM/src/host/stream_benchmark.hpp b/STREAM/src/host/stream_benchmark.hpp
index 401a899d..638868da 100644
--- a/STREAM/src/host/stream_benchmark.hpp
+++ b/STREAM/src/host/stream_benchmark.hpp
@@ -52,12 +52,6 @@ class StreamProgramSettings : public hpcc_base::BaseSettings {
      */
     uint streamArraySize;
 
-    /**
-     * @brief The number of used kernel replications
-     * 
-     */
-    uint kernelReplications;
-
     /**
      * @brief Indicator if the single kernel or the legacy kernel are used for execution
      * 
@@ -127,30 +121,11 @@ class StreamData {
 
 };
 
-/**
- * @brief Measured execution timing from the kernel execution
- * 
- */
-class StreamExecutionTimings {
-public:
-    /**
-     * @brief A map containing the timings for all stream operation types
-     * 
-     */
-    std::map<std::string,std::vector<double>> timings;
-
-    /**
-     * @brief The used array size
-     * 
-     */
-    uint arraySize;
-};
-
 /**
  * @brief Implementation of the Sream benchmark
  * 
  */
-class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark<StreamProgramSettings, StreamData, StreamExecutionTimings> {
+class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark<StreamProgramSettings, StreamData> {
 
 protected:
 
@@ -176,9 +151,8 @@ class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark<StreamProgramSetting
      * @brief Stream specific implementation of the kernel execution
      * 
      * @param data 
-     * @return std::unique_ptr<StreamExecutionTimings> 
      */
-    std::unique_ptr<StreamExecutionTimings>
+    void
     executeKernel( StreamData &data) override;
 
     /**
@@ -189,15 +163,24 @@ class StreamBenchmark : public hpcc_base::HpccFpgaBenchmark<StreamProgramSetting
      * @return false 
      */
     bool
-    validateOutputAndPrintError(StreamData &data) override;
+    validateOutput(StreamData &data) override;
+
+    /**
+     * @brief STREAM specific implementation of the error printing
+     *
+     */
+    void
+    printError() override;
 
     /**
      * @brief Stream specific implementation of printing the execution results
      * 
-     * @param output 
      */
     void
-    collectAndPrintResults(const StreamExecutionTimings &output) override;
+    collectResults() override;
+
+    void
+    printResults() override;
 
     /**
      * @brief Construct a new Stream Benchmark object
diff --git a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp
index 181750ae..1aae4c2a 100644
--- a/STREAM/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/STREAM/tests/test_kernel_functionality_and_host_integration.cpp
@@ -5,7 +5,7 @@
 #include "parameters.h"
 #include "test_program_settings.h"
 #include "stream_benchmark.hpp"
-
+#include "nlohmann/json.hpp"
 
 struct StreamKernelTest :public  ::testing::Test {
     std::shared_ptr<stream::StreamData> data;
@@ -29,7 +29,7 @@ struct StreamKernelTest :public  ::testing::Test {
  */
 TEST_F(StreamKernelTest, FPGACorrectResultsOneRepetition) {
     bm->getExecutionSettings().programSettings->numRepetitions = 1;
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < bm->getExecutionSettings().programSettings->streamArraySize; i++) {
         EXPECT_FLOAT_EQ(data->A[i], 30.0);
         EXPECT_FLOAT_EQ(data->B[i], 6.0);
@@ -42,10 +42,59 @@ TEST_F(StreamKernelTest, FPGACorrectResultsOneRepetition) {
  */
 TEST_F(StreamKernelTest, FPGACorrectResultsThreeRepetition) {
     bm->getExecutionSettings().programSettings->numRepetitions = 3;
-    auto result = bm->executeKernel(*data);
+    bm->executeKernel(*data);
     for (int i = 0; i < bm->getExecutionSettings().programSettings->streamArraySize; i++) {
         EXPECT_FLOAT_EQ(data->A[i], 6750.0);
         EXPECT_FLOAT_EQ(data->B[i], 1350.0);
         EXPECT_FLOAT_EQ(data->C[i], 1800.0);
     }
 }
+
+using json = nlohmann::json;
+
+TEST_F(StreamKernelTest, JsonDump) {
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("stream.json");
+    std::FILE *f = std::fopen("stream.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].contains("Add"));
+            EXPECT_TRUE(j["timings"].contains("Copy"));
+            EXPECT_TRUE(j["timings"].contains("PCI_read"));
+            EXPECT_TRUE(j["timings"].contains("PCI_write"));
+            EXPECT_TRUE(j["timings"].contains("Scale"));
+            EXPECT_TRUE(j["timings"].contains("Triad"));
+        }
+        EXPECT_TRUE(j.contains("results"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("Add_avg_t"));
+            EXPECT_TRUE(j["results"].contains("Add_best_rate"));
+            EXPECT_TRUE(j["results"].contains("Add_max_t"));
+            EXPECT_TRUE(j["results"].contains("Add_min_t"));
+            EXPECT_TRUE(j["results"].contains("Copy_avg_t"));
+            EXPECT_TRUE(j["results"].contains("Copy_best_rate"));
+            EXPECT_TRUE(j["results"].contains("Copy_max_t"));
+            EXPECT_TRUE(j["results"].contains("Copy_min_t"));
+            EXPECT_TRUE(j["results"].contains("PCI_read_avg_t"));
+            EXPECT_TRUE(j["results"].contains("PCI_read_best_rate"));
+            EXPECT_TRUE(j["results"].contains("PCI_read_max_t"));
+            EXPECT_TRUE(j["results"].contains("PCI_read_min_t"));
+            EXPECT_TRUE(j["results"].contains("PCI_write_avg_t"));
+            EXPECT_TRUE(j["results"].contains("PCI_write_best_rate"));
+            EXPECT_TRUE(j["results"].contains("PCI_write_max_t"));
+            EXPECT_TRUE(j["results"].contains("PCI_write_min_t"));
+            EXPECT_TRUE(j["results"].contains("Scale_avg_t"));
+            EXPECT_TRUE(j["results"].contains("Scale_best_rate"));
+            EXPECT_TRUE(j["results"].contains("Scale_max_t"));
+            EXPECT_TRUE(j["results"].contains("Scale_min_t"));
+            EXPECT_TRUE(j["results"].contains("Triad_avg_t"));
+            EXPECT_TRUE(j["results"].contains("Triad_best_rate"));
+            EXPECT_TRUE(j["results"].contains("Triad_max_t"));
+            EXPECT_TRUE(j["results"].contains("Triad_min_t"));
+        }
+    }
+}
diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt
index b894bb48..bfbb3e59 100755
--- a/b_eff/CMakeLists.txt
+++ b/b_eff/CMakeLists.txt
@@ -19,11 +19,12 @@ set(USE_DEPRECATED_HPP_HEADER No)
 
 set(COMMUNICATION_TYPE_SUPPORT_ENABLED Yes)
 
+if (USE_ACCL)
+    math(EXPR calculate_accl_buffer_size "2 ^ ${DEFAULT_MAX_MESSAGE_SIZE} * 4")
+    set(ACCL_BUFFER_SIZE ${calculate_accl_buffer_size} CACHE STRING "Size of ACCL buffers in bytes")
+endif()
+
 set(DATA_TYPE char)
 include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake)
 unset(DATA_TYPE CACHE)
 find_package(MPI REQUIRED)
-
-if (NOT INTELFPGAOPENCL_FOUND)
-    message(ERROR "Benchmark does only support the Intel OpenCL SDK")
-endif()
diff --git a/b_eff/README.md b/b_eff/README.md
index ad2a9c27..cdbb8c92 100644
--- a/b_eff/README.md
+++ b/b_eff/README.md
@@ -71,38 +71,51 @@ For execution of the benchmark run:
     
 For more information on available input parameters run
 
-    $./Network_intel -h
+    ./Network_intel -h
     
     Implementation of the effective bandwidth benchmark proposed in the HPCC benchmark suite for FPGA.
     Version: 1.3
 
+    MPI Version:  3.1
+    Config. Time: Thu Dec 08 10:38:28 UTC 2022
+    Git Commit:   86e0064-dirty
+
     Usage:
-    bin/Network_intel [OPTION...]
-
-    -f, --file arg         Kernel file name
-    -n, arg                Number of repetitions (default: 10)
-    -i,                    Use memory Interleaving
-        --skip-validation  Skip the validation of the output data. This will
-                            speed up execution and helps when working with special
-                            data types.
-        --device arg       Index of the device that has to be used. If not
-                            given you will be asked which device to use if there are
-                            multiple devices available. (default: -1)
-        --platform arg     Index of the platform that has to be used. If not
-                            given you will be asked which platform to use if there
-                            are multiple platforms available. (default: -1)
-    -h, --help             Print this help
-    -u, --upper arg        Maximum number of repetitions per data size
-                            (default: 32768)
-    -l, --lower arg        Minimum number of repetitions per data size
-                            (default: 1)
-        --min-size arg     Minimum Message Size (default: 0)
-    -m, arg                Maximum message size (default: 20)
-    -o, arg                Offset used before reducing repetitions (default: 1)
-    -d, arg                Number os steps the repetitions are decreased to its
-                            minimum (default: 5)
+      ./bin/Network_intel [OPTION...]
+
+      -f, --file arg          Kernel file name
+      -n, arg                 Number of repetitions (default: 10)
+      -i,                     Use memory Interleaving
+          --skip-validation   Skip the validation of the output data. This will
+                              speed up execution and helps when working with
+                              special data types.
+          --device arg        Index of the device that has to be used. If not
+                              given you will be asked which device to use if there
+                              are multiple devices available. (default: 0)
+          --platform arg      Index of the platform that has to be used. If not
+                              given you will be asked which platform to use if
+                              there are multiple platforms available. (default: 0)
+          --platform_str arg  Name of the platform that has to be used (default:
+                              )
+      -r, arg                 Number of used kernel replications (default: 2)
+          --comm-type arg     Used communication type for inter-FPGA
+                              communication (default: AUTO)
+          --dump-json arg     dump benchmark configuration and results to this
+                              file in json format (default: )
+          --test              Only test given configuration and skip execution
+                              and validation
+      -h, --help              Print this help
+      -u, --upper arg         Maximum number of repetitions per data size
+                              (default: 65536)
+      -l, --lower arg         Minimum number of repetitions per data size
+                              (default: 256)
+          --min-size arg      Minimum Message Size (default: 0)
+      -m, arg                 Maximum message size (default: 20)
+      -o, arg                 Offset used before reducing repetitions (default:
+                              11)
+      -d, arg                 Number os steps the repetitions are decreased to
+                              its minimum (default: 7)
 
-    
 To execute the unit and integration tests run
 
     ./Network_test_intel -f KERNEL_FILE_NAME
@@ -140,30 +153,12 @@ This might still lead to inaccuracies in the time measurements depending on the
 The benchmark will output a result table to the standard output after execution.
 This is an example output using a single rank in emulation:
 
-            MSize      looplength            time            B/s
-                1           16384     5.46779e-02     5.99292e+05
-                2            8192     5.19651e-02     6.30578e+05
-                4            4096     2.58565e-02     1.26730e+06
-                8            2048     7.51376e-03     4.36107e+06
-               16            1024     3.01288e-03     1.08760e+07
-               32             512     1.66958e-03     1.96265e+07
-               64             256     4.60622e-03     7.11386e+06
-              128             128     1.86568e-03     1.75636e+07
-              256              64     3.75094e-03     8.73594e+06
-              512              32     3.81549e-03     8.58814e+06
-             1024              16     3.44074e-03     9.52354e+06
-             2048               8     3.83420e-03     8.54624e+06
-             4096               4     3.34786e-03     9.78775e+06
-            16384               2     7.84717e-03     8.35154e+06
-            32768               1     7.42386e-03     8.82775e+06
-            65536               1     1.40822e-02     9.30761e+06
-           131072               1     1.28135e-02     2.04585e+07
-           262144               1     5.52680e-02     9.48628e+06
-           524288               1     9.99676e-02     1.04892e+07
-          1048576               1     1.21861e-01     1.72094e+07
-          2097152               1     4.20120e-01     9.98360e+06
-    
-    b_eff = 9.58731e+06 B/s
+               MSize             looplength               transfer                    B/s
+                  64                      5            4.38310e-05            1.46015e+07
+                 128                      5            7.07010e-05            1.81044e+07
+                 256                      5            7.73410e-05            3.31002e+07
+
+    b_eff = 2.19354e+07 B/s
 
 The table contains the measurements for all tested message sizes.
 It is split into the following four columns:
@@ -177,4 +172,202 @@ It is possible to set the number of repetitions of the experiment.
 In this case, the best measured time will be used to calculate the bandwidth.
 
 Under the table the calculated effective bandwidth is printed.
-It is the mean of the achieved bandwidths for all used message sizes.
\ No newline at end of file
+It is the mean of the achieved bandwidths for all used message sizes.
+
+The json output looks like the following.
+
+```json
+
+{
+  "config_time": "Wed Dec 14 08:39:42 UTC 2022",
+  "device": "Intel(R) FPGA Emulation Device",
+  "environment": {
+    "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+  },
+  "errors": {},
+  "execution_time": "Wed Dec 14 09:56:29 UTC 2022",
+  "git_commit": "be1a4e9-dirty",
+  "mpi": {
+    "subversion": 1,
+    "version": 3
+  },
+  "name": "effective bandwidth",
+  "results": {
+    "b_eff": {
+      "unit": "B/s",
+      "value": 22061624.19637537
+    }
+  },
+  "settings": {
+    "Communication Type": false,
+    "Kernel File": false,
+    "Kernel Replications": 2,
+    "Loop Length": 5,
+    "MPI Ranks": 1,
+    "Message Sizes": 2,
+    "Repetitions": 10,
+    "Test Mode": false
+  },
+  "timings": {
+    "6": {
+      "maxCalcBW": 9880812.696844315,
+      "maxMinCalculationTime": 6.4772e-05,
+      "timings": [
+        {
+          "looplength": 5,
+          "messageSize": 6,
+          "timings": [
+            {
+              "unit": "s",
+              "value": 0.010991125
+            },
+            {
+              "unit": "s",
+              "value": 8.8202e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000133323
+            },
+            {
+              "unit": "s",
+              "value": 8.5442e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000272905
+            },
+            {
+              "unit": "s",
+              "value": 0.000168784
+            },
+            {
+              "unit": "s",
+              "value": 6.4772e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000171733
+            },
+            {
+              "unit": "s",
+              "value": 0.000163393
+            },
+            {
+              "unit": "s",
+              "value": 8.0391e-05
+            }
+          ]
+        }
+      ]
+    },
+    "7": {
+      "maxCalcBW": 19143908.348538782,
+      "maxMinCalculationTime": 6.6862e-05,
+      "timings": [
+        {
+          "looplength": 5,
+          "messageSize": 7,
+          "timings": [
+            {
+              "unit": "s",
+              "value": 0.000135662
+            },
+            {
+              "unit": "s",
+              "value": 0.000119343
+            },
+            {
+              "unit": "s",
+              "value": 0.000178914
+            },
+            {
+              "unit": "s",
+              "value": 7.7691e-05
+            },
+            {
+              "unit": "s",
+              "value": 9.1922e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000259545
+            },
+            {
+              "unit": "s",
+              "value": 0.000143233
+            },
+            {
+              "unit": "s",
+              "value": 0.000149763
+            },
+            {
+              "unit": "s",
+              "value": 6.6862e-05
+            },
+            {
+              "unit": "s",
+              "value": 7.2351e-05
+            }
+          ]
+        }
+      ]
+    },
+    "8": {
+      "maxCalcBW": 37160151.543743014,
+      "maxMinCalculationTime": 6.8891e-05,
+      "timings": [
+        {
+          "looplength": 5,
+          "messageSize": 8,
+          "timings": [
+            {
+              "unit": "s",
+              "value": 0.000159723
+            },
+            {
+              "unit": "s",
+              "value": 0.000104432
+            },
+            {
+              "unit": "s",
+              "value": 0.000166953
+            },
+            {
+              "unit": "s",
+              "value": 7.7492e-05
+            },
+            {
+              "unit": "s",
+              "value": 7.8241e-05
+            },
+            {
+              "unit": "s",
+              "value": 9.5762e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000235084
+            },
+            {
+              "unit": "s",
+              "value": 0.000280265
+            },
+            {
+              "unit": "s",
+              "value": 0.000130013
+            },
+            {
+              "unit": "s",
+              "value": 6.8891e-05
+            }
+          ]
+        }
+      ]
+    }
+  },
+  "validated": true,
+  "version": "1.3"
+}
+
+```
diff --git a/b_eff/configs/Bittware_520N_PCIE.cmake b/b_eff/configs/Bittware_520N_PCIE.cmake
new file mode 100644
index 00000000..b5fb6dad
--- /dev/null
+++ b/b_eff/configs/Bittware_520N_PCIE.cmake
@@ -0,0 +1,17 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "p520_hpc_sg280l" CACHE STRING "" FORCE)
+set(AOC_FLAGS "-fpc -fp-relaxed -seed=7" CACHE STRING "" FORCE)
+
+# GEMM specific options
+set(CHANNEL_WIDTH 32 CACHE STRING "Width of a single external channel in Byte" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications" FORCE)
\ No newline at end of file
diff --git a/b_eff/configs/Xilinx_U280_DDR.cmake b/b_eff/configs/Xilinx_U280_DDR.cmake
new file mode 100644
index 00000000..61d9003b
--- /dev/null
+++ b/b_eff/configs/Xilinx_U280_DDR.cmake
@@ -0,0 +1,14 @@
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.u280.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.u280.ini CACHE FILEPATH "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
\ No newline at end of file
diff --git a/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_ddr.cmake b/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_ddr.cmake
new file mode 100644
index 00000000..523c8761
--- /dev/null
+++ b/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_ddr.cmake
@@ -0,0 +1,25 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_buffers.ddr.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_hbm.cmake b/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_hbm.cmake
new file mode 100644
index 00000000..f097ebd9
--- /dev/null
+++ b/b_eff/configs/Xilinx_U280_DDR_ACCL_buffers_hbm.cmake
@@ -0,0 +1,25 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_buffers.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake b/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake
new file mode 100644
index 00000000..c40efff7
--- /dev/null
+++ b/b_eff/configs/Xilinx_U280_HBM_ACCL_pl_profile.cmake
@@ -0,0 +1,27 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u280_xdma_201920_3" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES "send_recv" CACHE STRING "" FORCE)
+set(USE_ACCL_CLIENT_ARBITER Yes CACHE BOOL "Use the client arbiter kernel to connect multiple kernels to the CCLO cmd stream" FORCE)
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/b_eff/configs/Xilinx_U55C_DDR_ACCL_buffers_hbm.cmake b/b_eff/configs/Xilinx_U55C_DDR_ACCL_buffers_hbm.cmake
new file mode 100644
index 00000000..ed6ec1f9
--- /dev/null
+++ b/b_eff/configs/Xilinx_U55C_DDR_ACCL_buffers_hbm.cmake
@@ -0,0 +1,25 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake
new file mode 100644
index 00000000..81c20e1d
--- /dev/null
+++ b/b_eff/configs/Xilinx_U55C_DDR_ACCL_pl_hbm.cmake
@@ -0,0 +1,27 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES "send_recv" CACHE STRING "" FORCE)
+set(USE_ACCL_CLIENT_ARBITER Yes CACHE BOOL "Use the client arbiter kernel to connect multiple kernels to the CCLO cmd stream" FORCE)
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+set(ACCL_BUFFER_SIZE 4194304 CACHE STRING "Size of the ACCL buffers" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/b_eff/configs/Xilinx_U55C_HBM_ACCL_pl_profile.cmake b/b_eff/configs/Xilinx_U55C_HBM_ACCL_pl_profile.cmake
new file mode 100644
index 00000000..5cd3ed0a
--- /dev/null
+++ b/b_eff/configs/Xilinx_U55C_HBM_ACCL_pl_profile.cmake
@@ -0,0 +1,27 @@
+# This file contains the default configuration for the Nallatech 520N board
+# for the use with single precision floating point values.
+# To use this configuration file, call cmake with the parameter
+#
+#     cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
+#
+
+
+set(USE_MPI Yes CACHE BOOL "" FORCE)
+set(USE_SVM No CACHE BOOL "" FORCE)
+set(USE_HBM No CACHE BOOL "" FORCE)
+set(USE_ACCL Yes CACHE BOOL "" FORCE)
+set(USE_XRT_HOST Yes CACHE BOOL "" FORCE)
+set(USE_OCL_HOST No CACHE BOOL "" FORCE)
+set(FPGA_BOARD_NAME "xilinx_u55c_gen3x16_xdma_3_202210_1" CACHE STRING "" FORCE)
+set(XILINX_LINK_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini CACHE FILEPATH "" FORCE)
+set(XILINX_COMPILE_SETTINGS_FILE ${CMAKE_SOURCE_DIR}/settings/settings.compile.xilinx.accl_buffers.ini CACHE FILEPATH "" FORCE)
+set(XILINX_ADDITIONAL_COMPILE_FLAGS -g CACHE STRING "" FORCE)
+set(XILINX_ADDITIONAL_LINK_FLAGS -g --kernel_frequency 250 CACHE STRING "" FORCE)
+set(XILINX_KERNEL_NAMES "send_recv" CACHE STRING "" FORCE)
+set(USE_ACCL_CLIENT_ARBITER Yes CACHE BOOL "Use the client arbiter kernel to connect multiple kernels to the CCLO cmd stream" FORCE)
+# STREAM specific options
+# Defaults to a total of ~12GB data
+set(CHANNEL_WIDTH 0 CACHE STRING "Width of a single external channel in Byte will not be considered" FORCE)
+set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications will not be considered" FORCE)
+
+set(USE_DEPRECATED_HPP_HEADER Yes CACHE BOOL "Use cl.hpp intead of cl2.hpp" FORCE)
diff --git a/b_eff/scripts/prepare_tests.sh b/b_eff/scripts/prepare_tests.sh
new file mode 100755
index 00000000..2705d74d
--- /dev/null
+++ b/b_eff/scripts/prepare_tests.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/bash
+
+cd $1
+touch kernel_output_ch0
+touch kernel_output_ch1
+touch kernel_output_ch2
+touch kernel_output_ch3
+ln -s kernel_output_ch0 kernel_input_ch1
+ln -s kernel_output_ch2 kernel_input_ch3
+ln -s kernel_output_ch1 kernel_input_ch0
+ln -s kernel_output_ch3 kernel_input_ch2
diff --git a/b_eff/settings/settings.compile.xilinx.accl_buffers.ini b/b_eff/settings/settings.compile.xilinx.accl_buffers.ini
new file mode 100644
index 00000000..e69de29b
diff --git a/b_eff/settings/settings.compile.xilinx.u280.ini b/b_eff/settings/settings.compile.xilinx.u280.ini
new file mode 100644
index 00000000..e69de29b
diff --git a/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini b/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini
new file mode 100644
index 00000000..2ee98436
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.accl_buffers.ddr.ini
@@ -0,0 +1,71 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
+
+sp=ccl_offload_0.m_axi_0:DDR[0:1]
+sp=ccl_offload_0.m_axi_1:DDR[0:1]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
diff --git a/b_eff/settings/settings.link.xilinx.accl_buffers.hbm.ini b/b_eff/settings/settings.link.xilinx.accl_buffers.hbm.ini
new file mode 100644
index 00000000..e6352198
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.accl_buffers.hbm.ini
@@ -0,0 +1,71 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
+
+sp=ccl_offload_0.m_axi_0:HBM[0:5]
+sp=ccl_offload_0.m_axi_1:HBM[0:5]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
diff --git a/b_eff/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini b/b_eff/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini
new file mode 100644
index 00000000..61850b2a
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.accl_buffers.u55c.hbm.ini
@@ -0,0 +1,71 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_sum:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=lb_user_krnl:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+
+sp=ccl_offload_0.m_axi_0:HBM[0:5]
+sp=ccl_offload_0.m_axi_1:HBM[0:5]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:hostctrl_0.sts
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini
new file mode 100644
index 00000000..374a41c9
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.accl_pl.u280.hbm.profile.ini
@@ -0,0 +1,91 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+nk=client_arbiter:1:client_arbiter
+nk=send_recv:1:sendrecv
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR1
+slr=compression_0_1:SLR1
+slr=compression_0_2:SLR1
+slr=lb_user_krnl:SLR1
+slr=arith_0:SLR1
+slr=ccl_offload_0:SLR1
+slr=hostctrl_0:SLR1
+slr=networklayer_0:SLR2
+slr=cmac_0:SLR2
+slr=client_arbiter:SLR1
+slr=sendrecv:SLR1
+
+sp=ccl_offload_0.m_axi_0:HBM[0:5]
+sp=ccl_offload_0.m_axi_1:HBM[0:5]
+sp=sendrecv.m_axi_gmem:HBM[0:5]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:client_arbiter.cmd_clients_0
+stream_connect=client_arbiter.ack_clients_0:hostctrl_0.sts
+stream_connect=sendrecv.cmd:client_arbiter.cmd_clients_1
+stream_connect=client_arbiter.ack_clients_1:sendrecv.sts
+stream_connect=client_arbiter.cmd_cclo:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:client_arbiter.ack_cclo
+
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
+[profile]
+data=send_recv:all:all
+data=client_arbiter:all:all
+data=ccl_offload:all:m_axis_eth_tx_data
+data=networklayer:all:M_AXIS_nl2sk
+data=networklayer:all:M_AXIS_nl2eth
+data=cmac_0:all:M_AXIS
+memory=all
+stall=all
+exec=all:all
diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini
new file mode 100644
index 00000000..a59018d2
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.ini
@@ -0,0 +1,81 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+nk=client_arbiter:1:client_arbiter
+nk=send_recv:1:sendrecv
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=lb_user_krnl:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+slr=client_arbiter:SLR0
+slr=sendrecv:SLR0
+
+sp=ccl_offload_0.m_axi_0:HBM[0:5]
+sp=ccl_offload_0.m_axi_1:HBM[0:5]
+sp=sendrecv.m_axi_gmem:HBM[0:5]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:client_arbiter.cmd_clients_0
+stream_connect=client_arbiter.ack_clients_0:hostctrl_0.sts
+stream_connect=sendrecv.cmd:client_arbiter.cmd_clients_1
+stream_connect=client_arbiter.ack_clients_1:sendrecv.sts
+stream_connect=client_arbiter.cmd_cclo:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:client_arbiter.ack_cclo
+
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
diff --git a/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini
new file mode 100644
index 00000000..778054e5
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.accl_pl.u55c.hbm.profile.ini
@@ -0,0 +1,91 @@
+# /*******************************************************************************
+#  Copyright (C) 2021 Xilinx, Inc
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+# *******************************************************************************/
+[connectivity]
+# Define number of kernels and their name
+nk=networklayer:1:networklayer_0
+nk=ccl_offload:1:ccl_offload_0
+nk=hostctrl:1:hostctrl_0
+nk=cmac_0:1:cmac_0
+nk=reduce_ops:1:arith_0
+nk=hp_compression:3:compression_0_0.compression_0_1.compression_0_2
+nk=loopback:1:lb_user_krnl
+nk=client_arbiter:1:client_arbiter
+nk=send_recv:1:sendrecv
+
+# Kernels Foorplaning
+slr=compression_0_0:SLR0
+slr=compression_0_1:SLR0
+slr=compression_0_2:SLR0
+slr=lb_user_krnl:SLR0
+slr=arith_0:SLR0
+slr=ccl_offload_0:SLR0
+slr=hostctrl_0:SLR0
+slr=networklayer_0:SLR1
+slr=cmac_0:SLR1
+slr=client_arbiter:SLR0
+slr=sendrecv:SLR0
+
+sp=ccl_offload_0.m_axi_0:HBM[0:5]
+sp=ccl_offload_0.m_axi_1:HBM[0:5]
+sp=sendrecv.m_axi_gmem:HBM[0:5]
+
+# Connect host controllers to CCL Offload
+stream_connect=hostctrl_0.cmd:client_arbiter.cmd_clients_0
+stream_connect=client_arbiter.ack_clients_0:hostctrl_0.sts
+stream_connect=sendrecv.cmd:client_arbiter.cmd_clients_1
+stream_connect=client_arbiter.ack_clients_1:sendrecv.sts
+stream_connect=client_arbiter.cmd_cclo:ccl_offload_0.s_axis_call_req
+stream_connect=ccl_offload_0.m_axis_call_ack:client_arbiter.ack_cclo
+
+
+# Connect CCL Offload kernel to UDP Network Kernel
+stream_connect=ccl_offload_0.m_axis_eth_tx_data:networklayer_0.S_AXIS_sk2nl:512
+stream_connect=networklayer_0.M_AXIS_nl2sk:ccl_offload_0.s_axis_eth_rx_data:512
+
+# Connect UDP Network Kernel to CMAC Kernel
+stream_connect=cmac_0.M_AXIS:networklayer_0.S_AXIS_eth2nl
+stream_connect=networklayer_0.M_AXIS_nl2eth:cmac_0.S_AXIS
+
+# arithmetic connections
+stream_connect=ccl_offload_0.m_axis_arith_op0:arith_0.in0
+stream_connect=ccl_offload_0.m_axis_arith_op1:arith_0.in1
+stream_connect=arith_0.out_r:ccl_offload_0.s_axis_arith_res
+
+# caster connections
+stream_connect=ccl_offload_0.m_axis_compression0:compression_0_0.in_r
+stream_connect=compression_0_0.out_r:ccl_offload_0.s_axis_compression0
+
+stream_connect=ccl_offload_0.m_axis_compression1:compression_0_1.in_r
+stream_connect=compression_0_1.out_r:ccl_offload_0.s_axis_compression1
+
+stream_connect=ccl_offload_0.m_axis_compression2:compression_0_2.in_r
+stream_connect=compression_0_2.out_r:ccl_offload_0.s_axis_compression2
+
+# Tie off user kernel interface
+stream_connect=ccl_offload_0.m_axis_krnl:lb_user_krnl.in
+stream_connect=lb_user_krnl.out:ccl_offload_0.s_axis_krnl
+
+[profile]
+data=send_recv:all:all
+data=client_arbiter:all:all
+data=ccl_offload:all:m_axis_eth_tx_data
+data=networklayer:all:M_AXIS_nl2sk
+data=networklayer:all:M_AXIS_nl2eth
+data=cmac_0:all:M_AXIS
+memory=all
+stall=all
+exec=all:all
diff --git a/b_eff/settings/settings.link.xilinx.u280.ddr.ini b/b_eff/settings/settings.link.xilinx.u280.ddr.ini
new file mode 100644
index 00000000..4d8fb9bd
--- /dev/null
+++ b/b_eff/settings/settings.link.xilinx.u280.ddr.ini
@@ -0,0 +1,4 @@
+[connectivity]
+nk=dummyKernel:1:dummyKernel
+
+sp=dummyKernel.m_axi_gmem:DDR[0]
\ No newline at end of file
diff --git a/b_eff/src/common/parameters.h.in b/b_eff/src/common/parameters.h.in
index d404bfd7..80805817 100644
--- a/b_eff/src/common/parameters.h.in
+++ b/b_eff/src/common/parameters.h.in
@@ -1,12 +1,10 @@
 #ifndef SRC_COMMON_PARAMETERS_H_
 #define SRC_COMMON_PARAMETERS_H_
 
-#define VERSION "@PROJECT_VERSION@"
+#include "base_parameters.h"
+
 #define SEND_KERNEL_NAME "@SEND_KERNEL_NAME@"
 #define RECV_KERNEL_NAME "@RECV_KERNEL_NAME@"
-#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@
-#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@
-#define DEFAULT_DEVICE @DEFAULT_DEVICE@
 #define DEFAULT_MAX_MESSAGE_SIZE @DEFAULT_MAX_MESSAGE_SIZE@
 #define DEFAULT_MAX_LOOP_LENGTH @DEFAULT_MAX_LOOP_LENGTH@
 #define DEFAULT_MIN_LOOP_LENGTH @DEFAULT_MIN_LOOP_LENGTH@
@@ -17,25 +15,16 @@
  * Kernel Parameters
  */
 #define CHANNEL_WIDTH @CHANNEL_WIDTH@
-#define NUM_REPLICATIONS @NUM_REPLICATIONS@
-
-#define HOST_DATA_TYPE @HOST_DATA_TYPE@
-#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
-
-#cmakedefine HOST_EMULATION_REORDER
 
 /*
 Short description of the program.
 Moreover the version and build time is also compiled into the description.
 */
 
-#define PROGRAM_DESCRIPTION "Implementation of the effective bandwidth benchmark"\
+#define PROGRAM_NAME "effective bandwidth"
+
+#define PROGRAM_DESCRIPTION "Implementation of the " PROGRAM_NAME " benchmark"\
                             " proposed in the HPCC benchmark suite for FPGA.\n"\
                             "Version: " VERSION "\n"
 
-/**
-Output separator
-*/
-#define HLINE "-------------------------------------------------------------\n"
-
 #endif // SRC_COMMON_PARAMETERS_H_
\ No newline at end of file
diff --git a/b_eff/src/device/CMakeLists.txt b/b_eff/src/device/CMakeLists.txt
index 8316a884..e1b372ea 100644
--- a/b_eff/src/device/CMakeLists.txt
+++ b/b_eff/src/device/CMakeLists.txt
@@ -3,12 +3,32 @@ set(KERNEL_REPLICATION_ENABLED Yes CACHE INTERNAL "Enables kernel replication in
 set(NUM_REPLICATIONS 2)
 include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake)
 
-generate_kernel_targets_intel(communication_bw520n_IEC)
-add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 -m 0 -n 1
-        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1
-        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx --comm-type PCIE -l 1 -u 10 -m 0 -n 1
-        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
-add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 1 -m 20 -n 1
-        WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+if (INTELFPGAOPENCL_FOUND)
+        generate_kernel_targets_intel(communication_bw520n_IEC communication_PCIE)
+        add_test(NAME test_emulation_iec_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_bw520n_IEC_emulate.aocx -l 1 -u 10 --min-size 6 -m 6 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_emulation_cpu_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_PCIE_emulate.aocx --comm-type CPU -l 1 -u 10 -m 0 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_emulation_pcie_intel COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_intel -f communication_PCIE_emulate.aocx -l 1 -u 10 -m 0 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_output_parsing_intel COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_intel -f communication_PCIE_emulate.aocx -l 1 -u 1 -m 20 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+endif()
+
+if (Vitis_FOUND)
+        generate_kernel_targets_xilinx(communication_PCIE)
+        if (USE_ACCL)
+                generate_kernel_targets_xilinx(communication_ACCL communication_ACCL_pl
+                    communication_ACCL_pl_stream)
+        endif()
+        add_test(NAME test_emulation_pcie_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 10 -m 0 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_emulation_cpu_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin --comm-type CPU -l 1 -u 10 -m 0 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_output_parsing_xilinx COMMAND ${CMAKE_SOURCE_DIR}/../scripts/evaluation/execute_and_parse.sh ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 1 -m 20 -n 1
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_emulation_pcie_reverse_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 10 -m 0 -n 1 --kernel-latency --pcie-write --pcie-read
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+        add_test(NAME test_emulation_pcie_reverse_batch_xilinx COMMAND ${CMAKE_SOURCE_DIR}/scripts/clean_emulation_output_files.sh ${CMAKE_BINARY_DIR} ./Network_xilinx -f communication_PCIE_emulate.xclbin -l 1 -u 10 -m 0 -n 1 --kernel-latency --pcie-write --pcie-read --pcie-batch
+                WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
+endif()
diff --git a/b_eff/src/device/communication_ACCL.cl b/b_eff/src/device/communication_ACCL.cl
new file mode 100644
index 00000000..80c12a86
--- /dev/null
+++ b/b_eff/src/device/communication_ACCL.cl
@@ -0,0 +1,27 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+__kernel
+void dummy(__global void *nothing) {
+    // Do nothing.
+    // Will be exluded during linking process and will not be in final bitstream
+}
\ No newline at end of file
diff --git a/b_eff/src/device/communication_ACCL_pl.cpp b/b_eff/src/device/communication_ACCL_pl.cpp
new file mode 100644
index 00000000..c32a3af5
--- /dev/null
+++ b/b_eff/src/device/communication_ACCL_pl.cpp
@@ -0,0 +1,56 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#include "accl_hls.h"
+
+
+void send_recv(ap_uint<64> read_buffer,ap_uint<64> write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations,
+                ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
+                STREAM<command_word> &cmd, STREAM<command_word> &sts) {
+#pragma HLS INTERFACE s_axilite port=read_buffer
+#pragma HLS INTERFACE s_axilite port=write_buffer
+#pragma HLS INTERFACE s_axilite port=size
+#pragma HLS INTERFACE s_axilite port=num_iterations
+#pragma HLS INTERFACE s_axilite port=neighbor_rank
+#pragma HLS INTERFACE s_axilite port=communicator_addr
+#pragma HLS INTERFACE s_axilite port=datapath_cfg
+#pragma HLS INTERFACE axis port=cmd
+#pragma HLS INTERFACE axis port=sts
+#pragma HLS INTERFACE s_axilite port=return
+    accl_hls::ACCLCommand accl(cmd, sts);
+    for (int i = 0; i < num_iterations; i++) {
+        #pragma HLS protocol fixed
+        accl.start_call(
+            ACCL_SEND, size, communicator_addr, neighbor_rank, 0, 0,
+                datapath_cfg, 0, 0,
+                read_buffer, 0, 0);
+        ap_wait();
+        accl.finalize_call();
+        ap_wait();
+        accl.start_call(
+            ACCL_RECV, size, communicator_addr, neighbor_rank, 0, 0,
+                datapath_cfg, 0, 0,
+                0, 0, write_buffer);
+        ap_wait();
+        accl.finalize_call();
+    }
+}
+
diff --git a/b_eff/src/device/communication_ACCL_pl_stream.cpp b/b_eff/src/device/communication_ACCL_pl_stream.cpp
new file mode 100644
index 00000000..eb68fe8e
--- /dev/null
+++ b/b_eff/src/device/communication_ACCL_pl_stream.cpp
@@ -0,0 +1,143 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#include "accl_hls.h"
+
+typedef ap_axiu<1, 0, 0, 0> notify_word;
+
+void
+write_data(ap_uint<512>* read_buffer, ap_uint<32> size, STREAM<stream_word> &data_out) {
+    // receive the incoming data while send may still be in progress
+    for (int chunk = 0; chunk < (size + 15) / 16; chunk++) {
+        #pragma HLS pipeline II=1
+        stream_word word;
+        word.last = 1;
+        word.keep = -1;
+        word.dest = 0;
+        word.data = read_buffer[chunk];
+        data_out.write(word);
+    }
+}
+
+void
+read_data(ap_uint<512>* write_buffer, ap_uint<32> size, STREAM<stream_word> &data_in) {
+    // receive the incoming data while send may still be in progress
+    for (int chunk = 0; chunk < (size + 15) / 16 ; chunk++) {
+        #pragma HLS pipeline II=1
+        stream_word word = data_in.read();
+        write_buffer[chunk] = word.data;
+    }
+}
+
+void
+schedule_send(ap_uint<32> size, ap_uint<32> neighbor_rank, 
+        ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg, 
+        STREAM<command_word> &cmd, STREAM<command_word> &sts) {
+    send_fixed: {
+    #pragma HLS protocol fixed
+    // Send data from stream to the remote FPGA.
+    // Remote FPGA will immediatly move data to stream.
+    // This will allow overlapping of send and recv.
+    accl_hls::ACCLCommand accl(cmd, sts);
+    accl.start_call(
+        ACCL_SEND, size, communicator_addr, neighbor_rank, 0, 0,
+            datapath_cfg, 0, 3,
+            0, 0, 0);
+    ap_wait();
+    accl.finalize_call();
+    }
+}
+
+void recv_stream(ap_uint<512>* write_buffer,  ap_uint<32> size, ap_uint<32> num_iterations,
+                ap_uint<32> notify_enabled,
+                STREAM<stream_word> &data_in,
+                STREAM<notify_word> &notify) {
+#pragma HLS INTERFACE m_axi port=write_buffer bundle=gmem_out
+#pragma HLS INTERFACE s_axilite port=size
+#pragma HLS INTERFACE s_axilite port=num_iterations
+#pragma HLS INTERFACE s_axilite port=notify_enabled
+#pragma HLS INTERFACE axis port=data_in
+#pragma HLS INTERFACE axis port=notify
+#pragma HLS INTERFACE s_axilite port=return
+
+    notify_word w;
+    for (int i = 0; i < num_iterations; i++) {
+        #pragma HLS protocol fixed
+        read_data(write_buffer, size, data_in);
+        ap_wait();
+        if (notify_enabled != 0) {
+            notify.write(w);
+        }
+    }
+}
+
+void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations,
+                ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
+                STREAM<command_word> &cmd, STREAM<command_word> &sts,
+                STREAM<notify_word> &notify) {
+#pragma HLS INTERFACE s_axilite port=size
+#pragma HLS INTERFACE s_axilite port=num_iterations
+#pragma HLS INTERFACE s_axilite port=neighbor_rank
+#pragma HLS INTERFACE s_axilite port=communicator_addr
+#pragma HLS INTERFACE s_axilite port=datapath_cfg
+#pragma HLS INTERFACE axis port=cmd
+#pragma HLS INTERFACE axis port=sts
+#pragma HLS INTERFACE axis port=notify        
+#pragma HLS INTERFACE s_axilite port=return
+
+    for (int i = 0; i < num_iterations; i++) {
+        #pragma HLS protocol fixed
+        schedule_send(size, neighbor_rank, communicator_addr, datapath_cfg, cmd, sts);
+        ap_wait();
+        notify_word w = notify.read();
+    }
+}
+
+void send_stream(ap_uint<512>* read_buffer,  ap_uint<32> size, ap_uint<32> num_iterations,
+                STREAM<stream_word> &data_out) {
+#pragma HLS INTERFACE m_axi port=read_buffer bundle=gmem_in
+#pragma HLS INTERFACE s_axilite port=size
+#pragma HLS INTERFACE s_axilite port=num_iterations
+#pragma HLS INTERFACE axis port=data_out
+#pragma HLS INTERFACE s_axilite port=return
+
+    for (int i = 0; i < num_iterations; i++) {
+        write_data(read_buffer, size, data_out);
+    }
+}
+
+
+void loopback_reduce(STREAM<stream_word> & in0, STREAM<stream_word> & in1, STREAM<stream_word> & out) {
+#pragma HLS INTERFACE axis register both port=in0
+#pragma HLS INTERFACE axis register both port=in1
+#pragma HLS INTERFACE axis register both port=out
+#pragma HLS INTERFACE ap_ctrl_none port=return
+
+stream_word tmp;
+
+do{
+#pragma HLS PIPELINE II=1
+	tmp = in0.read();
+    tmp = in1.read();
+	out.write(tmp);
+} while(tmp.last == 0);
+
+}
diff --git a/b_eff/src/host/execution.h b/b_eff/src/device/communication_PCIE.cl
similarity index 55%
rename from b_eff/src/host/execution.h
rename to b_eff/src/device/communication_PCIE.cl
index 195b97b1..af4f4f81 100644
--- a/b_eff/src/host/execution.h
+++ b/b_eff/src/device/communication_PCIE.cl
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 Marius Meyer
+Copyright (c) 2022 Marius Meyer
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
@@ -19,34 +19,22 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
-#ifndef SRC_HOST_EXECUTION_H_
-#define SRC_HOST_EXECUTION_H_
 
-/* C++ standard library headers */
-#include <map>
-#include <memory>
-#include <vector>
-
-/* External library headers */
 #include "parameters.h"
-#include "network_benchmark.hpp"
-
 
-namespace bm_execution {
 
 /**
-The actual execution of the benchmark.
-This method can be implemented in multiple *.cpp files. This header enables
-simple exchange of the different calculation methods.
-
-@param config struct that contains all necessary information to execute the kernel on the FPGA
-
-
-@return The resulting matrix
-*/
-    std::shared_ptr<network::ExecutionTimings>
-    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength, cl::vector<HOST_DATA_TYPE> &validationData);
-
-}  // namespace bm_execution
-
-#endif  // SRC_HOST_EXECUTION_H_
+ *   Minimal kernel only used to measure the startup latency of a kernel and to provide a 
+ *      memory buffe for Xilinx FPGAs to measure PCIe read and write performance
+ *
+ * @param output Output buffer that will be used to write the verification data into
+ * @param verification Verification value that will be written to the buffer
+ * @param messageSize size of the output buffer
+ */
+__kernel
+__attribute__ ((max_global_work_dim(0)))
+void dummyKernel(__global DEVICE_DATA_TYPE *output, DEVICE_DATA_TYPE verification, int messageSize) {
+    for (int m=0; m < messageSize; m++) {
+        output[m] = verification;
+    }
+}
diff --git a/b_eff/src/device/communication_bw520n_IEC.cl b/b_eff/src/device/communication_bw520n_IEC.cl
index ce128d8c..26379080 100644
--- a/b_eff/src/device/communication_bw520n_IEC.cl
+++ b/b_eff/src/device/communication_bw520n_IEC.cl
@@ -49,17 +49,17 @@ typedef struct {
 /**
  * Definition of the external channels
  */
- // PY_CODE_GEN block_start [replace(local_variables=locals()) for r in range(num_replications)]
-channel message_part ch_out_/*PY_CODE_GEN  2*r+1*/ __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str(r % 4) + "\""*/)));
-channel message_part ch_out_/*PY_CODE_GEN  2*r+2*/  __attribute((io(/*PY_CODE_GEN "\"kernel_output_ch" + str((r + 2) % 4) + "\""*/)));
-channel message_part ch_in_/*PY_CODE_GEN  2*r+1*/ __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str(r % 4) + "\""*/)));
-channel message_part ch_in_/*PY_CODE_GEN  2*r+2*/  __attribute((io(/*PY_CODE_GEN "\"kernel_input_ch" + str((r + 2) % 4) + "\""*/)));
-channel message_part ch_exchange/*PY_CODE_GEN 2*r+1*/;
-channel message_part ch_exchange/*PY_CODE_GEN 2*r+2*/;
-// PY_CODE_GEN block_end
+{% for i in range(num_replications) %}
+channel message_part ch_out_{{ 2*i + 1 }} __attribute((io("kernel_output_ch{{ i % 4 }}")));
+channel message_part ch_out_{{ 2*i + 2 }} __attribute((io("kernel_output_ch{{ (i + 2) % 4 }}")));
+channel message_part ch_in_{{ 2*i + 1 }} __attribute((io("kernel_input_ch{{ i % 4 }}")));
+channel message_part ch_in_{{ 2*i + 2 }}  __attribute((io("kernel_input_ch{{ (i + 2) % 4 }}")));
+channel message_part ch_exchange{{ 2*i + 1 }};
+channel message_part ch_exchange{{ 2*i + 2 }};
+{% endfor %}
 
 
-// PY_CODE_GEN block_start [replace(local_variables=locals()) for r in range(num_replications)]
+{% for i in range(num_replications) %}
 /**
  * Send kernel that will send messages through two channels
  *
@@ -68,7 +68,7 @@ channel message_part ch_exchange/*PY_CODE_GEN 2*r+2*/;
  */
 __kernel
 __attribute__ ((max_global_work_dim(0)))
-void send/*PY_CODE_GEN  r*/(const unsigned data_size,
+void send{{ i }}(const unsigned data_size,
         const unsigned repetitions) {
     const unsigned send_iterations = ((1 << data_size) +  2 * ITEMS_PER_CHANNEL - 1) / (2 * ITEMS_PER_CHANNEL);
     message_part send_part1;
@@ -85,13 +85,13 @@ void send/*PY_CODE_GEN  r*/(const unsigned data_size,
     for (unsigned i=0; i < repetitions; i++) {
         // Send a single message sent over two channels split into multiple chunks
         for (unsigned k=0; k < send_iterations; k++) {
-            write_channel_intel(ch_out_/*PY_CODE_GEN  2*r+1*/, send_part1);
-            write_channel_intel(ch_out_/*PY_CODE_GEN  2*r+2*/, send_part2);
+            write_channel_intel(ch_out_{{ 2*i+1 }}, send_part1);
+            write_channel_intel(ch_out_{{ 2*i+2 }}, send_part2);
         }
 #ifndef EMULATE
         // Introduce data dependency between loop iterations to prevent coalescing of loop
-        send_part1 = read_channel_intel(ch_exchange/*PY_CODE_GEN 2*r+1*/);
-        send_part2 = read_channel_intel(ch_exchange/*PY_CODE_GEN 2*r+2*/);
+        send_part1 = read_channel_intel(ch_exchange{{ 2*i+1 }});
+        send_part2 = read_channel_intel(ch_exchange{{ 2*i+2 }});
 #endif
     }
 }
@@ -106,7 +106,7 @@ void send/*PY_CODE_GEN  r*/(const unsigned data_size,
  */
 __kernel
 __attribute__ ((max_global_work_dim(0)))
-void recv/*PY_CODE_GEN  r*/(__global DEVICE_DATA_TYPE* validation_buffer,
+void recv{{ i }}(__global DEVICE_DATA_TYPE* validation_buffer,
             const unsigned data_size,
             const unsigned repetitions) {
     const unsigned send_iterations = ((1 << data_size) +  2 * ITEMS_PER_CHANNEL - 1) / (2 * ITEMS_PER_CHANNEL);
@@ -117,26 +117,31 @@ void recv/*PY_CODE_GEN  r*/(__global DEVICE_DATA_TYPE* validation_buffer,
     for (unsigned i=0; i < repetitions; i++) {
         // Receive a single message sent over two channels split into multiple chunks
         for (unsigned k=0; k < send_iterations; k++) {
-            recv_part1 = read_channel_intel(ch_in_/*PY_CODE_GEN  2*r+1*/);
-            recv_part2 = read_channel_intel(ch_in_/*PY_CODE_GEN  2*r+2*/);
+            recv_part1 = read_channel_intel(ch_in_{{ 2*i+1 }});
+            recv_part2 = read_channel_intel(ch_in_{{ 2*i+2 }});
+
+            DEVICE_DATA_TYPE mem_buffer[2 * ITEMS_PER_CHANNEL];
+            // Store the last received data chunks in global memory for later validation
+            __attribute__((opencl_unroll_hint(ITEMS_PER_CHANNEL)))
+            for (DEVICE_DATA_TYPE d = 0; d < ITEMS_PER_CHANNEL; d++) {
+                mem_buffer[d] = recv_part1.values[d];
+            }
+            __attribute__((opencl_unroll_hint(ITEMS_PER_CHANNEL)))
+            for (DEVICE_DATA_TYPE d = 0; d < ITEMS_PER_CHANNEL; d++) {
+                mem_buffer[ITEMS_PER_CHANNEL + d] = recv_part2.values[d];
+            }
+            __attribute__((opencl_unroll_hint(2*ITEMS_PER_CHANNEL)))
+            for (DEVICE_DATA_TYPE d = 0; d < 2*ITEMS_PER_CHANNEL; d++) {
+                validation_buffer[k * (2 * ITEMS_PER_CHANNEL) + d] = mem_buffer[d];
+            }
         }
 #ifndef EMULATE
         // Introduce data dependency between loop iterations to prevent coalescing of loop
         // by sending the data to the send kernel
-        write_channel_intel(ch_exchange/*PY_CODE_GEN 2*r+1*/, recv_part1);
-        write_channel_intel(ch_exchange/*PY_CODE_GEN 2*r+2*/, recv_part2);
+        write_channel_intel(ch_exchange{{ 2*i+1 }}, recv_part1);
+        write_channel_intel(ch_exchange{{ 2*i+2 }}, recv_part2);
 #endif
     }
-
-    // Store the last received data chunks in global memory for later validation
-    __attribute__((opencl_unroll_hint(ITEMS_PER_CHANNEL)))
-    for (DEVICE_DATA_TYPE d = 0; d < ITEMS_PER_CHANNEL; d++) {
-        validation_buffer[d] = recv_part1.values[d];
-    }
-    __attribute__((opencl_unroll_hint(ITEMS_PER_CHANNEL)))
-    for (DEVICE_DATA_TYPE d = 0; d < ITEMS_PER_CHANNEL; d++) {
-        validation_buffer[ITEMS_PER_CHANNEL + d] = recv_part2.values[d];
-    }
 }
 
-//PY_CODE_GEN block_end
+{% endfor %}
diff --git a/b_eff/src/host/CMakeLists.txt b/b_eff/src/host/CMakeLists.txt
index fb08281f..28e92c94 100755
--- a/b_eff/src/host/CMakeLists.txt
+++ b/b_eff/src/host/CMakeLists.txt
@@ -14,6 +14,32 @@ if (INTELFPGAOPENCL_FOUND)
     target_link_libraries(${LIB_NAME}_intel hpcc_fpga_base)
     target_link_libraries(${HOST_EXE_NAME}_intel ${LIB_NAME}_intel)
     target_compile_definitions(${LIB_NAME}_intel PRIVATE -DINTEL_FPGA -D_USE_MPI_)
+    target_compile_definitions(${HOST_EXE_NAME}_intel PRIVATE -DINTEL_FPGA)
     target_compile_options(${LIB_NAME}_intel PRIVATE "${OpenMP_CXX_FLAGS}")
     add_test(NAME test_intel_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_intel> -h)
 endif()
+if (Vitis_FOUND)
+if (USE_ACCL)
+    set(CMAKE_SKIP_BUILD_RPATH No)
+    set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes)
+    list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib)
+    list(APPEND HOST_SOURCE ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl_stream.cpp 
+                                ${CMAKE_SOURCE_DIR}/src/device/communication_ACCL_pl.cpp)
+endif()
+    add_library(${LIB_NAME}_xilinx STATIC ${HOST_SOURCE})
+    target_include_directories(${LIB_NAME}_xilinx PRIVATE ${HPCCBaseLibrary_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/src/common ${Vitis_INCLUDE_DIRS})
+    target_include_directories(${LIB_NAME}_xilinx PUBLIC ${CMAKE_SOURCE_DIR}/src/host)
+    add_executable(${HOST_EXE_NAME}_xilinx main.cpp)
+    target_link_libraries(${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
+    target_link_libraries(${LIB_NAME}_xilinx hpcc_fpga_base)
+if (USE_ACCL)
+    target_link_libraries(${LIB_NAME}_xilinx accl)
+    target_include_directories(${LIB_NAME}_xilinx PRIVATE ${ACCL_INCLUDE_PATH})
+    target_link_libraries(${HOST_EXE_NAME}_xilinx zmqpp)
+endif()
+    target_link_libraries(${HOST_EXE_NAME}_xilinx ${LIB_NAME}_xilinx)
+    target_compile_definitions(${LIB_NAME}_xilinx PRIVATE -DXILINX_FPGA)
+    target_compile_definitions(${HOST_EXE_NAME}_xilinx PRIVATE -DXILINX_FPGA)
+    target_compile_options(${LIB_NAME}_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
+    add_test(NAME test_xilinx_host_executable COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_xilinx> -h)
+endif()
diff --git a/b_eff/src/host/execution_types/execution.hpp b/b_eff/src/host/execution_types/execution.hpp
index df630838..0cd828bc 100644
--- a/b_eff/src/host/execution_types/execution.hpp
+++ b/b_eff/src/host/execution_types/execution.hpp
@@ -21,5 +21,15 @@ SOFTWARE.
 */
 
 #include "execution_types/execution_cpu.hpp"
+#ifndef USE_ACCL
 #include "execution_types/execution_pcie.hpp"
-#include "execution_types/execution_iec.hpp"
\ No newline at end of file
+#include "execution_types/execution_pcie_reverse.hpp"
+#ifdef INTEL_FPGA
+#include "execution_types/execution_iec.hpp"
+#endif
+#else
+#include "execution_types/execution_accl.hpp"
+#include "execution_types/execution_accl_stream.hpp"
+#include "execution_types/execution_accl_pl.hpp"
+#include "execution_types/execution_accl_pl_stream.hpp"
+#endif
diff --git a/b_eff/src/host/execution_types/execution_accl.hpp b/b_eff/src/host/execution_types/execution_accl.hpp
new file mode 100644
index 00000000..3d5f41e5
--- /dev/null
+++ b/b_eff/src/host/execution_types/execution_accl.hpp
@@ -0,0 +1,130 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_HPP
+#define SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_HPP
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* External library headers */
+#include "mpi.h"
+#include "accl.hpp"
+
+/* Project's headers */
+
+namespace network::execution_types::accl {
+
+    /*
+    Implementation for the single kernel.
+     @copydoc bm_execution::calculate()
+    */
+	template<class TDevice, class TContext, class TProgram>
+    network::ExecutionTimings
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
+                cl::vector<HOST_DATA_TYPE> &validationData) {
+
+        int err;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+        std::vector<cl::vector<HOST_DATA_TYPE>> recvBufferContents;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclSendBuffers;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclRecvBuffers;
+        size_t size_in_bytes = std::max((1 << messageSize), 4);
+
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+        std::vector<double> calculationTimings;
+        for (uint r =0; r < config.programSettings->numRepetitions; r++) {
+            dummyBufferContents.clear();
+	    recvBufferContents.clear();
+	    acclSendBuffers.clear();
+	    acclRecvBuffers.clear();
+	    int size_in_values = (size_in_bytes + 3) / 4;
+            // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
+            for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+                recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
+		        acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 0));
+		        acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, 1));
+		        acclSendBuffers.back()->sync_to_device();
+		        acclRecvBuffers.back()->sync_to_device();
+            }
+
+            double calculationTime = 0.0;
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto startCalculation = std::chrono::high_resolution_clock::now();
+                for (int l = 0; l < looplength; l++) {
+#ifndef NDEBUG
+                    std::cout << "Send " << size_in_bytes << " bytes to " 
+                                << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl;
+#endif
+			config.context->accl->send(*acclSendBuffers[i], size_in_values, 
+                                        (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                        0, ACCL::GLOBAL_COMM, true);
+#ifndef NDEBUG
+                    std::cout << "Recv " << size_in_bytes << " bytes from " 
+                                << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl;
+#endif
+            config.context->accl->recv(*acclRecvBuffers[i], size_in_values,
+                                        (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                        0, ACCL::GLOBAL_COMM, true);
+#ifndef NDEBUG
+                    std::cout << "Done" << std::endl;
+#endif
+                }
+                auto endCalculation = std::chrono::high_resolution_clock::now();
+                calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
+                #endif
+            }
+            calculationTimings.push_back(calculationTime);
+#ifndef NDEBUG
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+        std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
+#endif
+        }
+        // Read validation data from FPGA will be placed sequentially in buffer for all replications
+        // The data order should not matter, because every byte should have the same value!
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+            acclRecvBuffers[r]->sync_from_device();
+		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]);
+        }
+        return network::ExecutionTimings{
+               looplength,
+                messageSize,
+                calculationTimings
+        };
+    }
+
+}  // namespace bm_execution
+
+#endif
diff --git a/b_eff/src/host/execution_types/execution_accl_pl.hpp b/b_eff/src/host/execution_types/execution_accl_pl.hpp
new file mode 100644
index 00000000..9135ec84
--- /dev/null
+++ b/b_eff/src/host/execution_types/execution_accl_pl.hpp
@@ -0,0 +1,145 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_PL_HPP
+#define SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_PL_HPP
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* External library headers */
+#include "mpi.h"
+#include "accl.hpp"
+#include "cclo_bfm.h"
+#include "accl_hls.h"
+
+/* Project's headers */
+
+extern void send_recv(ap_uint<64> read_buffer,ap_uint<64> write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, 
+                ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
+                STREAM<command_word> &cmd, STREAM<command_word > &sts);
+
+namespace network::execution_types::accl_pl {
+
+
+    /*
+    Implementation for the single kernel.
+     @copydoc bm_execution::calculate()
+    */
+	template<class TDevice, class TContext, class TProgram>
+    network::ExecutionTimings
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
+                cl::vector<HOST_DATA_TYPE> &validationData) {
+
+        int err;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+        std::vector<cl::vector<HOST_DATA_TYPE>> recvBufferContents;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclSendBuffers;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclRecvBuffers;
+        cl_uint size_in_bytes = (1 << messageSize);
+
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+        hlslib::Stream<stream_word> cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo");
+        hlslib::Stream<command_word> cmd("cmd"), sts("sts");
+
+        std::vector<unsigned int> dest = {0};
+        std::unique_ptr<CCLO_BFM> cclo;
+        if (config.programSettings->useAcclEmulation) {
+            cclo = std::make_unique<CCLO_BFM>(6000, current_rank, current_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+            cclo->run();
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+
+        std::vector<double> calculationTimings;
+        for (uint r =0; r < config.programSettings->numRepetitions; r++) {
+            acclSendBuffers.clear();
+            acclRecvBuffers.clear();
+            int size_in_values = (size_in_bytes + 3) / 4;
+            // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
+            for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+                recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
+                acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 0));
+                acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 1));
+                acclSendBuffers.back()->sync_to_device();
+                acclRecvBuffers.back()->sync_to_device();
+            }
+
+            xrt::kernel sendrecvKernel;
+            if (!config.programSettings->useAcclEmulation) {
+                sendrecvKernel = xrt::kernel(*config.device, *config.program, "send_recv");
+            }
+
+            double calculationTime = 0.0;
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto startCalculation = std::chrono::high_resolution_clock::now();
+                if (!config.programSettings->useAcclEmulation) {
+                    auto run = sendrecvKernel(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                            config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}));
+                    run.wait();
+                } else {
+                    send_recv(acclSendBuffers[i]->physical_address(), acclRecvBuffers[i]->physical_address(), size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                            config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}),
+                                            cmd, sts);
+                }
+                auto endCalculation = std::chrono::high_resolution_clock::now();
+                calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
+                #endif
+            }
+            calculationTimings.push_back(calculationTime);
+#ifndef NDEBUG
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+        std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
+#endif
+        }
+
+        if (config.programSettings->useAcclEmulation) {
+            cclo->stop();
+        }
+        // Read validation data from FPGA will be placed sequentially in buffer for all replications
+        // The data order should not matter, because every byte should have the same value!
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+            acclRecvBuffers[r]->sync_from_device();
+		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]);
+        }
+        return network::ExecutionTimings{
+                looplength,
+                messageSize,
+                calculationTimings
+        };
+    }
+
+}  // namespace bm_execution
+
+#endif
diff --git a/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
new file mode 100644
index 00000000..2b12b6d1
--- /dev/null
+++ b/b_eff/src/host/execution_types/execution_accl_pl_stream.hpp
@@ -0,0 +1,182 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_PL_STREAM_HPP
+#define SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_PL_STREAM_HPP
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+#include <thread>
+
+/* External library headers */
+#include "mpi.h"
+#include "accl.hpp"
+#include "cclo_bfm.h"
+#include "accl_hls.h"
+
+/* Project's headers */
+typedef ap_axiu<1, 0, 0, 0> notify_word;
+
+extern void send_stream(ap_uint<512>* read_buffer, ap_uint<32> size, ap_uint<32> num_iterations, 
+                        STREAM<stream_word > &data_out);
+
+extern void recv_stream(ap_uint<512>* write_buffer, ap_uint<32> size, ap_uint<32> num_iterations, 
+                STREAM<stream_word> &data_in, STREAM<notify_word> &notify);
+
+extern void schedule_stream(ap_uint<32> size, ap_uint<32> num_iterations, int enable,
+                ap_uint<32> neighbor_rank, ap_uint<32> communicator_addr, ap_uint<32> datapath_cfg,
+                STREAM<command_word> &cmd, STREAM<command_word > &sts, STREAM<notify_word> &notify);
+
+namespace network::execution_types::accl_pl_stream {
+
+
+    /*
+    Implementation for the single kernel.
+     @copydoc bm_execution::calculate()
+    */
+	template<class TDevice, class TContext, class TProgram>
+    network::ExecutionTimings
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
+                cl::vector<HOST_DATA_TYPE> &validationData) {
+
+        int err;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+        std::vector<cl::vector<HOST_DATA_TYPE>> recvBufferContents;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclSendBuffers;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclRecvBuffers;
+        cl_uint size_in_bytes = (1 << messageSize);
+
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+        hlslib::Stream<stream_word> cclo2krnl("cclo2krnl"), krnl2cclo("krnl2cclo");
+        hlslib::Stream<command_word> cmd("cmd"), sts("sts");
+        hlslib::Stream<notify_word> notify("notify");
+
+        std::vector<unsigned int> dest = {0};
+        std::unique_ptr<CCLO_BFM> cclo;
+        if (config.programSettings->useAcclEmulation) {
+            cclo = std::make_unique<CCLO_BFM>(6000, current_rank, current_size, dest, cmd, sts, cclo2krnl, krnl2cclo);
+            cclo->run();
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+
+        std::vector<double> calculationTimings;
+        for (uint r =0; r < config.programSettings->numRepetitions; r++) {
+            acclSendBuffers.clear();
+            acclRecvBuffers.clear();
+            dummyBufferContents.clear();
+            recvBufferContents.clear();
+            int size_in_values = (size_in_bytes + 3) / 4;
+
+            xrt::kernel sendKernel;
+            xrt::kernel recvKernel;
+            xrt::kernel scheduleKernel;
+            if (!config.programSettings->useAcclEmulation) {
+                sendKernel = xrt::kernel(*config.device, *config.program, "send_stream");
+                recvKernel = xrt::kernel(*config.device, *config.program, "recv_stream");
+                scheduleKernel = xrt::kernel(*config.device, *config.program, "schedule_stream");
+            }
+            // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
+            for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+                recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
+                if (config.programSettings->useAcclEmulation) {
+                    acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 0));
+                    acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, 1));
+                }
+                else {
+                    acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, sendKernel.group_id(0)));
+                    acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::int32, recvKernel.group_id(0)));               
+                }
+                acclSendBuffers.back()->sync_to_device();
+                acclRecvBuffers.back()->sync_to_device();
+            }
+
+            double calculationTime = 0.0;
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto startCalculation = std::chrono::high_resolution_clock::now();
+                if (!config.programSettings->useAcclEmulation) {
+                    auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength, 1);
+                    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+                    MPI_Barrier(MPI_COMM_WORLD);
+                    auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength);
+                    startCalculation = std::chrono::high_resolution_clock::now();
+                    auto run_schedule = scheduleKernel(size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                            config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}));
+                    run_send.wait();
+                    run_recv.wait();
+                    run_schedule.wait();
+                } else {
+                    std::thread run_send(send_stream, reinterpret_cast<ap_uint<512>*>(acclSendBuffers[i]->buffer()), size_in_values, looplength,
+                                            std::ref(krnl2cclo));
+                    std::thread run_recv(recv_stream, reinterpret_cast<ap_uint<512>*>(acclRecvBuffers[i]->buffer()), size_in_values, looplength, 1,
+                                            std::ref(cclo2krnl), std::ref(notify));
+                    std::thread run_schedule(schedule_stream,size_in_values, looplength, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                            config.context->accl->get_communicator_addr(), config.context->accl->get_arithmetic_config_addr({ACCL::dataType::int32, ACCL::dataType::int32}),
+                                            std::ref(cmd), std::ref(sts), std::ref(notify));
+                    run_send.join();
+                    run_recv.join();
+                    run_schedule.join();
+                }
+                auto endCalculation = std::chrono::high_resolution_clock::now();
+                calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
+                #endif
+            }
+            calculationTimings.push_back(calculationTime);
+#ifndef NDEBUG
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+        std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
+#endif
+        }
+
+        if (config.programSettings->useAcclEmulation) {
+            cclo->stop();
+        }
+        // Read validation data from FPGA will be placed sequentially in buffer for all replications
+        // The data order should not matter, because every byte should have the same value!
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+            if (!config.programSettings->useAcclEmulation) {
+                acclRecvBuffers[r]->sync_from_device();
+            }
+		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]);
+        }
+        return network::ExecutionTimings{
+                looplength,
+                messageSize,
+                calculationTimings
+        };
+    }
+
+}  // namespace bm_execution
+
+#endif
diff --git a/b_eff/src/host/execution_types/execution_accl_stream.hpp b/b_eff/src/host/execution_types/execution_accl_stream.hpp
new file mode 100644
index 00000000..797b8ca7
--- /dev/null
+++ b/b_eff/src/host/execution_types/execution_accl_stream.hpp
@@ -0,0 +1,134 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_STREAM_HPP
+#define SRC_HOST_EXECUTION_TYPES_EXECUTION_ACCL_STREAM_HPP
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* External library headers */
+#include "mpi.h"
+#include "accl.hpp"
+
+/* Project's headers */
+
+namespace network::execution_types::accl_stream {
+
+    /*
+    Implementation for the single kernel.
+     @copydoc bm_execution::calculate()
+    */
+	template<class TDevice, class TContext, class TProgram>
+    network::ExecutionTimings
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
+                cl::vector<HOST_DATA_TYPE> &validationData) {
+
+        int err;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+        std::vector<cl::vector<HOST_DATA_TYPE>> recvBufferContents;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclSendBuffers;
+	    std::vector<std::unique_ptr<ACCL::Buffer<HOST_DATA_TYPE>>> acclRecvBuffers;
+        size_t size_in_bytes = std::max((1 << messageSize), 4);
+
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+        std::vector<double> calculationTimings;
+        for (uint r =0; r < config.programSettings->numRepetitions; r++) {
+            dummyBufferContents.clear();
+	    recvBufferContents.clear();
+	    acclSendBuffers.clear();
+	    acclRecvBuffers.clear();
+	    int size_in_values = (size_in_bytes + 3) / 4;
+        xrt::kernel sendKernel;
+        xrt::kernel recvKernel;
+        xrt::kernel scheduleKernel;
+        sendKernel = xrt::kernel(*config.device, *config.program, "send_stream");
+        recvKernel = xrt::kernel(*config.device, *config.program, "recv_stream");
+        scheduleKernel = xrt::kernel(*config.device, *config.program, "schedule_stream");
+            // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
+            for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+                recvBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
+		        acclSendBuffers.push_back(config.context->accl->create_buffer(dummyBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, sendKernel.group_id(0)));
+		        acclRecvBuffers.push_back(config.context->accl->create_buffer(recvBufferContents.back().data(), size_in_bytes, ACCL::dataType::float32, recvKernel.group_id(0)));
+		        acclSendBuffers.back()->sync_to_device();
+		        acclRecvBuffers.back()->sync_to_device();
+            }
+            double calculationTime = 0.0;
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto run_recv = recvKernel(*acclRecvBuffers[i]->bo(), size_in_values, looplength, 0);
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto run_send = sendKernel(*acclSendBuffers[i]->bo(), size_in_values, looplength);
+                auto startCalculation = std::chrono::high_resolution_clock::now();
+                for (int l = 0; l < looplength; l++) {
+#ifndef NDEBUG
+                    std::cout << "Stream " << size_in_bytes << " bytes to " 
+                                << ((current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size) << std::endl;
+#endif
+            config.context->accl->stream_put(ACCL::dataType::float32, size_in_values, 
+                                        (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size,
+                                        0);
+#ifndef NDEBUG
+                    std::cout << "Done" << std::endl;
+#endif
+                }
+                run_send.wait();
+                run_recv.wait();
+                auto endCalculation = std::chrono::high_resolution_clock::now();
+                calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
+                #endif
+            }
+            calculationTimings.push_back(calculationTime);
+#ifndef NDEBUG
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+        std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
+#endif
+        }
+        // Read validation data from FPGA will be placed sequentially in buffer for all replications
+        // The data order should not matter, because every byte should have the same value!
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+            acclRecvBuffers[r]->sync_from_device();
+		    std::copy(recvBufferContents[r].begin(), recvBufferContents[r].end(), &validationData.data()[size_in_bytes * r]);
+        }
+        return network::ExecutionTimings{
+               looplength,
+                messageSize,
+                calculationTimings
+        };
+    }
+
+}  // namespace bm_execution
+
+#endif
diff --git a/b_eff/src/host/execution_types/execution_cpu.hpp b/b_eff/src/host/execution_types/execution_cpu.hpp
index 778dc2f1..81ace311 100644
--- a/b_eff/src/host/execution_types/execution_cpu.hpp
+++ b/b_eff/src/host/execution_types/execution_cpu.hpp
@@ -38,16 +38,16 @@ namespace network::execution_types::cpu {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
-    std::shared_ptr<network::ExecutionTimings>
-    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength,
+    template<class TDevice, class TContext, class TProgram>
+    network::ExecutionTimings
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
         int err;
-        std::vector<cl::CommandQueue> sendQueues;
-        std::vector<cl::Buffer> dummyBuffers;
-        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferReadContents;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferWriteContents;
 
-        cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
+        cl_uint size_in_bytes = (1 << messageSize);
 
         int current_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
@@ -57,32 +57,20 @@ namespace network::execution_types::cpu {
 
         std::vector<double> calculationTimings;
         for (uint r =0; r < config.programSettings->numRepetitions; r++) {
-            sendQueues.clear();
-            dummyBuffers.clear();
-            dummyBufferContents.clear();
+            dummyBufferReadContents.clear();
+            dummyBufferWriteContents.clear();
             // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
             for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-
-                dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err));
-                ASSERT_CL(err)
-
-                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
-
-                cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err);
-                ASSERT_CL(err)
-
-                sendQueue.enqueueWriteBuffer(dummyBuffers.back(), CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents.back().data());
-
-                sendQueues.push_back(sendQueue);
-
+                dummyBufferReadContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+                dummyBufferWriteContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(0));
             }
             double calculationTime = 0.0;
             for (int i = 0; i < config.programSettings->kernelReplications; i++) {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 for (int l = 0; l < looplength; l++) {
-                        MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, 
-                                        dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2)  + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                        MPI_Sendrecv(dummyBufferReadContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, 
+                                        dummyBufferWriteContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2)  + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
@@ -102,15 +90,14 @@ namespace network::execution_types::cpu {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-            err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
-            ASSERT_CL(err);
+            std::copy(dummyBufferWriteContents[r].begin(),dummyBufferWriteContents[r].end(),
+                        &validationData.data()[r * size_in_bytes]);
         }
-        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
+        return network::ExecutionTimings{
                 looplength,
                 messageSize,
                 calculationTimings
-        });
-        return result;
+        };
     }
 
 }  // namespace bm_execution
diff --git a/b_eff/src/host/execution_types/execution_iec.hpp b/b_eff/src/host/execution_types/execution_iec.hpp
index 2ec348e5..471a3547 100644
--- a/b_eff/src/host/execution_types/execution_iec.hpp
+++ b/b_eff/src/host/execution_types/execution_iec.hpp
@@ -39,8 +39,9 @@ namespace network::execution_types::iec {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
-    std::shared_ptr<network::ExecutionTimings>
-    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength,
+   template<class TDevice, class TContext, class TProgram>
+    network::ExecutionTimings
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
         int err;
@@ -161,15 +162,14 @@ namespace network::execution_types::iec {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-            err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
+            err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * (1 << messageSize), &validationData.data()[r * (1 << messageSize)]);
             ASSERT_CL(err);
         }
-        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
+        return network::ExecutionTimings{
                 looplength,
                 messageSize,
                 calculationTimings
-        });
-        return result;
+        };
     }
 
 }  // namespace bm_execution
diff --git a/b_eff/src/host/execution_types/execution_pcie.hpp b/b_eff/src/host/execution_types/execution_pcie.hpp
index 73156b7e..244da3e2 100644
--- a/b_eff/src/host/execution_types/execution_pcie.hpp
+++ b/b_eff/src/host/execution_types/execution_pcie.hpp
@@ -38,16 +38,18 @@ namespace network::execution_types::pcie {
     Implementation for the single kernel.
      @copydoc bm_execution::calculate()
     */
-    std::shared_ptr<network::ExecutionTimings>
-    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings> const& config, cl_uint messageSize, cl_uint looplength,
+    template<class TDevice, class TContext, class TProgram>
+    network::ExecutionTimings
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
                 cl::vector<HOST_DATA_TYPE> &validationData) {
 
         int err;
         std::vector<cl::CommandQueue> sendQueues;
         std::vector<cl::Buffer> dummyBuffers;
+        std::vector<cl::Kernel> dummyKernels;
         std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
 
-        cl_uint size_in_bytes = std::max(static_cast<int>(validationData.size()), (1 << messageSize));
+        cl_uint size_in_bytes = (1 << messageSize);
 
         int current_rank;
         MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
@@ -66,6 +68,16 @@ namespace network::execution_types::pcie {
                 dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err));
                 ASSERT_CL(err)
 
+                dummyKernels.push_back(cl::Kernel(*config.program,
+                                                    "dummyKernel", &err));
+
+                err = dummyKernels[r].setArg(0, dummyBuffers[r]);
+                ASSERT_CL(err);
+                err = dummyKernels[r].setArg(1, (HOST_DATA_TYPE)(messageSize & 255));
+                ASSERT_CL(err);
+                err = dummyKernels[r].setArg(2, 1); 
+                ASSERT_CL(err);
+
                 dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
 
                 cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err);
@@ -81,14 +93,20 @@ namespace network::execution_types::pcie {
                 MPI_Barrier(MPI_COMM_WORLD);
                 auto startCalculation = std::chrono::high_resolution_clock::now();
                 for (int l = 0; l < looplength; l++) {
-
+                        if(config.programSettings->pcie_reverse_execute_kernel) {
+                            sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1));
+                        }
                         sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
+                        sendQueues[i].finish();
 
                         MPI_Sendrecv(dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2) + current_size) % current_size, 0, 
                                         dummyBufferContents[i].data(), size_in_bytes, MPI_CHAR, (current_rank - 1 + 2 * ((current_rank + i) % 2)  + current_size) % current_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
 
                         sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
-
+                        if(config.programSettings->pcie_reverse_execute_kernel) {
+                            sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1));
+                        }
+                        sendQueues[i].finish();
                 }
                 auto endCalculation = std::chrono::high_resolution_clock::now();
                 calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
@@ -108,15 +126,14 @@ namespace network::execution_types::pcie {
         // Read validation data from FPGA will be placed sequentially in buffer for all replications
         // The data order should not matter, because every byte should have the same value!
         for (int r = 0; r < config.programSettings->kernelReplications; r++) {
-            err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / config.programSettings->kernelReplications, &validationData.data()[r * validationData.size() / config.programSettings->kernelReplications]);
+            err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, &validationData.data()[r * size_in_bytes]);
             ASSERT_CL(err);
         }
-        std::shared_ptr<network::ExecutionTimings> result(new network::ExecutionTimings{
+        return network::ExecutionTimings{
                 looplength,
                 messageSize,
                 calculationTimings
-        });
-        return result;
+        };
     }
 
 }  // namespace bm_execution
diff --git a/b_eff/src/host/execution_types/execution_pcie_reverse.hpp b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
new file mode 100644
index 00000000..e3c15a5c
--- /dev/null
+++ b/b_eff/src/host/execution_types/execution_pcie_reverse.hpp
@@ -0,0 +1,154 @@
+/*
+Copyright (c) 2019 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_EXECUTION_TYPES_EXECUTION_PCIE_REVERSE_HPP
+#define SRC_HOST_EXECUTION_TYPES_EXECUTION_PCIE_REVERSE_HPP
+
+/* C++ standard library headers */
+#include <memory>
+#include <vector>
+#include <chrono>
+
+/* External library headers */
+#include "mpi.h"
+
+/* Project's headers */
+
+namespace network::execution_types::pcie_reverse {
+
+    /*
+    Implementation for the single kernel.
+     @copydoc bm_execution::calculate()
+    */
+    template<class TDevice, class TContext, class TProgram>
+    network::ExecutionTimings
+    calculate(hpcc_base::ExecutionSettings<network::NetworkProgramSettings, TDevice, TContext, TProgram> const& config, cl_uint messageSize, cl_uint looplength,
+                cl::vector<HOST_DATA_TYPE> &validationData) {
+
+        int err;
+        std::vector<cl::CommandQueue> sendQueues;
+        std::vector<cl::Buffer> dummyBuffers;
+        std::vector<cl::vector<HOST_DATA_TYPE>> dummyBufferContents;
+        std::vector<cl::Kernel> dummyKernels;
+
+        cl_uint size_in_bytes = (1 << messageSize);
+
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+        std::vector<double> calculationTimings;
+        for (uint r =0; r < config.programSettings->numRepetitions; r++) {
+            sendQueues.clear();
+            dummyBuffers.clear();
+            dummyBufferContents.clear();
+            dummyKernels.clear();
+
+            // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
+            for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+
+                dummyBuffers.push_back(cl::Buffer(*config.context, CL_MEM_READ_WRITE, sizeof(HOST_DATA_TYPE) * size_in_bytes,0,&err));
+                ASSERT_CL(err)
+
+                dummyKernels.push_back(cl::Kernel(*config.program,
+                                                    "dummyKernel", &err));
+
+                err = dummyKernels[r].setArg(0, dummyBuffers[r]);
+                ASSERT_CL(err);
+                err = dummyKernels[r].setArg(1, (HOST_DATA_TYPE)(messageSize & 255));
+                ASSERT_CL(err);
+                err = dummyKernels[r].setArg(2, 1); 
+                ASSERT_CL(err);
+
+                dummyBufferContents.emplace_back(size_in_bytes, static_cast<HOST_DATA_TYPE>(messageSize & (255)));
+
+                cl::CommandQueue sendQueue(*config.context, *config.device, 0, &err);
+                ASSERT_CL(err)
+
+                sendQueue.enqueueWriteBuffer(dummyBuffers.back(), CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents.back().data());
+
+                sendQueues.push_back(sendQueue);
+
+            }
+            double calculationTime = 0.0;
+            for (int i = 0; i < config.programSettings->kernelReplications; i++) {
+                MPI_Barrier(MPI_COMM_WORLD);
+                auto startCalculation = std::chrono::high_resolution_clock::now();
+                for (int l = 0; l < looplength; l++) {
+                    if (config.programSettings->pcie_reverse_write_pcie) {
+                        sendQueues[i].enqueueWriteBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
+                        if (!config.programSettings->pcie_reverse_batch) {
+                            sendQueues[i].finish();
+                        }
+                    }
+                    if (config.programSettings->pcie_reverse_execute_kernel) {
+                        sendQueues[i].enqueueNDRangeKernel(dummyKernels[i], cl::NullRange, cl::NDRange(1), cl::NDRange(1));
+                        if (!config.programSettings->pcie_reverse_batch) {
+                            sendQueues[i].finish();
+                        }
+                    }
+                    if (config.programSettings->pcie_reverse_read_pcie) {
+                        sendQueues[i].enqueueReadBuffer(dummyBuffers[i], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[i].data());
+                        if (!config.programSettings->pcie_reverse_batch) {
+                            sendQueues[i].finish();
+                        }
+                    }
+                }
+                if (config.programSettings->pcie_reverse_batch) {
+                    sendQueues[i].finish();
+                }
+                auto endCalculation = std::chrono::high_resolution_clock::now();
+                calculationTime += std::chrono::duration_cast<std::chrono::duration<double>>(endCalculation - startCalculation).count();
+                #ifndef NDEBUG
+                        int current_rank;
+                        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+                        std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
+                #endif
+            }
+            calculationTimings.push_back(calculationTime);
+#ifndef NDEBUG
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+        std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
+#endif
+        }
+        // Read validation data from FPGA will be placed sequentially in buffer for all replications
+        // The data order should not matter, because every byte should have the same value!
+        for (int r = 0; r < config.programSettings->kernelReplications; r++) {
+            if (!config.programSettings->pcie_reverse_read_pcie) {
+                err = sendQueues[r].enqueueReadBuffer(dummyBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * size_in_bytes, dummyBufferContents[r].data());
+                err = sendQueues[r].finish();
+                ASSERT_CL(err)
+            }
+            std::copy(dummyBufferContents[r].begin(), dummyBufferContents[r].end(), &validationData.data()[r * size_in_bytes]);
+        }
+        return network::ExecutionTimings{
+                looplength,
+                messageSize,
+                calculationTimings
+        };
+    }
+
+}  // namespace bm_execution
+
+#endif
diff --git a/b_eff/src/host/network_benchmark.cpp b/b_eff/src/host/network_benchmark.cpp
index 7bf728a2..4058c527 100644
--- a/b_eff/src/host/network_benchmark.cpp
+++ b/b_eff/src/host/network_benchmark.cpp
@@ -36,7 +36,16 @@ SOFTWARE.
 
 network::NetworkProgramSettings::NetworkProgramSettings(cxxopts::ParseResult &results) : hpcc_base::BaseSettings(results),
     maxLoopLength(results["u"].as<uint>()), minLoopLength(results["l"].as<uint>()), maxMessageSize(results["m"].as<uint>()), 
-    minMessageSize(results["min-size"].as<uint>()), llOffset(results["o"].as<uint>()), llDecrease(results["d"].as<uint>()) {
+    minMessageSize(results["min-size"].as<uint>()), stepSize(results["step-size"].as<uint>()), llOffset(results["o"].as<uint>()), 
+    llDecrease(results["d"].as<uint>()), pcie_reverse_write_pcie(results["pcie-read"].count()), 
+    pcie_reverse_read_pcie(results["pcie-write"].count()), pcie_reverse_execute_kernel(results["kernel-latency"].count()),
+    pcie_reverse_batch(results["pcie-batch"].count()), pcie_reverse(results["pcie-reverse"].count())
+#ifdef USE_ACCL
+    , accl_from_programable_logic(results["accl-pl"].count()), accl_axi_stream(results["accl-stream"].count())  
+#endif    
+{
+
+    pcie_reverse = pcie_reverse_execute_kernel | pcie_reverse_read_pcie | pcie_reverse_write_pcie;
 
 }
 
@@ -48,8 +57,8 @@ network::NetworkProgramSettings::getSettingsMap() {
         return map;
 }
 
-network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize, unsigned int _loopLength) : messageSize(_messageSize), loopLength(_loopLength), 
-                                                                            validationBuffer(CHANNEL_WIDTH * 2 * 2, 0) {
+network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize, unsigned int _loopLength, unsigned int replications) : messageSize(_messageSize), loopLength(_loopLength), 
+                                                                            validationBuffer((1 << _messageSize) * replications, 0) {
                                                                                 // TODO: fix the validation buffer size to use the variable number of kernel replications and channels
                                                                                 // Validation data buffer should be big enough to fit the data of two channels
                                                                                 // for every repetition. The number of kernel replications is fixed to 2, which 
@@ -57,13 +66,13 @@ network::NetworkData::NetworkDataItem::NetworkDataItem(unsigned int _messageSize
                                                                             }
 
 network::NetworkData::NetworkData(unsigned int max_looplength, unsigned int min_looplength, unsigned int min_messagesize, unsigned int max_messagesize, 
-                                unsigned int offset, unsigned int decrease) {
+                                unsigned int stepsize, unsigned int offset, unsigned int decrease, unsigned int replications) {
     uint decreasePerStep = (max_looplength - min_looplength) / decrease;
-    for (uint i = min_messagesize; i <= max_messagesize; i++) {
+    for (uint i = min_messagesize; i <= max_messagesize; i += stepsize) {
         uint messageSizeDivOffset = (i > offset) ? i - offset : 0u;
         uint newLooplength = (max_looplength > messageSizeDivOffset * decreasePerStep) ? max_looplength - messageSizeDivOffset * decreasePerStep : 0u;
         uint looplength = std::max(newLooplength, min_looplength);
-        this->items.push_back(NetworkDataItem(i, looplength));
+        this->items.push_back(NetworkDataItem(i, looplength, replications));
     }
 }
 
@@ -83,13 +92,24 @@ network::NetworkBenchmark::addAdditionalParseOptions(cxxopts::Options &options)
         ("min-size", "Minimum Message Size", cxxopts::value<uint>()->default_value(std::to_string(0)))
         ("m", "Maximum message size",
              cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_MAX_MESSAGE_SIZE)))
+        ("step-size", "Step size to generate message sizes in the specified range",
+            cxxopts::value<uint>()->default_value(std::to_string(1)))
         ("o", "Offset used before reducing repetitions",
             cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_OFFSET)))
         ("d", "Number os steps the repetitions are decreased to its minimum",
-            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE)));
+            cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_LOOP_LENGTH_DECREASE)))
+#ifdef USE_ACCL
+        ("accl-pl", "Use second ACCL command kernel to schedule sends and recevs from PL")
+        ("accl-stream", "Send and receive data to AXI streams instead of global memory")
+#endif
+        ("pcie-read", "Use reverse PCIe experiment and measure PCIe read performance from device")
+        ("pcie-write", "Use reverse PCIe experiment and measure PCIe write performance from device")
+        ("kernel-latency", "Use reverse PCIe experiment and measure kernel execution latency")
+        ("pcie-batch", "Execute the reverse PCIe experiments in batch mode to make use of the queues of the schedulers")
+        ("pcie-reverse", "Execute the reverse PCIe experiments");
 }
 
-std::unique_ptr<network::NetworkExecutionTimings>
+void
 network::NetworkBenchmark::executeKernel(NetworkData &data) {
     // Get the number of processes
     int world_size;
@@ -99,32 +119,55 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
     int world_rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 
-    std::vector<std::shared_ptr<network::ExecutionTimings>> timing_results;
+    std::vector<network::ExecutionTimings> timing_results;
 
     for (auto& run : data.items) {
         if (world_rank == 0) {
-            std::cout << "Measure for " << (1 << run.messageSize) << " Byte" << std::endl;
+            std::cout << std::dec << "Measure for " << (1 << run.messageSize) << " Byte" << std::endl;
         }
-        std::shared_ptr<network::ExecutionTimings> timing;
+        network::ExecutionTimings timing;
         switch (executionSettings->programSettings->communicationType) {
             case hpcc_base::CommunicationType::cpu_only: timing = execution_types::cpu::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
-            case hpcc_base::CommunicationType::pcie_mpi: timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
-            case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
-            default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType));
+#ifndef USE_ACCL
+	    case hpcc_base::CommunicationType::pcie_mpi: 
+            if (executionSettings->programSettings->pcie_reverse) {
+                timing = execution_types::pcie_reverse::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
+            } else {
+                timing = execution_types::pcie::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); 
+            }
+            break;
+#ifdef INTEL_FPGA
+	    case hpcc_base::CommunicationType::intel_external_channels: timing = execution_types::iec::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); break;
+#endif
+#else
+	    case hpcc_base::CommunicationType::accl: if (!executionSettings->programSettings->accl_from_programable_logic) { 
+                                                    if (!executionSettings->programSettings->accl_axi_stream) { 
+                                                        timing = execution_types::accl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
+                                                    }else {
+                                                       timing = execution_types::accl_stream::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer); 
+                                                    }
+                                                } else { 
+                                                   if (!executionSettings->programSettings->accl_axi_stream) { 
+                                                    timing = execution_types::accl_pl::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
+                                                } 
+                                                else {
+                                                timing = execution_types::accl_pl_stream::calculate(*executionSettings, run.messageSize, run.loopLength, run.validationBuffer);
+                                                }} break;
+#endif
+	    default: throw std::runtime_error("Selected Communication type not supported: " + hpcc_base::commToString(executionSettings->programSettings->communicationType));
         }
         timing_results.push_back(timing);
     }
 
-    std::unique_ptr<network::NetworkExecutionTimings> collected_results = std::unique_ptr<network::NetworkExecutionTimings> (new network::NetworkExecutionTimings());
     if (world_rank > 0) {
         for (const auto& t : timing_results) {
-            MPI_Send(&(t->messageSize),
+            MPI_Send(&(t.messageSize),
                      1,
                      MPI_UNSIGNED, 0, 0, MPI_COMM_WORLD);
-            MPI_Send(&(t->looplength),
+            MPI_Send(&(t.looplength),
                      1,
                      MPI_UNSIGNED, 0, 1, MPI_COMM_WORLD);
-            MPI_Send(&(t->calculationTimings.front()),
+            MPI_Send(&(t.calculationTimings.front()),
                      executionSettings->programSettings->numRepetitions,
                      MPI_DOUBLE, 0,  2, MPI_COMM_WORLD);
         }
@@ -132,84 +175,86 @@ network::NetworkBenchmark::executeKernel(NetworkData &data) {
         std::cout << "Collect results over MPI.";
         int k = 0;
         for (auto& run : data.items) {
-            std::vector<std::shared_ptr<network::ExecutionTimings>> tmp_timings;
+            std::vector<network::ExecutionTimings> tmp_timings;
             std::cout << ".";
             for (int i=1; i < world_size; i++) {
-                auto execution_result = std::shared_ptr<network::ExecutionTimings>( new network::ExecutionTimings {
+                auto execution_result = network::ExecutionTimings{
                     0,0,std::vector<double>(executionSettings->programSettings->numRepetitions)
-                });
+                };
                 MPI_Status status;
-                MPI_Recv(&(execution_result->messageSize),
+                MPI_Recv(&(execution_result.messageSize),
                          1,
                          MPI_UNSIGNED, i, 0, MPI_COMM_WORLD, &status);
-                MPI_Recv(&(execution_result->looplength),
+                MPI_Recv(&(execution_result.looplength),
                          1,
                          MPI_UNSIGNED, i, 1, MPI_COMM_WORLD, &status);
-                MPI_Recv(&(execution_result->calculationTimings.front()),
+                MPI_Recv(&(execution_result.calculationTimings.front()),
                          executionSettings->programSettings->numRepetitions,
                          MPI_DOUBLE, i, 2, MPI_COMM_WORLD, &status);
                 tmp_timings.push_back(execution_result);
-                if (execution_result->messageSize != run.messageSize) {
-                    std::cerr << "Wrong message size: " << execution_result->messageSize << " != " << run.messageSize << " from rank " << i << std::endl;
+                if (execution_result.messageSize != run.messageSize) {
+                    std::cerr << "Wrong message size: " << execution_result.messageSize << " != " << run.messageSize << " from rank " << i << std::endl;
                     throw std::runtime_error("Wrong message size received! Something went wrong in the MPI communication");
                 }
             }
             tmp_timings.push_back(timing_results[k]);
             k++;
-            collected_results->timings.emplace(run.messageSize, std::make_shared<std::vector<std::shared_ptr<network::ExecutionTimings>>>(tmp_timings));
+            collected_timings.emplace(run.messageSize, network::ExecutionResult{tmp_timings, 0.0, 0.0});
         }
         std::cout << " done!" << std::endl;
     }
-
-        return collected_results;
+    return;
 }
 
 void
-network::NetworkBenchmark::collectAndPrintResults(const network::NetworkExecutionTimings &output) {
+network::NetworkBenchmark::collectResults() {
     std::vector<double> maxBandwidths;
 
     if (mpi_comm_rank == 0) {
-        std::cout << std::setw(ENTRY_SPACE) << "MSize" << "   "
-                << std::setw(ENTRY_SPACE) << "looplength" << "   "
-                << std::setw(ENTRY_SPACE) << "transfer" << "   "
-                << std::setw(ENTRY_SPACE) << "B/s" << std::endl;
-        std::vector<double> totalMaxMinCalculationTime;
-        for (long unsigned int i =0; i < output.timings.size(); i++) {
-            totalMaxMinCalculationTime.push_back(0.0);
-        }
         int i = 0;
-        for (const auto& msgSizeResults : output.timings) {
-            for (const auto& r : *msgSizeResults.second) {
-                double localMinCalculationTime = *min_element(r->calculationTimings.begin(), r->calculationTimings.end());
-                totalMaxMinCalculationTime[i] = std::max(totalMaxMinCalculationTime[i], localMinCalculationTime);
+        for (auto& timing : collected_timings) {
+            for (auto& r : timing.second.execution_timings) {
+                double localMinCalculationTime = *min_element(r.calculationTimings.begin(), r.calculationTimings.end());
+                timing.second.maxMinCalculationTime = std::max(timing.second.maxMinCalculationTime, localMinCalculationTime);
             }
             i++;
         }
         i = 0;
-        for (const auto& msgSizeResults : output.timings) {
-            int looplength = msgSizeResults.second->at(0)->looplength;
+        for (auto& timing : collected_timings) {
+            int looplength = timing.second.execution_timings.at(0).looplength;
+            int messageSize = timing.first;
+            int num_timings = timing.second.execution_timings.size();
             // The total sent data in bytes will be:
-            // #Nodes * message_size * looplength * 2
-            // the * 2 is because we have two kernels per bitstream that will send and receive simultaneously.
+            // #Nodes * message_size * looplength * kernel_replications
+            // the * kernel_replications is because we have multiple replications per bitstream that will send and receive simultaneously.
             // This will be divided by half of the maximum of the minimum measured runtime over all ranks.
-            double maxCalcBW = static_cast<double>(msgSizeResults.second->size() * 2 * (1 << msgSizeResults.first) * looplength)
-                                                                / (totalMaxMinCalculationTime[i]);
+            timing.second.maxCalcBW = static_cast<double>( num_timings * executionSettings->programSettings->kernelReplications
+                                                            * (1 << messageSize) * looplength) / timing.second.maxMinCalculationTime;
 
-            maxBandwidths.push_back(maxCalcBW);
+            maxBandwidths.push_back(timing.second.maxCalcBW);
 
-            std::cout << std::setw(ENTRY_SPACE) << (1 << msgSizeResults.first) << "   "
-                    << std::setw(ENTRY_SPACE) << looplength << "   "
-                    << std::setw(ENTRY_SPACE) << totalMaxMinCalculationTime[i] << "   "
-                    << std::setw(ENTRY_SPACE)  << maxCalcBW
-                    << std::endl;
             i++;
         }
 
+        results.emplace("b_eff", hpcc_base::HpccResult(accumulate(maxBandwidths.begin(), maxBandwidths.end(), 0.0) / static_cast<double>(maxBandwidths.size()), "B/s"));
+    }
+}
 
-        double b_eff = accumulate(maxBandwidths.begin(), maxBandwidths.end(), 0.0) / static_cast<double>(maxBandwidths.size());
-
-        std::cout << std::endl << "b_eff = " << b_eff << " B/s" << std::endl;
+void network::NetworkBenchmark::printResults() {
+    std::cout << std::setw(ENTRY_SPACE) << "MSize" << "   "
+            << std::setw(ENTRY_SPACE) << "looplength" << "   "
+            << std::setw(ENTRY_SPACE) << "time [s]" << "   "
+            << std::setw(ENTRY_SPACE) << "B/s" << std::endl;
+
+    for (const auto& timing : collected_timings) {
+        std::cout << std::setw(ENTRY_SPACE) << (1 << timing.first) << "   "
+                << std::setw(ENTRY_SPACE) << timing.second.execution_timings.at(0).looplength << "   "
+                << std::setw(ENTRY_SPACE) << timing.second.maxMinCalculationTime << "   "
+                << std::setw(ENTRY_SPACE)  << timing.second.maxCalcBW
+                << std::endl;
     }
+
+    std::cout << std::endl << "b_eff = " << results.at("b_eff") << std::endl;
 }
 
 std::unique_ptr<network::NetworkData>
@@ -227,35 +272,43 @@ network::NetworkBenchmark::generateInputData() {
                                                                             executionSettings->programSettings->minLoopLength,
                                                                             executionSettings->programSettings->minMessageSize,
                                                                             executionSettings->programSettings->maxMessageSize,
+                                                                            executionSettings->programSettings->stepSize,
                                                                             executionSettings->programSettings->llOffset,
-                                                                            executionSettings->programSettings->llDecrease));
+                                                                            executionSettings->programSettings->llDecrease,
+                                                                            executionSettings->programSettings->kernelReplications));
     return d;
 }
 
 bool  
-network::NetworkBenchmark::validateOutputAndPrintError(network::NetworkData &data) {
+network::NetworkBenchmark::validateOutput(network::NetworkData &data) {
     unsigned total_error = 0;
 
     // For every data size in the data set
     for (const auto& item : data.items) {
         // check if the validation buffer contains the expected data
         HOST_DATA_TYPE expected_value = static_cast<HOST_DATA_TYPE>(item.messageSize & 255u);
-        unsigned errors = 0;
+        unsigned error_count = 0;
         HOST_DATA_TYPE failing_entry = 0;
         for (const auto& v: item.validationBuffer) {
             if (v != expected_value) {
-                errors++;
+                error_count++;
                 failing_entry = v;
             }
         }
-        total_error += errors;
-        if (errors > 0) {
-            std::cerr << "Validation data invalid for message size " << (1 << item.messageSize) << " in " << errors << " cases! Expected: " 
-                    << static_cast<int>(expected_value) << ", Value: " << static_cast<int>(failing_entry) << std::endl;
+        if (error_count > 0) {
+            errors.emplace(std::to_string(item.messageSize), error_count); 
         }
+        total_error += error_count;
     }
 
     // success only, if no error occured
     return total_error == 0;
 }
 
+void
+network::NetworkBenchmark::printError() {
+    for (const auto& error: errors) {
+        std::cerr << "Validation data invalid for message size " << (1 << stoi(error.first)) << " in " << int(error.second) << " cases!" << std::endl; 
+    }
+}
+
diff --git a/b_eff/src/host/network_benchmark.hpp b/b_eff/src/host/network_benchmark.hpp
index 0fdf8064..52e2a479 100644
--- a/b_eff/src/host/network_benchmark.hpp
+++ b/b_eff/src/host/network_benchmark.hpp
@@ -31,6 +31,33 @@ SOFTWARE.
 #include "hpcc_benchmark.hpp"
 #include "parameters.h"
 
+#ifdef USE_DEPRECATED_HPP_HEADER
+template <typename T>
+struct aligned_allocator {
+
+   //    typedefs
+          typedef T value_type;
+          typedef value_type* pointer;
+          typedef const value_type* const_pointer;
+
+	   pointer allocate(size_t pCount, const_pointer = 0){ 
+	    	T* mem = 0;
+	    	if (posix_memalign(reinterpret_cast<void**>(&mem), 4096, sizeof(T) * pCount) != 0) {
+	    		throw std::bad_alloc();
+	        }
+		return mem; 
+	   }
+
+	   void deallocate(pointer pPtr, size_t pCount) { 
+	       free(pPtr);
+	   }
+};
+	   
+namespace cl {
+    template <class T> using vector = std::vector<T,aligned_allocator<T>>; 
+}
+#endif
+
 /**
  * @brief Contains all classes and methods needed by the Network benchmark
  * 
@@ -64,11 +91,26 @@ namespace network {
         std::vector<double> calculationTimings;
     };
 
+    struct ExecutionResult {
+        std::vector<ExecutionTimings> execution_timings;
+        /**
+         * @brief maximum of minimum calculation time, filled by collectResults
+         * 
+         */
+        double maxMinCalculationTime;
+    
+        /**
+         * @brief maximum of calculated bandwidths, filled by collectResults
+         * 
+         */
+        double maxCalcBW;
+    };
+
     /**
      * @brief The data structure used to store all measurement results
      * 
      */
-    typedef std::map<int, std::shared_ptr<std::vector<std::shared_ptr<ExecutionTimings>>>> CollectedResultMap;
+    typedef std::map<int, ExecutionResult> CollectedTimingsMap;
 
 /**
  * @brief The Network benchmark specific program settings
@@ -101,6 +143,12 @@ class NetworkProgramSettings : public hpcc_base::BaseSettings {
      */
     uint minMessageSize;
 
+    /**
+     * @brief Step size for tested message sizes
+     * 
+     */
+    uint stepSize;
+
     /**
      * @brief Offset that is used before the loop length will be reduced for higher message sizes
      * 
@@ -113,6 +161,51 @@ class NetworkProgramSettings : public hpcc_base::BaseSettings {
      */
     uint llDecrease;
 
+    /**
+    * @brief Use the second command kernel to schedule sends and receives directly from PL
+    *
+    */
+    bool accl_from_programable_logic;
+
+    /**
+     * @brief Forward data to AXI stream instead of global memory to further reduce latency
+    */
+    bool accl_axi_stream;
+
+    /**
+     * @brief his is automatically set to true if one of pcie_reverse_write_pcie, pcie_reverse_read_pcie, 
+     * or pcie_reverse_execute_kernel is set to true. The reverse PCIe experiment will be executed in that case.
+     * 
+     */
+    bool pcie_reverse;
+
+    /**
+     * @brief If true, the benchmark will execute the reverse PCIe benchmark instead. It will write data to the FPGA. 
+     * The other pcie_reverse flags can be set to do additional operations within the measurement.
+     * 
+     */
+    bool pcie_reverse_write_pcie;
+
+    /**
+     * @brief If true, the benchmark will execute the reverse PCIe benchmark instead. It will execute an empty kernel. 
+     * The other pcie_reverse flags can be set to do additional operations within the measurement.
+     * 
+     */
+    bool pcie_reverse_execute_kernel;
+
+    /**
+     * @brief If true, the benchmark will execute the reverse PCIe benchmark instead. It will read data from the FPGA. 
+     * The other pcie_reverse flags can be set to do additional operations within the measurement.
+     * 
+     */
+    bool pcie_reverse_read_pcie;
+
+    /**
+     * @brief If true, the reverse experiments are executed in batch mode per looplength to make use of the scheduling queues
+     * 
+     */
+    bool pcie_reverse_batch;
+
     /**
      * @brief Construct a new Network Program Settings object
      * 
@@ -169,8 +262,9 @@ class NetworkData {
          * 
          * @param messageSize The message size in bytes
          * @param loopLength The number of repetitions in the kernel
+         * @param replications The number of kernel replications
          */
-        NetworkDataItem(unsigned int messageSize, unsigned int loopLength);
+        NetworkDataItem(unsigned int messageSize, unsigned int loopLength, unsigned int replications);
     };
 
 
@@ -187,35 +281,39 @@ class NetworkData {
      * @param min_looplength The minimum number of iterations that should be done for a message size
      * @param max_messagesize The minimum message size
      * @param max_messagesize The maximum message size
+     * @param stepSize Step size used to generate tested message sizes
      * @param offset The used offset to scale the loop length. The higher the offset, the later the loop lenght will be decreased
      * @param decrease Number of steps the looplength will be decreased to the minimum
+     * @param replications The number of kernel replications
      */
-    NetworkData(unsigned int max_looplength, unsigned int min_looplength,  unsigned int min_messagesize, unsigned int max_messagesize, unsigned int offset, unsigned int decrease);
+    NetworkData(unsigned int max_looplength, unsigned int min_looplength,  unsigned int min_messagesize, unsigned int max_messagesize,
+                unsigned int stepSize, unsigned int offset, unsigned int decrease, unsigned int replications);
 
 };
 
 /**
- * @brief Measured execution timing from the kernel execution
+ * @brief Implementation of the Network benchmark
  * 
  */
-class NetworkExecutionTimings {
-public:
+class NetworkBenchmark : 
+#ifdef USE_OCL_HOST
+    public hpcc_base::HpccFpgaBenchmark<network::NetworkProgramSettings, cl::Device, cl::Context, cl::Program, network::NetworkData> 
+#endif
+#ifdef USE_XRT_HOST
+#ifdef USE_ACCL
+    public hpcc_base::HpccFpgaBenchmark<network::NetworkProgramSettings, xrt::device, fpga_setup::ACCLContext, xrt::uuid, network::NetworkData> 
+#else
+    public hpcc_base::HpccFpgaBenchmark<network::NetworkProgramSettings, xrt::device, bool, xrt::uuid, network::NetworkData> 
+#endif
+#endif
+   {
+    protected:
 
     /**
-     * @brief A vector containing the timings for all repetitions for the kernel execution
+     * @brief Data structure used to store the number of errors for each message size
      * 
      */
-    CollectedResultMap timings;
-
-};
-
-/**
- * @brief Implementation of the Network benchmark
- * 
- */
-class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark<NetworkProgramSettings, NetworkData, NetworkExecutionTimings> {
-
-protected:
+    std::map<std::string, int> errors;
 
     /**
      * @brief Additional input parameters of the Network benchmark
@@ -227,6 +325,38 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark<NetworkProgramSetti
 
 public:
 
+    CollectedTimingsMap collected_timings;
+    
+    json
+    getTimingsJson() override
+    {
+        json j;
+        for (const auto& timing: collected_timings) {
+            json timing_json;
+            timing_json["maxMinCalculationTime"] = timing.second.maxMinCalculationTime;
+            timing_json["maxCalcBW"] = timing.second.maxCalcBW;
+            std::vector<json> timings_json;
+            for (const auto& execution_timing: timing.second.execution_timings) {
+                json single_timing_json;
+                single_timing_json["looplength"] = execution_timing.looplength;
+                single_timing_json["messageSize"] = execution_timing.messageSize;
+                std::vector<json> calculation_timings;
+                for (const auto& timing: execution_timing.calculationTimings) {
+                    json j;
+                    j["unit"] = "s";
+                    j["value"] = timing;
+                    calculation_timings.push_back(j);
+                }
+                single_timing_json["timings"] = calculation_timings;
+                timings_json.push_back(single_timing_json);
+            }
+            timing_json["timings"] = timings_json;
+            
+            j[std::to_string(timing.first)] = timing_json;
+        }
+        return j;
+    }
+
     /**
      * @brief Network specific implementation of the data generation
      * 
@@ -241,8 +371,8 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark<NetworkProgramSetti
      * @param data The input and output data of the benchmark
      * @return std::unique_ptr<NetworkExecutionTimings> Measured runtimes of the kernel execution
      */
-    std::unique_ptr<NetworkExecutionTimings>
-    executeKernel(NetworkData &data) override;
+    void
+    executeKernel(network::NetworkData &data) override;
 
     /**
      * @brief Network specific implementation of the execution validation
@@ -251,15 +381,29 @@ class NetworkBenchmark : public hpcc_base::HpccFpgaBenchmark<NetworkProgramSetti
      * @return true always, since no checks are done
      */
     bool
-    validateOutputAndPrintError(NetworkData &data) override;
+    validateOutput(network::NetworkData &data) override;
+
+    /**
+     * @brief Network specific implementation of the error printing
+     *
+     */
+    void
+    printError() override;
 
     /**
-     * @brief Network specific implementation of printing the execution results
+     * @brief Network specific implementation of collecting the execution results
      * 
      * @param output Measured runtimes of the kernel execution
      */
     void
-    collectAndPrintResults(const NetworkExecutionTimings &output) override;
+    collectResults() override;
+
+    /**
+     * @brief Network specifig implementation of the printing the execution results
+     *
+     */
+    void
+    printResults() override;
 
     /**
      * @brief Construct a new Network Benchmark object. This construtor will directly setup
diff --git a/b_eff/tests/CMakeLists.txt b/b_eff/tests/CMakeLists.txt
index 2a00ea83..03eece0e 100755
--- a/b_eff/tests/CMakeLists.txt
+++ b/b_eff/tests/CMakeLists.txt
@@ -6,4 +6,9 @@ set(TEST_SOURCES test_kernel_functionality_and_host_integration.cpp)
 
 include(${CMAKE_SOURCE_DIR}/../cmake/unitTestTargets.cmake)
 
-target_link_libraries(${LIB_NAME}_intel ${MPI_LIBRARIES})
+if (INTELFPGAOPENCL_FOUND)
+    target_link_libraries(${LIB_NAME}_intel ${MPI_LIBRARIES})
+endif()
+if (Vitis_FOUND)
+    target_link_libraries(${LIB_NAME}_xilinx ${MPI_LIBRARIES})
+endif()
diff --git a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
index ba254201..4cc30e25 100644
--- a/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
+++ b/b_eff/tests/test_kernel_functionality_and_host_integration.cpp
@@ -10,7 +10,7 @@
 #include "test_program_settings.h"
 #include <fstream>
 
-struct NetworkKernelTest : testing::TestWithParam<hpcc_base::CommunicationType> {
+struct NetworkKernelTest : testing::Test {
     std::unique_ptr<network::NetworkBenchmark> bm;
     std::unique_ptr<network::NetworkData> data;
     unsigned numberOfChannels = 4;
@@ -22,7 +22,6 @@ struct NetworkKernelTest : testing::TestWithParam<hpcc_base::CommunicationType>
     void SetUp() override {
         bm = std::unique_ptr<network::NetworkBenchmark>(new network::NetworkBenchmark(global_argc, global_argv));
         bm->getExecutionSettings().programSettings->numRepetitions = 1;
-        bm->getExecutionSettings().programSettings->communicationType = GetParam();
         data = bm->generateInputData();
         createChannelFilesAndSymbolicLinks();
     }
@@ -48,32 +47,36 @@ struct NetworkKernelTest : testing::TestWithParam<hpcc_base::CommunicationType>
 /**
  * Tests if calculate returns the correct execution results
  */
-TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) {
+TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor111) {
+    if (bm->getExecutionSettings().programSettings->communicationType == hpcc_base::CommunicationType::intel_external_channels) {
+        // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
+        GTEST_SKIP() << "Intel external channel needs at least message size of 64 byte to fill channel!";
+    }
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(1,1));
-    auto result = bm->executeKernel(*data);
-    EXPECT_NE(result->timings.end(), result->timings.find(1));
-    EXPECT_EQ(1, result->timings.find(1)->second->at(0)->looplength);
-    EXPECT_EQ(1, result->timings.find(1)->second->at(0)->calculationTimings.size());
+    data->items.push_back(network::NetworkData::NetworkDataItem(1,1, bm->getExecutionSettings().programSettings->kernelReplications));
+    bm->executeKernel(*data);
+    EXPECT_NE(bm->collected_timings.end(), bm->collected_timings.find(1));
+    EXPECT_EQ(1, bm->collected_timings.find(1)->second.execution_timings.at(0).looplength);
+    EXPECT_EQ(1, bm->collected_timings.find(1)->second.execution_timings.at(0).calculationTimings.size());
 }
 
 /**
  * Tests if calculate returns the correct execution results for multiple repetitions
  */
-TEST_P(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) {
+TEST_F(NetworkKernelTest, CalculateReturnsCorrectExecutionResultFor842) {
     bm->getExecutionSettings().programSettings->numRepetitions = 2;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(8,4));
-    auto result = bm->executeKernel(*data);
-    EXPECT_NE(result->timings.end(), result->timings.find(8));
-    EXPECT_EQ(4, result->timings.find(8)->second->at(0)->looplength);
-    EXPECT_EQ(2, result->timings.find(8)->second->at(0)->calculationTimings.size());
+    data->items.push_back(network::NetworkData::NetworkDataItem(8,4, bm->getExecutionSettings().programSettings->kernelReplications));
+    bm->executeKernel(*data);
+    EXPECT_NE(bm->collected_timings.end(), bm->collected_timings.find(8));
+    EXPECT_EQ(4, bm->collected_timings.find(8)->second.execution_timings.at(0).looplength);
+    EXPECT_EQ(2, bm->collected_timings.find(8)->second.execution_timings.at(0).calculationTimings.size());
 }
 
 /**
  * Tests if data is written to the channels for small message sizes
  */
-TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) {
+TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel) {
     if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
         // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
         GTEST_SKIP();
@@ -81,8 +84,8 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel)
     const unsigned messageSize = std::log2(CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    bm->executeKernel(*data);
     HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2];
     for (int i=0; i < numberOfChannels; i++) {
         std::string ifname = channelOutName + std::to_string(i);
@@ -101,7 +104,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingOneChannel)
 /**
  * Tests if data is written to the channels for small message sizes filling two channels
  */
-TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) {
+TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels) {
     if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
         // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
         GTEST_SKIP();
@@ -109,8 +112,8 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize, looplength));
-    auto result = bm->executeKernel(*data);
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize, looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    bm->executeKernel(*data);
     HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2];
     for (int i=0; i < numberOfChannels; i++) {
         std::string ifname = channelOutName + std::to_string(i);
@@ -126,7 +129,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingTwoChannels
 /**
  * Tests if data is written to the channels for message sizes filling more than two channels
  */
-TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) {
+TEST_F(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwoChannels) {
     if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
         // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
         GTEST_SKIP();
@@ -134,8 +137,8 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwo
     const unsigned messageSize = std::log2(8 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 1;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    bm->executeKernel(*data);
     HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[(1 << messageSize) * looplength * 2];
     for (int i=0; i < numberOfChannels; i++) {
         std::string ifname = channelOutName + std::to_string(i);
@@ -151,7 +154,7 @@ TEST_P(NetworkKernelTest, DataIsWrittenToChannelForMessageSizeFillingMoreThanTwo
 /**
  * Tests if correct data is written to the channels
  */
-TEST_P(NetworkKernelTest, CorrectDataIsWrittenToChannel) {
+TEST_F(NetworkKernelTest, CorrectDataIsWrittenToChannel) {
     if (bm->getExecutionSettings().programSettings->communicationType != hpcc_base::CommunicationType::intel_external_channels) {
         // Skip this test if no IEC are used, because they are specific to the IEC emulation based on files
         GTEST_SKIP();
@@ -159,8 +162,8 @@ TEST_P(NetworkKernelTest, CorrectDataIsWrittenToChannel) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    bm->executeKernel(*data);
     HOST_DATA_TYPE* buffer = new HOST_DATA_TYPE[messageSize * looplength * 2];
     for (int i=0; i < numberOfChannels; i++) {
         std::string ifname = channelOutName + std::to_string(i);
@@ -175,12 +178,12 @@ TEST_P(NetworkKernelTest, CorrectDataIsWrittenToChannel) {
     delete [] buffer;
 }
 
-TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) {
+TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    bm->executeKernel(*data);
     HOST_DATA_TYPE cvalue = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     EXPECT_EQ(cvalue, data->items[0].validationBuffer[0]);
     bool all_same = true;
@@ -190,12 +193,15 @@ TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForTwoChannels) {
     EXPECT_TRUE(all_same);
 }
 
-TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) {
+TEST_F(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) {
+    if (bm->getExecutionSettings().programSettings->communicationType == hpcc_base::CommunicationType::intel_external_channels) {
+        GTEST_SKIP() << "Intel external channel needs at least message size of 64 byte to fill channel!";
+    }
     const unsigned messageSize = 0;
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    bm->executeKernel(*data);
     HOST_DATA_TYPE cvalue = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     EXPECT_EQ(cvalue, data->items[0].validationBuffer[0]);
     bool all_same = true;
@@ -205,100 +211,144 @@ TEST_P(NetworkKernelTest, ValidationDataIsStoredCorrectlyForSmallMessageSize) {
     EXPECT_TRUE(all_same);
 }
 
-TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) {
+TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength4) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
+    bm->getExecutionSettings().programSettings->kernelReplications = 1;
     const unsigned looplength = 4;
+    const unsigned replications = 1;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size());
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1));
+    EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size());
 }
 
-TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) {
+TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForLoopLength1) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
+    bm->getExecutionSettings().programSettings->kernelReplications = 1;
     const unsigned looplength = 1;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size());
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 1));
+    EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size());
 }
 
-TEST_P(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) {
+TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForDifferentMessageSize) {
     const unsigned messageSize = 0;
+    bm->getExecutionSettings().programSettings->kernelReplications = 1;
     const unsigned looplength = 1;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
-    EXPECT_EQ(looplength * CHANNEL_WIDTH * 2 * 2, data->items[0].validationBuffer.size());
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    EXPECT_EQ((1 << messageSize), data->items[0].validationBuffer.size());
 }
 
-TEST_P(NetworkKernelTest, ValidationDataSingleItemWrongCheckFails) {
-    const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
+TEST_F(NetworkKernelTest, ValidationDataHasCorrectSizeForReplication2) {
+    const unsigned messageSize = 4;
+    const unsigned looplength = 2;
+    bm->getExecutionSettings().programSettings->kernelReplications = 2;
+    data->items.clear();
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, 2));
+    EXPECT_EQ((1 << messageSize) * 2, data->items[0].validationBuffer.size());
+}
+
+TEST_F(NetworkKernelTest, ValidationDataSingleItemWrongCheckFails) {
+    const unsigned messageSize = 4;
     const HOST_DATA_TYPE expected_data = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data;});
     data->items[0].validationBuffer[looplength] = expected_data + 1;
-    EXPECT_FALSE(bm->validateOutputAndPrintError(*data));
+    EXPECT_FALSE(bm->validateOutput(*data));
+    bm->printError();
 }
 
-TEST_P(NetworkKernelTest, ValidationDataWrongCheckFails) {
+TEST_F(NetworkKernelTest, ValidationDataWrongCheckFails) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const HOST_DATA_TYPE expected_data = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data - 1;});
-    EXPECT_FALSE(bm->validateOutputAndPrintError(*data));
+    EXPECT_FALSE(bm->validateOutput(*data));
+    bm->printError();
 }
 
-TEST_P(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) {
+TEST_F(NetworkKernelTest, ValidationDataCorrectCheckSuccessful) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const HOST_DATA_TYPE expected_data = static_cast<HOST_DATA_TYPE>(messageSize & 255);
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
     std::for_each(data->items[0].validationBuffer.begin(), data->items[0].validationBuffer.end(), [expected_data](HOST_DATA_TYPE& d){d = expected_data;});
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError();
 }
 
-TEST_P(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) {
+TEST_F(NetworkKernelTest, ValidationDataCorrectOneMessageSizeAfterExecution) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    auto result = bm->executeKernel(*data);
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    bm->executeKernel(*data);
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError();
 }
 
 // This test is disabled because it does not work with the current implementation of the
 // external channels in software emulation. The different kernel executions will read 
 // the old data from the channel file, which will lead to a failing validation!
-TEST_P(NetworkKernelTest, DISABLED_ValidationDataCorrectTwoMessageSizesAfterExecution) {
+TEST_F(NetworkKernelTest, ValidationDataCorrectTwoMessageSizesAfterExecution) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength));
-    auto result = bm->executeKernel(*data);
-    EXPECT_TRUE(bm->validateOutputAndPrintError(*data));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    bm->executeKernel(*data);
+    EXPECT_TRUE(bm->validateOutput(*data));
+    bm->printError();
 }
 
-TEST_P(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) {
+TEST_F(NetworkKernelTest, ValidationDataWrongTwoMessageSizesAfterExecution) {
     const unsigned messageSize = std::log2(2 * CHANNEL_WIDTH / sizeof(HOST_DATA_TYPE));
     const unsigned looplength = 4;
     data->items.clear();
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength));
-    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength));
-    auto result = bm->executeKernel(*data);
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    data->items.push_back(network::NetworkData::NetworkDataItem(messageSize + 1,looplength, bm->getExecutionSettings().programSettings->kernelReplications));
+    bm->executeKernel(*data);
     data->items[1].validationBuffer[0] = static_cast<HOST_DATA_TYPE>(0);
-    EXPECT_FALSE(bm->validateOutputAndPrintError(*data));
+    EXPECT_FALSE(bm->validateOutput(*data));
+    bm->printError();
 }
 
-
-
-INSTANTIATE_TEST_CASE_P(
-        NetworkKernelParametrizedTests,
-        NetworkKernelTest,
-        ::testing::Values(hpcc_base::CommunicationType::intel_external_channels,hpcc_base::CommunicationType::cpu_only, hpcc_base::CommunicationType::pcie_mpi));
+TEST_F(NetworkKernelTest, JsonDump) {
+    data->items.clear();
+    data->items.push_back(network::NetworkData::NetworkDataItem(8,4, bm->getExecutionSettings().programSettings->kernelReplications));
+    bm->executeKernel(*data);
+    bm->collectResults();
+    bm->dumpConfigurationAndResults("b_eff.json");
+    std::FILE *f = std::fopen("b_eff.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        json j = json::parse(f);
+        EXPECT_TRUE(j.contains("timings"));
+        if (j.contains("timings")) {
+            EXPECT_TRUE(j["timings"].size() > 0);
+            if (j["timings"].size() > 0) {
+                for (const auto& timing: j["timings"].items()) {
+                    EXPECT_TRUE(timing.value().contains("maxCalcBW"));
+                    EXPECT_TRUE(timing.value().contains("maxMinCalculationTime"));
+                    EXPECT_TRUE(timing.value().contains("timings"));
+                    if (timing.value().contains("timings")) {
+                        for (const auto& timing: timing.value()["timings"]) {
+                            EXPECT_TRUE(timing.contains("looplength"));
+                            EXPECT_TRUE(timing.contains("messageSize"));
+                            EXPECT_TRUE(timing.contains("timings"));
+                        }
+                    }
+                }
+            }
+        }
+        EXPECT_TRUE(j.contains("results"));
+        if (j.contains("results")) {
+            EXPECT_TRUE(j["results"].contains("b_eff"));
+        }
+    }
+}
diff --git a/cmake/accl.cmake b/cmake/accl.cmake
new file mode 100644
index 00000000..dd00a8b4
--- /dev/null
+++ b/cmake/accl.cmake
@@ -0,0 +1,162 @@
+
+# General definitions
+set(ACCL_STACK_TYPE "UDP" CACHE STRING "Network layer used in ACCL")
+set(ACCL_UDP_ETH_IFS 1 CACHE STRING "Number of Ethernet interfaces to synthesize for UDP stack")
+set(ACCL_DEVICE_NAME "xcu280-fsvh2892-2L-e" CACHE STRING "Name of the FPGA used on the target platform")
+set(DEFAULT_ACCL_BUFFER_SIZE 8192 CACHE STRING "Size of ACCL buffers in KB")
+set(DEFAULT_ACCL_BUFFER_COUNT 16 CACHE STRING "Number of ACCL buffers")
+set(DEFAULT_ACCL_BUFFER_BANK 0 CACHE STRING "Default memory bank for ACCL buffers")
+set(DEFAULT_ACCL_RECV_BUFFER_BANKS 1 CACHE STRING "Memory banks to allocate recevie buffers (can be comma-separated list)")
+set(ACCL_HARDWARE_DIR ${extern_accl_SOURCE_DIR}/test/hardware)
+set(ACCL_CCLO_ADDITIONAL_BUILD_ARGS "" CACHE STRING "Add additional build arguments that will be passed to the CCLO makefile")
+set(ACCL_CCLO_BUILD_ARGS ${ACCL_CCLO_ADDITIONAL_BUILD_ARGS})
+# UDP related definitions
+set(ACCL_VNX_DIR ${extern_accl_udp_SOURCE_DIR})
+set(ACCL_NETLAYER_HLS ${ACCL_VNX_DIR}/NetLayers/100G-fpga-network-stack-core)
+set(ACCL_UDP_NET_XO ${ACCL_VNX_DIR}/NetLayers/_x.${FPGA_BOARD_NAME}/networklayer.xo)
+set(ACCL_HLS_IP_FOLDER ${ACCL_NETLAYER_HLS}/synthesis_results_HBM)
+if (ACCL_STACK_TYPE STREQUAL "UDP")
+    list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_VNX_DIR}/Ethernet/post_sys_link.tcl)
+    list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_HLS_IP_FOLDER})
+    list(APPEND ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE})
+endif()
+
+list(APPEND XILINX_ADDITIONAL_COMPILE_FLAGS "-I${extern_accl_SOURCE_DIR}/driver/hls" "-DACCL_SYNTHESIS")
+
+set(ACCL_UDP_MAC_XOS "")
+
+math(EXPR loopend "${ACCL_UDP_ETH_IFS} - 1")
+foreach(i RANGE ${loopend})
+    set(CURRENT_MAC_XO ${ACCL_VNX_DIR}/Ethernet/_x.${FPGA_BOARD_NAME}/cmac_${i}.xo)
+    add_custom_command(
+        OUTPUT ${CURRENT_MAC_XO}
+        COMMAND make -C ${ACCL_VNX_DIR}/Ethernet DEVICE=${FPGA_BOARD_NAME} INTERFACE=${i} all
+        WORKING_DIRECTORY ${ACCL_HARDWARE_DIR}) 
+    list(APPEND ACCL_UDP_MAC_XOS ${CURRENT_MAC_XO})
+endforeach()
+
+add_custom_command(
+    OUTPUT ${ACCL_UDP_NET_XO}
+    COMMAND make -C ${ACCL_VNX_DIR}/NetLayers DEVICE=${FPGA_BOARD_NAME} all
+    WORKING_DIRECTORY ${ACCL_HARDWARE_DIR}) 
+
+add_custom_target(
+    accl_udp_stack
+    DEPENDS ${ACCL_UDP_MAC_XOS} ${ACCL_UDP_NET_XO})
+
+# TCP related definitions
+set(ACCL_TCP_BASE_DIR ${extern_accl_tcp_SOURCE_DIR})
+set(ACCL_TCP_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/network_krnl.xo)
+set(ACCL_TCP_CMAC_XO ${ACCL_TCP_BASE_DIR}/_x.hw.${FPGA_BOARD_NAME}/cmac_krnl.xo)
+if (ACCL_STACK_TYPE STREQUAL "TCP")
+    list(APPEND ACCL_LINK_CONFIG --advanced.param compiler.userPostSysLinkOverlayTcl=${ACCL_TCP_BASE_DIR}/scripts/post_sys_link.tcl)
+    list(APPEND ACCL_LINK_CONFIG --user_ip_repo_paths ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo)
+    list(APPEND ACCL_CCLO_BUILD_ARGS STACK_TYPE=${ACCL_STACK_TYPE} EN_FANIN=1)
+endif()
+
+# TODO: This is very sppecific to the Xilinx build system, because
+# different Vivado version is required to build these ips
+add_custom_command(
+    OUTPUT ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo
+    COMMAND mkdir build && cd build && cmake .. -DFDEV_NAME=u280 
+            -DVIVADO_HLS_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 
+            -DVIVADO_ROOT_DIR=/proj/xbuilds/2020.1_released/installs/lin64/Vivado/2020.1 
+            -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=1 -DTCP_STACK_WINDOW_SCALING_EN=0 &&
+            make installip
+    WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR})
+
+add_custom_command(
+    OUTPUT ${ACCL_TCP_CMAC_XO}
+    COMMAND make cmac_krnl DEVICE=${FPGA_BOARD_NAME} XSA=${FPGA_BOARD_NAME} TEMP_DIR=_x.hw.${FPGA_BOARD_NAME}/
+    WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR}
+    DEPENDS ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo) 
+
+add_custom_command(
+    OUTPUT ${ACCL_TCP_XO}
+    COMMAND make network_krnl DEVICE=${FPGA_BOARD_NAME} XSA=${FPGA_BOARD_NAME} TEMP_DIR=_x.hw.${FPGA_BOARD_NAME}/
+    WORKING_DIRECTORY ${ACCL_TCP_BASE_DIR}
+    DEPENDS ${ACCL_TCP_BASE_DIR}/build/fpga-network-stack/iprepo) 
+
+add_custom_target(
+    accl_tcp_stack
+    DEPENDS ${ACCL_TCP_XO} ${ACCL_TCP_CMAC_XO})
+      
+
+# Build CCLO
+set(ACCL_CCLO_KERNEL_DIR ${extern_accl_SOURCE_DIR}/kernels/cclo/)
+set(ACCL_CCLO_KERNEL_XO ccl_offload.xo)
+
+add_custom_command(
+    OUTPUT ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO}
+    COMMAND make ${ACCL_CCLO_BUILD_ARGS} PLATFORM=${FPGA_BOARD_NAME}
+    WORKING_DIRECTORY ${ACCL_CCLO_KERNEL_DIR})
+
+add_custom_target(
+    accl_cclo
+    DEPENDS ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO})
+
+# Build the ACCL Plugins
+set(ACCL_PLUGINS_DIR ${extern_accl_SOURCE_DIR}/kernels/plugins)
+set(ACCL_PLUGINS_HOSTCTRL ${ACCL_PLUGINS_DIR}/hostctrl/hostctrl.xo)
+set(ACCL_PLUGINS_SUM ${ACCL_PLUGINS_DIR}/reduce_ops/reduce_ops.xo)
+set(ACCL_PLUGINS_COMPRESSION ${ACCL_PLUGINS_DIR}/hp_compression/hp_compression.xo)
+set(ACCL_PLUGINS_LOOPBACK ${ACCL_PLUGINS_DIR}/loopback/loopback.xo)
+set(ACCL_PLUGINS_ARBITER ${ACCL_PLUGINS_DIR}/client_arbiter/client_arbiter.xo)
+
+add_custom_command(
+    OUTPUT ${ACCL_PLUGINS_HOSTCTRL}
+    COMMAND vitis_hls build_hostctrl.tcl -tclargs ip ${ACCL_DEVICE_NAME}
+    WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/hostctrl ) 
+add_custom_command(
+    OUTPUT ${ACCL_PLUGINS_SUM}
+    COMMAND vitis_hls build.tcl -tclargs ip ${ACCL_DEVICE_NAME}
+    WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/reduce_ops ) 
+add_custom_command(
+    OUTPUT ${ACCL_PLUGINS_COMPRESSION}
+    COMMAND vitis_hls build.tcl -tclargs ip ${ACCL_DEVICE_NAME}
+    WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/hp_compression ) 
+add_custom_command(
+    OUTPUT ${ACCL_PLUGINS_LOOPBACK}
+    COMMAND vitis_hls build_loopback.tcl -tclargs ip ${ACCL_DEVICE_NAME}
+    WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/loopback ) 
+add_custom_command(
+    OUTPUT ${ACCL_PLUGINS_ARBITER}
+    COMMAND vitis_hls build_client_arbiter.tcl -tclargs ip ${ACCL_DEVICE_NAME}
+    WORKING_DIRECTORY ${ACCL_PLUGINS_DIR}/client_arbiter ) 
+
+
+add_custom_target(
+    accl_plugins
+    DEPENDS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL} 
+    ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_ARBITER})
+
+set(ACCL_UDP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL}
+    ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_UDP_MAC_XOS} ${ACCL_UDP_NET_XO} CACHE INTERNAL "Object files required for ACCL with UDP")
+
+set(ACCL_TCP_XOS ${ACCL_PLUGINS_LOOPBACK} ${ACCL_PLUGINS_COMPRESSION} ${ACCL_PLUGINS_SUM} ${ACCL_PLUGINS_HOSTCTRL}
+    ${ACCL_CCLO_KERNEL_DIR}/${ACCL_CCLO_KERNEL_XO} ${ACCL_TCP_CMAC_XO} ${ACCL_TCP_XO} CACHE INTERNAL "Object files required for ACCL with TCP")
+
+if (DEFINED USE_ACCL_CLIENT_ARBITER)
+    list(APPEND ACCL_UDP_XOS ${ACCL_PLUGINS_ARBITER})
+    list(APPEND ACCL_TCP_XOS ${ACCL_PLUGINS_ARBITER})
+endif()
+if (ACCL_STACK_TYPE STREQUAL "UDP")
+    set(ACCL_XOS ${ACCL_UDP_XOS} CACHE INTERNAL "Object files required for ACCL")
+else()
+    set(ACCL_XOS ${ACCL_TCP_XOS} CACHE INTERNAL "Object files required for ACCL")
+endif()
+
+add_custom_target(
+    accl_udp)
+add_dependencies(accl_udp accl_udp_stack accl_cclo accl_plugins)
+
+add_custom_target(
+    accl_tcp)
+add_dependencies(accl_tcp accl_tcp_stack accl_cclo accl_plugins)
+
+add_custom_target(accl_device)
+if (ACCL_STACK_TYPE STREQUAL "UDP")
+    add_dependencies(accl_device accl_udp)
+else()
+    add_dependencies(accl_device accl_tcp)
+endif()
diff --git a/cmake/customKernelTargets.cmake b/cmake/customKernelTargets.cmake
index 82ac811f..4657ba53 100644
--- a/cmake/customKernelTargets.cmake
+++ b/cmake/customKernelTargets.cmake
@@ -9,7 +9,7 @@ include(${CMAKE_SOURCE_DIR}/../cmake/kernelTargets.cmake)
 
 file(GLOB custom_kernel_files
     RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
-    "*.cl"
+    "*.cl" "*.cpp"
 )
 
 set(custom_kernel_targets "")
diff --git a/cmake/general_benchmark_build_setup.cmake b/cmake/general_benchmark_build_setup.cmake
index 64aa8d0a..427aaab4 100644
--- a/cmake/general_benchmark_build_setup.cmake
+++ b/cmake/general_benchmark_build_setup.cmake
@@ -1,10 +1,7 @@
 cmake_policy(VERSION 3.13)
 INCLUDE (CheckTypeSize)
 
-set (CMAKE_CXX_STANDARD 11)
-
-# Download build dependencies
-add_subdirectory(${CMAKE_SOURCE_DIR}/../extern ${CMAKE_BINARY_DIR}/extern)
+set (CMAKE_CXX_STANDARD 14)
 
 if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
     enable_testing()
@@ -30,6 +27,8 @@ set(USE_OPENMP ${USE_OPENMP} CACHE BOOL "Use OpenMP in the host code")
 set(USE_MPI ${USE_MPI} CACHE BOOL "Compile the host code with MPI support. This has to be supported by the host code.")
 set(USE_SVM No CACHE BOOL "Use SVM pointers instead of creating buffers on the board and transferring the data there before execution.")
 set(USE_HBM No CACHE BOOL "Use host code specific to HBM FPGAs")
+set(USE_ACCL No CACHE BOOL "Use ACCL for communication")
+set(USE_OCL_HOST Yes CACHE BOOL "Use OpenCL host code implementation")
 set(USE_CUSTOM_KERNEL_TARGETS No CACHE BOOL "Enable build targets for custom kernels")
 set(USE_DEPRECATED_HPP_HEADER ${header_default} CACHE BOOL "Flag that indicates if the old C++ wrapper header should be used (cl.hpp) or the newer version (cl2.hpp or opencl.hpp)")
 set(HPCC_FPGA_CONFIG ${HPCC_FPGA_CONFIG} CACHE FILEPATH "Configuration file that is used to overwrite the default configuration")
@@ -43,12 +42,19 @@ if (NOT KERNEL_REPLICATION_ENABLED)
  unset(NUM_REPLICATIONS)
 endif()
 
-
 if (HPCC_FPGA_CONFIG)
     message(STATUS "HPCC FPGA configuration defined. Overwrite default values with configuration: ${HPCC_FPGA_CONFIG}")
     include(${HPCC_FPGA_CONFIG})
 endif()
 
+# Download build dependencies
+add_subdirectory(${CMAKE_SOURCE_DIR}/../extern ${CMAKE_BINARY_DIR}/extern)
+
+# Enable ACCL if required
+if (USE_ACCL)
+   include(${CMAKE_SOURCE_DIR}/../cmake/accl.cmake)
+endif()
+
 # Set the used data type
 if (NOT DATA_TYPE)
     set(DATA_TYPE float CACHE STRING "Data type used for calculation")
@@ -86,6 +92,15 @@ if (USE_MPI)
     include_directories(${MPI_CXX_INCLUDE_PATH})
     link_libraries(${MPI_LIBRARIES})
 endif()
+if (USE_ACCL)
+    add_definitions(-DUSE_ACCL)
+endif()
+if (USE_XRT_HOST)
+    add_definitions(-DUSE_XRT_HOST)
+endif()
+if (USE_OCL_HOST)
+    add_definitions(-DUSE_OCL_HOST)
+endif()
 
 # Add configuration time to build
 string(TIMESTAMP CONFIG_TIME "%a %b %d %H:%M:%S UTC %Y" UTC)
@@ -149,6 +164,10 @@ list(APPEND CMAKE_EXTRA_INCLUDE_FILES "CL/opencl.h")
 check_type_size("${HOST_DATA_TYPE}" DATA_TYPE_SIZE)
 
 # Configure the header file with definitions used by the host code
+configure_file(
+        "${CMAKE_SOURCE_DIR}/../shared/include/base_parameters.h.in"
+        "${CMAKE_BINARY_DIR}/src/common/base_parameters.h"
+)
 configure_file(
         "${CMAKE_SOURCE_DIR}/src/common/parameters.h.in"
         "${CMAKE_BINARY_DIR}/src/common/parameters.h"
diff --git a/cmake/kernelTargets.cmake b/cmake/kernelTargets.cmake
index 1d7e667f..2dddc89b 100644
--- a/cmake/kernelTargets.cmake
+++ b/cmake/kernelTargets.cmake
@@ -9,26 +9,52 @@ else()
         set(VPP_FLAGS "-O3")
 endif()
 
+set(file_endings "cl" "cpp" )
+
 ##
 # This function will create build targets for the kernels for emulationand synthesis for xilinx.
 ##
 function(generate_kernel_targets_xilinx)
     foreach (kernel_file_name ${ARGN})
         string(REGEX MATCH "^custom_.*" is_custom_kernel ${kernel_file_name})
-        if (is_custom_kernel) 
+        if (is_custom_kernel)
                 string(REPLACE "custom_" "" base_file_name ${kernel_file_name})
                 set(base_file_part "src/device/custom/${base_file_name}")
         else()
                 set(base_file_part "src/device/${kernel_file_name}")
         endif()
-        set(base_file "${CMAKE_SOURCE_DIR}/${base_file_part}.cl")
+        string(REGEX MATCH ".*_ACCL.*" is_accl_kernel ${kernel_file_name})
+        if (is_accl_kernel AND NOT USE_ACCL)
+            continue()
+        endif()
+        set(file_exists No)
+        if (DEFINED FORCE_FILE_ENDING)
+                set(file_endings ${FORCE_FILE_ENDING})
+        endif()
+        foreach (ending ${file_endings})
+            set(search_file_name "${CMAKE_SOURCE_DIR}/${base_file_part}.${ending}")
+            if (NOT file_exists AND EXISTS ${search_file_name})
+                set(file_exists Yes)
+                set(selected_file_ending ${ending})
+                set(base_file "${search_file_name}")
+            endif()
+        endforeach()
         if (KERNEL_REPLICATION_ENABLED)
-            set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_replicated_xilinx.cl")
+            set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_replicated_xilinx.${selected_file_ending}")
         else()
-            set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_copied_xilinx.cl")
+            set(source_f "${CMAKE_BINARY_DIR}/${base_file_part}_copied_xilinx.${selected_file_ending}")
+        endif()
+        if (DEFINED XILINX_KERNEL_NAMES)
+            set(bitstream_compile "")
+            set(bitstream_compile_emulate "")
+            foreach (kernel ${XILINX_KERNEL_NAMES})
+                list(APPEND bitstream_compile xilinx_tmp_compile/${kernel_file_name}/${kernel}.xo)
+                list(APPEND bitstream_compile_emulate xilinx_tmp_compile/${kernel_file_name}/${kernel}_emulate.xo)
+            endforeach()
+        else()
+            set(bitstream_compile xilinx_tmp_compile/${kernel_file_name}.xo)
+            set(bitstream_compile_emulate xilinx_tmp_compile/${kernel_file_name}_emulate.xo)
         endif()
-        set(bitstream_compile xilinx_tmp_compile/${kernel_file_name}.xo)
-        set(bitstream_compile_emulate xilinx_tmp_compile/${kernel_file_name}_emulate.xo)
         set(bitstream_emulate_f
             ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_emulate.xclbin)
         set(bitstream_f ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}.xclbin)
@@ -40,16 +66,22 @@ function(generate_kernel_targets_xilinx)
             set(gen_xilinx_link_settings ${XILINX_LINK_SETTINGS_FILE})
             set(xilinx_link_settings ${CMAKE_BINARY_DIR}/settings/settings.link.xilinx.${kernel_file_name}.ini)
         endif()
+        if (USE_ACCL AND is_accl_kernel)
+            list(APPEND additional_xos ${ACCL_XOS})
+        endif()
         set(xilinx_report_folder "${EXECUTABLE_OUTPUT_PATH}/xilinx_reports")
-        set(local_CLFLAGS ${CLFLAGS} -DXILINX_FPGA)
+        set(local_CLFLAGS -DXILINX_FPGA)
         list(APPEND local_CLFLAGS --report_dir=${xilinx_report_folder} --log_dir=${xilinx_report_folder}/logs)
-
+        if (is_accl_kernel)
+            list(APPEND local_harware_only_flags ${ACCL_LINK_CONFIG})
+        endif()
         string(REGEX MATCH "^.+\.tcl" is_tcl_script ${XILINX_COMPILE_SETTINGS_FILE})
         if (is_tcl_script)
                 set(CLFLAGS --hls.pre_tcl ${XILINX_COMPILE_SETTINGS_FILE})
         else()
                 set(CLFLAGS --config ${XILINX_COMPILE_SETTINGS_FILE})
         endif()
+        list(APPEND local_CLFLAGS ${CLFLAGS})
 
         # build emulation config for device
         add_custom_command(OUTPUT ${EXECUTABLE_OUTPUT_PATH}/emconfig.json
@@ -57,7 +89,7 @@ function(generate_kernel_targets_xilinx)
         )
         if (XILINX_GENERATE_LINK_SETTINGS)
             add_custom_command(OUTPUT ${xilinx_link_settings}
-                    COMMAND ${Python3_EXECUTABLE} ${CODE_GENERATOR} -o ${xilinx_link_settings} -p num_replications=${NUM_REPLICATIONS} --comment "\"#\"" --comment-ml-start "\"$$\"" --comment-ml-end "\"$$\"" ${gen_xilinx_link_settings}
+                    COMMAND ${Python3_EXECUTABLE} ${CODE_GENERATOR} -o ${xilinx_link_settings} -p num_replications=${NUM_REPLICATIONS} ${gen_xilinx_link_settings}
                     MAIN_DEPENDENCY ${gen_xilinx_link_settings}
                     )
         else()
@@ -79,37 +111,52 @@ function(generate_kernel_targets_xilinx)
                 )
         endif()
 
-        add_custom_command(OUTPUT ${bitstream_compile_emulate}
-                COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -DEMULATE -t sw_emu ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS} -f ${FPGA_BOARD_NAME} -g -c ${XILINX_COMPILE_FLAGS} -o ${bitstream_compile_emulate} ${source_f}
-                MAIN_DEPENDENCY ${source_f}
-                DEPENDS ${XILINX_COMPILE_SETTINGS_FILE}
-                )
+        foreach (kernel ${bitstream_compile_emulate})
+            if (DEFINED XILINX_KERNEL_NAMES)
+                string(REGEX MATCH ".+/(.+)_emulate\.xo" kernel_name ${kernel})
+                set(kernel_name_flag -k ${CMAKE_MATCH_1})
+            endif()
+            add_custom_command(OUTPUT ${kernel}
+                    COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -DKERNEL_${CMAKE_MATCH_1} -DEMULATE -t sw_emu ${kernel_name_flag} ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS} -f ${FPGA_BOARD_NAME} -c ${XILINX_COMPILE_FLAGS} -o ${kernel} ${source_f}
+                    MAIN_DEPENDENCY ${source_f}
+                    DEPENDS ${XILINX_COMPILE_SETTINGS_FILE}
+                    )
+        endforeach()
         add_custom_command(OUTPUT ${bitstream_emulate_f}
-            COMMAND ${Vitis_COMPILER} ${local_CL_FLAGS} ${VPP_FLAGS} -DEMULATE -t sw_emu ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} -f ${FPGA_BOARD_NAME} -g -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_emulate_f} ${bitstream_compile_emulate}
-                MAIN_DEPENDENCY ${bitstream_compile_emulate}
+            COMMAND ${Vitis_COMPILER} ${local_CL_FLAGS} ${VPP_FLAGS} -DEMULATE -t sw_emu ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} -f ${FPGA_BOARD_NAME} -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_emulate_f} ${bitstream_compile_emulate}
+                DEPENDS ${bitstream_compile_emulate}
                 DEPENDS ${xilinx_link_settings}
                 )
-        add_custom_command(OUTPUT ${bitstream_compile}
-                COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS}  --platform ${FPGA_BOARD_NAME} -R2 -c ${XILINX_COMPILE_FLAGS} -o ${bitstream_compile} ${source_f}
-                MAIN_DEPENDENCY ${source_f}
-                DEPENDS ${XILINX_COMPILE_SETTINGS_FILE}
-                )
+        foreach (kernel ${bitstream_compile})
+            if (DEFINED XILINX_KERNEL_NAMES)
+                string(REGEX MATCH ".+/(.+)\.xo" kernel_name ${kernel})
+                set(kernel_name_flag -k ${CMAKE_MATCH_1})
+            endif()
+            add_custom_command(OUTPUT ${kernel}
+                    COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -t hw -DKERNEL_${CMAKE_MATCH_1} ${kernel_name_flag} ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_COMPILE_FLAGS}  --platform ${FPGA_BOARD_NAME} -R2 -c ${XILINX_COMPILE_FLAGS} -o ${kernel} ${source_f}
+                    MAIN_DEPENDENCY ${source_f}
+                    DEPENDS ${XILINX_COMPILE_SETTINGS_FILE}
+                    )
+        endforeach()
         add_custom_command(OUTPUT ${bitstream_f}
-                COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} ${XILINX_COMPILE_FLAGS} -o ${bitstream_f} ${bitstream_compile}
-                MAIN_DEPENDENCY ${bitstream_compile}
+                COMMAND ${Vitis_COMPILER} ${local_CLFLAGS} ${VPP_FLAGS} ${local_harware_only_flags} -t hw ${COMPILER_INCLUDES} ${XILINX_ADDITIONAL_LINK_FLAGS} --platform ${FPGA_BOARD_NAME} -R2 -l --config ${xilinx_link_settings} -o ${bitstream_f} ${additional_xos} ${bitstream_compile}
+                DEPENDS ${bitstream_compile}
                 DEPENDS ${xilinx_link_settings}
                 )
-        add_custom_target(${kernel_file_name}_emulate_xilinx 
-		DEPENDS ${bitstream_emulate_f} 
+        add_custom_target(${kernel_file_name}_emulate_xilinx
+		DEPENDS ${bitstream_emulate_f}
                 DEPENDS ${source_f} ${CMAKE_BINARY_DIR}/src/common/parameters.h ${EXECUTABLE_OUTPUT_PATH}/emconfig.json)
         add_custom_target(${kernel_file_name}_xilinx
-		DEPENDS ${bitstream_f} 
+		DEPENDS ${bitstream_f}
                 DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h
                 )
         add_custom_target(${kernel_file_name}_report_xilinx
-		DEPENDS ${bitstream_compile} 
+		DEPENDS ${bitstream_compile}
                 DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h
                 )
+        if(USE_ACCL AND is_accl_kernel)
+            add_dependencies(${kernel_file_name}_xilinx accl_device)
+        endif()
         list(APPEND kernel_emulation_targets_xilinx ${kernel_file_name}_emulate_xilinx)
         set(kernel_emulation_targets_xilinx ${kernel_emulation_targets_xilinx} CACHE INTERNAL "Kernel emulation targets used to define dependencies for the tests for Xilinx devices")
     endforeach ()
@@ -123,7 +170,7 @@ endfunction()
 function(generate_kernel_targets_intel)
     foreach (kernel_file_name ${ARGN})
         string(REGEX MATCH "^custom_.*" is_custom_kernel ${kernel_file_name})
-        if (is_custom_kernel) 
+        if (is_custom_kernel)
                 string(REPLACE "custom_" "" base_file_name ${kernel_file_name})
                 set(base_file_part "src/device/custom/${base_file_name}")
         else()
@@ -162,7 +209,7 @@ function(generate_kernel_targets_intel)
                 DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${report_f}
         )
         add_custom_command(OUTPUT ${EXECUTABLE_OUTPUT_PATH}/${bitstream_f}
-                COMMAND ${CMAKE_COMMAND} -E copy  ${CMAKE_CURRENT_BINARY_DIR}/${bitstream_f} ${EXECUTABLE_OUTPUT_PATH}/${bitstream_f} 
+                COMMAND ${CMAKE_COMMAND} -E copy  ${CMAKE_CURRENT_BINARY_DIR}/${bitstream_f} ${EXECUTABLE_OUTPUT_PATH}/${bitstream_f}
                 COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_BINARY_DIR}/${kernel_file_name}/reports ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_synth_reports
                 COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${kernel_file_name}/acl_quartus_report.txt ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_synth_reports/acl_quartus_report.txt
                 COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${kernel_file_name}/quartus_sh_compile.log ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_synth_reports/quartus_sh_compile.log
@@ -187,11 +234,11 @@ function(generate_kernel_targets_intel)
                 MAIN_DEPENDENCY ${source_f}
                 DEPENDS ${CMAKE_BINARY_DIR}/src/common/parameters.h
                 )
-        add_custom_target(${kernel_file_name}_report_intel 
+        add_custom_target(${kernel_file_name}_report_intel
                 DEPENDS ${EXECUTABLE_OUTPUT_PATH}/${kernel_file_name}_reports/report.html)
-        add_custom_target(${kernel_file_name}_intel 
+        add_custom_target(${kernel_file_name}_intel
                 DEPENDS ${EXECUTABLE_OUTPUT_PATH}/${bitstream_f})
-        add_custom_target(${kernel_file_name}_emulate_intel 
+        add_custom_target(${kernel_file_name}_emulate_intel
                 DEPENDS ${EXECUTABLE_OUTPUT_PATH}/${bitstream_emulate_f})
         list(APPEND kernel_emulation_targets_intel ${kernel_file_name}_emulate_intel)
         set(kernel_emulation_targets_intel ${kernel_emulation_targets_intel} CACHE INTERNAL "Kernel emulation targets used to define dependencies for the tests for intel devices")
diff --git a/cmake/unitTestTargets.cmake b/cmake/unitTestTargets.cmake
index 2597017b..4e949a9d 100644
--- a/cmake/unitTestTargets.cmake
+++ b/cmake/unitTestTargets.cmake
@@ -21,12 +21,20 @@ endif()
 
 if (Vitis_FOUND)
     include_directories(SYSTEM ${Vitis_INCLUDE_DIRS})
+    if (USE_ACCL)
+        set(CMAKE_SKIP_BUILD_RPATH No)
+        set(CMAKE_BUILD_WITH_INSTALL_RPATH Yes)
+        list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_BINARY_DIR}/lib/accl/lib)
+    endif()
     add_executable(${HOST_EXE_NAME}_test_xilinx ${TEST_SOURCES} ${PROJECT_SOURCES})
     target_link_libraries(${HOST_EXE_NAME}_test_xilinx gtest gmock ${LIB_NAME}_xilinx ${Vitis_LIBRARIES} "${OpenMP_CXX_FLAGS}")
     target_link_libraries(${HOST_EXE_NAME}_test_xilinx hpcc_fpga_base_test)
     add_dependencies(${HOST_EXE_NAME}_test_xilinx ${kernel_emulation_targets_xilinx})
     target_compile_definitions(${HOST_EXE_NAME}_test_xilinx PRIVATE -DXILINX_FPGA)
     target_compile_options(${HOST_EXE_NAME}_test_xilinx PRIVATE "${OpenMP_CXX_FLAGS}")
+    if (USE_ACCL)
+        target_link_libraries(${HOST_EXE_NAME}_xilinx zmqpp)
+    endif()
     foreach (kernel_target ${kernel_emulation_targets_xilinx})
         string(REPLACE "_xilinx" ".xclbin" kernel_name ${kernel_target})
         add_test(NAME test_unit_${kernel_target} COMMAND $<TARGET_FILE:${HOST_EXE_NAME}_test_xilinx> -f ${kernel_name} ${TEST_HOST_FLAGS} WORKING_DIRECTORY ${EXECUTABLE_OUTPUT_PATH})
diff --git a/docs/requirements.txt b/docs/requirements.txt
index c675a279..f705e859 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,2 +1,2 @@
-Sphinx==3.0.3
-sphinx-rtd-theme==0.5.0
+Sphinx==4.0.0
+sphinx-rtd-theme==1.1.1
diff --git a/docs/source/FFT/index.rst b/docs/source/FFT/index.rst
index 2fda355a..353691bd 100644
--- a/docs/source/FFT/index.rst
+++ b/docs/source/FFT/index.rst
@@ -1,7 +1,8 @@
 .. _fft:
-======
+
+====== 
 FFT
-======
+====== 
 
 This section contains all information related to the FFT benchmark.
 The benchmark executes a batched calculation of 1d FFTs on a single FPGA.
diff --git a/docs/source/FFT/results/fft-1-1.csv b/docs/source/FFT/results/fft-1-1.csv
index c98312bc..7099394e 100644
--- a/docs/source/FFT/results/fft-1-1.csv
+++ b/docs/source/FFT/results/fft-1-1.csv
@@ -1,21 +1,22 @@
-FPGA board,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
-FPGA,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
-Memory Type,DDR,DDR,HBM2,SVM
-SDK,19.4.0,2019.2,2019.2,19.4.0
-BSP/Shell,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
-CPU,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
-System,`Noctua <https://pc2.uni-paderborn.de/hpc-services/available-systems/noctua/>`_,,,
-LOG_FFT_SIZE,17,9,5,17
-NUM_REPLICATIONS,2,1,15,1
-LUT,276676,83494,602125,192189
-LUT percent,36.0,7.39,54.13,22.0
-Register,724790,168150,941404,480285
-Register percent,36.0,7.19,42.18,22.0
-BRAM,4177,39,405,2147
-BRAM percent,36.0,2.28,22.35,18.0
-DSP,1414,672,5280,707
-DSP percent,25.0,7.46,58.58,12.0
-Frequency,413.34,248.00,254.00,348.00
-GFLOPs,349.45,78.26,576.00,119.66
-GBs,65.78,27.83,368.77,22.54
-Error,7.1e-1,3.9e-1,5.4e-1,7.1e-1
+Version,1.4,1.1,1.1,1.1,1.1
+FPGA board,BittWare 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
+FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
+Memory Type,DDR,DDR,DDR,HBM2,SVM
+SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0
+BSP/Shell,20.4.0,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
+CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
+System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_
+LOG_FFT_SIZE,12,17,9,5,17
+NUM_REPLICATIONS,2,2,1,15,1
+LUT,280105,276676,83494,602125,192189
+LUT percent,30,36.0,7.39,54.13,22.0
+Register,611446,724790,168150,941404,480285
+Register percent,,36.0,7.19,42.18,22.0
+BRAM,1811,4177,39,405,2147
+BRAM percent,15,36.0,2.28,22.35,18.0
+DSP,1560,1414,672,5280,707
+DSP percent,27,25.0,7.46,58.58,12.0
+Frequency,402.41,413.34,248.00,254.00,348.00
+GFLOPs,239.598,349.45,78.26,576.00,119.66
+GBs,,65.78,27.83,368.77,22.54
+Error,3.00463e-1,7.1e-1,3.9e-1,5.4e-1,7.1e-1
\ No newline at end of file
diff --git a/docs/source/FFT/results/index.rst b/docs/source/FFT/results/index.rst
index e2f705db..8672be27 100644
--- a/docs/source/FFT/results/index.rst
+++ b/docs/source/FFT/results/index.rst
@@ -9,7 +9,7 @@ The measurements were executed 10 times and the best result is published.
 The results and the used configuration is given in :numref:`tbl_fft_1_1_results` and are also available as :download:`CSV <fft-1-1.csv>`.
 
 .. _tbl_fft_1_1_results:
-.. csv-table:: FFT FPGA Benchmark Results for version 1.1
+.. csv-table:: FFT FPGA Benchmark Results
     :file: fft-1-1.csv
     :stub-columns: 1
 
diff --git a/docs/source/GEMM/index.rst b/docs/source/GEMM/index.rst
index 14f597ed..df3899ed 100644
--- a/docs/source/GEMM/index.rst
+++ b/docs/source/GEMM/index.rst
@@ -1,4 +1,5 @@
 .. _gemm:
+
 ======
 GEMM
 ======
diff --git a/docs/source/GEMM/results/gemm-1-0.csv b/docs/source/GEMM/results/gemm-1-0.csv
index 6b36ebc3..211d4e9f 100644
--- a/docs/source/GEMM/results/gemm-1-0.csv
+++ b/docs/source/GEMM/results/gemm-1-0.csv
@@ -1,24 +1,25 @@
-FPGA board,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
-FPGA,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
-Memory Type,DDR,DDR,HBM2,SVM
-SDK,19.4.0,2019.2,2019.2,19.4.0
-BSP/Shell,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
-CPU,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
-System,`Noctua <https://pc2.uni-paderborn.de/hpc-services/available-systems/noctua/>`_,,,
-BLOCK_SIZE,512,256,256,512
-GEMM_SIZE,8,8,8,8
-GLOBAL_MEM_UNROLL,16,16,16,16
-DATA_TYPE,float,float,float,float
-NUM_REPLICATIONS,5,3,3,5
-LUT,275754,568558,499002,299427
-LUT percent,36.0,51.87,42.64,33.0
-Register,861277,441602,920127,829802
-Register percent,36.0,19.43,38.7,33.0
-BRAM,8860,666,666,9041
-BRAM percent,76.0,43.11,36.71,77.0
-DSP,3398,7683,7683,3398
-DSP percent,59.0,85.23,85.18,59.0
-Frequency,160.42,100.00,236.00,225.00
-GFLOPs,708.95,266.91,603.86,739.59
-GFLOPs norm,88.39,85.29,88.97,65.74
-Error,6.0e-7,2.0e-6,2.0e-6,6.0e-7
+Version,1.4,1.0,1.0,1.0,1.0
+FPGA board,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
+FPGA,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
+Memory Type,DDR,DDR,DDR,HBM2,SVM
+SDK,21.2.0,19.4.0,2019.2,2019.2,19.4.0
+BSP/Shell,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
+CPU,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
+System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,
+BLOCK_SIZE,512,512,256,256,512
+GEMM_SIZE,8,8,8,8,8
+GLOBAL_MEM_UNROLL,8,16,16,16,16
+DATA_TYPE,float,float,float,float,float
+NUM_REPLICATIONS,5,5,3,3,5
+LUT,310564,275754,568558,499002,299427
+LUT percent,33,36.0,51.87,42.64,33.0
+Register,793535,861277,441602,920127,829802
+Register percent,,36.0,19.43,38.7,33.0
+BRAM,8321,8860,666,666,9041
+BRAM percent,71,76.0,43.11,36.71,77.0
+DSP,3318,3398,7683,7683,3398
+DSP percent,58,59.0,85.23,85.18,59.0
+Frequency,273.07,160.42,100.00,236.00,225.00
+GFLOPs,1232.50,708.95,266.91,603.86,739.59
+GFLOPs norm,90.27,88.39,85.29,88.97,65.74
+Error,9.15527e-5,6.0e-7,2.0e-6,2.0e-6,6.0e-7
\ No newline at end of file
diff --git a/docs/source/GEMM/results/index.rst b/docs/source/GEMM/results/index.rst
index 923b78d2..7e08adb0 100644
--- a/docs/source/GEMM/results/index.rst
+++ b/docs/source/GEMM/results/index.rst
@@ -10,7 +10,7 @@ The measurements were executed 10 times and the best result is published.
 The results and the used configuration is given in :numref:`tbl_gemm_1_0_results` and are also available as :download:`CSV <gemm-1-0.csv>`.
 
 .. _tbl_gemm_1_0_results:
-.. csv-table:: GEMM FPGA Benchmark Results for version 1.0
+.. csv-table:: GEMM FPGA Benchmark Results
     :file: gemm-1-0.csv
     :stub-columns: 1
 
diff --git a/docs/source/LINPACK/index.rst b/docs/source/LINPACK/index.rst
index 7ce28dd4..440616bd 100644
--- a/docs/source/LINPACK/index.rst
+++ b/docs/source/LINPACK/index.rst
@@ -1,4 +1,5 @@
 .. _hpl:
+
 =======
 LINPACK
 =======
diff --git a/docs/source/PTRANS/index.rst b/docs/source/PTRANS/index.rst
index b5a9c93d..07bf00c2 100644
--- a/docs/source/PTRANS/index.rst
+++ b/docs/source/PTRANS/index.rst
@@ -1,4 +1,5 @@
 .. _ptrans:
+
 ======
 PTRANS
 ======
diff --git a/docs/source/RandomAccess/index.rst b/docs/source/RandomAccess/index.rst
index 607b311a..02b510d4 100644
--- a/docs/source/RandomAccess/index.rst
+++ b/docs/source/RandomAccess/index.rst
@@ -1,4 +1,5 @@
 .. _randomaccess:
+
 ============
 RandomAccess
 ============
diff --git a/docs/source/RandomAccess/results/index.rst b/docs/source/RandomAccess/results/index.rst
index 52dd983d..a4330c56 100644
--- a/docs/source/RandomAccess/results/index.rst
+++ b/docs/source/RandomAccess/results/index.rst
@@ -9,7 +9,7 @@ The measurements were executed 10 times and the best result is published.
 The results and the used configuration is given in :numref:`tbl_randomaccess_2_2_results` and are also available as :download:`CSV <randomaccess-2-2.csv>`.
 
 .. _tbl_randomaccess_2_2_results:
-.. csv-table:: RandomAccess FPGA Benchmark Results for version 2.2
+.. csv-table:: RandomAccess FPGA Benchmark Results
     :file: randomaccess-2-2.csv
     :stub-columns: 1
 
diff --git a/docs/source/RandomAccess/results/randomaccess-2-2.csv b/docs/source/RandomAccess/results/randomaccess-2-2.csv
index 68969c49..b101cbc6 100644
--- a/docs/source/RandomAccess/results/randomaccess-2-2.csv
+++ b/docs/source/RandomAccess/results/randomaccess-2-2.csv
@@ -1,20 +1,21 @@
-FPGA board,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
-FPGA,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
-Memory Type,DDR,DDR,HBM2,SVM
-SDK,19.4.0,2019.2,2019.2,19.4.0
-BSP/Shell,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
-CPU,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
-System,`Noctua <https://pc2.uni-paderborn.de/hpc-services/available-systems/noctua/>`_,,,
-DEVICE_BUFFER_SIZE,1,1024,1024,1024
-NUM_REPLICATIONS,4,2,32,1
-LUT,115743,7256,116096,103397
-LUT percent,18.0,0.65,10.68,12.0
-Register,253578,11716,187456,225293
-Register percent,18.0,0.5,8.76,12.0
-BRAM,489,38,608,535
-BRAM percent,4.0,2.23,33.55,5.0
-DSP,14,14,224,0
-DSP percent,1.0,0.16,2.48,0.0
-Frequency,329.17,446.0,450.0,322.0
-MUOPs,245.0,40.3,128.1,0.5
-Error,0.0099,0.0106,0.0106,0.0106
+Version,2.5,2.5,2.2,2.2,2.2,2.2
+FPGA board,Alveo U280,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
+FPGA,Xilinx XCU280,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
+Memory Type,DDR,DDR,DDR,DDR,HBM2,SVM
+SDK,2019.2,21.2.0,19.4.0,2019.2,2019.2,19.4.0
+BSP/Shell,2019.2.3,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
+CPU,AMD EPYC Milan 7763,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
+System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_
+DEVICE_BUFFER_SIZE,1024,1,1,1024,1024,1024
+NUM_REPLICATIONS,2,4,4,2,32,1
+LUT,184888,222405,115743,7256,116096,103397
+LUT percent,14.19,24,18.0,0.65,10.68,12.0
+Register,288566,434090,253578,11716,187456,225293
+Register percent,11.08,24,18.0,0.5,8.76,12.0
+BRAM,349.5,602,489,38,608,535
+BRAM percent,17.34,5,4.0,2.23,33.55,5.0
+DSP,24,14,14,14,224,0
+DSP percent,0.27,< 1.0,< 1.0,0.16,2.48,0.0
+Frequency,411.015198,326.05,329.17,446.0,450.0,322.0
+MUOPs,39.7888,185.633,245.0,40.3,128.1,0.5
+Error,0.00662282,0.0689179,0.0099,0.0106,0.0106,0.0106
\ No newline at end of file
diff --git a/docs/source/STREAM/index.rst b/docs/source/STREAM/index.rst
index 7b4f41ff..26dbc1c8 100644
--- a/docs/source/STREAM/index.rst
+++ b/docs/source/STREAM/index.rst
@@ -1,4 +1,5 @@
 .. _stream:
+
 =======
 STREAM
 =======
diff --git a/docs/source/STREAM/results/index.rst b/docs/source/STREAM/results/index.rst
index 4b0d8d4a..b529fcee 100644
--- a/docs/source/STREAM/results/index.rst
+++ b/docs/source/STREAM/results/index.rst
@@ -18,7 +18,7 @@ The results and the used configuration is given in :numref:`tbl_stream_2_3_resul
 
 
 .. _tbl_stream_2_3_results:
-.. csv-table:: STREAM FPGA Benchmark Results for version 2.3
+.. csv-table:: STREAM FPGA Benchmark Results
     :file: stream-2-3.csv
     :stub-columns: 1
 
diff --git a/docs/source/STREAM/results/stream-2-3.csv b/docs/source/STREAM/results/stream-2-3.csv
index aa9a49ee..bf12dedf 100644
--- a/docs/source/STREAM/results/stream-2-3.csv
+++ b/docs/source/STREAM/results/stream-2-3.csv
@@ -1,27 +1,28 @@
-FPGA board,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
-FPGA,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
-Memory Type,DDR,DDR,HBM2,SVM
-SDK,19.4.0,2019.2,2019.2,19.4.0
-BSP/Shell,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
-CPU,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
-System,`Noctua <https://pc2.uni-paderborn.de/hpc-services/available-systems/noctua/>`_,,,
-DATA_TYPE,float,float,float,float
-VECTOR_COUNT,16,16,16,16
-GLOBAL_MEM_UNROLL,1,1,1,1
-DEVICE_BUFFER_SIZE,4096,16384,2048,1
-NUM_REPLICATIONS,4,2,32,1
-LUT,176396,20832,331904,103628
-LUT percent,25.0,1.9,20.69,12.0
-Register,449231,39002,574976,244354
-Register percent,25.0,1.39,27.24,12.0
-BRAM,4029,558,1408,548
-BRAM percent,34.0,34.19,77.7,5.0
-DSP,128,160,2560,32
-DSP percent,2.0,1.78,28.38,1.0
-Frequency,316.67,300.0,370.0,346.0
-Copy,67.01,33.94,377.42,20.15
-Scale,67.24,33.92,365.8,20.04
-Add,68.9,34.58,374.03,15.04
-Triad,68.9,34.57,378.88,15.12
-PCIe Read,6.41,5.68,6.66,inf
-PCIe Write,6.32,5.47,6.03,inf
+Version,2.6,2.6,2.3,2.3,2.3,2.3
+FPGA board,Alveo U280,Bittware 520N,BittWare 520N,Alveo U280,Alveo U280,PAC D5005
+FPGA,Xilinx XCU280,Intel Stratix 10 GX2800,Intel Stratix 10 GX2800,Xilinx XCU280,Xilinx XCU280,Intel Stratix 10 SX
+Memory Type,DDR,DDR,DDR,DDR,HBM2,SVM
+SDK,2019.2,21.2.0,19.4.0,2019.2,2019.2,19.4.0
+BSP/Shell,2019.2.3,20.4.0_hpc,19.2.0_hpc,2019.2.3,2019.2.3,18.1.2_svm
+CPU,AMD EPYC Milan 7763,AMD EPYC Milan 7763,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148,Intel Xeon Gold 6148
+System,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 2 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944066/Noctua+2>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_,`Noctua 1 <https://uni-paderborn.atlassian.net/wiki/spaces/PC2DOK/pages/12944490/Noctua+1>`_
+DATA_TYPE,float,float,float,float,float,float
+VECTOR_COUNT,16,16,16,16,16,16
+GLOBAL_MEM_UNROLL,1,1,1,1,1,1
+DEVICE_BUFFER_SIZE,16384,32768,4096,16384,2048,1
+NUM_REPLICATIONS,2,4,4,2,32,1
+LUT,188124,178268,176396,20832,331904,103628
+LUT percent,14.44,19,25.0,1.9,20.69,12.0
+Register,298365,297342,449231,39002,574976,244354
+Register percent,11.45,,25.0,1.39,27.24,12.0
+BRAM,853.5,3926,4029,558,1408,548
+BRAM percent,42.43,33,34.0,34.19,77.7,5.0
+DSP,170,128,128,160,2560,32
+DSP percent,1.88,2,2.0,1.78,28.38,1.0
+Frequency,411.015198,342.23,316.67,300.0,370.0,346.0
+Copy (GB/s),32.98,65.63,67.01,33.94,377.42,20.15
+Scale (GB/s),32.98,65.63,67.24,33.92,365.8,20.04
+Add (GB/s),33.88,67.78,68.9,34.58,374.03,15.04
+Triad,33.89,67.80,68.9,34.57,378.88,15.12
+PCIe Read,6.35,6.28,6.41,5.68,6.66,inf
+PCIe Write,4.00,5.87,6.32,5.47,6.03,inf
\ No newline at end of file
diff --git a/docs/source/b_eff/index.rst b/docs/source/b_eff/index.rst
index f8cc2f18..030bee78 100644
--- a/docs/source/b_eff/index.rst
+++ b/docs/source/b_eff/index.rst
@@ -1,4 +1,5 @@
 .. _beff: 
+
 =======
 b_eff
 =======
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 96fd71bc..8f3cd6bb 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -25,6 +25,17 @@ The pages collected under **Benchmark Descriptions** contain information about t
 **Technical Support** tackles selected topics of configuration, build, and execution of the benchmarks.
 **Benchmark Results** for the base implementations of the benchmarks are listed at the bottom of this page. They are reported together with the used CPU and other relevant infrastructure, as well as the configuration and resource utilization of the bitstreams.
 
+The scalability and performance of applications executed over multiple FPGAs is not least dependent on the communication capabilities of these devices. The benchmark suite supports the implementation of different communication strategies to compare their impact on the overall benchmark performance. This is only available to the benchmarks which rely on communication: b_eff, PTRANS and LINPACK.
+
+The first and most obvious strategy is host-to-host communication using PCIe and MPI. This strategy requires, in most cases, no additional hardware or software and only relies on moving data between the host and FPGA.
+The data is then exchanged via the existing CPU network, which makes it broadly appliable in the HPC context.
+As a consequence, this approach is used for the base implementations in this benchmark suite.
+For comparison, the suite can be extended with different communication types. 
+Intel is providing external channels for direct communication between the FPGAs. 
+This approach is based on point-to-point connections between FPGA and requires manual routing of data through the network.
+
+Further optimized implementations that use such device-specific communication approaches will be added in the future to the suite.
+
 
 
 .. toctree::
@@ -53,7 +64,12 @@ The pages collected under **Benchmark Descriptions** contain information about t
    :glob:
 
    */results/index
-
+   
+.. toctree::
+   :maxdepth: 1
+   :caption: Benchmark Results:
+   :glob:
+   
    
 ----------
 References
diff --git a/docs/source/technical_support/Basic Setup/index.rst b/docs/source/technical_support/Basic Setup/index.rst
index ed80740a..7308fc23 100644
--- a/docs/source/technical_support/Basic Setup/index.rst	
+++ b/docs/source/technical_support/Basic Setup/index.rst	
@@ -103,20 +103,22 @@ You can always get an overview of the available targets by executing the followi
     BENCHMARK_VENDOR, "Builds the host application "
     BENCHMARK_test_VENDOR, "Compile the tests and its dependencies "
 
-Moreover, there are additional targets to generate kernel reports and bitstreams.
+Moreover, there are additional targets to generate device reports and bitstreams.
+
 The kernel targets are:
  
 .. csv-table:: Device code build targets
    :header: "Target","Description"  
    :widths: 10, 30  
 
-    BENCHMARK_VENDOR            , Synthesizes the kernel (takes several hours!)  
-    BENCHMARK_report_VENDOR        , Just compile the kernel and create logs and reports 
-    BENCHMARK_emulate_VENDOR       , Create an emulation kernel                    
+    BASENAME_{COMM_}VENDOR            , Synthesizes the device kernels (takes several hours!)
+    BASENAME_{COMM_}report_VENDOR        , Just compile the kernels and create logs and reports
+    BASENAME_{COMM_}emulate_VENDOR       , Creates the emulation kernels
   
 `VENDOR` is either `intel` or `xilinx` depending if the Intel SDK or Xilinx Vitis should be used.
-`BENCHMARK` is the kernel name.
-A benchmark can provide multiple kernels and thus, these targets will be generated for every kernel file.
+`BASENAME` is the name of the file containing the device code.
+A benchmark can provide multiple kernel implementations and thus, these targets will be generated for every file containing kernel code.
+For all benchmarks using communication between FPGAs the different communcation types are encoded into the device code file name and therefore part of target name. These are b_eff, PTRANS and LINPACK.
 
 ------------------------------------------------------
 Configure and Build STREAM for a fictional Xilinx FPGA
diff --git a/docs/source/technical_support/Host Input Parameters/index.rst b/docs/source/technical_support/Host Input Parameters/index.rst
index 46abe6f3..c201524e 100644
--- a/docs/source/technical_support/Host Input Parameters/index.rst	
+++ b/docs/source/technical_support/Host Input Parameters/index.rst	
@@ -1,3 +1,5 @@
+.. _execution:
+
 ========================
 Execution of a Benchmark
 ========================
@@ -26,10 +28,16 @@ Input parameters (or options) can be appended to the host execution call like th
     The number of repetitions can be given with this parameter as a positive integer. The benchmark experiment will be repeated the given number of times. The benchmark will show 
     the aggregated results for all runs, but only validate the output of the last run.
 
+``-i``:
+    Use `Intel memory interleaving <https://www.intel.com/content/www/us/en/docs/programmable/683846/22-4/disabling-burst-interleaving-of-global.html>`_.
+
 ``--platform INT``:
     Also an integer. It can be used to specify the index of the OpenCL platform that should be used for execution. By default, it is set to -1. This will make the host code ask you
     to select a platform if multiple platforms are available. This option can become handy if you want to automize the execution of your benchmark.
 
+``--platform_str arg``:
+    A string which can be used to specify the wanted platform independent of the index. The exact platform name needs to be specified. When given, the value of the platform index specified by the flag above will be ignored.
+
 ``--device INT``:
     Also an integer. It can be used to specify the index of the OpenCL device that should be used for execution. By default, it is set to -1. This will make the host code ask you
     to select a device if multiple devices are available. This option can become handy if you want to automize the execution of your benchmark.
@@ -43,9 +51,15 @@ Input parameters (or options) can be appended to the host execution call like th
     Please note, that the benchmark will always fail with this option since it assumes the validation failed, so it will return a non-zero exit code! For reported measurements, the validation has to be enabled and the host should return
     with an exit code 0.
 
+``--comm-type COMM``:
+    This parameter chooses the communication strategy which will be used. Current Options are "IEC" for using the Intel External Channel, "PCIE" for using the host-to-host communicationa and "CPU" for calculating on the CPU.
+
+``--dump-json PATH``:
+    This parameters enables the dumping of the benchmark configuration, settings, timings and results in machine-readable json-format. The parameter describes the path of the json file, where the dump will go. If no parameter is given no dump will be created.
+
 ``--test``:
     This option will also skip the execution of the benchmark. It can be used to test different data generation schemes or the benchmark summary before the actual execution. Please note, that the 
     host will exit with a non-zero exit code, because it will not be able to validate the output.
 
 Additionally, every benchmark will have several options to define the size and type of the used input data.
-These options vary between the benchmarks. An easy way to find out more about these options is to use the ``-h`` option with the host.
\ No newline at end of file
+These options vary between the benchmarks. An easy way to find out more about these options is to use the ``-h`` option with the host.
diff --git a/docs/source/technical_support/json_output/index.rst b/docs/source/technical_support/json_output/index.rst
new file mode 100644
index 00000000..08ca9ab7
--- /dev/null
+++ b/docs/source/technical_support/json_output/index.rst
@@ -0,0 +1,214 @@
+===========
+JSON Output
+===========
+
+The output of the configuration, settings, timings and results in machine-readable json-format can be enabled as described in :ref:`Execution of a Benchmark <execution>`
+
+When enabled, this creates a json file which will have some information for all benchmarks. In the following example the different informations are left out, so these are the same for all benchmarks.
+
+.. code-block:: javascript
+
+    {
+      "config_time": "Mon Dec 05 15:09:08 UTC 2022",
+      "device": "Intel(R) FPGA Emulation Device",
+      "environment": {
+        "LD_LIBRARY_PATH": "/opt/software/pc2/EB-SW/software/Python/3.9.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libffi/3.3-GCCcore-10.3.0/lib64:/opt/software/pc2/EB-SW/software/GMP/6.2.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/SQLite/3.35.4-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/Tcl/8.6.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libreadline/8.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libarchive/3.5.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/cURL/7.76.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/bzip2/1.0.8-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ncurses/6.2-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/ScaLAPACK/2.1.0-gompi-2021a-fb/lib:/opt/software/pc2/EB-SW/software/FFTW/3.3.9-gompi-2021a/lib:/opt/software/pc2/EB-SW/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenBLAS/0.3.15-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenMPI/4.1.1-GCC-10.3.0/lib:/opt/software/pc2/EB-SW/software/PMIx/3.2.3-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libfabric/1.12.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/UCX/1.10.0-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libevent/2.1.12-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/OpenSSL/1.1/lib:/opt/software/pc2/EB-SW/software/hwloc/2.4.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libpciaccess/0.16-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/libxml2/2.9.10-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/XZ/5.2.5-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/numactl/2.0.14-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/binutils/2.36.1-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/zlib/1.2.11-GCCcore-10.3.0/lib:/opt/software/pc2/EB-SW/software/GCCcore/10.3.0/lib64:/opt/software/slurm/21.08.6/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/21.2.0/hld/host/linux64/lib:/opt/software/FPGA/IntelFPGA/opencl_sdk/20.4.0/hld/board/bittware_pcie/s10/linux64/lib"
+      },
+      "git_commit": "c7f3890-dirty",
+      "mpi": {
+        "subversion": 1,
+        "version": 3
+      },
+      "name": "effective bandwidth",
+      "results": {
+      },
+      "settings": {
+        "Communication Type": "IEC",
+        "Kernel File": "./communication_bw520n_IEC_emulate.aocx",
+        "Kernel Replications": 2,
+        "MPI Ranks": 1,
+        "Repetitions": 10,
+        "Test Mode": "No"
+      },
+      "timings": {
+      },
+      "version": "1.3"
+    }
+
+If a benchmark has more settings, they will be added to the settings-key. Every benchmark can track different categories of timings, different results and errors. To see a full example and which keys are available have a look at the README.md of the single benchmarks in the [git repositoy](https://git.uni-paderborn.de/pc2/HPCC_FPGA).
+
+The results and timings are in a special format, which consists of the value and the unit.
+
+.. code-block:: javascript
+
+    {
+      "results": {
+        "b_eff": {
+          "unit": "B/s",
+          "value": 14806691.755972749
+        }
+      }
+    }
+
+The timings are a vector of all the timings which were measured, expect for b_eff, where a special format is used. For every message size used in the benchmark the interim results are saved in the following way.
+
+.. code-block:: javascript
+
+    {
+    "6": {
+      "maxCalcBW": 9225059.007945802,
+      "maxMinCalculationTime": 5.5501e-05,
+      "timings": [
+        {
+          "looplength": 4,
+          "messageSize": 6,
+          "timings": [
+            {
+              "unit": "s",
+              "value": 0.008889638
+            },
+            {
+              "unit": "s",
+              "value": 0.000115271
+            },
+            {
+              "unit": "s",
+              "value": 0.000149272
+            },
+            {
+              "unit": "s",
+              "value": 0.000163372
+            },
+            {
+              "unit": "s",
+              "value": 7.5731e-05
+            },
+            {
+              "unit": "s",
+              "value": 5.5501e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000162132
+            },
+            {
+              "unit": "s",
+              "value": 8.2091e-05
+            },
+            {
+              "unit": "s",
+              "value": 6.7621e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000126891
+            }
+          ]
+        }
+      ]
+    },
+    "7": {
+      "maxCalcBW": 12222341.581026724,
+      "maxMinCalculationTime": 8.3781e-05,
+      "timings": [
+        {
+          "looplength": 4,
+          "messageSize": 7,
+          "timings": [
+            {
+              "unit": "s",
+              "value": 0.000296573
+            },
+            {
+              "unit": "s",
+              "value": 0.000136292
+            },
+            {
+              "unit": "s",
+              "value": 0.000320834
+            },
+            {
+              "unit": "s",
+              "value": 0.000130881
+            },
+            {
+              "unit": "s",
+              "value": 8.3781e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000247252
+            },
+            {
+              "unit": "s",
+              "value": 0.000430356
+            },
+            {
+              "unit": "s",
+              "value": 0.000281403
+            },
+            {
+              "unit": "s",
+              "value": 0.000421565
+            },
+            {
+              "unit": "s",
+              "value": 0.000266754
+            }
+          ]
+        }
+      ]
+    },
+    "8": {
+      "maxCalcBW": 38030862.93662141,
+      "maxMinCalculationTime": 5.3851e-05,
+      "timings": [
+        {
+          "looplength": 4,
+          "messageSize": 8,
+          "timings": [
+            {
+              "unit": "s",
+              "value": 0.000157722
+            },
+            {
+              "unit": "s",
+              "value": 0.000121611
+            },
+            {
+              "unit": "s",
+              "value": 0.000217192
+            },
+            {
+              "unit": "s",
+              "value": 9.7101e-05
+            },
+            {
+              "unit": "s",
+              "value": 6.6931e-05
+            },
+            {
+              "unit": "s",
+              "value": 8.6791e-05
+            },
+            {
+              "unit": "s",
+              "value": 0.000145572
+            },
+            {
+              "unit": "s",
+              "value": 0.000143042
+            },
+            {
+              "unit": "s",
+              "value": 8.5281e-05
+            },
+            {
+              "unit": "s",
+              "value": 5.3851e-05
+            }
+          ]
+        }
+      ]
+    }
+    }
diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
index 75025b7c..0e8bed30 100644
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -28,7 +28,7 @@ FetchContent_Declare(
 
   # unfortunately they do not use releases, so the latest commit was used
   GIT_REPOSITORY      https://github.com/definelicht/hlslib.git
-  GIT_TAG             v1.2.1)
+  GIT_TAG             master)
 
 FetchContent_GetProperties(extern_hlslib)
 if(NOT extern_hlslib_POPULATED)
@@ -54,3 +54,71 @@ if(NOT extern_cxxopts_POPULATED)
     ${extern_cxxopts_BINARY_DIR}
     EXCLUDE_FROM_ALL)
 endif()
+
+if (USE_ACCL)
+message(STATUS "ACCL was selected. Fetch ACCL dependencies")
+# -------------------------------------------------------------------------------
+# ACCL Library
+FetchContent_Declare(
+	extern_accl
+
+    GIT_REPOSITORY	https://github.com/Xilinx/ACCL.git
+	GIT_TAG		dev)
+
+FetchContent_GetProperties(extern_accl)
+if(NOT extern_accl_POPULATED)
+	message(STATUS "Fetching mandatory build dependency ACCL")
+	FetchContent_Populate(extern_accl)
+	set(extern_accl_SOURCE_DIR ${extern_accl_SOURCE_DIR} PARENT_SCOPE)
+endif()
+
+# -------------------------------------------------------------------------------
+# UDP Library
+FetchContent_Declare(
+	extern_accl_udp
+
+    GIT_REPOSITORY	https://github.com/Xilinx/xup_vitis_network_example.git
+	  GIT_TAG		master)
+
+FetchContent_GetProperties(extern_accl_udp)
+if(NOT extern_accl_udp_POPULATED)
+	message(STATUS "Fetching mandatory build dependency ACCL UDP stack")
+	FetchContent_Populate(extern_accl_udp)
+	set(extern_accl_udp_SOURCE_DIR ${extern_accl_udp_SOURCE_DIR} PARENT_SCOPE)
+endif()
+
+# -------------------------------------------------------------------------------
+# TCP Library
+FetchContent_Declare(
+	extern_accl_tcp
+
+    GIT_REPOSITORY	https://github.com/fpgasystems/Vitis_with_100Gbps_TCP-IP.git
+	  GIT_TAG		vitis_2022_1)
+
+FetchContent_GetProperties(extern_accl_tcp)
+if(NOT extern_accl_tcp_POPULATED)
+	message(STATUS "Fetching mandatory build dependency ACCL TCP stack")
+	FetchContent_Populate(extern_accl_tcp)
+	set(extern_accl_tcp_SOURCE_DIR ${extern_accl_tcp_SOURCE_DIR} PARENT_SCOPE)
+endif()
+endif()
+# ------------------------------------------------------------------------------
+# A header only library for handling json
+FetchContent_Declare(
+  extern_json
+
+  URL      https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz
+  URL_HASH SHA256=8c4b26bf4b422252e13f332bc5e388ec0ab5c3443d24399acb675e68278d341f)
+
+FetchContent_GetProperties(extern_json)
+if(NOT extern_json_POPULATED)
+  message(STATUS "Fetching mandatory build dependency json")
+  FetchContent_Populate(extern_json)
+  add_subdirectory(
+    ${extern_json_SOURCE_DIR}
+    ${extern_json_BINARY_DIR}
+    EXCLUDE_FROM_ALL)
+  set(extern_json_SOURCE_DIR ${extern_json_SOURCE_DIR} PARENT_SCOPE)
+endif()
+
+
diff --git a/scripts/code_generator/README.md b/scripts/code_generator/README.md
index 20730682..9ff4d8ab 100644
--- a/scripts/code_generator/README.md
+++ b/scripts/code_generator/README.md
@@ -4,85 +4,57 @@ This is a small and highly extendable Python script for Code generation.
 The main application area is the generation of OpenCL code, but the generator works independently of the used programming language.
 It can be seen as an extension of the usually used preprocessors to adapt the code before compilation.
 With this code it is also possible to replicate code sections and do more complex modifications while keeping the code readable.
-This is done using inline scripting in code comments.
-A generator code line always starts with `PY_CODE_GEN`.
+This is done using the [jinja templating engine](https://jinja.palletsprojects.com/en/3.1.x/).
 
 ## Execution
 
-The script needs Python3 to run.
+The script needs Python3 with the module "jinja2"  to run.
 It will be used by the CMake build system to generate source code and settings for some of the benchmarks.
 A short summary of the usage of the script that can also be printed by running `./generator.py -h`:
 
-    usage: generator.py [-h] [-o OUTPUT_FILE] [--comment COMMENT_SYMBOL]
-                        [--comment-ml-start COMMENT_SYMBOL_ML_START]
-                        [--comment-ml-end COMMENT_SYMBOL_ML_END] [-p PARAMS]
-                        CODE_FILE
+    usage: generator.py [-h] [-o OUTPUT_FILE] [-p PARAMS] CODE_FILE
 
-    Preprocessor for code replication and advanced code modification.
+    Preprocessor for code replication and advanced code modification using jinja.
 
     positional arguments:
-    CODE_FILE             Path to the file that is used as input
+      CODE_FILE       Path to the file that is used as input
 
     optional arguments:
-    -h, --help            show this help message and exit
-    -o OUTPUT_FILE        Path to the output file. If not given, output will
-                            printed to stdout.
-    --comment COMMENT_SYMBOL
-                            Symbols that are used to comment out lines in the
-                            target language. Default='//'
-    --comment-ml-start COMMENT_SYMBOL_ML_START
-                            Symbols that are used to start a multi line comment in
-                            the target language. Default='/*'
-    --comment-ml-end COMMENT_SYMBOL_ML_END
-                            Symbols that are used to end a multi line comment in
-                            the target language. Default='*/'
-    -p PARAMS             Python statement that is parsed before modifying the
-                            files. Can be used to define global variables.
-
+      -h, --help      show this help message and exit
+      -o OUTPUT_FILE  Path to the output file. If not given, output will printed
+                      to stdout.
+      -p PARAMS       Python statement that is parsed before modifying the files.
+                      Can be used to define global variables.
 
+        usage: generator.py [-h] [-o OUTPUT_FILE] [--comment COMMENT_SYMBOL]
+                            [--comment-ml-start COMMENT_SYMBOL_ML_START]
+                            [--comment-ml-end COMMENT_SYMBOL_ML_END] [-p PARAMS]
+                            CODE_FILE
 
 ## Code Examples
 
-The generator takes arbitrary code files as input and only applies changes when specific comment patterns are found.
+The generator takes arbitrary code files as input and only applies changes when the specific jinja templating syntax is used.
 The code insertions have the following syntax:
 
-    // PY_CODE_GEN [block_start STATEMENT|block_end|STATEMENT]
-
-it is also possible to write multiple lines of code:
-
-    /* PY_CODE_GEN 
-    STATEMENT1
-    STATEMENT2
-    ...
-    */
-
-Where `STATEMENT`is an arbitrary python statement.
-The input file will be parsed from the beginning to the end and generation statements will be executed immediately.
 Example for the definition of a global variable:
 
-    PY_CODE_GEN replicate=4
+    {% set replicate = 4 %}
 
 This variable can then be used within the next pragmas to further modify the code.
 E.g. the defined variable can be used to modifiy a code block:
 
-    // PY_CODE_GEN block_start CODE.replace("$R", str(replicate))
-    int i = $R;
-    printf("i should be $R");
-    // PY_CODE_GEN block_end 
-
-`CODE` is a global variable containing the code within the recent block. It can be modified like every other Python string.
-In most cases it is recommended to use the build-in function `replace()` for replacing variables, but it might be used for more advanced code modifications.
-The result of the given Python statement will then be printed in the modified file.
+    int i = {{ replicate }};
+    printf("i should be {{ replicate }}");
 
 This is functionality, which would also be possible using the standard preprocessor.
 A case, where this script becomes handy is code replication.
-This can easily be doe using list comprehension.
+This can easily be done using the for-syntax similiar to list comprehension.
 As an example the dynamic construction of a switch statement:
 
     switch(i) {
-        // PY_CODE_GEN block_start [replace(local_variables=locals()) for i in range(replicate)]
-        case /*PY_CODE_GEN i*/: return /*PY_CODE_GEN i+1*/; break;
-        // PY_CODE_GEN block_end 
+        {% for i in range(replicate) %}
+        case {{ i }}: return /*PY_CODE_GEN i+1*/; break;
+        {% endfor %}
     }
 
 would result in:
@@ -94,25 +66,20 @@ would result in:
         case 3: return 4; break;
     }
 
-Note, that the variables that have to be replaced are written in inline comments `/*PY_CODE_GEN i*/`.
+Note, that the variables that have to be replaced are written double brackets `{{ i }}`.
 The given statement will be evaluated and the comment will be replaced by the result.
 Thus, it is also possible to call functions or do arithmetic.
 
 ## Built-In Functions
 
-The generator can easily be extended by including additional file with the `use_file(FILENAME)` command.
-
-    PY_CODE_GEN use_file(helpers.py)
-
-This will read the file and make all functions and global variables available within following blocks.
+It is possible to insert variables or function definitions with the -p parameter, but they need to be defined explicitly in the script itself to be available in the template engine.
 
-`replace()` makes it easier to replace global variables within the code:
+For accessing functions the globals variable of the template needs to be updated.
 
-    // PY_CODE_GEN block_start replace(local_variables={"test": 2})
-    int var = /*PY_CODE_GEN test*/
-    // PY_CODE_GEN block_end
+    template.globals.update({'function': function})
+ 
+The variables need to be passed in the render step.
 
-will generate the code `int var = 2`.
+    template.render(variable=variable)
 
-It is easily possible to add other helper functions and extend the functionality of the generator using the `use_file` method
-or by declaring functions in multi line comments.
+This is very inflexible compared to the previous generation of this script. Further evaluation is needed to find out whether a automatic merge of the globals of the script with the globals of the template is possible.
\ No newline at end of file
diff --git a/scripts/code_generator/generator.py b/scripts/code_generator/generator.py
old mode 100755
new mode 100644
index 7b27ee93..f8b1da3a
--- a/scripts/code_generator/generator.py
+++ b/scripts/code_generator/generator.py
@@ -1,49 +1,33 @@
-#!/usr/bin/env python3
-#
-# Copyright (c) 2019 Marius Meyer
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy of
-# this software and associated documentation files (the "Software"), to deal in
-# the Software without restriction, including without limitation the rights to
-# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-# of the Software, and to permit persons to whom the Software is furnished to do
-# so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-##
-
 import argparse
-import itertools
 import sys
 import logging
-import re
-
-
-comment_symbol = "//"
-ml_comment_symbol_start = "/*"
-ml_comment_symbol_end = "*/"
-pycodegen_cmd = "PY_CODE_GEN"
-pragma_cmd = comment_symbol +"\\s*"+ pycodegen_cmd
+from jinja2 import Environment, PackageLoader, BaseLoader, TemplateNotFound, select_autoescape
+from os.path import join, exists, getmtime
 
-parser = argparse.ArgumentParser(description='Preprocessor for code replication and advanced code modification.')
+parser = argparse.ArgumentParser(description='Preprocessor for code replication and advanced code modification using jinja.')
 parser.add_argument('file', metavar='CODE_FILE', type=str,
                    help='Path to the file that is used as input')
 parser.add_argument("-o", dest="output_file", default=None, help="Path to the output file. If not given, output will printed to stdout.")
-parser.add_argument("--comment", dest="comment_symbol", default=comment_symbol, help="Symbols that are used to comment out lines in the target language. Default='%s'" % comment_symbol)
-parser.add_argument("--comment-ml-start", dest="comment_symbol_ml_start", default=ml_comment_symbol_start, help="Symbols that are used to start a multi line comment in the target language. Default='%s'" % ml_comment_symbol_start)
-parser.add_argument("--comment-ml-end", dest="comment_symbol_ml_end", default=ml_comment_symbol_end, help="Symbols that are used to end a multi line comment in the target language. Default='%s'" % ml_comment_symbol_end)
 parser.add_argument("-p", dest="params", default=[], action="append", help="Python statement that is parsed before modifying the files. Can be used to define global variables.")
 
-CODE = ""
+# create a simple loader to load templates from the file system
+class SimpleLoader(BaseLoader):
+    def __init__(self, path):
+        self.path = path
+
+    def get_source(self, environment, template):
+        path = join(self.path, template)
+        if not exists(path):
+            raise TemplateNotFound(template)
+        mtime = getmtime(path)
+        with open(path) as f:
+            source = f.read()
+        return source, path, lambda: mtime == getmtime(path)
+
+env = Environment(
+    loader=SimpleLoader("./"),
+    autoescape=select_autoescape()
+)
 
 def use_file(file_name):
     """
@@ -67,124 +51,43 @@ def use_file(file_name):
         print("Error while parsing external file. See logs for more information.",file=sys.stderr)
         exit(1)
 
+def create_list(content, count):
+    return [content for i in range(count)]
 
-def replace(code_block=None, local_variables=None):
-    """
-    Evaluate or execute inline code and replace the code with the result.
-
-    @param code_block The input code block that will be parsed and modified
-    @param local_variables A dictionary containing local variables that should also be considered (like locals())
+if __name__ == '__main__':
+    args = parser.parse_args()
 
-    @return the modified code
-    """
-    global CODE
-    if not code_block:
-        code_block = CODE
-    if local_variables is not None:
-        variables = {**globals(), **local_variables}
+    if args.output_file:
+        log_file_name = args.output_file + ".log"
     else:
-        variables = globals()
-    matches = itertools.chain(re.finditer("%s\\s*%s\\s+(?P<code>(.|\n)+?)%s" % (ml_comment_symbol_start, pycodegen_cmd, ml_comment_symbol_end), code_block, flags=0),
-                                re.finditer("%s\\s+(?!block_start\\s+)(?!block_end\\s+)(?P<code>(.)+?)\n" % (pragma_cmd), code_block, flags=0))
-    for res_ml in matches:
-        res_ml_code = res_ml.group(0)
-        try:
-            evaluated = str(eval(res_ml.groupdict()["code"], variables))
-            code_block = code_block.replace(res_ml_code, evaluated)
-            logging.debug("Evaluated '%s' to '%s'" % (res_ml.groupdict()["code"], evaluated))
-            continue
-        except Exception as e:
-            logging.debug("Failed to evaluate inline code")
-        try:
-            exec(res_ml.groupdict()["code"], globals())
-            code_block = code_block.replace(res_ml_code, "")
-            logging.debug("Executed in global space: '%s'" % res_ml.groupdict()["code"])
-        except Exception as e:
-            logging.warning("Could not execute inline code:\n\tCommand: '''\n%s\n'''\n\tError: %s" % (res_ml.groupdict()["code"], e))
-    return code_block
-
-
-def modify_block(code_block, cmd_str, out):
-    global CODE
-    CODE  = code_block
-    if cmd_str == "":
-        cmd_str = "None"
-    try:
-        mod_code = eval(cmd_str, {**globals(), **locals()})
-    except Exception as e:
-        logging.error("Block: %s \n %s" % (code_block, e))
-        logging.error("Global variables: %s" % globals())
-        print( "Block: %s \n %s" % (code_block, e),file=sys.stderr)
-        exit(1)
-    if type(mod_code) is list:
-        mod_code = "".join(mod_code)
-    elif mod_code is None:
-        mod_code = ""
-    elif type(mod_code) is not str:
-        logging.warning("%s is not a string. Automatic convert to string!" % mod_code)
-        mod_code = str(mod_code)
-    return mod_code
-    #logging.debug("Start parsing of modified sub-block")
-    #parse_string(mod_code, out)
-    #logging.debug("Finished parsing of modified sub-block")
-
-
-def parse_string(code_string, out):
-    try:
-        code_string = replace(code_string)
-        for res in re.finditer("%s\\s+block_start\\s+(?P<cmd>.*)\n(?P<code>(.|\n)+?)%s\\s+block_end\\s*\n" % (pragma_cmd, pragma_cmd), code_string, flags=0):
-            logging.debug("Found block match!")
-            d = res.groupdict()
-            code_block = d["code"]
-            logging.debug("Modify the block!")
-            code_block = modify_block(code_block, d["cmd"], out)
-            code_string = code_string.replace(res.group(0), code_block)
-        logging.debug("Parsing complete. Write result to file.")
-        output.write(code_string)
-    except Exception as e:
-        logging.error("Block: %s \n %s" % (code_string, e))
-        logging.error("Global variables: %s" % globals())
-        logging.error("Local variables: %s" % locals())
-        print( "Error while parsing code block: %s \n %s" % (e),file=sys.stderr)
+        log_file_name = "generator.log"
+    logging.basicConfig(filename=log_file_name, filemode='w', level=logging.DEBUG)
 
+    if not args.file:
+        logging.debug('no input file given')
+        exit(1)
+    for p in args.params:
+        logging.debug("Parse statement: %s" % p)
+        exec(p, globals())
 
-def parse_file(file_name, out):
-    """
-    Opens a single source code file and applies the changes to it.
+    template = env.get_template(args.file)
 
-    The function will output the modified source code into the given output stream.
+    template.globals.update({'create_list': create_list})
 
-    @param file_name The psth to the source code file relative to the current working directory
-    @param out       Output stream that is used to output the modified source code
-    """
     try:
-        with open(file_name) as f:
-            parse_string(f.read(), out)
-    except Exception as e:
-        logging.error("Error when opening and parsing file %s: %s" % (file_name, e))
-        print("Error occurred when parsing file. See logs for more details.",file=sys.stderr)
-
+        template.globals.update({"generate_attributes": generate_attributes})
+    except:
+        pass
 
+    if not 'num_replications' in globals():
+        num_replications = 1 
 
+    if not 'num_total_replications' in globals():
+        num_total_replications = 1
 
-if __name__=="__main__":
-    args = parser.parse_args()
-    if args.output_file:
-        log_file_name = args.output_file + ".log"
-    else:
-        log_file_name = "generator.log"
-    logging.basicConfig(filename=log_file_name, filemode='w', level=logging.DEBUG)
-    output = sys.stdout
-    for p in args.params:
-        logging.debug("Parse statement: %s" % p)
-        exec(p, globals())
-    if args.output_file:
-        logging.debug("Use output file: %s" % args.output_file)
-        output = open(args.output_file, 'w')
-    comment_symbol = re.escape(args.comment_symbol)
-    ml_comment_symbol_start = re.escape(args.comment_symbol_ml_start)
-    ml_comment_symbol_end = re.escape(args.comment_symbol_ml_end)
-    pragma_cmd = comment_symbol +"\\s*"+ pycodegen_cmd
-    logging.debug("Use pragma command: %s", pragma_cmd)
-    logging.debug("Start parsing file: %s" % args.file)
-    parse_file(args.file, output)
+    rendered_template = template.render(num_replications=num_replications, num_total_replications=num_total_replications)
+    try:
+        with open(args.output_file, 'w') as f:
+            f.write(rendered_template)
+    except:
+        sys.stdout.write(rendered_template)
diff --git a/scripts/code_generator/requirements.txt b/scripts/code_generator/requirements.txt
new file mode 100644
index 00000000..ea18cd6f
--- /dev/null
+++ b/scripts/code_generator/requirements.txt
@@ -0,0 +1 @@
+jinja2==2.11.3
diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py
index 03dfa2f4..743b5410 100755
--- a/scripts/evaluation/parse_raw_to_csv.py
+++ b/scripts/evaluation/parse_raw_to_csv.py
@@ -9,13 +9,65 @@
 import sys
 
 # Regular expressions for the raw output of all 
-fft_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Batch\\sSize\\s+(?P<batch_size>\d+)\n(.*\n)FFT\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<best_time>(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P<avg_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_flops>(\d|\.|\+|-|e)+)"
-gemm_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<resid>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<gflops>(\d|\.|\+|-|e)+)"
-ra_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+(?P<size>(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+Error:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<gops>(\d|\.|\+|-|e)+)"
-trans_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+total\\s\\[s\\]\\s+transfer\\s\\[s\\]\\s+calc\\s\\[s\\]\\s+calc\\s+FLOPS\\s+Mem\\s+\\[B/s\\]\\s+PCIe\\s+\\[B/s\\]\n\\s*avg:\\s+(?P<avg_total_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_transfer_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\\s+(?P<avg_trans_bw>(\d|\.|\+|-|e|inf)+)\n\\s*best:\\s+(?P<best_total_time>(\d|\.|\+|-|e)+)\\s+(?P<best_transfer_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)\\s+(?P<best_trans_bw>(\d|\.|\+|-|e|inf)+)"
-stream_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P<size>(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P<data_type>.+)\n(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Kernel\\sType\\s+(?P<type>.+)\n(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P<add_rate>(\d|\.|\+|-|e)+)\\s+(?P<add_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<add_min_time>(\d|\.|\+|-|e)+)\\s+(?P<add_max_time>(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P<copy_rate>(\d|\.|\+|-|e)+)\\s+(?P<copy_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<copy_min_time>(\d|\.|\+|-|e)+)\\s+(?P<copy_max_time>(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P<pcir_rate>(\d|\.|\+|-|e)+)\\s+(?P<pcir_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<pcir_min_time>(\d|\.|\+|-|e)+)\\s+(?P<pcir_max_time>(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P<pciw_rate>(\d|\.|\+|-|e)+)\\s+(?P<pciw_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<pciw_min_time>(\d|\.|\+|-|e)+)\\s+(?P<pciw_max_time>(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P<scale_rate>(\d|\.|\+|-|e)+)\\s+(?P<scale_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<scale_min_time>(\d|\.|\+|-|e)+)\\s+(?P<scale_max_time>(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P<triad_rate>(\d|\.|\+|-|e)+)\\s+(?P<triad_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<triad_min_time>(\d|\.|\+|-|e)+)\\s+(?P<triad_max_time>(\d|\.|\+|-|e)+)"
-linpack_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P<error>((\d|\.|\+|-|e)+|nan))\\s+(?P<resid>((\d|\.|\+|-|e)+|nan))\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P<total_best_time>(\d|\.|\+|-|e)+)\\s+(?P<total_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<total_gflops>(\d|\.|\+|-|e)+)(\\s*\n)\\s+GEFA\\s+(?P<lu_best_time>(\d|\.|\+|-|e)+)\\s+(?P<lu_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<lu_gflops>(\d|\.|\+|-|e)+)(\\s*\n)\\s+GESL\\s+(?P<sl_best_time>(\d|\.|\+|-|e)+)\\s+(?P<sl_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<sl_gflops>(\d|\.|\+|-|e)+)"
-   
+fft_regex = ("Version:\\s+(?P<version>.+)\n"
+    "(.*\n)+Batch\\sSize\\s+(?P<batch_size>\d+)\n"
+    "(.*\n)FFT\\sSize\\s+(?P<size>\d+)"
+    "(.*\n)+Device\\s+(?P<device>.+)\n"
+    "(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n"
+    "\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)"
+    "(.*\n)+\\s+avg\\s+best\\s+\n"
+    "\\s+Time\\s+in\\s+s:\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+.+\n"
+    "\\s+GFLOPS:\\s+(?P<avg_flops>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<best_flops>(\d|\.|\+|-|e)+)\\sGFLOP")
+
+gemm_regex = ("Version:\\s+(?P<version>.+)\n"
+    "(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)"
+    "(.*\n)+Device\\s+(?P<device>.+)\n"
+    "(.*\n)+\\s+norm\.\\sresidual\\s+res\.\\serror\\s+mach\.\\seps\n"
+    "\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<resid>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)"
+    "(.*\n)+\\s+best\\s+mean\\s+GFLOPS\\s+\n"
+    "(?P<best_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+.+\\s+(?P<gflops>(\d|\.|\+|-|e)+)\\s+GFLOP")
+
+ra_regex = ("Version:\\s+(?P<version>.+)\n"
+    "(.*\n)+Array\\sSize\\s+(?P<size>(\d|\.|\+|-|e)+)"
+    "(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)"
+    "(.*\n)+Device\\s+(?P<device>.+)\n"
+    "(.*\n)+Error:\\s+(?P<error>(\d|\.|\+|-|e)+)"
+    "(.*\n)+best\\s+mean\\s+GUOPS\\s+\n"
+    "(?P<best_time>(\d|\.|\+|-|e)+)\\s.\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<gops>(\d|\.|\+|-|e)+)\\sGUOP")
+
+#TODO
+trans_regex = ("Version:\\s+(?P<version>.+)\n"
+    "(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)"
+    "(.*\n)+Device\\s+(?P<device>.+)\n"
+    "(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)"
+    "(.*\n)+\\s+total\\stime\\s+transfer\\stime\\s+calc\\s+time\\s+calc\\sFLOPS\\s+Memory\\sBandwidth\\s+PCIe\\sBandwidth\\s+\n"
+    "\\s+avg:\\s+(?P<avg_total_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<avg_transfer_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<avg_trans_bw>(\d|\.|\+|-|e|inf)+)\\s.+\\s+\n"
+    "\\s+best:\\s+(?P<best_total_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<best_transfer_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<best_trans_bw>(\d|\.|\+|-|e|inf)+)\\s.+\\s.\n")
+
+stream_regex = ("Version:\\s+(?P<version>.+)\n"
+    "(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P<size>(\d|\.|\+|-|e)+)"
+    "(.*\n)+Data\\sType\\s+(?P<data_type>.+)\n"
+    "(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)"
+    "(.*\n)+Kernel\\sType\\s+(?P<type>.+)\n"
+    "(.*\n)+Device\\s+(?P<device>.+)\n"
+    "(.*\n)+Function\\s+Best\\sRate\\s+Avg\\stime\\s+Min\\stime\\s+Max\\stime\\s+\n"
+    "PCI_write\\s+(?P<pciw_rate>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<pciw_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<pciw_min_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<pciw_max_time>(\d|\.|\+|-|e)+)\\s.+\\s+\n"
+    "PCI_read\\s+(?P<pcir_rate>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<pcir_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<pcir_min_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<pcir_max_time>(\d|\.|\+|-|e)+)\\s+.+\\s+\n"
+    "Copy\\s+(?P<copy_rate>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<copy_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<copy_min_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<copy_max_time>(\d|\.|\+|-|e)+)\\s+.+\\s+\n"
+    "Scale\\s+(?P<scale_rate>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<scale_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<scale_min_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<scale_max_time>(\d|\.|\+|-|e)+)\\s.+\\s+\n"
+    "Add\\s+(?P<add_rate>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<add_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<add_min_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<add_max_time>(\d|\.|\+|-|e)+)\\s.+\\s+\n"
+    "Triad\\s+(?P<triad_rate>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<triad_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<triad_min_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<triad_max_time>(\d|\.|\+|-|e)+)\\s.+\\s+\n")
+
+linpack_regex = ("Version:\\s+(?P<version>.+)\n"
+    "(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)"
+    "(.*\n)+Device\\s+(?P<device>.+)\n"
+    "(.*\n)+\\s+norm\.\\sresidual\\s+res\.\\serror\\s+mach\.\\seps\n"
+    "\\s+(?P<error>((\d|\.|\+|-|e)+|nan))\\s+(?P<resid>((\d|\.|\+|-|e)+|nan))\\s+(?P<epsilon>(\d|\.|\+|-|e)+)\n"
+    "(.*\n)+\\sMethod\\s+best\\s+mean\\s+GFLOPS\\s+\n"
+    "\\stotal\\s+(?P<total_best_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<total_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<total_gflops>(\d|\.|\+|-|e)+)\\s.+\\s+\n"
+    "\\sGEFA\\s+(?P<lu_best_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<lu_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<lu_gflops>(\d|\.|\+|-|e)+)\\s.+\\s+\n"
+    "\\sGESL\\s+(?P<sl_best_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<sl_avg_time>(\d|\.|\+|-|e)+)\\s.+\\s+(?P<sl_gflops>(\d|\.|\+|-|e)+)\\s.+\\s+\n")
+
 
 def parse_network(file_content):
     '''
diff --git a/scripts/evaluation/requirements.txt b/scripts/evaluation/requirements.txt
index f9ccbaa9..efd4927b 100644
--- a/scripts/evaluation/requirements.txt
+++ b/scripts/evaluation/requirements.txt
@@ -1 +1 @@
-pandas==0.23.3
+pandas==1.4.3
diff --git a/scripts/test_all.sh b/scripts/test_all.sh
index 081e8474..b1e9de9e 100755
--- a/scripts/test_all.sh
+++ b/scripts/test_all.sh
@@ -21,8 +21,8 @@ TEST_DIR=${PROJECT_ROOT}/build/test
 BUILD_LOG_FILE=${TEST_DIR}/lastbuild.log
 TEST_LOG_FILE=${TEST_DIR}/lasttests.log
 
-BENCHMARKS=("b_eff" "FFT" "GEMM" "LINPACK" "PTRANS" "RandomAccess" "STREAM")
-
+BENCHMARKS=("b_eff" "LINPACK" "PTRANS")
+#BENCHMARKS=("b_eff" "FFT" "GEMM" "LINPACK" "PTRANS" "RandomAccess" "STREAM")
 if [ "$1" != "inc" ]; then
     echo "Clean build directory, use option 'inc' to prevent this!"
     rm -rf ${TEST_DIR}
@@ -49,7 +49,7 @@ for bm in ${BENCHMARKS[@]}; do
     mkdir -p $bm
     ret=0
     cd $bm
-    cmake ${PROJECT_ROOT}/$bm -DDEFAULT_DEVICE=0 -DDEFAULT_PLATFORM=0 -DBLOCK_SIZE=32 &>> $BUILD_LOG_FILE
+    cmake ${PROJECT_ROOT}/$bm -DUSE_OCL_HOST=Yes -DUSE_DEPRECATED_HPP_HEADER=Yes -DDEFAULT_DEVICE=0 -DDEFAULT_PLATFORM=0 -DBLOCK_SIZE=32 &>> $BUILD_LOG_FILE
     ret=$(($ret + $?))
     make -j 40 VERBOSE=1 all &>> $BUILD_LOG_FILE
     ret=$(($ret + $?))
diff --git a/shared/CMakeLists.txt b/shared/CMakeLists.txt
index 89a18117..d9e73c59 100644
--- a/shared/CMakeLists.txt
+++ b/shared/CMakeLists.txt
@@ -1,6 +1,29 @@
 project(HPCCBaseLibrary VERSION 1.0.1)
 
-add_library(hpcc_fpga_base STATIC ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp)
+set(HPCC_BASE_SOURCES "")
+
+if (USE_ACCL)
+    add_subdirectory(${extern_accl_SOURCE_DIR}/driver/xrt ${CMAKE_BINARY_DIR}/lib/accl)
+    list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_accl.cpp ${extern_accl_SOURCE_DIR}/test/model/bfm/cclo_bfm.cpp)
+    if (CMAKE_BUILD_TYPE EQUAL "Debug")
+        set(ACCL_DEBUG Yes)
+    endif()
+endif()
+if (USE_XRT_HOST)
+    list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup_xrt.cpp)
+endif()
+list(APPEND HPCC_BASE_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/setup/fpga_setup.cpp ${CMAKE_CURRENT_SOURCE_DIR}/hpcc_settings.cpp)
+add_library(hpcc_fpga_base STATIC ${HPCC_BASE_SOURCES})
+if (USE_ACCL)
+    add_subdirectory(${extern_accl_SOURCE_DIR}/test/hardware/xup_vitis_network_example/xrt_host_api
+       ${CMAKE_BINARY_DIR}/lib/xrt_host_api)
+    target_include_directories(hpcc_fpga_base PUBLIC ${VNX_INCLUDE_PATH} ${ACCL_INCLUDE_PATH} ${extern_accl_SOURCE_DIR}/test/model/bfm ${extern_accl_SOURCE_DIR}/driver/hls ${extern_hlslib_SOURCE_DIR}/include/hlslib/xilinx)
+    target_link_libraries(hpcc_fpga_base accl vnx)
+endif()
+if (USE_XRT_HOST)
+    target_link_directories(hpcc_fpga_base PUBLIC ${XRT_SEARCH_PATH})
+    target_link_libraries(hpcc_fpga_base xrt_coreutil xrt_core)
+endif()
 
 find_package(OpenCL QUIET)
 
@@ -15,6 +38,6 @@ else()
 endif()
 
 target_include_directories(hpcc_fpga_base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
-target_link_libraries(hpcc_fpga_base cxxopts)
+target_link_libraries(hpcc_fpga_base cxxopts nlohmann_json::nlohmann_json)
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tests)
diff --git a/shared/hpcc_settings.cpp b/shared/hpcc_settings.cpp
new file mode 100644
index 00000000..e621ffe6
--- /dev/null
+++ b/shared/hpcc_settings.cpp
@@ -0,0 +1,78 @@
+#include "hpcc_settings.hpp"
+
+#ifdef USE_ACCL
+#include "setup/fpga_setup_accl.hpp"
+#endif
+    
+    /**
+     * @brief Construct a new Base Settings object
+     * 
+     * @param results The resulting map from parsing the program input parameters
+     */
+hpcc_base::BaseSettings::BaseSettings(cxxopts::ParseResult &results) : numRepetitions(results["n"].as<uint>()),
+#ifdef INTEL_FPGA
+            useMemoryInterleaving(static_cast<bool>(results.count("i"))), 
+#else
+            useMemoryInterleaving(true),
+#endif
+            skipValidation(static_cast<bool>(results.count("skip-validation"))), 
+            defaultPlatform(results["platform"].as<int>()),
+            defaultDevice(results["device"].as<int>()),
+            kernelFileName(results["f"].as<std::string>()),
+            dumpfilePath(results["dump-json"].as<std::string>()),
+#ifdef NUM_REPLICATIONS
+            kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : NUM_REPLICATIONS),
+#else
+            kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : 1),
+#endif
+#ifdef USE_ACCL
+            useAcclEmulation(static_cast<bool>(results.count("accl-emulation"))),
+            acclProtocol(fpga_setup::acclProtocolStringToEnum(results["accl-protocol"].as<std::string>())),
+            acclBufferSize(results["accl-buffer-size"].as<uint>() * 1024),
+            acclBufferCount(results["accl-buffer-count"].as<uint>()),
+            acclRecvBufferMemBanks(results["accl-recv-banks"].as<std::vector<int>>()),
+            acclDefaultBank(results["accl-default-bank"].as<int>()),
+#endif
+#ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED
+            communicationType(retrieveCommunicationType(results["comm-type"].as<std::string>(), results["f"].as<std::string>())),
+#else
+            communicationType(retrieveCommunicationType("UNSUPPORTED", results["f"].as<std::string>())),
+#endif
+            testOnly(static_cast<bool>(results.count("test"))) {}
+
+
+/**
+ * @brief Get a map of the settings. This map will be used to print the final configuration.
+ *          Derived classes should override it to add additional configuration options
+ * 
+ * @return std::map<std::string,std::string> 
+ */
+std::map<std::string,std::string> 
+hpcc_base::BaseSettings::getSettingsMap() {
+    int mpi_size = 0;
+#ifdef _USE_MPI_
+     MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+#endif
+    std::string str_mpi_ranks = "None";
+    if (mpi_size > 0) {
+        str_mpi_ranks = std::to_string(mpi_size);
+    }
+#ifdef USE_ACCL
+    std::stringstream accl_recv_banks;
+    for (auto& b: acclRecvBufferMemBanks) {
+        accl_recv_banks << b << ",";
+    }
+#endif
+    return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, 
+            {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"},
+            {"Communication Type", commToString(communicationType)}
+#ifdef USE_ACCL
+            ,{"ACCL Protocol", fpga_setup::acclEnumToProtocolString(acclProtocol)},
+            {"ACCL Recv. Banks", accl_recv_banks.str()},
+            {"ACCL Default Bank", std::to_string(acclDefaultBank)},
+            {"ACCL Buffer Size", std::to_string(acclBufferSize) + "KB"},
+            {"ACCL Buffer Count", std::to_string(acclBufferCount)},
+            {"ACCL Emulation", useAcclEmulation ? "Yes" : "No"}
+#endif
+            };
+}
diff --git a/shared/include/base_parameters.h.in b/shared/include/base_parameters.h.in
new file mode 100644
index 00000000..6915a14c
--- /dev/null
+++ b/shared/include/base_parameters.h.in
@@ -0,0 +1,24 @@
+#ifndef BASE_PARAMETERS_H
+#define BASE_PARAMETERS_H
+
+#define VERSION "@PROJECT_VERSION@"
+#define DEFAULT_REPETITIONS @DEFAULT_REPETITIONS@
+#define DEFAULT_PLATFORM @DEFAULT_PLATFORM@
+#define DEFAULT_DEVICE @DEFAULT_DEVICE@
+#cmakedefine NUM_REPLICATIONS @NUM_REPLICATIONS@
+#define HOST_DATA_TYPE @HOST_DATA_TYPE@
+#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
+
+#cmakedefine HOST_EMULATION_REORDER
+#cmakedefine DEFAULT_ACCL_BUFFER_SIZE @DEFAULT_ACCL_BUFFER_SIZE@
+#cmakedefine DEFAULT_ACCL_BUFFER_COUNT @DEFAULT_ACCL_BUFFER_COUNT@
+#cmakedefine ACCL_STACK_TYPE "@ACCL_STACK_TYPE@"
+#cmakedefine DEFAULT_ACCL_RECV_BUFFER_BANKS "@DEFAULT_ACCL_RECV_BUFFER_BANKS@"
+#cmakedefine DEFAULT_ACCL_BUFFER_BANK @DEFAULT_ACCL_BUFFER_BANK@
+
+/**
+Output separator
+*/
+#define HLINE "-------------------------------------------------------------\n"
+
+#endif
\ No newline at end of file
diff --git a/shared/include/communication_types.hpp b/shared/include/communication_types.hpp
index bb46bb8d..3f7d9751 100644
--- a/shared/include/communication_types.hpp
+++ b/shared/include/communication_types.hpp
@@ -47,10 +47,9 @@ typedef enum _CommunicationType {
     pcie_mpi,
 
     /**
-     * @brief Communcation using the Streaming Message Interface
-     * 
-     */
-    smi,
+     * @brief Communication using ACCL 
+     */ 
+    accl,
 
     /**
      * @brief Calculate the benchmark on CPU instead of FPGA
@@ -75,12 +74,11 @@ typedef enum _CommunicationType {
 static const std::map<const std::string, CommunicationType> comm_to_str_map{ 
     {"IEC", CommunicationType::intel_external_channels}, 
     {"PCIE", CommunicationType::pcie_mpi},
-	{"SMI", CommunicationType::smi},
+    {"ACCL", CommunicationType::accl},
     {"CPU", CommunicationType::cpu_only},
-    {"UNSUPPORTED", CommunicationType::unsupported},
     {"AUTO", CommunicationType::automatic}
     };
-
+    
 /**
  * @brief Serializes a enum of type CommunicationType into a string. The resulting string can be used with the function retrieveCommunicationType to get back the enum.
  * 
@@ -121,4 +119,4 @@ static CommunicationType retrieveCommunicationType(std::string comm_name, std::s
 }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp
index 17e17bb9..4a1c79e0 100644
--- a/shared/include/hpcc_benchmark.hpp
+++ b/shared/include/hpcc_benchmark.hpp
@@ -23,6 +23,7 @@ SOFTWARE.
 #define SHARED_HPCC_BENCHMARK_HPP_
 
 #include <memory>
+#include <iostream>
 
 /* External library headers */
 #ifdef USE_DEPRECATED_HPP_HEADER
@@ -35,15 +36,27 @@ SOFTWARE.
 #endif
 
 /* Project's headers */
+#ifdef USE_ACCL
+#include "setup/fpga_setup_accl.hpp"
+#endif
+#ifdef USE_XRT_HOST
+#include "setup/fpga_setup_xrt.hpp"
+#endif
 #include "setup/fpga_setup.hpp"
 #include "cxxopts.hpp"
+#include "nlohmann/json.hpp"
 #include "parameters.h"
 #include "communication_types.hpp"
+#include "hpcc_settings.hpp"
 
 #define STR_EXPAND(tok) #tok
 #define STR(tok) STR_EXPAND(tok)
 
-#define ENTRY_SPACE 15
+#define VALUE_SPACE 11
+#define UNIT_SPACE 8
+#define ENTRY_SPACE (VALUE_SPACE + UNIT_SPACE + 1)
+
+using json = nlohmann::json;
 
 /**
  * @brief Contains all classes and functions that are used as basis
@@ -52,183 +65,24 @@ SOFTWARE.
  */
 namespace hpcc_base {
 
-/**
- * @brief This class should be derived and extended for every benchmark.
- *          It is a pure data object containing the benchmark settings that are
- *          used to execute the benchmark kernel.
- *       
- */
-class BaseSettings {
-
+class HpccResult {
 public:
+    double value;
+    std::string unit;
 
-    /**
-     * @brief Number of times the kernel execution will be repeated
-     * 
-     */
-    uint numRepetitions;
-
-    /**
-     * @brief Boolean showing if memory interleaving is used that is 
-     *          triggered from the host side (Intel specific)
-     * 
-     */
-    bool useMemoryInterleaving;
-
-    /**
-     * @brief Boolean showing if the output data of the benchmark kernel
-     *          should be validated or not
-     * 
-     */
-    bool skipValidation;
-
-    /**
-     * @brief The default platform that should be used for execution. 
-     *          A number representing the index in the list of available platforms
-     * 
-     */
-    int defaultPlatform;
-
-    /**
-     * @brief The default device that should be used for execution. 
-     *          A number representing the index in the list of available devices
-     * 
-     */
-    int defaultDevice;
-
-    /**
-     * @brief Path to the kernel file that is used for execution
-     * 
-     */
-    std::string kernelFileName;
-
-    /**
-     * @brief Number of times the kernel is replicated
-     * 
-     */
-    uint kernelReplications;
-
-    /**
-     * @brief Only test the given configuration. Do not execute the benchmarks
-     * 
-     */
-    bool testOnly;
-
-    /**
-     * @brief Type of inter-FPGA communication used
-     * 
-     */
-    CommunicationType communicationType;
-
-    /**
-     * @brief Construct a new Base Settings object
-     * 
-     * @param results The resulting map from parsing the program input parameters
-     */
-    BaseSettings(cxxopts::ParseResult &results) : numRepetitions(results["n"].as<uint>()),
-#ifdef INTEL_FPGA
-            useMemoryInterleaving(static_cast<bool>(results.count("i"))), 
-#else
-            useMemoryInterleaving(true),
-#endif
-            skipValidation(static_cast<bool>(results.count("skip-validation"))), 
-            defaultPlatform(results["platform"].as<int>()),
-            defaultDevice(results["device"].as<int>()),
-            kernelFileName(results["f"].as<std::string>()),
-#ifdef NUM_REPLICATIONS
-            kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : NUM_REPLICATIONS),
-#else
-            kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : 1),
-#endif
-#ifdef COMMUNICATION_TYPE_SUPPORT_ENABLED
-            communicationType(retrieveCommunicationType(results["comm-type"].as<std::string>(), results["f"].as<std::string>())),
-#else
-            communicationType(retrieveCommunicationType("UNSUPPORTED", results["f"].as<std::string>())),
-#endif
-            testOnly(static_cast<bool>(results.count("test"))) {}
-
-    /**
-     * @brief Get a map of the settings. This map will be used to print the final configuration.
-     *          Derived classes should override it to add additional configuration options
-     * 
-     * @return std::map<std::string,std::string> 
-     */
-    virtual std::map<std::string,std::string> getSettingsMap() {
-    int mpi_size = 0;
-#ifdef _USE_MPI_
-     MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
-#endif
-    std::string str_mpi_ranks = "None";
-    if (mpi_size > 0) {
-        str_mpi_ranks = std::to_string(mpi_size);
-    }
-        return {{"Repetitions", std::to_string(numRepetitions)}, {"Kernel Replications", std::to_string(kernelReplications)}, 
-                {"Kernel File", kernelFileName}, {"MPI Ranks", str_mpi_ranks}, {"Test Mode", testOnly ? "Yes" : "No"},
-                {"Communication Type", commToString(communicationType)}};
+    HpccResult(double value, std::string unit): value(value), unit(unit) {}
+    
+    friend std::ostream &operator<<(std::ostream &os, const HpccResult &result) {
+        os << std::setw(VALUE_SPACE) << result.value << " " << std::left << std::setw(UNIT_SPACE) << result.unit << std::right;
+        return os;
     }
 
-};
-
-
-/**
- * @brief Settings class that is containing the program settings together with
- *          additional information about the OpenCL runtime
- * 
- * @tparam TSettings The program settings class that should be used (Must derive from BaseSettings)
- */
-template <class TSettings>
-class ExecutionSettings {
-public:
-
-    /**
-     * @brief Pointer to the additional program settings
-     * 
-     */
-    std::unique_ptr<TSettings> programSettings;
-
-    /**
-     * @brief The OpenCL device that should be used for execution
-     * 
-     */
-    std::unique_ptr<cl::Device> device;
-
-    /**
-     * @brief The OpenCL context that should be used for execution
-     * 
-     */
-    std::unique_ptr<cl::Context> context;
-
-    /**
-     * @brief The OpenCL program that contains the benchmark kernel
-     * 
-     */
-    std::unique_ptr<cl::Program> program;
-
-    /**
-     * @brief Construct a new Execution Settings object
-     * 
-     * @param programSettings_ Pointer to an existing program settings object that is derived from BaseSettings
-     * @param device_ Used OpenCL device
-     * @param context_ Used OpenCL context
-     * @param program_ Used OpenCL program
-     */
-    ExecutionSettings(std::unique_ptr<TSettings> programSettings_, std::unique_ptr<cl::Device> device_, 
-                        std::unique_ptr<cl::Context> context_, std::unique_ptr<cl::Program> program_): 
-                                    programSettings(std::move(programSettings_)), device(std::move(device_)), 
-                                    context(std::move(context_)), program(std::move(program_)) {}
-
-    /**
-     * @brief Destroy the Execution Settings object. Used to specify the order the contained objects are destroyed 
-     *         to prevent segmentation faults during exit.
-     * 
-     */
-    ~ExecutionSettings() {
-        program = nullptr;
-        context = nullptr;
-        device = nullptr;
-        programSettings = nullptr;
+    std::string to_string() const {
+        std::ostringstream oss;
+        oss << *this;
+        return oss.str();
     }
-
+    // TODO: to_json function
 };
 
 /**
@@ -238,7 +92,8 @@ class ExecutionSettings {
  * @tparam TData Class used to represent the benchmark input and output data
  * @tparam TOutput Class representing the measurements like timings etc
  */
-template <class TSettings, class TData, class TOutput>
+template <typename TSettings, class TDevice, class TContext, class TProgram, class TData, typename =
+typename std::enable_if<std::is_base_of<BaseSettings, TSettings>::value>::type>
 class HpccFpgaBenchmark {
 
 private:
@@ -258,7 +113,7 @@ class HpccFpgaBenchmark {
      *        It should be laos used by all other methods to read the current benchmark settings.
      * 
      */
-    std::unique_ptr<ExecutionSettings<TSettings>> executionSettings;
+    std::unique_ptr<ExecutionSettings<TSettings, TDevice, TContext, TProgram>> executionSettings;
 
     /**
      * @brief Add additional options to the program parameter parser
@@ -287,6 +142,34 @@ class HpccFpgaBenchmark {
      * 
      */
     bool mpi_external_init = true;
+    
+    /**
+     *
+     * @brief map containing the benchmark timings
+     *
+     */
+    std::map<std::string, std::vector<double>> timings;
+
+    /**
+     *
+     * @brief map containing the benchmark results
+     *
+     */
+    std::map<std::string, HpccResult> results;
+    
+    /**
+     *
+     * @brief map containing the errors of the benchmark
+     *
+     */
+    std::map<std::string, double> errors;
+
+    /**
+     * @brief This flag indicates whether the validation was successful
+     *
+     */
+    bool validated = false;
+
 
 public:
 
@@ -304,7 +187,7 @@ class HpccFpgaBenchmark {
      * @param data The initialized data for the kernel. It will be replaced by the kernel output for validation
      * @return std::unique_ptr<TOutput> A data class containing the measurement results of the execution
      */
-    virtual std::unique_ptr<TOutput>
+    virtual void
     executeKernel(TData &data) = 0;
 
     /**
@@ -315,7 +198,13 @@ class HpccFpgaBenchmark {
      * @return false If the validation failed
      */
     virtual bool
-    validateOutputAndPrintError(TData &data) = 0;
+    validateOutput(TData &data) = 0;
+    
+    /**
+     * @brief Print the error after validating output
+    */
+    virtual void
+    printError() = 0;
 
     /**
      * @brief Collects the measurment results from all MPI ranks and 
@@ -324,7 +213,10 @@ class HpccFpgaBenchmark {
      * @param output  The measurement data of the kernel execution
      */
     virtual void
-    collectAndPrintResults(const TOutput &output) = 0;
+    collectResults() = 0;
+    
+    virtual void
+    printResults() = 0;
 
     /**
      * @brief Method that can be overwritten by inheriting classes to check the validity of input parameters.
@@ -371,6 +263,19 @@ class HpccFpgaBenchmark {
                 cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_REPETITIONS)))
 #ifdef INTEL_FPGA
                 ("i", "Use memory Interleaving")
+#endif
+#ifdef USE_ACCL
+                ("accl-emulation", "Use the accl emulation instead of hardware execution")
+                ("accl-protocol", "Specify the network protocol that should be used with ACCL.",
+                cxxopts::value<std::string>()->default_value(ACCL_STACK_TYPE))
+                ("accl-buffer-size", "Specify the size of the ACCL buffers in KB",
+                cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_SIZE)))
+                ("accl-buffer-count", "Specify the number of ACCL buffers used within the benchmark",
+                cxxopts::value<uint>()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_COUNT)))
+                ("accl-default-bank", "Default memory bank used by ACCL to create new FPGA buffers",
+                cxxopts::value<int>()->default_value(std::to_string(DEFAULT_ACCL_BUFFER_BANK)))
+                ("accl-recv-banks", "Memory banks used by ACCL for receive buffers",
+                cxxopts::value<std::vector<int>>()->default_value(DEFAULT_ACCL_RECV_BUFFER_BANKS))
 #endif
                 ("skip-validation", "Skip the validation of the output data. This will speed up execution and helps when working with special data types.")
                 ("device", "Index of the device that has to be used. If not given you "\
@@ -380,6 +285,7 @@ class HpccFpgaBenchmark {
             "you will be asked which platform to use if there are multiple "\
             "platforms available.",
                 cxxopts::value<int>()->default_value(std::to_string(DEFAULT_PLATFORM)))
+                ("platform_str", "Name of the platform that has to be used", cxxopts::value<std::string>()->default_value(std::string()))
 #ifdef NUM_REPLICATIONS
                 ("r", "Number of used kernel replications",
                 cxxopts::value<cl_uint>()->default_value(std::to_string(NUM_REPLICATIONS)))
@@ -388,6 +294,7 @@ class HpccFpgaBenchmark {
                 ("comm-type", "Used communication type for inter-FPGA communication",
                 cxxopts::value<std::string>()->default_value(DEFAULT_COMM_TYPE))
 #endif
+                ("dump-json", "dump benchmark configuration and results to this file in json format", cxxopts::value<std::string>()->default_value(std::string("")))
                 ("test", "Only test given configuration and skip execution and validation")
                 ("h,help", "Print this help");
 
@@ -440,6 +347,178 @@ class HpccFpgaBenchmark {
         std::cout << "Summary:" << std::endl;
         std::cout << *executionSettings << std::endl;
     }
+    
+    /*
+     * @brief Returns the map of the timings
+     *
+     * @return The timings map
+     */
+    std::map<std::string, std::vector<double>>
+    getTimingsMap() {
+        return timings;
+    }
+    
+    /*
+     * @brief adds a timing to the timings map
+     *
+     * @param key The key
+     */
+    void
+    addTimings(std::string key, std::vector<double> value) {
+        timings.emplace(key, value);
+    }
+    
+    /*
+     * @brief Returns the timings map as json
+     *
+     * @return The json object
+     *
+     * It should be overwritten for benchmarks with special timings format, like b_eff
+     */
+    virtual json getTimingsJson() {
+        json j;
+        for (auto const &key: timings) {
+            std::vector<json> timings_list;
+            for (auto const &timing: key.second) {
+                json j;
+                j["unit"] = "s";
+                j["value"] = timing;
+                timings_list.push_back(j);
+            }
+            j[key.first] = timings_list;
+        }
+        return j;
+    }
+
+    /**
+     * @brief Returns the results map as json
+     *
+     * @return The return object
+     *
+     */
+    std::map<std::string, json> getResultsJson() {
+        std::map<std::string, json> results_string;
+        for (auto const &result: results) {
+            json j;
+            j["unit"] = result.second.unit;
+            j["value"] = result.second.value;
+            results_string[result.first] = j;
+        }
+        return results_string;
+    }
+
+    /**
+     * @brief Returns the map of the dumped environment variables
+     *
+     * @param The environment map
+     *
+     * Can be extended as needed
+     */
+    std::map<std::string, std::string>
+    getEnvironmentMap() {
+        std::map<std::string, std::string> env; 
+        env["LD_LIBRARY_PATH"] = std::string(std::getenv("LD_LIBRARY_PATH"));
+        return env;
+    }
+    /**
+     * @brief Format the FPGA Torus setting string
+     *
+     * @param The setting string
+     *
+     * @return The parsed json object
+     *
+     */
+    json
+    parseFPGATorusString(std::string str) {
+        json j; 
+        size_t space = str.find(" "); 
+        std::string p_str = str.substr(0, space);
+        std::string q_str = str.substr(space, str.size());
+        j["P"] = stoi(p_str.substr(p_str.find("=") + 1, p_str.find(",")));
+        j["Q"] = stoi(q_str.substr(q_str.find("=") + 1, q_str.size()));
+        return j;
+    }
+    
+    /**
+     * @brief Get current time as string
+     *
+     * @return The time string
+     *
+     * Has the same format as CONFIG_TIME
+     */
+    std::string
+    getCurrentTime() {
+        time_t time = std::time(0);
+        const tm *utc_time = std::gmtime(&time);
+        std::ostringstream oss;
+        oss << std::put_time(utc_time, "%a %b %d %T UTC %Y");
+        return oss.str();
+    }
+
+    /**
+     * @brief Convert the settings map to json
+     *
+     * @param settings_map The settings map
+     *
+     * @return the json object
+     *
+     * This function checks for settings which are not strings and converts them
+     */
+    std::map<std::string, json>
+    jsonifySettingsMap(std::map<std::string, std::string> settings_map) {
+        json j;
+        for (const auto& item: settings_map) {
+            std::string key = item.first;
+            std::string value = item.second;
+            try {
+                int value_int = stoi(value); 
+                j[key] = value_int;
+            } catch (std::invalid_argument const &ex) {
+                if (key == "FPGA Torus") {
+                    j[key] = parseFPGATorusString(value);
+                } else if (key == "Emulate" || key == "Test Mode" || key == "Memory Interleaving" || key == "Replicate Inputs" || key == "Inverse" || key == "Diagonally Dominant" || "Dist. Buffers") {
+                    j[key] = value == "Yes";
+                } else {
+                    j[key] = value; 
+                }
+            }     
+        }
+        return j;
+    }
+
+    /**
+     * @brief Dumps the benchmark configuration and results to a json file
+     *
+     * @param file_path Path where the json will be saved
+     *
+     */
+    void
+    dumpConfigurationAndResults(std::string file_path) {
+        std::fstream fs;
+        fs.open(file_path, std::ios_base::out);
+        if (!fs.is_open()) {
+            std::cout << "Unable to open file for dumping configuration and results" << std::endl;
+        } else {
+            json dump;
+            dump["name"] = PROGRAM_NAME;
+#ifdef _USE_MPI_
+            dump["mpi"] ={{"version", MPI_VERSION}, {"subversion", MPI_SUBVERSION}};
+#endif
+            dump["config_time"] = CONFIG_TIME;
+            dump["execution_time"] = getCurrentTime();
+            dump["git_commit"] = GIT_COMMIT_HASH;
+            dump["version"] = VERSION;
+            dump["device"] = executionSettings->getDeviceName();
+            dump["settings"] = jsonifySettingsMap(executionSettings->programSettings->getSettingsMap());
+            dump["timings"] = getTimingsJson();
+            dump["results"] = getResultsJson();
+            dump["errors"] = errors;
+            dump["validated"] = validated;
+            dump["environment"] = getEnvironmentMap();
+
+            fs << dump;
+        }
+    }
 
     /**
      * @brief Selects and prepares the target device and prints the final configuration.
@@ -472,21 +551,47 @@ class HpccFpgaBenchmark {
 
             std::unique_ptr<TSettings> programSettings = parseProgramParameters(tmp_argc, tmp_argv);
 
-            std::unique_ptr<cl::Context> context;
-            std::unique_ptr<cl::Program> program;
-            std::unique_ptr<cl::Device> usedDevice;
+            std::unique_ptr<TContext> context;
+            std::unique_ptr<TProgram> program;
+            std::unique_ptr<TDevice> usedDevice;
 
             if (!programSettings->testOnly) {
+#ifdef USE_XRT_HOST
+                usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultDevice);
+#ifndef USE_ACCL
+                context = std::unique_ptr<bool>(new bool(false));
+#endif
+#ifdef USE_ACCL
+                if (!programSettings->useAcclEmulation) {
+#endif
+                    program = fpga_setup::fpgaSetup(*usedDevice, programSettings->kernelFileName);
+#ifdef USE_ACCL
+                }
+#endif
+#endif                                                             
+#ifdef USE_OCL_HOST
                 usedDevice = fpga_setup::selectFPGADevice(programSettings->defaultPlatform,
-                                                                    programSettings->defaultDevice);
+                                                                    programSettings->defaultDevice,
+                                                                    programSettings->platformString);
 
                 context = std::unique_ptr<cl::Context>(new cl::Context(*usedDevice));
                 program = fpga_setup::fpgaSetup(context.get(), {*usedDevice},
                                                                     &programSettings->kernelFileName);
+#endif
+#ifdef USE_ACCL
+                if (programSettings->communicationType == CommunicationType::accl) {
+                    context = std::unique_ptr<fpga_setup::ACCLContext>(new fpga_setup::ACCLContext(
+                                    fpga_setup::fpgaSetupACCL(*usedDevice, *program, *programSettings)));
+                }
+                else {
+                    context = std::unique_ptr<fpga_setup::ACCLContext>(new fpga_setup::ACCLContext());
+                }
+#endif
             }
 
-            executionSettings = std::unique_ptr<ExecutionSettings<TSettings>>(new ExecutionSettings<TSettings>(std::move(programSettings), std::move(usedDevice), 
-                                                                    std::move(context), std::move(program)));
+            executionSettings = std::unique_ptr<ExecutionSettings<TSettings, TDevice, TContext, TProgram>>(new ExecutionSettings<TSettings, TDevice, TContext, TProgram>(std::move(programSettings), std::move(usedDevice), 
+                                                                    std::move(context), std::move(program)
+                                                                    ));
             if (mpi_comm_rank == 0) {
                 if (!checkInputParameters()) {
                     std::cerr << "ERROR: Input parameter check failed!" << std::endl;
@@ -552,9 +657,8 @@ class HpccFpgaBenchmark {
                         << HLINE;
             }
 
-            bool validateSuccess = false;
             auto exe_start = std::chrono::high_resolution_clock::now();
-            std::unique_ptr<TOutput> output =  executeKernel(*data);
+            executeKernel(*data);
 
 #ifdef _USE_MPI_
         MPI_Barrier(MPI_COMM_WORLD);
@@ -570,25 +674,35 @@ class HpccFpgaBenchmark {
 
             if (!executionSettings->programSettings->skipValidation) {
                 auto eval_start = std::chrono::high_resolution_clock::now();
-                validateSuccess = validateOutputAndPrintError(*data);
+                validated = validateOutput(*data);
+                if (mpi_comm_rank == 0) {
+                    printError();
+                }
                 std::chrono::duration<double> eval_time = std::chrono::high_resolution_clock::now() - eval_start;
 
                 if (mpi_comm_rank == 0) {
                     std::cout << "Validation Time: " << eval_time.count() << " s" << std::endl;
                 }
             }
-            collectAndPrintResults(*output);
-
+            std::cout << HLINE << "Collect results..." << std::endl << HLINE;
+            collectResults();
+            
             if (mpi_comm_rank == 0) {
-                if (!validateSuccess) {
-                    std::cerr << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl;
+                if (executionSettings->programSettings->dumpfilePath.size() > 0) {
+                    dumpConfigurationAndResults(executionSettings->programSettings->dumpfilePath);
+                }
+            
+                printResults();
+
+                if (!validated) {
+                    std::cerr << HLINE << "ERROR: VALIDATION OF OUTPUT DATA FAILED!" << std::endl;
                 }
                 else {
-                    std::cout << "Validation: SUCCESS!" << std::endl;
+                    std::cout << HLINE << "Validation: SUCCESS!" << std::endl;
                 }
             }
 
-            return validateSuccess;
+            return validated;
        }
        catch (const std::exception& e) {
             std::cerr << "An error occured while executing the benchmark: " << std::endl;
@@ -602,7 +716,7 @@ class HpccFpgaBenchmark {
      * 
      * @return ExecutionSettings& The execution settings object
      */
-    ExecutionSettings<TSettings>& getExecutionSettings() {
+    ExecutionSettings<TSettings, TDevice, TContext, TProgram>& getExecutionSettings() {
         return *executionSettings;
     }
 
@@ -649,6 +763,7 @@ class HpccFpgaBenchmark {
 
 };
 
+
 /**
  * @brief Prints the execution settings to an output stream
  * 
@@ -657,16 +772,10 @@ class HpccFpgaBenchmark {
  * @param printedExecutionSettings The execution settings that have to be printed to the stream
  * @return std::ostream& The output stream after the execution settings are piped in
  */
-template <class TSettings>
-std::ostream& operator<<(std::ostream& os, ExecutionSettings<TSettings> const& printedExecutionSettings){
-        std::string device_name;
+template <class TSettings, class TDevice, class TContext, class TProgram>
+std::ostream& operator<<(std::ostream& os, ExecutionSettings<TSettings, TDevice, TContext, TProgram> const& printedExecutionSettings){
+        std::string device_name = printedExecutionSettings.getDeviceName();
         os << std::left;
-        if (!printedExecutionSettings.programSettings->testOnly) {
-        printedExecutionSettings.device->getInfo(CL_DEVICE_NAME, &device_name);
-        }
-        else {
-            device_name = "TEST RUN: Not selected!";
-        }
         for (auto k : printedExecutionSettings.programSettings->getSettingsMap()) {
             os   << std::setw(2 * ENTRY_SPACE) << k.first << k.second << std::endl;
         }
diff --git a/shared/include/hpcc_settings.hpp b/shared/include/hpcc_settings.hpp
new file mode 100644
index 00000000..71c7a290
--- /dev/null
+++ b/shared/include/hpcc_settings.hpp
@@ -0,0 +1,239 @@
+#ifndef HPCC_BASE_SETTINGS_H_
+#define HPCC_BASE_SETTINGS_H_
+
+#ifdef USE_OCL_HOST
+#ifdef USE_DEPRECATED_HPP_HEADER
+#include "CL/cl.hpp"
+#else
+#include OPENCL_HPP_HEADER
+#endif
+#else
+#include "xrt/xrt_device.h"
+#endif
+#include "cxxopts.hpp"
+#include "parameters.h"
+#include "communication_types.hpp"
+
+#ifdef _USE_MPI_
+#include "mpi.h"
+#endif
+
+#ifdef USE_ACCL
+#include "accl.hpp"
+#endif
+
+/**
+ * @brief Contains all classes and functions that are used as basis
+ *          for all benchmarks.
+ * 
+ */
+namespace hpcc_base {
+
+/**
+ * @brief This class should be derived and extended for every benchmark.
+ *          It is a pure data object containing the benchmark settings that are
+ *          used to execute the benchmark kernel.
+ *       
+ */
+class BaseSettings {
+
+public:
+
+    /**
+     * @brief Number of times the kernel execution will be repeated
+     * 
+     */
+    uint numRepetitions;
+
+    /**
+     * @brief Boolean showing if memory interleaving is used that is 
+     *          triggered from the host side (Intel specific)
+     * 
+     */
+    bool useMemoryInterleaving;
+
+    /**
+     * @brief Boolean showing if the output data of the benchmark kernel
+     *          should be validated or not
+     * 
+     */
+    bool skipValidation;
+
+    /**
+     * @brief The default platform that should be used for execution. 
+     *          A number representing the index in the list of available platforms
+     * 
+     */
+    int defaultPlatform;
+
+    std::string platformString;
+
+    /**
+     * @brief The default device that should be used for execution. 
+     *          A number representing the index in the list of available devices
+     * 
+     */
+    int defaultDevice;
+
+    /**
+     * @brief Path to the kernel file that is used for execution
+     * 
+     */
+    std::string kernelFileName;
+
+    /**
+     * @brief Number of times the kernel is replicated
+     * 
+     */
+    uint kernelReplications;
+
+    /**
+     * @brief Only test the given configuration. Do not execute the benchmarks
+     * 
+     */
+    bool testOnly;
+
+    std::string dumpfilePath;
+
+    /**
+     * @brief Type of inter-FPGA communication used
+     * 
+     */
+    CommunicationType communicationType;
+
+#ifdef USE_ACCL
+    /**
+     * @brief Use ACCL emulation constructor instead of hardware execution
+     */
+    bool useAcclEmulation;
+
+    /**
+     * @brief Used ACCL network stack
+     * 
+     */
+    ACCL::networkProtocol acclProtocol;
+
+    /**
+     * @brief Size of the ACCL buffers in bytes
+     * 
+     */
+    uint acclBufferSize;
+
+    /**
+     * @brief Number of ACCL buffers to use
+     * 
+     */
+    uint acclBufferCount;
+
+    /**
+     * @brief Memory banks used to create ACCL receive buffers
+    */
+    std::vector<int> acclRecvBufferMemBanks;
+
+    /**
+     * @brief Default bank for memory buffer created with ACCL driver
+    */
+    int acclDefaultBank;
+#endif
+
+    /**
+     * @brief Construct a new Base Settings object
+     * 
+     * @param results The resulting map from parsing the program input parameters
+     */
+    BaseSettings(cxxopts::ParseResult &results);
+
+    /**
+     * @brief Get a map of the settings. This map will be used to print the final configuration.
+     *          Derived classes should override it to add additional configuration options
+     * 
+     * @return std::map<std::string,std::string> 
+     */
+    virtual std::map<std::string,std::string> getSettingsMap();
+
+};
+
+/**
+ * @brief Settings class that is containing the program settings together with
+ *          additional information about the OpenCL runtime
+ * 
+ * @tparam TSettings The program settings class that should be used (Must derive from BaseSettings)
+ */
+template <class TSettings, class TDevice, class TContext, class TProgram, typename =
+typename std::enable_if<std::is_base_of<hpcc_base::BaseSettings, TSettings>::value>::type>
+class ExecutionSettings {
+public:
+
+    /**
+     * @brief Pointer to the additional program settings
+     * 
+     */
+    std::unique_ptr<TSettings> programSettings;
+
+    /**
+     * @brief The OpenCL device that should be used for execution
+     * 
+     */
+    std::unique_ptr<TDevice> device;
+
+    /**
+     * @brief The OpenCL context that should be used for execution
+     * 
+     */
+    std::unique_ptr<TContext> context;
+
+    /**
+     * @brief The OpenCL program that contains the benchmark kernel
+     * 
+     */
+    std::unique_ptr<TProgram> program;
+
+    std::string
+    getDeviceName() const {
+        std::string device_name;
+        if (!programSettings->testOnly) {
+#ifdef USE_OCL_HOST
+            device->getInfo(CL_DEVICE_NAME, &device_name);
+#endif
+#ifdef USE_XRT_HOST
+            device_name = device->template get_info<xrt::info::device::name>();
+#endif
+        } else {
+            device_name = "TEST RUN: Not selected!";
+        }
+        return device_name;
+    }
+
+    /**
+     * @brief Construct a new Execution Settings object
+     * 
+     * @param programSettings_ Pointer to an existing program settings object that is derived from BaseSettings
+     * @param device_ Used OpenCL device
+     * @param context_ Used OpenCL context
+     * @param program_ Used OpenCL program
+     */
+    ExecutionSettings(std::unique_ptr<TSettings> programSettings_, std::unique_ptr<TDevice> device_, 
+                        std::unique_ptr<TContext> context_, std::unique_ptr<TProgram> program_
+                        
+                        ): 
+                                    programSettings(std::move(programSettings_)), device(std::move(device_)), 
+                                    context(std::move(context_)), program(std::move(program_))                                    
+                                             {}
+
+    /**
+     * @brief Destroy the Execution Settings object. Used to specify the order the contained objects are destroyed 
+     *         to prevent segmentation faults during exit.
+     * 
+     */
+    ~ExecutionSettings() {
+        program = nullptr;
+        context = nullptr;
+        device = nullptr;
+        programSettings = nullptr;
+    }
+
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/shared/include/setup/fpga_setup.hpp b/shared/include/setup/fpga_setup.hpp
index 0799900c..19ff3436 100644
--- a/shared/include/setup/fpga_setup.hpp
+++ b/shared/include/setup/fpga_setup.hpp
@@ -30,13 +30,14 @@ SOFTWARE.
 #include <fstream>
 #include <memory>
 
+#ifdef USE_OCL_HOST
 /* External libraries */
 #ifdef USE_DEPRECATED_HPP_HEADER
 #include "CL/cl.hpp"
 #else
 #include OPENCL_HPP_HEADER
 #endif
-
+#endif
 
 /**
 Makro to convert the error integer representation to its string representation
@@ -74,6 +75,7 @@ class FpgaSetupException : public std::exception
     std::string error_message;
 };
 
+#ifdef USE_OCL_HOST
 /**
  * @brief Exception that is thrown if the ASSERT_CL failed
  * 
@@ -134,13 +136,6 @@ Sets up the given FPGA with the kernel in the provided file.
     fpgaSetup(const cl::Context *context, std::vector<cl::Device> deviceList,
               const std::string *usedKernelFile);
 
-/**
-Sets up the C++ environment by configuring std::cout and checking the clock
-granularity using bm_helper::checktick()
-*/
-    void
-    setupEnvironmentAndClocks();
-
 
 /**
 Searches an selects an FPGA device using the CL library functions.
@@ -157,7 +152,16 @@ choose a device.
 @return A list containing a single selected device
 */
     std::unique_ptr<cl::Device>
-    selectFPGADevice(int defaultPlatform, int defaultDevice);
+    selectFPGADevice(int defaultPlatform, int defaultDevice, std::string platformString);
+
+
+#endif
+/**
+Sets up the C++ environment by configuring std::cout and checking the clock
+granularity using bm_helper::checktick()
+*/
+    void
+    setupEnvironmentAndClocks();
 
 }  // namespace fpga_setup
 #endif  // SRC_HOST_FPGA_SETUP_H_
diff --git a/shared/include/setup/fpga_setup_accl.hpp b/shared/include/setup/fpga_setup_accl.hpp
new file mode 100644
index 00000000..fb7d85b3
--- /dev/null
+++ b/shared/include/setup/fpga_setup_accl.hpp
@@ -0,0 +1,85 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_FPGA_SETUP_ACCL_H_
+#define SRC_HOST_FPGA_SETUP_ACCL_H_
+
+#include <chrono>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+/* External libraries */
+#include "accl.hpp"
+#include "xrt/xrt_device.h"
+#include "hpcc_settings.hpp"
+
+namespace fpga_setup {
+
+
+struct ACCLContext {
+    std::unique_ptr<ACCL::ACCL> accl;
+    std::unique_ptr<ACCL::BaseBuffer> tx_buf_network;
+    std::unique_ptr<ACCL::BaseBuffer> rx_buf_network; 
+};
+
+
+static const std::map<std::string, ACCL::networkProtocol> acclProtocolMap = {
+    {"UDP", ACCL::networkProtocol::UDP}, 
+    {"TCP", ACCL::networkProtocol::TCP} 
+};
+
+static std::string acclEnumToProtocolString(ACCL::networkProtocol p) {
+    for (const auto& entry: acclProtocolMap) {
+        if (entry.second == p) {
+            return entry.first;
+        }
+    }
+    std::runtime_error("ACCL network protocol could not be parsed to string!");
+    return "";
+}
+
+static ACCL::networkProtocol acclProtocolStringToEnum(std::string string_representation) {
+    if (acclProtocolMap.count(string_representation)) {
+        return acclProtocolMap.at(string_representation);
+    }
+    else {
+        std::runtime_error("ACCL network protocol could not be parsed from string: " + string_representation);
+    }
+    return ACCL::networkProtocol::UDP;
+}
+
+/**
+Sets up the given FPGA with the kernel in the provided file.
+
+@param device The device used for the program
+@param program The program used to find the ACCL kernels for hardware execution
+@param programSettings Pass current program settings to configure ACCL according to user specification
+@return The ACCL instance used for communication
+*/
+ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
+                                          hpcc_base::BaseSettings &programSettings);
+
+} // namespace fpga_setup
+#endif // SRC_HOST_FPGA_SETUP_H_
diff --git a/shared/include/setup/fpga_setup_xrt.hpp b/shared/include/setup/fpga_setup_xrt.hpp
new file mode 100644
index 00000000..61c74f72
--- /dev/null
+++ b/shared/include/setup/fpga_setup_xrt.hpp
@@ -0,0 +1,66 @@
+/*
+Copyright (c) 2022 Marius Meyer
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#ifndef SRC_HOST_FPGA_SETUP_XRT_H_
+#define SRC_HOST_FPGA_SETUP_XRT_H_
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <fstream>
+#include <memory>
+
+/* External libraries */
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+namespace fpga_setup {
+
+/**
+Sets up the given FPGA with the kernel in the provided file.
+
+@param device The device used for the program
+@param usedKernelFile The path to the kernel file
+@return The ACCL instance used for communication
+*/
+    std::unique_ptr<xrt::uuid>
+    fpgaSetup(xrt::device &device,
+              const std::string &usedKernelFile);
+
+
+/**
+Searches an selects an FPGA device using the CL library functions.
+If multiple platforms or devices are given, the user will be prompted to
+choose a device.
+
+@param defaultDevice The index of the device that has to be used. If a
+                        value < 0 is given, the device can be chosen
+                        interactively
+
+@return the selected device
+*/
+    std::unique_ptr<xrt::device>
+    selectFPGADevice(int defaultDevice);
+
+}  // namespace fpga_setup
+#endif  // SRC_HOST_FPGA_SETUP_H_
diff --git a/shared/setup/fpga_setup.cpp b/shared/setup/fpga_setup.cpp
index dd1ddd28..15798f19 100644
--- a/shared/setup/fpga_setup.cpp
+++ b/shared/setup/fpga_setup.cpp
@@ -28,6 +28,9 @@ FpgaSetupException::what() const noexcept
     return error_message.c_str();
 }
 
+
+#ifdef USE_OCL_HOST
+
 OpenClException::OpenClException(std::string error_name)
     : FpgaSetupException("An OpenCL error occured: " + error_name) {}
 
@@ -101,7 +104,7 @@ Converts the reveived OpenCL error to a string
             CL_ERR_TO_STR(CL_INVALID_DEVICE_PARTITION_COUNT);
 
             default:
-                return "UNKNOWN ERROR CODE";
+                return "UNKNOWN ERROR CODE: " + std::to_string(err);
         }
     }
 
@@ -135,7 +138,6 @@ Sets up the given FPGA with the kernel in the provided file.
 #ifdef _USE_MPI_
         MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 #endif
-
         if (world_rank == 0) {
             std::cout << HLINE;
             std::cout << "FPGA Setup:" << usedKernelFile->c_str() << std::endl;
@@ -178,37 +180,6 @@ Sets up the given FPGA with the kernel in the provided file.
         return std::unique_ptr<cl::Program>(new cl::Program(program));
     }
 
-/**
-Sets up the C++ environment by configuring std::cout and checking the clock
-granularity using bm_helper::checktick()
-*/
-    void
-    setupEnvironmentAndClocks() {
-        std::cout << std::setprecision(5) << std::scientific;
-
-        int world_rank = 0;
-
-#ifdef _USE_MPI_
-        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
-#endif
-
-        if (world_rank == 0) {
-            std::cout << HLINE;
-            std::cout << "General setup:" << std::endl;
-
-            // Check clock granularity and output result
-            std::cout << "C++ high resolution clock is used." << std::endl;
-            std::cout << "The clock precision seems to be "
-                    << static_cast<double>
-                        (std::chrono::high_resolution_clock::period::num) /
-                        std::chrono::high_resolution_clock::period::den * 10e9
-                    << "ns" << std::endl;
-
-            std::cout << HLINE;
-        }
-    }
-
-
 /**
 Searches an selects an FPGA device using the CL library functions.
 If multiple platforms or devices are given, the user will be prompted to
@@ -220,11 +191,14 @@ choose a device.
 @param defaultDevice The index of the device that has to be used. If a
                         value < 0 is given, the device can be chosen
                         interactively
+@param platformString The platform string which should be chosen.
+                        If it is empty, it will be ignored. If it is not empty,
+                        but the string is not found an exception is thrown.
 
 @return A list containing a single selected device
 */
     std::unique_ptr<cl::Device>
-    selectFPGADevice(int defaultPlatform, int defaultDevice) {
+    selectFPGADevice(int defaultPlatform, int defaultDevice, std::string platformString) {
         // Integer used to store return codes of OpenCL library calls
         int err;
 
@@ -242,7 +216,22 @@ choose a device.
 
         // Choose the target platform
         long unsigned int chosenPlatformId = 0;
-        if (defaultPlatform >= 0) {
+        if (platformString.size() > 0) {
+            // Platform string has highest priority
+            bool found = false;
+            for (int i = 0; i < platformList.size(); i++) {
+                if (platformList[i].getInfo<CL_PLATFORM_NAME>() == platformString) {
+                    chosenPlatformId = i;
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) {
+                throw FpgaSetupException("Invalid platform string specified: " + platformString);
+            }
+        } 
+        else if (defaultPlatform >= 0) {
+            // Otherwise, select platform by index
             if (defaultPlatform < static_cast<int>(platformList.size())) {
                 chosenPlatformId = defaultPlatform;
             } else {
@@ -322,4 +311,36 @@ choose a device.
         return std::unique_ptr<cl::Device>(new cl::Device(deviceList[chosenDeviceId]));
     }
 
-}  // namespace fpga_setup
\ No newline at end of file
+
+#endif
+/**
+Sets up the C++ environment by configuring std::cout and checking the clock
+granularity using bm_helper::checktick()
+*/
+    void
+    setupEnvironmentAndClocks() {
+        std::cout << std::setprecision(5) << std::scientific;
+
+        int world_rank = 0;
+
+#ifdef _USE_MPI_
+        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+#endif
+
+        if (world_rank == 0) {
+            std::cout << HLINE;
+            std::cout << "General setup:" << std::endl;
+
+            // Check clock granularity and output result
+            std::cout << "C++ high resolution clock is used." << std::endl;
+            std::cout << "The clock precision seems to be "
+                    << static_cast<double>
+                        (std::chrono::high_resolution_clock::period::num) /
+                        std::chrono::high_resolution_clock::period::den * 10e9
+                    << "ns" << std::endl;
+
+            std::cout << HLINE;
+        }
+    }
+
+}  // namespace fpga_setup
diff --git a/shared/setup/fpga_setup_accl.cpp b/shared/setup/fpga_setup_accl.cpp
new file mode 100644
index 00000000..fdaeaf7f
--- /dev/null
+++ b/shared/setup/fpga_setup_accl.cpp
@@ -0,0 +1,144 @@
+//
+// Created by Marius Meyer on 04.12.19.
+//
+
+#include "setup/fpga_setup_accl.hpp"
+
+#include <chrono>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+/* External libraries */
+#include "experimental/xrt_ip.h"
+#include "parameters.h"
+#include <vnx/cmac.hpp>
+#include <vnx/networklayer.hpp>
+#include "xrt/xrt_kernel.h"
+#ifdef _USE_MPI_
+#include "mpi.h"
+#endif
+
+using namespace vnx;
+
+namespace fpga_setup {
+
+void configure_vnx(CMAC &cmac, Networklayer &network_layer,
+                   std::vector<ACCL::rank_t> &ranks, int rank) {
+  if (ranks.size() > max_sockets_size) {
+    throw std::runtime_error("Too many ranks. VNX supports up to " +
+                             std::to_string(max_sockets_size) + " sockets.");
+  }
+
+  const auto link_status = cmac.link_status();
+
+  if (link_status.at("rx_status")) {
+    std::cout << "Link successful!" << std::endl;
+  } else {
+    std::cout << "No link found." << std::endl;
+  }
+
+  if (!link_status.at("rx_status")) {
+    // Give time for other ranks to setup link.
+    std::this_thread::sleep_for(std::chrono::seconds(3));
+    exit(1);
+  }
+  
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  network_layer.update_ip_address(ranks[rank].ip);
+  for (size_t i = 0; i < ranks.size(); ++i) {
+    if (i == static_cast<size_t>(rank)) {
+      continue;
+    }
+
+    network_layer.configure_socket(i, ranks[i].ip, ranks[i].port,
+                                   ranks[rank].port, true);
+  }
+
+  network_layer.populate_socket_table();
+
+  std::this_thread::sleep_for(std::chrono::seconds(4));
+  network_layer.arp_discovery();
+  std::this_thread::sleep_for(std::chrono::seconds(2));
+  network_layer.arp_discovery();
+}
+
+void configure_tcp(ACCL::BaseBuffer &tx_buf_network, ACCL::BaseBuffer &rx_buf_network,
+                   xrt::kernel &network_krnl, std::vector<ACCL::rank_t> &ranks,
+                   int rank) {
+  std::cout << "Configure TCP Network Kernel" << std::endl;
+  tx_buf_network.sync_to_device();
+  rx_buf_network.sync_to_device();
+
+  uint local_fpga_ip = ACCL::ip_encode(ranks[rank].ip);
+  std::cout << "rank: " << rank << " FPGA IP: " << std::hex << local_fpga_ip
+            << std::endl;
+
+  network_krnl(local_fpga_ip, static_cast<uint32_t>(rank), local_fpga_ip,
+               *(tx_buf_network.bo()), *(rx_buf_network.bo()));
+}
+
+ACCLContext fpgaSetupACCL(xrt::device &device, xrt::uuid &program,
+                                          hpcc_base::BaseSettings &programSettings) {
+  int current_rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &current_rank);
+
+  int current_size;
+  MPI_Comm_size(MPI_COMM_WORLD, &current_size);
+
+  std::vector<ACCL::rank_t> ranks = {};
+  for (int i = 0; i < current_size; ++i) {
+    // TODO: Replace the ip addresses and ports here for execution of real hardware?
+    ACCL::rank_t new_rank = {"10.10.10." + std::to_string(i), 6000 + i, i, programSettings.acclBufferSize};
+    ranks.emplace_back(new_rank);
+  }
+
+  ACCLContext accl;
+
+  if (!programSettings.useAcclEmulation) {
+    std::cout << "Create cclo ip" << std::endl;
+    auto cclo_ip = xrt::ip(device, program, "ccl_offload:{ccl_offload_" + std::to_string(0) + "}");
+    std::cout << "Create hostctrl" << std::endl;
+    auto hostctrl_ip = xrt::kernel(device, program, "hostctrl:{hostctrl_" + std::to_string(0) + "}",
+                                   xrt::kernel::cu_access_mode::exclusive);
+    if (programSettings.acclProtocol == ACCL::networkProtocol::UDP) {
+      std::cout << "Create CMAC" << std::endl;
+      auto cmac = CMAC(xrt::ip(device, program, "cmac_0:{cmac_0}"));
+      std::cout << "Create Network Layer" << std::endl;
+      auto network_layer = Networklayer(
+            xrt::ip(device, program, "networklayer:{networklayer_0}"));
+      std::cout << "Configure VNX" << std::endl;
+      configure_vnx(cmac, network_layer, ranks, current_rank);
+    }
+    if (programSettings.acclProtocol == ACCL::networkProtocol::TCP) {
+      auto network_krnl = xrt::kernel(device, program, "network_krnl:{network_krnl_0}",
+                      xrt::kernel::cu_access_mode::exclusive);
+      accl.tx_buf_network = std::unique_ptr<ACCL::BaseBuffer>(new ACCL::FPGABuffer<int8_t>(
+          64 * 1024 * 1024, ACCL::dataType::int8, device, network_krnl.group_id(3)));
+      accl.rx_buf_network = std::unique_ptr<ACCL::BaseBuffer>(new ACCL::FPGABuffer<int8_t>(
+          64 * 1024 * 1024, ACCL::dataType::int8, device, network_krnl.group_id(4)));
+      configure_tcp(*accl.tx_buf_network, *accl.rx_buf_network, network_krnl, ranks, current_rank);
+    }
+    std::cout << "Create ACCL" << std::endl;
+    accl.accl = std::unique_ptr<ACCL::ACCL>(
+        new ACCL::ACCL(ranks, current_rank, device, cclo_ip, hostctrl_ip, programSettings.acclDefaultBank, 
+            programSettings.acclRecvBufferMemBanks, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize, programSettings.acclBufferSize));
+  } else {
+    // TODO: Add start port here. Currenty hardcoded!
+    accl.accl = std::unique_ptr<ACCL::ACCL>(
+        new ACCL::ACCL(ranks, current_rank, 6000, device, programSettings.acclProtocol, programSettings.acclBufferCount, programSettings.acclBufferSize));
+  }
+
+  if (programSettings.acclProtocol == ACCL::networkProtocol::TCP) {
+    MPI_Barrier(MPI_COMM_WORLD);
+    accl.accl->open_port();
+    MPI_Barrier(MPI_COMM_WORLD);
+    accl.accl->open_con();
+  }
+  return accl;
+}
+
+} // namespace fpga_setup
diff --git a/shared/setup/fpga_setup_xrt.cpp b/shared/setup/fpga_setup_xrt.cpp
new file mode 100644
index 00000000..f5d7ef32
--- /dev/null
+++ b/shared/setup/fpga_setup_xrt.cpp
@@ -0,0 +1,49 @@
+//
+// Created by Marius Meyer on 04.12.19.
+//
+
+#include "setup/fpga_setup_xrt.hpp"
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <fstream>
+
+/* External libraries */
+#include "parameters.h"
+
+#include "xrt.h"
+#ifdef _USE_MPI_
+#include "mpi.h"
+#endif
+
+namespace fpga_setup {
+
+    std::unique_ptr<xrt::uuid>
+    fpgaSetup(xrt::device &device,
+              const std::string &kernelFileName) {
+        int current_rank;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
+
+        int current_size;
+        MPI_Comm_size(MPI_COMM_WORLD, & current_size);
+
+        return std::unique_ptr<xrt::uuid>(new xrt::uuid(device.load_xclbin(kernelFileName)));
+    }
+
+    std::unique_ptr<xrt::device>
+    selectFPGADevice(int defaultDevice) {
+        int current_device;
+        MPI_Comm_rank(MPI_COMM_WORLD, & current_device);
+        if (defaultDevice >= 0) {
+            current_device = defaultDevice;
+        } else {
+            //TODO Use xrt::system::enumerate_devices() in "experimental/xrt_system.h" for future XRT versions
+            // instead of hardcoded number of devices.
+            current_device = current_device % 3;
+        }
+        return std::unique_ptr<xrt::device>(new xrt::device(current_device));
+    } 
+}  // namespace fpga_setup
diff --git a/shared/tests/CMakeLists.txt b/shared/tests/CMakeLists.txt
index a4ea0a4d..5d4c441b 100644
--- a/shared/tests/CMakeLists.txt
+++ b/shared/tests/CMakeLists.txt
@@ -1,6 +1,14 @@
 
 set(HPCC_BASE_TEST_SOURCES  main.cpp hpcc_base_benchmark_test.cpp)
 
+if (USE_ACCL)
+    set(ACCL_EMULATOR_DIR ${CMAKE_BINARY_DIR}/lib/accl-emulator CACHE STRING "Directory of ACCL emulator")
+    add_subdirectory(${extern_accl_SOURCE_DIR}/test/model/emulator ${ACCL_EMULATOR_DIR})
+    if (CMAKE_BUILD_TYPE EQUAL "Debug")
+        set(ACCL_DEBUG Yes)
+    endif()
+endif()
+
 add_library(hpcc_fpga_base_test STATIC ${HPCC_BASE_TEST_SOURCES})
 target_link_libraries(hpcc_fpga_base_test gtest gmock hpcc_fpga_base)
 target_include_directories(hpcc_fpga_base_test PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
@@ -11,6 +19,9 @@ if (INTELFPGAOPENCL_FOUND)
 elseif(Vitis_FOUND)
     target_include_directories(hpcc_fpga_base_test PUBLIC ${Vitis_INCLUDE_DIRS})  
     target_link_libraries(hpcc_fpga_base_test ${Vitis_LIBRARIES})
+if (USE_ACCL)
+    add_dependencies(hpcc_fpga_base_test cclo_emu)
+endif()
 else()
     message(ERROR "No OpenCL header found on system!")
 endif()
diff --git a/shared/tests/hpcc_base_benchmark_test.cpp b/shared/tests/hpcc_base_benchmark_test.cpp
index a93a2a69..7fafd340 100644
--- a/shared/tests/hpcc_base_benchmark_test.cpp
+++ b/shared/tests/hpcc_base_benchmark_test.cpp
@@ -8,6 +8,7 @@
 #include "test_program_settings.h"
 #include "gmock/gmock.h"
 #include "hpcc_benchmark.hpp"
+#include "nlohmann/json.hpp"
 
 
 // Dirty GoogleTest and static library hack
@@ -16,7 +17,8 @@
 // and enable the included tests
 void use_hpcc_base_lib() {}
 
-class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, int, int> {
+template<class T>
+class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, typename std::tuple_element<0, T>::type, typename std::tuple_element<1, T>::type, typename std::tuple_element<2, T>::type, int> {
 
 protected:
 
@@ -35,24 +37,30 @@ class MinimalBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
     std::unique_ptr<int>
     generateInputData() override { return returnInputData ? std::unique_ptr<int>(new int) : std::unique_ptr<int>(nullptr);}
 
-    std::unique_ptr<int>
-    executeKernel(int &data) override { return returnExecuteKernel ? std::unique_ptr<int>(new int) : std::unique_ptr<int>(nullptr);}
+    void
+    executeKernel(int &data) override { return;}
 
     bool
-    validateOutputAndPrintError(int &data) override { return returnValidate;}
+    validateOutput(int &data) override { return returnValidate;}
+    
+    void
+    printError() override {}
 
     bool
     checkInputParameters() override { return configurationCheckSucceeds;}
 
     void
-    collectAndPrintResults(const int &output) override {}
+    collectResults() override {}
 
-    MinimalBenchmark() : HpccFpgaBenchmark(0, { nullptr}) {}
+    void
+    printResults() override {}
 
-};
+    MinimalBenchmark() : hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, typename std::tuple_element<0, T>::type, typename std::tuple_element<1, T>::type, typename std::tuple_element<2, T>::type, int>(0, { nullptr}) {}
 
+};
 
-class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, int, int> {
+template<class TDevice, class TContext, class TProgram>
+class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, TDevice, TContext, TProgram, int> {
 
 protected:
 
@@ -80,21 +88,27 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
         generateInputDatacalled++;
         return std::unique_ptr<int>(new int);}
 
-    std::unique_ptr<int>
+    void
     executeKernel(int &data) override { 
         if (!returnExecuteKernel) {
             throw fpga_setup::FpgaSetupException("Test execute kernel failed");
         }
         executeKernelcalled++;
-        return std::unique_ptr<int>(new int);}
+        return;}
 
     bool
-    validateOutputAndPrintError(int &data) override { 
+    validateOutput(int &data) override { 
         validateOutputcalled++;
         return returnValidate;}
+    
+    void
+    printError() override {}
 
     void
-    collectAndPrintResults(const int &output) override {}
+    collectResults() override {}
+
+    void
+    printResults() override {}
 
     bool
     checkInputParameters() override {
@@ -102,29 +116,66 @@ class SuccessBenchmark : public hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSett
             return false;
         }
         else {
-            return hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, int, int>::checkInputParameters();
+            return hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, TDevice, TContext, TProgram, int>::checkInputParameters();
         }
     }
 
-    SuccessBenchmark() : HpccFpgaBenchmark(0, { nullptr}) {}
+    SuccessBenchmark() : hpcc_base::HpccFpgaBenchmark<hpcc_base::BaseSettings, TDevice, TContext, TProgram, int>(0, { nullptr}) {}
 
 };
 
+template<class T>
 class BaseHpccBenchmarkTest :public  ::testing::Test {
 
+using TDevice = typename std::tuple_element<0,T>::type;
+using TContext = typename std::tuple_element<1,T>::type;
+using TProgram = typename std::tuple_element<2,T>::type;
+
 public:
-    std::unique_ptr<SuccessBenchmark> bm;
+    std::unique_ptr<SuccessBenchmark<TDevice, TContext, TProgram>> bm;
 
     BaseHpccBenchmarkTest() {
-        bm = std::unique_ptr<SuccessBenchmark>(new SuccessBenchmark());
+        bm = std::unique_ptr<SuccessBenchmark<TDevice, TContext, TProgram>>(new SuccessBenchmark<TDevice, TContext, TProgram>());
         bm->setupBenchmark(global_argc, global_argv);
     }
 
 };
 
-
-TEST_F(BaseHpccBenchmarkTest, SetupSucceedsForBenchmarkTest) {
-        bool success = bm->setupBenchmark(global_argc, global_argv);
+template<class T>
+class SetupTest : public ::testing::Test {};
+
+#ifdef USE_OCL_HOST
+typedef ::testing::Types<std::tuple<cl::Device, cl::Context, cl::Program>> cl_types;
+TYPED_TEST_SUITE(
+        BaseHpccBenchmarkTest,
+        cl_types);
+TYPED_TEST_SUITE(
+        SetupTest,
+        cl_types);
+#endif
+#ifdef USE_XRT_HOST
+#ifndef USE_ACCL
+typedef ::testing::Types<std::tuple<xrt::device, bool, xrt::uuid>> xrt_types;
+TYPED_TEST_SUITE(
+        BaseHpccBenchmarkTest,
+        xrt_types);
+TYPED_TEST_SUITE(
+        SetupTest,
+        xrt_types);
+#else
+typedef ::testing::Types<std::tuple<xrt::device, fpga_setup::ACCLContext, xrt::uuid>> accl_types;
+TYPED_TEST_SUITE(
+        BaseHpccBenchmarkTest,
+        accl_types);
+TYPED_TEST_SUITE(
+        SetupTest,
+        accl_types);
+#endif
+#endif
+
+
+TYPED_TEST(BaseHpccBenchmarkTest, SetupSucceedsForBenchmarkTest) {
+        bool success = this->bm->setupBenchmark(global_argc, global_argv);
         EXPECT_TRUE(success);
 }
 
@@ -132,97 +183,115 @@ TEST_F(BaseHpccBenchmarkTest, SetupSucceedsForBenchmarkTest) {
 /**
  * Checks if the testing flag works as expected
  */
-TEST_F(BaseHpccBenchmarkTest, AllExecutedWhenNotTestOnly) {
-    bm->getExecutionSettings().programSettings->testOnly = false;
-    bm->executeBenchmark();
-    EXPECT_EQ(bm->validateOutputcalled, 1);
-    EXPECT_EQ(bm->executeKernelcalled, 1);
-    EXPECT_EQ(bm->generateInputDatacalled, 1);
+TYPED_TEST(BaseHpccBenchmarkTest, AllExecutedWhenNotTestOnly) {
+    this->bm->getExecutionSettings().programSettings->testOnly = false;
+    this->bm->executeBenchmark();
+    EXPECT_EQ(this->bm->validateOutputcalled, 1);
+    EXPECT_EQ(this->bm->executeKernelcalled, 1);
+    EXPECT_EQ(this->bm->generateInputDatacalled, 1);
 }
 
-TEST_F(BaseHpccBenchmarkTest, NothingExecutedWhenTestOnly) {
-    bm->getExecutionSettings().programSettings->testOnly = true;
-    bm->executeBenchmark();
-    EXPECT_EQ(bm->validateOutputcalled, 0);
-    EXPECT_EQ(bm->executeKernelcalled, 0);
-    EXPECT_EQ(bm->generateInputDatacalled, 0);
+TYPED_TEST(BaseHpccBenchmarkTest, NothingExecutedWhenTestOnly) {
+    this->bm->getExecutionSettings().programSettings->testOnly = true;
+    this->bm->executeBenchmark();
+    EXPECT_EQ(this->bm->validateOutputcalled, 0);
+    EXPECT_EQ(this->bm->executeKernelcalled, 0);
+    EXPECT_EQ(this->bm->generateInputDatacalled, 0);
 }
 
-TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenNotTestOnly) {
-    bm->getExecutionSettings().programSettings->testOnly = false;
-    EXPECT_TRUE(bm->executeBenchmark());
+TYPED_TEST(BaseHpccBenchmarkTest, ExecutionSuccessWhenNotTestOnly) {
+    this->bm->getExecutionSettings().programSettings->testOnly = false;
+    EXPECT_TRUE(this->bm->executeBenchmark());
 
 }
 
-TEST_F(BaseHpccBenchmarkTest, ExecutionFailsWhenTestOnlyAndSetupFails) {
-    bm->getExecutionSettings().programSettings->testOnly = true;
-    bm->forceSetupFail = true;
-    bm->setupBenchmark(global_argc, global_argv);
-    EXPECT_FALSE(bm->executeBenchmark());
+TYPED_TEST(BaseHpccBenchmarkTest, ExecutionFailsWhenTestOnlyAndSetupFails) {
+    this->bm->getExecutionSettings().programSettings->testOnly = true;
+    this->bm->forceSetupFail = true;
+    this->bm->setupBenchmark(global_argc, global_argv);
+    EXPECT_FALSE(this->bm->executeBenchmark());
 }
 
-TEST_F(BaseHpccBenchmarkTest, ExecutionSuccessWhenTestOnlyAndSetupSuccess) {
-    bm->getExecutionSettings().programSettings->testOnly = true;
-    EXPECT_TRUE(bm->executeBenchmark());
+TYPED_TEST(BaseHpccBenchmarkTest, ExecutionSuccessWhenTestOnlyAndSetupSuccess) {
+    this->bm->getExecutionSettings().programSettings->testOnly = true;
+    EXPECT_TRUE(this->bm->executeBenchmark());
+}
+
+/**
+ * Checks if non existing device leads to an error
+ */
+TYPED_TEST(BaseHpccBenchmarkTest, FindNonExistingDevice) {
+#ifdef USE_OCL_HOST
+    ASSERT_THROW(fpga_setup::selectFPGADevice(this->bm->getExecutionSettings().programSettings->defaultPlatform, 100, this->bm->getExecutionSettings().programSettings->platformString).get(), fpga_setup::FpgaSetupException);
+#else
+    ASSERT_THROW(fpga_setup::selectFPGADevice(100).get(), fpga_setup::FpgaSetupException);
+#endif
 }
 
 /**
  * Checks if using default platform and device is successful
  */
-TEST_F(BaseHpccBenchmarkTest, SuccessUseDefaultPlatform) {
-    EXPECT_NE(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, bm->getExecutionSettings().programSettings->defaultDevice).get(), nullptr);
+TYPED_TEST(BaseHpccBenchmarkTest, SuccessUseDefaultPlatformandDevice) {
+#ifdef USE_OCL_HOST
+    EXPECT_NE(fpga_setup::selectFPGADevice(this->bm->getExecutionSettings().programSettings->defaultPlatform, this->bm->getExecutionSettings().programSettings->defaultDevice, this->bm->getExecutionSettings().programSettings->platformString).get(), nullptr);
+#else
+    EXPECT_NE(fpga_setup::selectFPGADevice(this->bm->getExecutionSettings().programSettings->defaultDevice).get(), nullptr);
+#endif
 }
 
+#ifdef USE_OCL_HOST
 /**
  * Checks if non existing platform leads to an error
  */
-TEST_F(BaseHpccBenchmarkTest, FindNonExistingPlatform) {
-    ASSERT_THROW(fpga_setup::selectFPGADevice(100, bm->getExecutionSettings().programSettings->defaultDevice).get(), fpga_setup::FpgaSetupException);
+TYPED_TEST(BaseHpccBenchmarkTest, FindNonExistingPlatform) {
+    ASSERT_THROW(fpga_setup::selectFPGADevice(100, this->bm->getExecutionSettings().programSettings->defaultDevice, this->bm->getExecutionSettings().programSettings->platformString).get(), fpga_setup::FpgaSetupException);
 }
 
-/**
- * Checks if non existing device leads to an error
+/*
+ * Check if wrong platform string leads to an error
  */
-TEST_F(BaseHpccBenchmarkTest, FindNonExistingDevice) {
-    ASSERT_THROW(fpga_setup::selectFPGADevice(bm->getExecutionSettings().programSettings->defaultPlatform, 100).get(), fpga_setup::FpgaSetupException);
+TYPED_TEST(BaseHpccBenchmarkTest, FindNonExistingPlatformString) {
+    ASSERT_THROW(fpga_setup::selectFPGADevice(this->bm->getExecutionSettings().programSettings->defaultPlatform, this->bm->getExecutionSettings().programSettings->defaultDevice, "This is not a platform").get(), fpga_setup::FpgaSetupException);
 }
 
+#endif
+
 /**
  * Execute kernel and validation is success
  */
-TEST_F(BaseHpccBenchmarkTest, SuccessfulExeAndVal) {
-    EXPECT_TRUE(bm->executeBenchmark());
+TYPED_TEST(BaseHpccBenchmarkTest, SuccessfulExeAndVal) {
+    EXPECT_TRUE(this->bm->executeBenchmark());
 }
 
 /**
  * Execute kernel is success, but validation fails
  */
-TEST_F(BaseHpccBenchmarkTest, SuccessfulExeFailedVal) {
-    bm->returnValidate = false;
-    EXPECT_FALSE(bm->executeBenchmark());
+TYPED_TEST(BaseHpccBenchmarkTest, SuccessfulExeFailedVal) {
+    this->bm->returnValidate = false;
+    EXPECT_FALSE(this->bm->executeBenchmark());
 }
 
 /**
  * Execute kernel fails
  */
-TEST_F(BaseHpccBenchmarkTest, FailedExe) {
-    bm->returnExecuteKernel = false;
-    EXPECT_FALSE(bm->executeBenchmark());
+TYPED_TEST(BaseHpccBenchmarkTest, FailedExe) {
+    this->bm->returnExecuteKernel = false;
+    EXPECT_FALSE(this->bm->executeBenchmark());
 }
 
 /**
  * Benchmark Setup is successful with default data
  */
-TEST(SetupTest, BenchmarkSetupIsSuccessful) {
-    std::unique_ptr<MinimalBenchmark> bm = std::unique_ptr<MinimalBenchmark>(new MinimalBenchmark());
+TYPED_TEST(SetupTest, BenchmarkSetupIsSuccessful) {
+    std::unique_ptr<MinimalBenchmark<TypeParam>> bm = std::unique_ptr<MinimalBenchmark<TypeParam>>(new MinimalBenchmark<TypeParam>());
     EXPECT_TRUE(bm->setupBenchmark(global_argc, global_argv));
 }
 
 /**
  * Benchmark Setup fails because of failing configuration check
  */
-TEST(SetupTest, BenchmarkConfigurationFailsSetup) {
-    std::unique_ptr<MinimalBenchmark> bm = std::unique_ptr<MinimalBenchmark>(new MinimalBenchmark());
+TYPED_TEST(SetupTest, BenchmarkConfigurationFailsSetup) {
+    std::unique_ptr<MinimalBenchmark<TypeParam>> bm = std::unique_ptr<MinimalBenchmark<TypeParam>>(new MinimalBenchmark<TypeParam>());
     bm->configurationCheckSucceeds = false;
     EXPECT_FALSE(bm->setupBenchmark(global_argc, global_argv));
 }
@@ -230,8 +299,8 @@ TEST(SetupTest, BenchmarkConfigurationFailsSetup) {
 /**
  * Benchmark Execution fails if configuration check failed
  */
-TEST(SetupTest, BenchmarkConfigurationFailsExecution) {
-    std::unique_ptr<MinimalBenchmark> bm = std::unique_ptr<MinimalBenchmark>(new MinimalBenchmark());
+TYPED_TEST(SetupTest, BenchmarkConfigurationFailsExecution) {
+    std::unique_ptr<MinimalBenchmark<TypeParam>> bm = std::unique_ptr<MinimalBenchmark<TypeParam>>(new MinimalBenchmark<TypeParam>());
     bm->configurationCheckSucceeds = false;
     bm->setupBenchmark(global_argc, global_argv);
     EXPECT_FALSE(bm->executeBenchmark());
@@ -240,8 +309,8 @@ TEST(SetupTest, BenchmarkConfigurationFailsExecution) {
 /**
  * Benchmark Setup fails with empty data
  */
-TEST(SetupTest, BenchmarkSetupFails) {
-    std::unique_ptr<MinimalBenchmark> bm = std::unique_ptr<MinimalBenchmark>(new MinimalBenchmark());
+TYPED_TEST(SetupTest, BenchmarkSetupFails) {
+    std::unique_ptr<MinimalBenchmark<TypeParam>> bm = std::unique_ptr<MinimalBenchmark<TypeParam>>(new MinimalBenchmark<TypeParam>());
     char** tmp_argv = new char*[2];
     char* name_str = new char[5];
     strcpy(name_str, "name");
@@ -251,3 +320,32 @@ TEST(SetupTest, BenchmarkSetupFails) {
     delete [] tmp_argv;
     delete [] name_str;
 }
+
+using json = nlohmann::json;
+
+/**
+ *
+ * Check if dump-json flag produces valid json output
+ */
+TYPED_TEST(SetupTest, BenchmarkJsonDump) {
+    std::unique_ptr<MinimalBenchmark<TypeParam>> bm = std::unique_ptr<MinimalBenchmark<TypeParam>>(new MinimalBenchmark<TypeParam>());
+    bm->setupBenchmark(global_argc, global_argv);
+    bm->getExecutionSettings().programSettings->dumpfilePath = "out.json";
+    bm->executeBenchmark();
+    std::FILE *f = std::fopen("out.json", "r");
+    EXPECT_NE(f, nullptr);
+    if (f != nullptr) {
+        // json::parse will panic if f is nullptr
+        json j = json::parse(f);
+        // check if the expected keys are there
+        EXPECT_TRUE(j.contains("config_time"));
+        EXPECT_TRUE(j.contains("device"));
+        EXPECT_TRUE(j.contains("environment"));
+        EXPECT_TRUE(j.contains("git_commit"));
+        EXPECT_TRUE(j.contains("results"));
+        EXPECT_TRUE(j.contains("settings"));
+        EXPECT_TRUE(j.contains("timings"));
+        EXPECT_TRUE(j.contains("version"));
+    }
+}
+