Skip to content

Coalesced Buffer Communication #3153

Coalesced Buffer Communication

Coalesced Buffer Communication #3153

Workflow file for this run

name: CI short
on: [pull_request]
# Cancel "duplicated" workflows triggered by pushes to internal
# branches with associated PRs.
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
cancel-in-progress: true
env:
CTEST_OUTPUT_ON_FAILURE: 1
CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
MACHINE_CFG: cmake/machinecfg/CI.cmake
OMPI_MCA_mpi_common_cuda_event_max: 1000
# CUDA IPC within docker repeated seem to cause issue on the CI machine
OMPI_MCA_btl_smcuda_use_cuda_ipc: 0
# https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
OMPI_MCA_btl_vader_single_copy_mechanism: none
jobs:
style:
runs-on: [self-hosted, A100]
container:
image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
# map to local user id on CI machine to allow writing to build cache
options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'
- name: cpplint
run: python ./tst/style/cpplint.py --counting=detailed --recursive src example tst
- name: copyright
run: |
cmake -DCMAKE_CXX_FLAGS=-Werror -Bbuild-copyright-check
cmake --build build-copyright-check -t check-copyright
- uses: actions/upload-artifact@v3
with:
name: configure-log-style
path: build-copyright-check/CMakeFiles/CMakeOutput.log
retention-days: 3
unit:
strategy:
matrix:
device: ['cuda', 'host']
runs-on: [self-hosted, A100]
container:
image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
# map to local user id on CI machine to allow writing to build cache
options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'
- name: Configure
run: |
cmake -B build \
-DCMAKE_BUILD_TYPE=Debug \
-DMACHINE_VARIANT=${{ matrix.device }}-mpi
- name: Build
run: cmake --build build
- name: Test
run: |
cd build
# Pick GPU with most available memory
export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
ctest -LE 'performance|regression'
- uses: actions/upload-artifact@v3
with:
name: configure-log-unit-${{ matrix.device }}
path: build/CMakeFiles/CMakeOutput.log
retention-days: 3
integration:
strategy:
matrix:
device: ['cuda', 'host']
runs-on: [self-hosted, A100]
container:
image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
# map to local user id on CI machine to allow writing to build cache
options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'
- name: Configure
run: |
cmake -B build \
-DCMAKE_BUILD_TYPE=Release \
-DMACHINE_VARIANT=${{ matrix.device }}-mpi
# Test example with "variables" and output
- name: advection
run: |
cmake --build build -t advection-example
cd build
# Pick GPU with most available memory
export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
ctest -R regression_mpi_test:output_hdf5
# Test example with swarms
- name: particle-leapfrog
run: |
cmake --build build -t particle-leapfrog
cd build
# Pick GPU with most available memory
export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader | sort -nr | head -1 | awk '{ print $NF }')
ctest -R regression_mpi_test:particle_leapfrog
# Now testing if there are no hidden memcopies between host and device.
# Using a static grid (i.e., not AMR) as additional transfers are expected
# during loadbalance and refinement, but not for a static grid. We also need to
# turn off sparse, since sparse results in additional transfers.
# Also delaying start as there are explicit copies during initialization, e.g.,
# when the Variable caches are created.
- name: host-device-copy
if: ${{ matrix.device }} == 'cuda'
run: |
cd build
nsys profile --delay=5 --duration=5 --stats=true -s none -t cuda example/advection/advection-example \
-i ../tst/regression/test_suites/advection_performance/parthinput.advection_performance \
parthenon/mesh/nx1=128 parthenon/mesh/nx2=128 parthenon/mesh/nx3=128 \
parthenon/meshblock/nx1=64 parthenon/meshblock/nx2=64 parthenon/meshblock/nx3=64 \
parthenon/sparse/enable_sparse=false parthenon/time/nlim=200000 parthenon/time/tlim=200 2>&1 | tee profile.txt
if grep HtoD profile.txt; then exit 1; fi
if grep DtoH profile.txt; then exit 1; fi
- uses: actions/upload-artifact@v3
with:
name: configure-log-integration-${{ matrix.device }}
path: |
build/CMakeFiles/CMakeOutput.log
build/profile.txt
retention-days: 3
integration-amdgpu:
runs-on: [self-hosted, navi1030]
container:
image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5
# Map to local user id on CI machine to allow writing to build cache and
# forward device handles to access AMD GPU within container
options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined
env:
CMAKE_GENERATOR: Ninja
CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'
- name: Configure
run: |
cmake -B build \
-DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DMACHINE_VARIANT=hip-mpi \
-DCMAKE_CXX_COMPILER=hipcc
# Test example with "variables" and output
- name: advection
run: |
cmake --build build -t advection-example
cd build
ctest -R regression_mpi_test:output_hdf5
# Test example with swarms
- name: particle-leapfrog
run: |
cmake --build build -t particle-leapfrog
cd build
ctest -R regression_mpi_test:particle_leapfrog
- uses: actions/upload-artifact@v3
with:
name: configure-log-integration-amdgpu
path: |
build/CMakeFiles/CMakeOutput.log
retention-days: 3