Coalesced Buffer Communication #3153

Workflow file for this run

.github/workflows/ci-short.yml at c67a5fb

	name: CI short

	on: [pull_request]

	# Cancel "duplicated" workflows triggered by pushes to internal
	# branches with associated PRs.
	concurrency:
	group: ${{ github.workflow }}-${{ github.head_ref \|\| github.ref }}
	cancel-in-progress: true

	env:
	CTEST_OUTPUT_ON_FAILURE: 1
	CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
	MACHINE_CFG: cmake/machinecfg/CI.cmake
	OMPI_MCA_mpi_common_cuda_event_max: 1000
	# CUDA IPC within docker repeated seem to cause issue on the CI machine
	OMPI_MCA_btl_smcuda_use_cuda_ipc: 0
	# https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
	OMPI_MCA_btl_vader_single_copy_mechanism: none

	jobs:
	style:
	runs-on: [self-hosted, A100]
	container:
	image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
	# map to local user id on CI machine to allow writing to build cache
	options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: 'true'
	- name: cpplint
	run: python ./tst/style/cpplint.py --counting=detailed --recursive src example tst
	- name: copyright
	run: \|
	cmake -DCMAKE_CXX_FLAGS=-Werror -Bbuild-copyright-check
	cmake --build build-copyright-check -t check-copyright
	- uses: actions/upload-artifact@v3
	with:
	name: configure-log-style
	path: build-copyright-check/CMakeFiles/CMakeOutput.log
	retention-days: 3

	unit:
	strategy:
	matrix:
	device: ['cuda', 'host']
	runs-on: [self-hosted, A100]
	container:
	image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
	# map to local user id on CI machine to allow writing to build cache
	options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: 'true'
	- name: Configure
	run: \|
	cmake -B build \
	-DCMAKE_BUILD_TYPE=Debug \
	-DMACHINE_VARIANT=${{ matrix.device }}-mpi
	- name: Build
	run: cmake --build build
	- name: Test
	run: \|
	cd build
	# Pick GPU with most available memory
	export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader \| sort -nr \| head -1 \| awk '{ print $NF }')
	ctest -LE 'performance\|regression'
	- uses: actions/upload-artifact@v3
	with:
	name: configure-log-unit-${{ matrix.device }}
	path: build/CMakeFiles/CMakeOutput.log
	retention-days: 3

	integration:
	strategy:
	matrix:
	device: ['cuda', 'host']
	runs-on: [self-hosted, A100]
	container:
	image: ghcr.io/parthenon-hpc-lab/cuda11.6-mpi-hdf5-ascent
	# map to local user id on CI machine to allow writing to build cache
	options: --user 1001 --cap-add CAP_SYS_PTRACE --shm-size="8g" --ulimit memlock=134217728
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: 'true'
	- name: Configure
	run: \|
	cmake -B build \
	-DCMAKE_BUILD_TYPE=Release \
	-DMACHINE_VARIANT=${{ matrix.device }}-mpi
	# Test example with "variables" and output
	- name: advection
	run: \|
	cmake --build build -t advection-example
	cd build
	# Pick GPU with most available memory
	export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader \| sort -nr \| head -1 \| awk '{ print $NF }')
	ctest -R regression_mpi_test:output_hdf5
	# Test example with swarms
	- name: particle-leapfrog
	run: \|
	cmake --build build -t particle-leapfrog
	cd build
	# Pick GPU with most available memory
	export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=memory.free,index --format=csv,nounits,noheader \| sort -nr \| head -1 \| awk '{ print $NF }')
	ctest -R regression_mpi_test:particle_leapfrog

	# Now testing if there are no hidden memcopies between host and device.
	# Using a static grid (i.e., not AMR) as additional transfers are expected
	# during loadbalance and refinement, but not for a static grid. We also need to
	# turn off sparse, since sparse results in additional transfers.
	# Also delaying start as there are explicit copies during initialization, e.g.,
	# when the Variable caches are created.
	- name: host-device-copy
	if: ${{ matrix.device }} == 'cuda'
	run: \|
	cd build
	nsys profile --delay=5 --duration=5 --stats=true -s none -t cuda example/advection/advection-example \
	-i ../tst/regression/test_suites/advection_performance/parthinput.advection_performance \
	parthenon/mesh/nx1=128 parthenon/mesh/nx2=128 parthenon/mesh/nx3=128 \
	parthenon/meshblock/nx1=64 parthenon/meshblock/nx2=64 parthenon/meshblock/nx3=64 \
	parthenon/sparse/enable_sparse=false parthenon/time/nlim=200000 parthenon/time/tlim=200 2>&1 \| tee profile.txt
	if grep HtoD profile.txt; then exit 1; fi
	if grep DtoH profile.txt; then exit 1; fi

	- uses: actions/upload-artifact@v3
	with:
	name: configure-log-integration-${{ matrix.device }}
	path: \|
	build/CMakeFiles/CMakeOutput.log
	build/profile.txt
	retention-days: 3

	integration-amdgpu:
	runs-on: [self-hosted, navi1030]
	container:
	image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5
	# Map to local user id on CI machine to allow writing to build cache and
	# forward device handles to access AMD GPU within container
	options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined
	env:
	CMAKE_GENERATOR: Ninja
	CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: 'true'
	- name: Configure
	run: \|
	cmake -B build \
	-DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \
	-DCMAKE_BUILD_TYPE=Release \
	-DMACHINE_VARIANT=hip-mpi \
	-DCMAKE_CXX_COMPILER=hipcc
	# Test example with "variables" and output
	- name: advection
	run: \|
	cmake --build build -t advection-example
	cd build
	ctest -R regression_mpi_test:output_hdf5
	# Test example with swarms
	- name: particle-leapfrog
	run: \|
	cmake --build build -t particle-leapfrog
	cd build
	ctest -R regression_mpi_test:particle_leapfrog

	- uses: actions/upload-artifact@v3
	with:
	name: configure-log-integration-amdgpu
	path: \|
	build/CMakeFiles/CMakeOutput.log
	retention-days: 3

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Coalesced Buffer Communication #3153

Workflow file

Coalesced Buffer Communication #3153

Jobs

Run details

Workflow file for this run