From 37a5f4efcc03761de494789f423616d86277726a Mon Sep 17 00:00:00 2001 From: Lauren Chilutti Date: Tue, 24 Sep 2024 18:43:27 +0000 Subject: [PATCH] Adding Parallelworks CI runscripts to .github/.parallelworks Updating workflows to us a new version of the Parallelworks CI runscripts Updating the cleanup workflow to cleanup the weekly chron job CI in addition to the PR CI rundirs --- .github/.parallelworks/README.md | 6 ++ .github/.parallelworks/checkout.sh | 59 +++++++++++ .github/.parallelworks/compile.sh | 84 ++++++++++++++++ .github/.parallelworks/run_test.sh | 79 +++++++++++++++ .../workflows/SHiELD_parallelworks_intel.yml | 98 ++++++++++--------- .../workflows/daily_cleanup_parallelworks.yml | 3 +- 6 files changed, 282 insertions(+), 47 deletions(-) create mode 100644 .github/.parallelworks/README.md create mode 100755 .github/.parallelworks/checkout.sh create mode 100755 .github/.parallelworks/compile.sh create mode 100755 .github/.parallelworks/run_test.sh diff --git a/.github/.parallelworks/README.md b/.github/.parallelworks/README.md new file mode 100644 index 000000000..0d7524e17 --- /dev/null +++ b/.github/.parallelworks/README.md @@ -0,0 +1,6 @@ +# .parallelworks Directory + +The .parallelworks directory stores the CI scripts that reside on Parallelworks +These scripts are executed via the GitHub Actions Workflows in .github/workflows + +On Parallelworks these scripts are installed at: /contrib/fv3/GFDL_atmos_cubed_sphere_CI diff --git a/.github/.parallelworks/checkout.sh b/.github/.parallelworks/checkout.sh new file mode 100755 index 000000000..6023defb1 --- /dev/null +++ b/.github/.parallelworks/checkout.sh @@ -0,0 +1,59 @@ +#!/bin/sh -xe + +############################################################################## +## User set up variables +## Root directory for CI +dirRoot=/contrib/fv3 +## Intel version to be used +intelVersion=2023.2.0 +############################################################################## +## HPC-ME container +container=/contrib/containers/noaa-intel-prototype_2023.09.25.sif +container_env_script=/contrib/containers/load_spack_noaa-intel.sh +############################################################################## + +#Parse Arguments +branch=main +commit=none +while [[ $# -gt 0 ]]; do + case $1 in + -b|--branch) + branch="$2" + shift # past argument + shift # past value + ;; + -h|--hash) + commit="$2" + shift # past argument + shift # past value + ;; + *) + echo "unknown argument" + exit 1 + ;; + esac +done + +echo "branch is $branch" +echo "commit is $commit" + +## Set up the directories +testDir=${dirRoot}/${intelVersion}/GFDL_atmos_cubed_sphere/${branch}/${commit} +logDir=${testDir}/log +export MODULESHOME=/usr/share/lmod/lmod +## create directories +rm -rf ${testDir} +mkdir -p ${logDir} +# salloc commands to start up +#2 tests layout 8,8 (16 nodes) +#2 tests layout 4,8 (8 nodes) +#9 tests layout 4,4 (18 nodes) +#5 tests layout 4,1 (5 nodes) +#17 tests layout 2,2 (17 nodes) +#salloc --partition=p2 -N 64 -J ${branch} sleep 20m & + +## clone code +cd ${testDir} +git clone --recursive https://github.com/NOAA-GFDL/SHiELD_build.git && cd SHiELD_build && ./CHECKOUT_code |& tee ${logDir}/checkout.log +## Check out the PR +cd ${testDir}/SHiELD_SRC/GFDL_atmos_cubed_sphere && git fetch origin ${branch}:toMerge && git merge toMerge diff --git a/.github/.parallelworks/compile.sh b/.github/.parallelworks/compile.sh new file mode 100755 index 000000000..d9bca51a3 --- /dev/null +++ b/.github/.parallelworks/compile.sh @@ -0,0 +1,84 @@ +#!/bin/sh -xe + +############################################################################## +## User set up variables +## Root directory for CI +dirRoot=/contrib/fv3 +## Intel version to be used +intelVersion=2023.2.0 +############################################################################## +## HPC-ME container +container=/contrib/containers/noaa-intel-prototype_2023.09.25.sif +container_env_script=/contrib/containers/load_spack_noaa-intel.sh +############################################################################## + +#Parse Arguments +branch=main +commit=none +while [[ $# -gt 0 ]]; do + case $1 in + -b|--branch) + branch="$2" + shift # past argument + shift # past value + ;; + -h|--hash) + commit="$2" + shift # past argument + shift # past value + ;; + -c|--config) + config="$2" + shift # past argument + shift # past value + ;; + --hydro) + hydro="$2" + shift # past argument + shift # past value + ;; + --bit) + bit="$2" + shift # past argument + shift # past value + ;; + -m|--mode) + mode="$2" + shift # past argument + shift # past value + ;; + *) + echo "unknown argument" + exit 1 + ;; + esac +done + +if [ -z $mode ] || [ -z $bit ] || [ -z $hydro ] || [ -z $config ] + then + echo "must specify config, hydro, bit, and mode options for compile" + exit 1 +fi + +echo "branch is $branch" +echo "commit is $commit" +echo "mode is $mode" +echo "bit is $bit" +echo "hydro is $hydro" +echo "config is $config" + +if [ $hydro = "sw" ] && [ $config = "shield" ] + then + echo "this combination should not be tested" + else + ## Set up the directories + testDir=${dirRoot}/${intelVersion}/GFDL_atmos_cubed_sphere/${branch}/${commit} + logDir=${testDir}/log + # Set up build + cd ${testDir}/SHiELD_build/Build + #Define External Libs path + export EXTERNAL_LIBS=${dirRoot}/externallibs + # Build SHiELD + set -o pipefail + singularity exec -B /contrib ${container} ${container_env_script} "./COMPILE ${config} ${hydro} ${bit} ${mode} intel clean" |& tee ${logDir}/compile_${config}_${hydro}_${bit}_${mode}_intel.out +fi diff --git a/.github/.parallelworks/run_test.sh b/.github/.parallelworks/run_test.sh new file mode 100755 index 000000000..b3037f1b0 --- /dev/null +++ b/.github/.parallelworks/run_test.sh @@ -0,0 +1,79 @@ +#!/bin/bash -xe +ulimit -s unlimited +############################################################################## +## User set up variables +## Root directory for CI +dirRoot=/contrib/fv3 +## Intel version to be used +intelVersion=2023.2.0 +############################################################################## +## HPC-ME container +container=/contrib/containers/noaa-intel-prototype_2023.09.25.sif +container_env_script=/contrib/containers/load_spack_noaa-intel-mlong.sh + +#Parse Arguments +branch=main +commit=none +while [[ $# -gt 0 ]]; do + case $1 in + -b|--branch) + branch="$2" + shift # past argument + shift # past value + ;; + -h|--hash) + commit="$2" + shift # past argument + shift # past value + ;; + -t|--test) + testname="$2" + shift # past argument + shift # past value + ;; + *) + echo "unknown argument" + exit 1 + ;; + esac +done + +if [ -z $testname ] + then + echo "must specify a test name with -t" + exit 1 +fi + +echo "branch is $branch" +echo "commit is $commit" +echo "test is $testname" + +## Set up the directories +MODULESHOME=/usr/share/lmod/lmod +testDir=${dirRoot}/${intelVersion}/GFDL_atmos_cubed_sphere/${branch}/${commit} +logDir=${testDir}/log +baselineDir=${dirRoot}/baselines/intel/${intelVersion} + +## Run the CI Test +# Define the builddir testscriptdir and rundir BUILDDIR is used by test scripts +# Set the BUILDDIR for the test script to use +export BUILDDIR="${testDir}/SHiELD_build" +testscriptDir=${BUILDDIR}/RTS/CI +runDir=${BUILDDIR}/CI/BATCH-CI + +# Run CI test scripts +cd ${testscriptDir} +set -o pipefail +# Execute the test piping output to log file +./${testname} " --partition=compute --mpi=pmi2 --job-name=${commit}_${testname} singularity exec -B /contrib -B /apps ${container} ${container_env_script}" |& tee ${logDir}/run_${testname}.log + +## Compare Restarts to Baseline +source $MODULESHOME/init/sh +export MODULEPATH=/mnt/shared/manual_modules:/usr/share/modulefiles/Linux:/usr/share/modulefiles/Core:/usr/share/lmod/lmod/modulefiles/Core:/apps/modules/modulefiles:/apps/modules/modulefamilies/intel +module load intel/2022.1.2 +module load netcdf +module load nccmp +for resFile in `ls ${baselineDir}/${testname}` +do + nccmp -d ${baselineDir}/${testname}/${resFile} ${runDir}/${testname}/RESTART/${resFile} +done diff --git a/.github/workflows/SHiELD_parallelworks_intel.yml b/.github/workflows/SHiELD_parallelworks_intel.yml index df1c603c6..77370bf1f 100644 --- a/.github/workflows/SHiELD_parallelworks_intel.yml +++ b/.github/workflows/SHiELD_parallelworks_intel.yml @@ -33,8 +33,8 @@ jobs: # so this salloc will prompt 46 nodes to startup and stay active for 20 min # this is enough nodes for the first 17 tests to run in parallel, and we # have 17 runners configured. - - run: salloc --partition=p2 -N 46 -J $GITHUB_SHA sleep 20m & - - run: /contrib/fv3/GFDL_atmos_cubed_sphere_CI/checkout.sh $GITHUB_REF $GITHUB_SHA + - run: salloc --partition=compute -N 46 -J $GITHUB_SHA sleep 20m & + - run: /contrib/fv3/GFDL_atmos_cubed_sphere_CI/checkout.sh -b $GITHUB_REF -h $GITHUB_SHA build: if: github.repository == 'NOAA-GFDL/GFDL_atmos_cubed_sphere' @@ -43,15 +43,21 @@ jobs: needs: [checkout] strategy: fail-fast: true - max-parallel: 3 + max-parallel: 17 matrix: - runpath: [/contrib/fv3/GFDL_atmos_cubed_sphere_CI/] - runscript: [swcompile.sh, nhcompile.sh, hydrocompile.sh] + runscript: [/contrib/fv3/GFDL_atmos_cubed_sphere_CI/compile.sh] + config: [solo] + hydro: [sw, nh, hydro] + bit: [64bit] + mode: [repro] steps: - env: - RUNPATH: ${{ matrix.runpath }} RUNSCRIPT: ${{ matrix.runscript }} - run: $RUNPATH/$RUNSCRIPT $GITHUB_REF $GITHUB_SHA + CONFIG: ${{ matrix.config }} + HYDRO: ${{ matrix.hydro }} + BIT: ${{ matrix.bit }} + MODE: ${{ matrix.mode }} + run: $RUNSCRIPT -b $GITHUB_REF -h $GITHUB_SHA -c $CONFIG --hydro $HYDRO --bit $BIT -m $MODE test: if: github.repository == 'NOAA-GFDL/GFDL_atmos_cubed_sphere' @@ -62,55 +68,55 @@ jobs: fail-fast: false max-parallel: 17 matrix: - runpath: [/contrib/fv3/GFDL_atmos_cubed_sphere_CI/] - runscript: + runscript: [/contrib/fv3/GFDL_atmos_cubed_sphere_CI/run_test.sh] + argument: # These are placed in order of largest to smallest jobs #layout 8,8 needs 8 nodes on dvcimultiintel cluster - - C512r20.solo.superC.sh - - C768.sw.BTwave.sh + - C512r20.solo.superC + - C768.sw.BTwave #layout 4,8 needs 4 nodes on dvcimultiintel cluster - - C256r20.solo.superC.sh - - C384.sw.BLvortex.sh + - C256r20.solo.superC + - C384.sw.BLvortex #layout 4,4 needs 2 nodes on dvcimultiintel cluster - - C128r20.solo.superC.sh - - C128r3.solo.TC.d1.sh - - C128r3.solo.TC.h6.sh - - C128r3.solo.TC.sh - - C128r3.solo.TC.tr8.sh - - C192.sw.BLvortex.sh - - C192.sw.BTwave.sh - - C192.sw.modon.sh - - C384.sw.BTwave.sh + - C128r20.solo.superC + - C128r3.solo.TC.d1 + - C128r3.solo.TC.h6 + - C128r3.solo.TC + - C128r3.solo.TC.tr8 + - C192.sw.BLvortex + - C192.sw.BTwave + - C192.sw.modon + - C384.sw.BTwave #layout 4,1 and 2,2 need 1 node on dvcimultiintel cluster - - C96.solo.BCdry.hyd.sh - - C96.solo.BCdry.sh - - C96.solo.BCmoist.hyd.d3.sh - - C96.solo.BCmoist.hyd.sh - - C96.solo.BCmoist.nhK.sh - - C96.solo.BCmoist.sh - - C96.solo.mtn_rest.hyd.diff2.sh - - C96.solo.mtn_rest.hyd.sh - - C96.solo.mtn_rest.nonmono.diff2.sh - - C96.solo.mtn_rest.sh - - C96.sw.BLvortex.sh - - C96.sw.BTwave.sh - - C96.sw.modon.sh - - C96.sw.RHwave.sh - - d96_1k.solo.mtn_rest_shear.olddamp.sh - - d96_1k.solo.mtn_rest_shear.sh - - d96_1k.solo.mtn_schar.mono.sh - - d96_1k.solo.mtn_schar.sh - - d96_2k.solo.bubble.n0.sh - - d96_2k.solo.bubble.nhK.sh - - d96_2k.solo.bubble.sh - - d96_500m.solo.mtn_schar.sh + - C96.solo.BCdry.hyd + - C96.solo.BCdry + - C96.solo.BCmoist.hyd.d3 + - C96.solo.BCmoist.hyd + - C96.solo.BCmoist.nhK + - C96.solo.BCmoist + - C96.solo.mtn_rest.hyd.diff2 + - C96.solo.mtn_rest.hyd + - C96.solo.mtn_rest.nonmono.diff2 + - C96.solo.mtn_rest + - C96.sw.BLvortex + - C96.sw.BTwave + - C96.sw.modon + - C96.sw.RHwave + - d96_1k.solo.mtn_rest_shear.olddamp + - d96_1k.solo.mtn_rest_shear + - d96_1k.solo.mtn_schar.mono + - d96_1k.solo.mtn_schar + - d96_2k.solo.bubble.n0 + - d96_2k.solo.bubble.nhK + - d96_2k.solo.bubble + - d96_500m.solo.mtn_schar steps: # This will end the slurm job started in the checkout job - run: scancel -n $GITHUB_SHA - env: - RUNPATH: ${{ matrix.runpath }} RUNSCRIPT: ${{ matrix.runscript }} - run: $RUNPATH/$RUNSCRIPT $GITHUB_REF $GITHUB_SHA + ARG1: ${{ matrix.argument }} + run: $RUNSCRIPT -t $ARG1 -b $GITHUB_REF -h $GITHUB_SHA shutdown: if: always() && github.repository == 'NOAA-GFDL/GFDL_atmos_cubed_sphere' runs-on: [gfdlacsciintel] diff --git a/.github/workflows/daily_cleanup_parallelworks.yml b/.github/workflows/daily_cleanup_parallelworks.yml index 82550789b..ba37ed71c 100644 --- a/.github/workflows/daily_cleanup_parallelworks.yml +++ b/.github/workflows/daily_cleanup_parallelworks.yml @@ -1,6 +1,6 @@ name: Old Build Cleanup -# This GitHub Action Workflow is runing on the GFDL_ACS_CIINTEL cluster +# This GitHub Action Workflow is runing on the gclustercigfdlacs cluster # This will delete all build directories older than 30 days # Build directories are on the cloud at /contrib/fv3/2023.2.0 @@ -16,3 +16,4 @@ jobs: name: Delete Builds steps: - run: find /contrib/fv3/2023.2.0/GFDL_atmos_cubed_sphere/refs/pull -maxdepth 1 -mindepth 1 -mtime +30 -type d -print -exec rm -rf "{}" \; + - run: find /contrib/fv3/2023.2.0/GFDL_atmos_cubed_sphere/refs/heads -maxdepth 1 -mindepth 1 -mtime +30 -type d -print -exec rm -rf "{}" \;