Merge branch 'ufs-community:develop' into develop_DTC

dtcenter · Nov 19, 2024 · 40ad7c3 · 40ad7c3
2 parents 9b1d127 + 39d1e5d
commit 40ad7c3
Show file tree

Hide file tree

Showing 434 changed files with 17,545 additions and 16,009 deletions.
diff --git a/.cicd/Jenkinsfile b/.cicd/Jenkinsfile
diff --git a/.cicd/scripts/disk_usage.sh b/.cicd/scripts/disk_usage.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Output a CSV report of disk usage on subdirs of some path
+# Usage: 
+#    [JOB_NAME=<ci_job>] [BUILD_NUMBER=<n>] [SRW_COMPILER=<intel>] [SRW_PLATFORM=<machine>] disk_usage path depth size outfile.csv
+#
+# args:
+#    directory=$1
+#    depth=$2
+#    size=$3
+#    outfile=$4
+
+[[ -n ${WORKSPACE} ]] || WORKSPACE=$(pwd)
+[[ -n ${SRW_PLATFORM} ]] || SRW_PLATFORM=$(hostname -s 2>/dev/null) || SRW_PLATFORM=$(hostname 2>/dev/null)
+[[ -n ${SRW_COMPILER} ]] || SRW_COMPILER=compiler
+
+script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" > /dev/null 2>&1 && pwd)"
+
+# Get repository root from Jenkins WORKSPACE variable if set, otherwise, set
+# relative to script directory.
+declare workspace
+if [[ -n "${WORKSPACE}/${SRW_PLATFORM}" ]]; then
+    workspace="${WORKSPACE}/${SRW_PLATFORM}"
+else
+    workspace="$(cd -- "${script_dir}/../.." && pwd)"
+fi
+
+echo "STAGE_NAME=${STAGE_NAME}" # from pipeline
+outfile="${4:-${workspace}-${SRW_COMPILER}-disk-usage${STAGE_NAME}.csv}"
+
+function disk_usage() {
+    local directory=${1:-${PWD}}
+    local depth=${2:-1}
+    local size=${3:-k}
+    echo "Disk usage: ${JOB_NAME:-ci}/${SRW_PLATFORM}/$(basename $directory)"
+    (
+    cd $directory || exit 1
+    echo "Platform,Build,Owner,Group,Inodes,${size:-k}bytes,Access Time,Filename"
+    du -Px -d ${depth:-1} --inode --exclude='./workspace' | \
+        while read line ; do
+            arr=($line); inode=${arr[0]}; filename=${arr[1]};
+            echo "${SRW_PLATFORM}-${SRW_COMPILER:-compiler},${JOB_NAME:-ci}/${BUILD_NUMBER:-0},$(stat -c '%U,%G' $filename),${inode:-0},$(du -Px -s -${size:-k} --time $filename)" | tr '\t' ',' ;
+        done | sort -t, -k5 -n #-r
+    )
+    echo ""
+}
+
+disk_usage $1 $2 $3 | tee ${outfile}
diff --git a/.cicd/scripts/qsub_srw_ftest.sh b/.cicd/scripts/qsub_srw_ftest.sh
@@ -9,7 +9,5 @@
 #PBS -l select=1:ncpus=24:mpiprocs=24:ompthreads=1
 #PBS -l walltime=00:30:00
 #PBS -V
-#PBS -o log_wrap.%j.log
-#PBS -e err_wrap.%j.err 
 
 bash ${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/srw_ftest.sh
diff --git a/.cicd/scripts/srw_build.sh b/.cicd/scripts/srw_build.sh
@@ -27,14 +27,15 @@ fi
 # Build and install
 cd ${workspace}/tests
 set +e
-./build.sh ${platform} ${SRW_COMPILER}
+/usr/bin/time -p -f '{\n  "cpu": "%P"\n, "memMax": "%M"\n, "mem": {"text": "%X", "data": "%D", "swaps": "%W", "context": "%c", "waits": "%w"}\n, "pagefaults": {"major": "%F", "minor": "%R"}\n, "filesystem": {"inputs": "%I", "outputs": "%O"}\n, "time": {"real": "%e", "user": "%U", "sys": "%S"}\n}' -o ${WORKSPACE}/${SRW_PLATFORM}-${SRW_COMPILER}-time-srw_build.json \
+    ./build.sh ${platform} ${SRW_COMPILER}
 build_exit=$?
 set -e
 cd -
 
 # Create combined log file for upload to s3
 build_dir="${workspace}/build_${SRW_COMPILER}"
 cat ${build_dir}/log.cmake ${build_dir}/log.make \
-    >${build_dir}/srw_build-${platform}-${SRW_COMPILER}.txt
+    >${build_dir}/srw_build-${SRW_PLATFORM}-${SRW_COMPILER}.txt
 
 exit $build_exit
diff --git a/.cicd/scripts/srw_ftest.sh b/.cicd/scripts/srw_ftest.sh
@@ -66,6 +66,9 @@ sed "s|^workflow:|workflow:\n  EXEC_SUBDIR: ${workspace}/install_${SRW_COMPILER}
 # Decrease forecast length since we are running all the steps
 sed "s|^  FCST_LEN_HRS: 12|  FCST_LEN_HRS: 6|g" -i ush/config.yaml
 
+# Update compiler 
+sed "s|^  COMPILER: intel|  COMPILER: ${SRW_COMPILER}|g" -i ush/config.yaml
+
 # DATA_LOCATION differs on each platform ... find it.
 export DATA_LOCATION=$(grep TEST_EXTRN_MDL_SOURCE_BASEDIR ${workspace}/ush/machine/${platform,,}.yaml | awk '{printf "%s", $2}')
 echo "DATA_LOCATION=${DATA_LOCATION}"
@@ -85,6 +88,8 @@ source etc/lmod-setup.sh ${platform,,}
 module use modulefiles
 module load build_${platform,,}_${SRW_COMPILER}
 module load wflow_${platform,,}
+# Deactivate conflicting conda env on GCP
+[[ "${SRW_PLATFORM}" =~ "gclusternoaa" ]] && conda deactivate
 
 [[ ${FORGIVE_CONDA} == true ]] && set +e +u    # Some platforms have incomplete python3 or conda support, but wouldn't necessarily block workflow tests
 conda activate srw_app

diff --git a/.cicd/scripts/srw_init.sh b/.cicd/scripts/srw_init.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+#
+# A unified init script for the SRW application. This script is expected to
+# fetch initial source for the SRW application for all supported platforms.
+#
+set -e -u -x
+
+script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" > /dev/null 2>&1 && pwd)"
+
+# Get repository root from Jenkins WORKSPACE variable if set, otherwise, set
+# relative to script directory.
+declare workspace
+if [[ -n "${WORKSPACE}/${SRW_PLATFORM}" ]]; then
+    workspace="${WORKSPACE}/${SRW_PLATFORM}"
+else
+    workspace="$(cd -- "${script_dir}/../.." && pwd)"
+fi
+
+# Normalize Parallel Works cluster platform value.
+declare platform
+if [[ "${SRW_PLATFORM}" =~ ^(az|g|p)clusternoaa ]]; then
+    platform='noaacloud'
+else
+    platform="${SRW_PLATFORM}"
+fi
+
+# Build and install
+cd ${workspace}
+set +e
+/usr/bin/time -p -f '{\n  "cpu": "%P"\n, "memMax": "%M"\n, "mem": {"text": "%X", "data": "%D", "swaps": "%W", "context": "%c", "waits": "%w"}\n, "pagefaults": {"major": "%F", "minor": "%R"}\n, "filesystem": {"inputs": "%I", "outputs": "%O"}\n, "time": {"real": "%e", "user": "%U", "sys": "%S"}\n}' -o ${WORKSPACE}/${SRW_PLATFORM}-${SRW_COMPILER}-time-srw_init.json \
+    ./manage_externals/checkout_externals
+init_exit=$?
+echo "STAGE_NAME=${STAGE_NAME}"
+env | grep = | sort > ${WORKSPACE}/${SRW_PLATFORM}-${SRW_COMPILER}-env.txt
+set -e
+cd -
+
+exit $init_exit
diff --git a/.cicd/scripts/srw_metric_example.sh → .cicd/scripts/srw_metric.sh b/.cicd/scripts/srw_metric_example.sh → .cicd/scripts/srw_metric.sh
@@ -56,17 +56,17 @@ else
 fi
 
 # Test directories
-we2e_experiment_base_dir="${workspace}/../expt_dirs/metric_test"
-we2e_test_dir="${workspace}/tests/WE2E"
+we2e_experiment_base_dir="${we2e_experiment_base_dir:=${workspace}/../expt_dirs/metric_test}"
+we2e_test_dir="${we2e_test_dir:=${workspace}/tests/WE2E}"
 we2e_test_name="grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0"
 
 pwd
 
 # Setup the build environment
 declare srw_compiler
 srw_compiler=${SRW_COMPILER} 
-source etc/lmod-setup.sh ${platform,,}
-module use modulefiles
+source ${workspace}/etc/lmod-setup.sh ${platform,,}
+module use ${workspace}/modulefiles
 module load build_${platform,,}_${srw_compiler}
 
 # Build srw
@@ -78,6 +78,8 @@ cd ${workspace}
 
 # Activate workflow environment
 module load wflow_${platform,,}
+# Deactivate conflicting conda env on GCP
+[[ "${SRW_PLATFORM}" =~ "gclusternoaa" ]] && conda deactivate
 
 [[ ${FORGIVE_CONDA} == true ]] && set +e +u    # Some platforms have incomplete python3 or conda support, but would not necessarily block workflow tests
 conda activate srw_app
@@ -98,17 +100,17 @@ if [[ ${RUN_STAT_ANLY_OPT} == true ]]; then
     # Clear out data
     rm -rf ${workspace}/Indy-Severe-Weather/
     # Check if metprd data exists locally otherwise get it from S3
-    TEST_EXTRN_MDL_SOURCE_BASEDIR=$(grep TEST_EXTRN_MDL_SOURCE_BASEDIR ${workspace}/ush/machine/${SRW_PLATFORM}.yaml | awk '{print $NF}')
-    if [[ ! -d $(dirname ${TEST_EXTRN_MDL_SOURCE_BASEDIR})/metprd/point_stat ]] ; then
+    TEST_EXTRN_MDL_SOURCE_BASEDIR=$(grep TEST_EXTRN_MDL_SOURCE_BASEDIR ${workspace}/ush/machine/${platform}.yaml | awk '{print $NF}')
+    if [[ -d $(dirname ${TEST_EXTRN_MDL_SOURCE_BASEDIR})/metprd/point_stat ]] ; then
         mkdir -p Indy-Severe-Weather/metprd/point_stat
         cp -rp $(dirname ${TEST_EXTRN_MDL_SOURCE_BASEDIR})/metprd/point_stat Indy-Severe-Weather/metprd
     elif [[ -f Indy-Severe-Weather.tgz ]]; then
         tar xvfz Indy-Severe-Weather.tgz 
     else
-        wget https://noaa-ufs-srw-pds.s3.amazonaws.com/sample_cases/release-public-v2.1.0/Indy-Severe-Weather.tgz
+        wget https://noaa-ufs-srw-pds.s3.amazonaws.com/experiment-user-cases/release-public-v2.1.0/METplus-vx-sample/Indy-Severe-Weather.tgz
         tar xvfz Indy-Severe-Weather.tgz
     fi
-    [[ -f skill-score.txt ]] && rm skill-score.txt
+    [[ -f ${SRW_PLATFORM,,}-${srw_compiler}-skill-score.txt ]] && rm ${SRW_PLATFORM,,}-${srw_compiler}-skill-score.txt
     # Skill score index is computed over several terms that are defined in parm/metplus/STATAnalysisConfig_skill_score. 
     # It is computed by aggregating the output from earlier runs of the Point-Stat and/or Grid-Stat tools over one or more cases.
     # In this example, skill score index is a weighted average of 4 skill scores of RMSE statistics for wind speed, dew point temperature, 
@@ -126,15 +128,15 @@ if [[ ${RUN_STAT_ANLY_OPT} == true ]]; then
        sed -i 's|--load("conda")|load("conda")|g' ${workspace}/modulefiles/tasks/${platform,,}/run_vx.local.lua
     fi
     # Run stat_analysis
-    stat_analysis -config parm/metplus/STATAnalysisConfig_skill_score -lookin ${workspace}/Indy-Severe-Weather/metprd/point_stat -v 2 -out skill-score.txt
+    stat_analysis -config parm/metplus/STATAnalysisConfig_skill_score -lookin ${workspace}/Indy-Severe-Weather/metprd/point_stat -v 2 -out ${SRW_PLATFORM,,}-${srw_compiler}-skill-score.txt
 
     # check skill-score.txt
-    cat skill-score.txt
+    cat ${SRW_PLATFORM,,}-${srw_compiler}-skill-score.txt
 
     # get skill-score (SS_INDEX) and check if it is significantly smaller than 1.0
     # A value greater than 1.0 indicates that the forecast model outperforms the reference, 
     # while a value less than 1.0 indicates that the reference outperforms the forecast.
-    tmp_string=$( tail -2 skill-score.txt | head -1 )
+    tmp_string=$( tail -2 ${SRW_PLATFORM,,}-${srw_compiler}-skill-score.txt | head -1 )
     SS_INDEX=$(echo $tmp_string | awk -F " " '{print $NF}')
     echo "Skill Score: ${SS_INDEX}"
     if [[ ${SS_INDEX} < "0.700" ]]; then

diff --git a/.cicd/scripts/srw_test.sh b/.cicd/scripts/srw_test.sh
@@ -11,7 +11,7 @@ script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" > /dev/null 2>&1 && pwd)
 # Get repository root from Jenkins WORKSPACE variable if set, otherwise, set
 # relative to script directory.
 declare workspace
-if [[ -n "${WORKSPACE}/${SRW_PLATFORM}" ]]; then
+if [[ -d "${WORKSPACE}/${SRW_PLATFORM}" ]]; then
     workspace="${WORKSPACE}/${SRW_PLATFORM}"
 else
     workspace="$(cd -- "${script_dir}/../.." && pwd)"
@@ -26,21 +26,30 @@ else
 fi
 
 # Test directories
-we2e_experiment_base_dir="${workspace}/expt_dirs"
-we2e_test_dir="${workspace}/tests/WE2E"
+export we2e_experiment_base_dir="${workspace}/expt_dirs"
+export we2e_test_dir="${workspace}/tests/WE2E"
+
+# Clean any stale test logs
+rm -f ${workspace}/tests/WE2E/log.*
+rm -f ${we2e_experiment_base_dir}/*/log.generate_FV3LAM_wflow ${we2e_experiment_base_dir}/*/log/* WE2E_summary*txt
 
 # Run the end-to-end tests.
 if "${SRW_WE2E_COMPREHENSIVE_TESTS}"; then
-    test_type="comprehensive"
+    export test_type="comprehensive"
 else
-    test_type="coverage"
+    export test_type=${SRW_WE2E_SINGLE_TEST:-"coverage"}
+    if [[ "${test_type}" = skill-score ]]; then
+        export test_type="grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0"
+    fi
 fi
 
 cd ${we2e_test_dir}
 # Progress file
-progress_file="${workspace}/we2e_test_results-${platform}-${SRW_COMPILER}.txt"
-./setup_WE2E_tests.sh ${platform} ${SRW_PROJECT} ${SRW_COMPILER} ${test_type} \
-    --expt_basedir=${we2e_experiment_base_dir} | tee ${progress_file}
+progress_file="${workspace}/we2e_test_results-${SRW_PLATFORM}-${SRW_COMPILER}.txt"
+/usr/bin/time -p -f '{\n  "cpu": "%P"\n, "memMax": "%M"\n, "mem": {"text": "%X", "data": "%D", "swaps": "%W", "context": "%c", "waits": "%w"}\n, "pagefaults": {"major": "%F", "minor": "%R"}\n, "filesystem": {"inputs": "%I", "outputs": "%O"}\n, "time": {"real": "%e", "user": "%U", "sys": "%S"}\n}' -o ${WORKSPACE}/${SRW_PLATFORM}-${SRW_COMPILER}-time-srw_test.json \
+    ./setup_WE2E_tests.sh ${platform} ${SRW_PROJECT} ${SRW_COMPILER} ${test_type} \
+    --expt_basedir=${we2e_experiment_base_dir} | tee ${progress_file}; \
+    [[ -f ${we2e_experiment_base_dir}/grid_SUBCONUS_Ind_3km_ics_FV3GFS_lbcs_FV3GFS_suite_WoFS_v0/log.generate_FV3LAM_wflow ]] && ${workspace}/.cicd/scripts/srw_metric.sh run_stat_anly
 
 # Set exit code to number of failures
 set +e

diff --git a/.cicd/scripts/wrapper_srw_ftest.sh b/.cicd/scripts/wrapper_srw_ftest.sh
@@ -15,17 +15,17 @@ declare arg_1
 if [[ "${SRW_PLATFORM}" == cheyenne ]] || [[ "${SRW_PLATFORM}" == derecho ]]; then
     workflow_cmd=qsub
     arg_1=""
-    check_job="qstat -u ${USER} -r ${job_id}"
 else
     workflow_cmd=sbatch
     arg_1="--parsable"
-    check_job="squeue -u ${USER} -j ${job_id} --noheader"
 fi
 
 # Customize wrapper scripts
 if [[ "${SRW_PLATFORM}" == gaea ]]; then
     sed -i '15i #SBATCH --clusters=c5' ${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/${workflow_cmd}_srw_ftest.sh
     sed -i 's|qos=batch|qos=normal|g' ${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/${workflow_cmd}_srw_ftest.sh
+    sed -i 's|00:30:00|00:45:00|g' ${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/${workflow_cmd}_srw_ftest.sh
+    sed -i 's|${JOBSdir}/JREGIONAL_RUN_POST|$USHdir/load_modules_run_task.sh "gaea" "run_post" ${JOBSdir}/JREGIONAL_RUN_POST|g' ${WORKSPACE}/${SRW_PLATFORM}/ush/wrappers/run_post.sh
 fi
 
 if [[ "${SRW_PLATFORM}" == hera ]]; then
@@ -38,6 +38,10 @@ if [[ "${SRW_PLATFORM}" == jet ]]; then
     sed -i '15i #SBATCH --partition=xjet' ${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/${workflow_cmd}_srw_ftest.sh
 fi
 
+if [[ "${TASK_DEPTH}" == 0 ]] ; then
+    exit 0
+fi
+
 # Call job card and return job_id
 echo "Running: ${workflow_cmd} -A ${SRW_PROJECT} ${arg_1} ${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/${workflow_cmd}_srw_ftest.sh"
 job_id=$(${workflow_cmd} -A ${SRW_PROJECT} ${arg_1} ${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/${workflow_cmd}_srw_ftest.sh)
@@ -48,6 +52,11 @@ sleep 10
 # Check for job and exit when done
 while true
 do
+    if [[ "${SRW_PLATFORM}" == derecho ]]; then
+        check_job="qstat -u ${USER} -r ${job_id}"
+    else
+	check_job="squeue -u ${USER} -j ${job_id} --noheader"
+    fi
     job_id_info=$($check_job)
     if [ ! -z "$job_id_info" ]; then
         echo "Job is still running. Check again in two minutes"
@@ -58,7 +67,7 @@ do
         # Return exit code and check for results file first
         results_file="${WORKSPACE}/${SRW_PLATFORM}/functional_test_results_${SRW_PLATFORM}_${SRW_COMPILER}.txt"
         if [ ! -f "$results_file" ]; then
-            echo "Missing results file! \nexit 1"
+            echo -e "Missing results file! \nexit 1"
             exit 1
         fi
 

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -3,7 +3,7 @@
 
 # These owners will be the default owners for everything in the repo.
 #*       @defunkt
-*       @mkavulich @gsketefian @JeffBeck-NOAA @RatkoVasic-NOAA @BenjaminBlake-NOAA @ywangwof @chan-hoo @panll @christinaholtNOAA @christopherwharrop-noaa @danielabdi-noaa @mark-a-potts @jkbk2004 @willmayfield @dmwright526 @gspetro-NOAA @natalie-perlin @EdwardSnyder-NOAA @MichaelLueken
+*       @mkavulich @gsketefian @JeffBeck-NOAA @RatkoVasic-NOAA @BenjaminBlake-NOAA @ywangwof @chan-hoo @panll @christinaholtNOAA @christopherwharrop-noaa @danielabdi-noaa @mark-a-potts @jkbk2004 @willmayfield @dmwright526 @gspetro-NOAA @natalie-perlin @EdwardSnyder-NOAA @MichaelLueken @rickgrubin-noaa @BruceKropp-Raytheon
 
 # Order is important. The last matching pattern has the most precedence.
 # So if a pull request only touches javascript files, only these owners

diff --git a/.github/PULL_REQUEST_TEMPLATE b/.github/PULL_REQUEST_TEMPLATE
@@ -30,15 +30,13 @@
 <!-- Explicitly state what tests were run on these changes, or if any are still pending (for README or other text-only changes, just put "None required"). Make note of the compilers used, the platform/machine, and other relevant details as necessary. For more complicated changes, or those resulting in scientific changes, please be explicit! -->
 <!-- Add an X to check off a box. -->
 
-- [ ] hera.intel
-- [ ] orion.intel
-- [ ] hercules.intel
-- [ ] cheyenne.intel
-- [ ] cheyenne.gnu
 - [ ] derecho.intel
 - [ ] gaea.intel
-- [ ] gaeac5.intel
+- [ ] hera.gnu
+- [ ] hera.intel
+- [ ] hercules.intel
 - [ ] jet.intel
+- [ ] orion.intel
 - [ ] wcoss2.intel
 - [ ] NOAA Cloud (indicate which platform)
 - [ ] Jenkins

diff --git a/.github/scripts/check_tech_doc.sh b/.github/scripts/check_tech_doc.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# This script recreates technical documentation for the ush and tests/WE2E Python scripts
+# If the tech docs produced here do not match the branch's contents, the script will fail
+
+set -eo pipefail
+
+# Install prerequisites
+pip install sphinx
+pip install sphinx-rtd-theme
+pip install sphinxcontrib-bibtex
+
+# Regenerate tech docs in ush and tests/WE2E based on current state of scripts in those directories.
+cd doc/TechDocs
+sphinx-apidoc -fM --remove-old -o ./ush ../../ush
+sphinx-apidoc -fM --remove-old -o ./tests/WE2E ../../tests/WE2E
+
+# Check for mismatch between what comes out of this action and what is in the PR. 
+status=`git status -s`
+
+if [ -n "${status}" ]; then
+  echo ${status}
+  echo ""
+  echo "Please update your Technical Documentation RST files."
+  exit 1
+else
+  echo "Technical documentation is up-to-date."
+  exit 0
+fi
diff --git a/.github/workflows/doc_tests.yaml b/.github/workflows/doc_tests.yaml
@@ -0,0 +1,25 @@
+name: Doc Tests
+on:
+  push:
+  pull_request:
+    branches:
+      - develop
+      - 'release/*'
+  workflow_dispatch:
+
+defaults:
+  run:
+    shell: bash -leo pipefail {0}
+
+jobs:
+  doc_tests:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Check tech docs
+        run: .github/scripts/check_tech_doc.sh
+      - name: Build documentation
+        run: |
+          cd doc
+          make doc