Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[develop] update Gaea modulefile #836

Merged
merged 21 commits into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
cd3778f
update Gaea modulefile
Jun 15, 2023
ed68f32
Merge branch 'ufs-community:develop' into update-gaea-stack
natalie-perlin Jul 7, 2023
dda21d4
Merge branch 'ufs-community:develop' into update-gaea-stack
natalie-perlin Jul 10, 2023
61cce4b
updated modulefiles and scripts for Gaea c3,c4
Jul 11, 2023
d63cc30
merge with updated branch
Jul 11, 2023
e54c8f5
load alps module for Gaea
Jul 11, 2023
e65cba6
updated version of ncdiag/1.1.1 for GSI build
Jul 11, 2023
adec3be
Updating modulefiles for Gaea
Jul 22, 2023
41f3e73
Merge branch 'ufs-community:develop' into update-gaea-stack
natalie-perlin Jul 22, 2023
614c5f2
Update an update-gaea-stack branch with the recent changes in develop
Jul 22, 2023
24ffba5
updates for allowing use of met, metplus modules
Jul 22, 2023
88ed69a
update met, metplus versions
Jul 22, 2023
82a915f
Merge branch 'ufs-community:develop' into update-gaea-stack
natalie-perlin Aug 1, 2023
6231156
Updates for Gaea to enable MMet verification tasks
Aug 4, 2023
5f81ccc
Machine file update for Gaea C4
Aug 10, 2023
e13627b
Updates from the recent develop branch merged into update-gaea-stack
Aug 10, 2023
a300700
Reactivate Gaea pipelines in Jenkinsfile and change from Jet to Jet-E…
MichaelLueken Aug 11, 2023
747890b
Re-enable Functional Workflow Task Tests for Gaea.
MichaelLueken Aug 11, 2023
8c9b560
unload standard python in wflow_gaea.lua on compute nodes
Aug 14, 2023
60fcdfa
Merge remote-tracking branch 'origin2/update-gaea-stack' into update-…
Aug 14, 2023
77aa6e3
Remove module unload python from .cicd/scripts/srw_ftest.sh.
MichaelLueken Aug 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 5 additions & 9 deletions .cicd/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,10 @@ pipeline {
parameters {
// Allow job runner to filter based on platform
// Use the line below to enable all PW clusters
// choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaea', 'hera', 'jet', 'orion', 'pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1'], description: 'Specify the platform(s) to use')
// choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaea', 'hera', 'jet-epic', 'orion', 'pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1'], description: 'Specify the platform(s) to use')
// Use the line below to enable the PW AWS cluster
// choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaea', 'hera', 'jet', 'orion', 'pclusternoaav2use1'], description: 'Specify the platform(s) to use')
// Use the line below to re-enable Gaea
// choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaea', 'hera', 'jet', 'orion'], description: 'Specify the platform(s) to use')
choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'hera', 'jet', 'orion'], description: 'Specify the platform(s) to use')
// choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaea', 'hera', 'jet-epic', 'orion', 'pclusternoaav2use1'], description: 'Specify the platform(s) to use')
choice(name: 'SRW_PLATFORM_FILTER', choices: ['all', 'cheyenne', 'gaea', 'hera', 'jet-epic', 'orion'], description: 'Specify the platform(s) to use')
// Allow job runner to filter based on compiler
choice(name: 'SRW_COMPILER_FILTER', choices: ['all', 'gnu', 'intel'], description: 'Specify the compiler(s) to use to build')
// Uncomment the following line to re-enable comprehensive tests
Expand Down Expand Up @@ -78,8 +76,7 @@ pipeline {
axes {
axis {
name 'SRW_PLATFORM'
// values 'cheyenne', 'gaea', 'hera', 'jet', 'orion' //, 'pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1'
values 'cheyenne', 'hera', 'jet', 'orion' //, 'pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1'
values 'cheyenne', 'gaea', 'hera', 'jet-epic', 'orion' //, 'pclusternoaav2use1', 'azclusternoaav2eus1', 'gclusternoaav2usc1'
}

axis {
Expand All @@ -93,8 +90,7 @@ pipeline {
exclude {
axis {
name 'SRW_PLATFORM'
// values 'gaea', 'jet', 'orion' //, 'pclusternoaav2use1' , 'azclusternoaav2eus1', 'gclusternoaav2usc1'
values 'jet', 'orion' //, 'pclusternoaav2use1' , 'azclusternoaav2eus1', 'gclusternoaav2usc1'
values 'gaea', 'jet-epic', 'orion' //, 'pclusternoaav2use1' , 'azclusternoaav2eus1', 'gclusternoaav2usc1'
}

axis {
Expand Down
4 changes: 4 additions & 0 deletions .cicd/scripts/srw_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ else
platform="${SRW_PLATFORM}"
fi

if [[ "${SRW_PLATFORM}" = jet-epic ]]; then
platform='jet'
fi

# Build and install
cd ${workspace}/tests
set +e
Expand Down
25 changes: 12 additions & 13 deletions .cicd/scripts/srw_ftest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ else
platform="${SRW_PLATFORM}"
fi

if [[ "${SRW_PLATFORM}" = jet-epic ]]; then
platform='jet'
fi

# Test directories
we2e_experiment_base_dir="${workspace}/expt_dirs"
we2e_test_dir="${workspace}/tests/WE2E"
Expand All @@ -64,7 +68,7 @@ sed "s|^workflow:|workflow:\n EXPT_BASEDIR: ${workspace}/expt_dirs|1" -i ush/co
sed "s|^workflow:|workflow:\n EXEC_SUBDIR: ${workspace}/install_${SRW_COMPILER}/exec|1" -i ush/config.yaml

# DATA_LOCATION differs on each platform ... find it.
export DATA_LOCATION=$(grep TEST_EXTRN_MDL_SOURCE_BASEDIR ${workspace}/ush/machine/${SRW_PLATFORM,,}.yaml | awk '{printf "%s", $2}')
export DATA_LOCATION=$(grep TEST_EXTRN_MDL_SOURCE_BASEDIR ${workspace}/ush/machine/${platform,,}.yaml | awk '{printf "%s", $2}')
echo "DATA_LOCATION=${DATA_LOCATION}"

# Configure a default test ...
Expand Down Expand Up @@ -129,18 +133,13 @@ rm -f ${results_file}
status=0

# Limit to machines that are fully ready
deny_machines=( gaea )
if [[ ${deny_machines[@]} =~ ${platform,,} ]] ; then
echo "# Deny ${platform} - incomplete configuration." | tee -a ${results_file}
else
echo "# Try ${platform} with the first few simple SRW tasks ..." | tee -a ${results_file}
for task in ${TASKS[@]:0:${TASK_DEPTH}} ; do
echo -n "./$task.sh ... "
./$task.sh > $task-log.txt 2>&1 && echo "COMPLETE" || echo "FAIL rc=$(( status+=$? ))"
# stop at the first sign of trouble ...
[[ 0 != ${status} ]] && echo "$task: FAIL" >> ${results_file} && break || echo "$task: COMPLETE" >> ${results_file}
done
fi
echo "# Try ${platform} with the first few simple SRW tasks ..." | tee -a ${results_file}
for task in ${TASKS[@]:0:${TASK_DEPTH}} ; do
echo -n "./$task.sh ... "
./$task.sh > $task-log.txt 2>&1 && echo "COMPLETE" || echo "FAIL rc=$(( status+=$? ))"
# stop at the first sign of trouble ...
[[ 0 != ${status} ]] && echo "$task: FAIL" >> ${results_file} && break || echo "$task: COMPLETE" >> ${results_file}
done

# Set exit code to number of failures
set +e
Expand Down
4 changes: 4 additions & 0 deletions .cicd/scripts/srw_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ else
platform="${SRW_PLATFORM}"
fi

if [[ "${SRW_PLATFORM}" = jet-epic ]]; then
platform='jet'
fi

# Test directories
we2e_experiment_base_dir="${workspace}/expt_dirs"
we2e_test_dir="${workspace}/tests/WE2E"
Expand Down
6 changes: 1 addition & 5 deletions etc/lmod-setup.csh
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,7 @@ else if ( "$L_MACHINE" == singularity ) then
module purge

else if ( "$L_MACHINE" == gaea ) then
set ENV="/lustre/f2/dev/role.epic/contrib/apps/lmod/lmod/init/csh"
source $ENV

setenv LMOD_SYSTEM_DEFAULT_MODULES "modules/3.2.11.4"
module --initial_load --no_redirect restore
source /lustre/f2/dev/role.epic/contrib/Lmod_init.csh

else if ( "$L_MACHINE" == odin ) then
module unload modules
Expand Down
20 changes: 13 additions & 7 deletions modulefiles/build_gaea_intel.lua
Original file line number Diff line number Diff line change
@@ -1,21 +1,27 @@
help([[
This module loads libraries for building the UFS SRW App on
the NOAA RDHPC machine Gaea using Intel-2022.1.2
the NOAA RDHPC machine Gaea using Intel-2022.0.2
]])

whatis([===[Loads libraries needed for building the UFS SRW App on Gaea ]===])

load(pathJoin("cmake", os.getenv("cmake_ver") or "3.20.1"))

prepend_path("MODULEPATH","/lustre/f2/dev/role.epic/contrib/hpc-stack/intel-2021.3.0_noarch/modulefiles/stack")
prepend_path("MODULEPATH","/lustre/f2/dev/role.epic/contrib/hpc-stack/intel-classic-2022.0.2/modulefiles/stack")
load(pathJoin("hpc", os.getenv("hpc_ver") or "1.2.0"))
load(pathJoin("intel", os.getenv("intel_ver") or "2021.3.0"))
load(pathJoin("hpc-intel", os.getenv("hpc_intel_ver") or "2021.3.0"))
load(pathJoin("hpc-cray-mpich", os.getenv("hpc_cray_mpich_ver") or "7.7.11"))
load(pathJoin("gcc", os.getenv("gcc_ver") or "8.3.0"))
load(pathJoin("libpng", os.getenv("libpng_ver") or "1.6.37"))
load(pathJoin("hpc-intel-classic", os.getenv("hpc_intel_classic_ver") or "2022.0.2"))
load(pathJoin("hpc-cray-mpich", os.getenv("hpc_cray_mpich_ver") or "7.7.20"))

load("srw_common")
-- Need at runtime
load("alps")

local MKLROOT="/opt/intel/oneapi/mkl/2022.0.2/"
prepend_path("LD_LIBRARY_PATH",pathJoin(MKLROOT,"lib/intel64"))
pushenv("MKLROOT", MKLROOT)

pushenv("GSI_BINARY_SOURCE_DIR", "/lustre/f2/dev/role.epic/contrib/GSI_data/fix/20230601")
pushenv("CRAYPE_LINK_TYPE","dynamic")

setenv("CC","cc")
setenv("FC","ftn")
Expand Down
2 changes: 1 addition & 1 deletion modulefiles/srw_common.lua
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ load("sigio/2.3.2")
load("w3nco/2.4.1")
load("wrf_io/1.2.0")

load("ncdiag/1.0.0")
load("ncdiag/1.1.1")
load("ncio/1.1.2")
load("wgrib2/2.0.8")
2 changes: 1 addition & 1 deletion modulefiles/tasks/gaea/plot_allvars.local.lua
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
prepend_path("MODULEPATH","/lustre/f2/dev/role.epic/contrib/modulefiles")
prepend_path("MODULEPATH","/lustre/f2/dev/role.epic/contrib/miniconda3/modulefiles")
load(pathJoin("miniconda3", os.getenv("miniconda3_ver") or "4.12.0"))

setenv("SRW_ENV", "regional_workflow")
2 changes: 1 addition & 1 deletion modulefiles/tasks/gaea/python_srw.lua
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
prepend_path("MODULEPATH","/lustre/f2/dev/role.epic/contrib/modulefiles")
prepend_path("MODULEPATH","/lustre/f2/dev/role.epic/contrib/miniconda3/modulefiles")
load(pathJoin("miniconda3", os.getenv("miniconda3_ver") or "4.12.0"))

setenv("SRW_ENV", "workflow_tools")
8 changes: 6 additions & 2 deletions modulefiles/wflow_gaea.lua
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@ the NOAA RDHPC machine Gaea

whatis([===[Loads libraries needed for running the UFS SRW App on gaea ]===])

unload("python")
load("set_pythonpath")
prepend_path("MODULEPATH","/lustre/f2/dev/role.epic/contrib/modulefiles")
prepend_path("MODULEPATH","/lustre/f2/dev/role.epic/contrib/miniconda3/modulefiles")
load(pathJoin("miniconda3", os.getenv("miniconda3_ver") or "4.12.0"))
prepend_path("MODULEPATH","/lustre/f2/dev/role.epic/contrib/rocoto/modulefiles")
load("rocoto")
load("alps")

setenv("PROJ_LIB", "/lustre/f2/dev/role.epic/contrib/miniconda3/4.12.0/envs/regional_workflow/share/proj")
pushenv("MKLROOT", "/opt/intel/oneapi/mkl/2022.0.2/")
pushenv("GSI_BINARY_SOURCE_DIR", "/lustre/f2/dev/role.epic/contrib/GSI_data/fix/20230601")
setenv("PMI_NO_PREINITIALIZE","1")

if mode() == "load" then
LmodMsgRaw([===[Please do the following to activate conda:
Expand Down
30 changes: 22 additions & 8 deletions ush/machine/gaea.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,18 @@ platform:
MRMS_OBS_DIR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/obs_data/mrms/proc
NDAS_OBS_DIR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/obs_data/ndas/proc
DOMAIN_PREGEN_BASEDIR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/FV3LAM_pregen
QUEUE_DEFAULT: normal
QUEUE_FCST: normal
QUEUE_HPSS: normal
QUEUE_DEFAULT: windfall
QUEUE_FCST: windfall
QUEUE_HPSS: windfall
REMOVE_MEMORY: True
RUN_CMD_FCST: srun --export=ALL --mpi=pmi2 -n ${PE_MEMBER01}
RUN_CMD_POST: srun --export=ALL --mpi=pmi2 -n $nprocs
RUN_CMD_PRDGEN: srun --export=ALL --mpi=pmi2 -n $nprocs
PARTITION_HPSS: eslogin
RUN_CMD_FCST: srun --export=ALL -n ${PE_MEMBER01}
RUN_CMD_POST: srun --export=ALL -n $nprocs
RUN_CMD_PRDGEN: srun --export=ALL -n $nprocs
RUN_CMD_SERIAL: time
RUN_CMD_UTILS: srun --export=ALL --mpi=pmi2 -n $nprocs
SCHED_NATIVE_CMD: -M c3 --export=NONE
SCHED_NATIVE_CMD_HPSS: -M es --export=NONE
SCHED_NATIVE_CMD: --clusters=c4 --export=NONE
SCHED_NATIVE_CMD_HPSS: --clusters=es --export=NONE
PRE_TASK_CMDS: '{ ulimit -s unlimited; ulimit -a; }'
TEST_EXTRN_MDL_SOURCE_BASEDIR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/input_model_data
TEST_PREGEN_BASEDIR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/FV3LAM_pregen
Expand All @@ -30,8 +31,21 @@ platform:
FIXorg: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/fix/fix_orog
FIXsfc: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/fix/fix_sfc_climo
FIXshp: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/NaturalEarth
EXTRN_MDL_DATA_STORES: aws
data:
ics_lbcs:
FV3GFS:
nemsio: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/input_model_data/FV3GFS/nemsio/${yyyymmdd}${hh}
grib2: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/input_model_data/FV3GFS/grib2/${yyyymmdd}${hh}
RAP: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/input_model_data/RAP/${yyyymmdd}${hh}
HRRR: /lustre/f2/dev/role.epic/contrib/UFS_SRW_data/develop/input_model_data/HRRR/${yyyymmdd}${hh}
rocoto:
tasks:
metatask_run_ensemble:
task_run_fcst_mem#mem#:
cores: '{{ task_run_fcst.PE_MEMBER01 // 1 }}'
native: '--cpus-per-task {{ task_run_fcst.OMP_NUM_THREADS_RUN_FCST|int }} --exclusive {{ platform.SCHED_NATIVE_CMD }}'
nodes:
nnodes:
nodesize:
ppn:
Loading