Skip to content

Commit 3f5ff80

Browse files
authored
eCLM-ParFlowGPU fixes (#74)
* Model updates - https://github.com/HPSCTerrSys/eCLM/tree/beta-0.4 - https://github.com/HPSCTerrSys/parflow/tree/258fb925f5a02c7f231507799fad0a1b867578e5 * NOTE: ParFlowGPU is only supported on `jsc.2025.gnu.openmpi`. `jsc.2025.intel.psmpi` doesn't work since there is no Hypre+CUDA package built with the `Intel/2024.2.0-CUDA-12` toolchain.
1 parent 5c804d2 commit 3f5ff80

File tree

8 files changed

+135
-59
lines changed

8 files changed

+135
-59
lines changed

.github/workflows/CI.yml

Lines changed: 82 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ jobs:
2222
parflow_dir: "parflow",
2323
model_opts: "--ICON --eCLM --ParFlow"
2424
}
25+
- {
26+
name: "eCLM-ParFlowGPU",
27+
use_oasis: "True",
28+
parflow_dir: "parflow",
29+
model_opts: "--eCLM --ParFlowGPU"
30+
}
2531
# TODO - Fix PDAF GNU errors
2632
# - {
2733
# name: "CLM3.5-PDAF",
@@ -55,27 +61,39 @@ jobs:
5561
MPI_HOME: /usr/lib/x86_64-linux-gnu/openmpi
5662
VER_NETCDF_C: 4.9.2
5763
VER_NETCDF_F90: 4.6.1
58-
VER_HYPRE: 2.26.0
64+
VER_HYPRE: 2.33.0
5965
VER_ECCODES: 2.40.0
6066
VER_OASIS: tsmp-patches-v0.1
6167

6268
steps:
6369
- uses: actions/checkout@v4
6470

65-
# These apt packages have post-install step which can't be triggered by GitHub cache.
71+
#
72+
# Install required TSMP2 dependencies
73+
#
6674
- name: Install TSMP2 dependencies on Ubuntu
6775
run: |
6876
sudo apt-get update
6977
sudo apt-get install gfortran openmpi-bin libopenmpi-dev libhdf5-openmpi-dev libhdf5-openmpi-hl-fortran-100t64 hdf5-helpers liblapack-dev libblas-dev
7078
71-
# These apt packages can be safely cached.
7279
- name: Install extra TSMP2 dependencies on Ubuntu
7380
uses: awalsh128/cache-apt-pkgs-action@latest
7481
with:
7582
packages: libxml++2.6-dev pylint wget cmake libpnetcdf-dev tcl-dev tk-dev liblzma-dev
7683
version: 1.0
7784
execute_install_scripts: true
7885

86+
#
87+
# Install CUDA dependencies on Ubuntu
88+
#
89+
- if: contains(matrix.config.name, 'ParFlowGPU')
90+
name: Install CUDA compiler and libraries
91+
uses: awalsh128/cache-apt-pkgs-action@latest
92+
with:
93+
packages: nvidia-cuda-toolkit nvidia-cuda-dev
94+
version: 1.0
95+
execute_install_scripts: true
96+
7997
- name: Initialize dependency directory and variables
8098
run: |
8199
TSMP2_ROOT=$(git rev-parse --show-toplevel)
@@ -156,38 +174,82 @@ jobs:
156174
key: ${{ matrix.config.name }}_netcdf-f90-${{ env.VER_NETCDF_F90 }}
157175

158176
#
159-
# Hypre
177+
# Hypre (OpenMP backend)
160178
#
161-
- if: contains(matrix.config.name, 'ParFlow')
162-
name: Restore cached hypre-${{ env.VER_HYPRE }}
179+
- if: contains(matrix.config.name, 'ParFlow') && !contains(matrix.config.name, 'GPU')
180+
name: Restore cached hypre-${{ env.VER_HYPRE }} (OpenMP backend)
163181
uses: actions/cache/restore@v4
164-
id: cache-hypre-restore
182+
id: cache-hypre-openmp-restore
165183
with:
166-
path: ${{ env.DEPENDENCIES_ROOT }}/hypre-${{ env.VER_HYPRE }}
167-
key: ${{ matrix.config.name }}_hypre-${{ env.VER_HYPRE }}
184+
path: ${{ env.DEPENDENCIES_ROOT }}/hypre-${{ env.VER_HYPRE }}-openmp
185+
key: hypre-${{ env.VER_HYPRE }}-openmp
168186

169-
- if: contains(matrix.config.name, 'ParFlow') && steps.cache-hypre-restore.outputs.cache-hit != 'true'
170-
name: Install hypre-${{ env.VER_HYPRE }}
187+
- if: contains(matrix.config.name, 'ParFlow') && !contains(matrix.config.name, 'GPU') && steps.cache-hypre-openmp-restore.outputs.cache-hit != 'true'
188+
name: Install hypre-${{ env.VER_HYPRE }} (OpenMP backend)
171189
working-directory: /tmp
172190
run: |
173191
# Download
174192
wget https://github.com/hypre-space/hypre/archive/v${VER_HYPRE}.tar.gz
175193
tar xf v${VER_HYPRE}.tar.gz
176-
cd hypre-${VER_HYPRE}/src
194+
cd hypre-${VER_HYPRE}
177195
178196
# Install
179-
./configure --prefix=${DEPENDENCIES_ROOT}/hypre-${VER_HYPRE}
180-
make -j4 install
197+
INSTALL_DIR="${DEPENDENCIES_ROOT}/hypre-${VER_HYPRE}-openmp"
198+
cmake -S src -B bld \
199+
-DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
200+
-DMPI_INCLUDE_DIR="${MPI_HOME}/include" \
201+
-DHYPRE_ENABLE_OPENMP="ON"
202+
cmake --build bld --parallel 4
203+
cmake --install bld
204+
205+
# Verify
206+
tree -L 2 ${INSTALL_DIR}
207+
208+
- if: contains(matrix.config.name, 'ParFlow') && !contains(matrix.config.name, 'GPU') && steps.cache-hypre-openmp-restore.outputs.cache-hit != 'true'
209+
name: Cache hypre-${{ env.VER_HYPRE }} (OpenMP backend)
210+
uses: actions/cache/save@v4
211+
with:
212+
path: ${{ env.DEPENDENCIES_ROOT }}/hypre-${{ env.VER_HYPRE }}-openmp
213+
key: hypre-${{ env.VER_HYPRE }}-openmp
214+
215+
#
216+
# Hypre (CUDA backend)
217+
#
218+
- if: contains(matrix.config.name, 'ParFlowGPU')
219+
name: Restore cached hypre-${{ env.VER_HYPRE }} (CUDA backend)
220+
uses: actions/cache/restore@v4
221+
id: cache-hypre-cuda-restore
222+
with:
223+
path: ${{ env.DEPENDENCIES_ROOT }}/hypre-${{ env.VER_HYPRE }}-cuda
224+
key: hypre-${{ env.VER_HYPRE }}-cuda
225+
226+
- if: contains(matrix.config.name, 'ParFlowGPU') && steps.cache-hypre-cuda-restore.outputs.cache-hit != 'true'
227+
name: Install hypre-${{ env.VER_HYPRE }} (CUDA backend)
228+
working-directory: /tmp
229+
run: |
230+
# Download
231+
wget https://github.com/hypre-space/hypre/archive/v${VER_HYPRE}.tar.gz
232+
tar xf v${VER_HYPRE}.tar.gz
233+
cd hypre-${VER_HYPRE}
234+
235+
# Install
236+
INSTALL_DIR="${DEPENDENCIES_ROOT}/hypre-${VER_HYPRE}-cuda"
237+
cmake -S src -B bld \
238+
-DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
239+
-DMPI_INCLUDE_DIR="${MPI_HOME}/include" \
240+
-DHYPRE_ENABLE_CUDA="ON"
241+
cmake --build bld --parallel 4
242+
cmake --install bld
181243
182244
# Verify
183-
tree -L 2 ${DEPENDENCIES_ROOT}/hypre-${VER_HYPRE}
245+
tree -L 2 ${INSTALL_DIR}
184246
185-
- if: contains(matrix.config.name, 'ParFlow') && steps.cache-hypre-restore.outputs.cache-hit != 'true'
186-
name: Cache hypre-${{ env.VER_HYPRE }}
247+
- if: contains(matrix.config.name, 'ParFlowGPU') && steps.cache-hypre-cuda-restore.outputs.cache-hit != 'true'
248+
name: Cache hypre-${{ env.VER_HYPRE }} (CUDA backend)
187249
uses: actions/cache/save@v4
188250
with:
189-
path: ${{ env.DEPENDENCIES_ROOT }}/hypre-${{ env.VER_HYPRE }}
190-
key: ${{ matrix.config.name }}_hypre-${{ env.VER_HYPRE }}
251+
path: ${{ env.DEPENDENCIES_ROOT }}/hypre-${{ env.VER_HYPRE }}-cuda
252+
key: hypre-${{ env.VER_HYPRE }}-cuda
191253

192254
#
193255
# ecCodes
@@ -359,6 +421,7 @@ jobs:
359421
run: |
360422
tree -FUCh --du -L 2 $(pwd)
361423
CMAKE_PREFIX_PATH=$(ls -1 | xargs realpath | paste -sd ":" -)
424+
CMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}:${MPI_HOME}"
362425
echo "" && echo "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}"
363426
echo "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV
364427

.gitmodules

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
branch = icon-2024.07-public_coup-oas
55
[submodule "models/parflow"]
66
path = models/parflow
7-
url = https://github.com/HPSCTerrSys/parflow
8-
branch = openmp-flag-fix
7+
url = https://github.com/HPSCTerrSys/parflow.git
8+
branch = hypre-cuda-fixes
99
[submodule "models/parflow_pdaf"]
1010
path = models/parflow_pdaf
1111
url = https://github.com/HPSCTerrSys/parflow
@@ -17,7 +17,7 @@
1717
[submodule "models/eCLM"]
1818
path = models/eCLM
1919
url = https://github.com/HPSCTerrSys/eCLM.git
20-
branch = beta-0.3
20+
branch = beta-0.4
2121
[submodule "models/oasis3-mct"]
2222
path = models/oasis3-mct
2323
url = https://icg4geo.icg.kfa-juelich.de/ExternalReposPublic/oasis3-mct

cmake/BuildParFlow.cmake

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,14 @@ if (${ParFlowGPU})
2828
message(FATAL_ERROR "BuildParFlow: ParFlow GPU is enabled, but neither CUDA nor Kokkos was found.")
2929
endif()
3030
endif()
31-
32-
# TODO: Don't rely on env variables!
33-
list(APPEND PF_GPU_FLAGS -DRMM_ROOT=$ENV{RMM_ROOT})
3431
else()
3532
find_package(OpenMP)
3633
if (OpenMP_FOUND)
3734
set(PF_ACC_BACKEND "omp")
3835
else()
3936
set(PF_ACC_BACKEND "none")
4037
endif()
41-
#TODO: also support backends 'kokkos' and 'none'
38+
#TODO: Add support for 'kokkos' backend
4239
endif()
4340

4441
# Set compiler flags
@@ -84,7 +81,6 @@ ExternalProject_Add(ParFlow
8481
-DMPIEXEC_NUMPROC_FLAG=${MPIEXEC_NUMPROC_FLAG}
8582
-DPARFLOW_ENABLE_SLURM=${ENABLE_SLURM}
8683
${PF_CLM_FLAGS}
87-
${PF_GPU_FLAGS}
8884
DEPENDS ${MODEL_DEPENDENCIES}
8985
)
9086

env/jsc.2025.gnu.openmpi

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,8 @@ if [[ "$1" == "--parflowgpu" ]]; then
3434
module load UCX-settings/RC-CUDA
3535
module load Hypre/2.31.0
3636

37-
#
38-
# Workaround to make ParFlowGPU work. RMM is installed separately, and here we simply expose the path to the RMM binaries.
39-
#
40-
# RMM install script is at /p/scratch/cslts/shared_data/TSMP2_S2025/RAPIDS_MemoryManager/install_rmm.sh
41-
#
42-
export RMM_ROOT="/p/scratch/cslts/shared_data/TSMP2_S2025/RAPIDS_MemoryManager/${SYSTEMNAME^^}_$STAGE"
43-
4437
# TODO: Verify these values
45-
if [[ $SYSTEMNAME == "jedi" ]]; then
38+
if [[ $SYSTEMNAME == "jedi" || $SYSTEMNAME == "jupiter" ]]; then
4639
export CUDAARCHS="90"
4740
else
4841
export CUDAARCHS="80"
@@ -76,7 +69,6 @@ echo " Fortran: $($FC --version | head -n 1)"
7669
if [[ "$1" == "--parflowgpu" ]]; then
7770
echo " nvcc: $(nvcc --version | tail -n 1 | cut -d" " -f2)"
7871
echo " CUDAARCHS: $CUDAARCHS"
79-
echo " RMM_ROOT: ${RMM_ROOT}"
8072
fi
8173
echo "=================================================="
8274
echo ""

env/jsc.2025.intel.psmpi

Lines changed: 30 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@ module use $OTHERSTAGES
1414
module load Stages/2025
1515

1616
# Primary compiler toolchain
17-
module load Intel
17+
if [[ "$1" == "--parflowgpu" ]]; then
18+
# TODO: Use a generic GPU switch instead of --parflowgpu
19+
module load CUDA
20+
module load Intel/2024.2.0-CUDA-12
21+
else
22+
module load Intel
23+
fi
1824
module load ParaStationMPI
1925

2026
# Basic scripting and build tools
@@ -30,19 +36,15 @@ module load PnetCDF
3036

3137
# ParFlow additional libraries
3238
if [[ "$1" == "--parflowgpu" ]]; then
33-
module load CUDA
34-
module load UCX-settings/RC-CUDA
35-
module load Hypre/2.31.0
36-
37-
#
38-
# Workaround to make ParFlowGPU work. RMM is installed separately, and here we simply expose the path to the RMM binaries.
39+
# WARNING:
3940
#
40-
# RMM install script is at /p/scratch/cslts/shared_data/TSMP2_S2025/RAPIDS_MemoryManager/install_rmm.sh
41-
#
42-
export RMM_ROOT="/p/scratch/cslts/shared_data/TSMP2_S2025/RAPIDS_MemoryManager/${SYSTEMNAME^^}_$STAGE"
41+
# Loading Hypre/2.31.0 under Intel will stubbornly load Hypre/2.31.0-cpu
42+
# since Hypre/2.31.0 is only available under the GCC toolchain. This wrong
43+
# Hypre will trigger a linker failure in ParFlow. I suggest using jsc.2025.gnu.openmpi when
44+
# building ParFlow.
4345

4446
# TODO: Verify these values
45-
if [[ $SYSTEMNAME == "jedi" ]]; then
47+
if [[ $SYSTEMNAME == "jedi" || $SYSTEMNAME == "jupiter" ]]; then
4648
export CUDAARCHS="90"
4749
else
4850
export CUDAARCHS="80"
@@ -57,6 +59,9 @@ module load Tcl
5759
module load ecCodes
5860

5961
# Set default MPI compilers
62+
export OMPI_CC=icx
63+
export OMPI_CXX=icpx
64+
export OMPI_FC=ifx
6065
export CC=mpicc
6166
export FC=mpif90
6267
export CXX=mpicxx
@@ -65,10 +70,19 @@ export MPI_HOME=$EBROOTPSMPI
6570
# Display compiler settings
6671
module list
6772
echo "====================================== COMPILER SETTINGS ======================================"
68-
echo " Machine: ${SYSTEMNAME} on Stages/$STAGE"
69-
echo " MPI lib: $(mpichversion | head -n 1 | tr -d =)"
70-
echo " C: $($CC --version | head -n 1)"
71-
echo " C++: $($CXX --version | head -n 1)"
72-
echo " Fortran: $($FC --version | head -n 1)"
73+
echo " Machine: ${SYSTEMNAME} on Stages/$STAGE"
74+
echo " MPI lib: $(mpichversion | head -n 1 | tr -d =)"
75+
echo " C: $($CC --version | head -n 1)"
76+
echo " C++: $($CXX --version | head -n 1)"
77+
echo " Fortran: $($FC --version | head -n 1)"
78+
if [[ "$1" == "--parflowgpu" ]]; then
79+
echo " nvcc: $(nvcc --version | tail -n 1 | cut -d" " -f2)"
80+
echo " CUDAARCHS: $CUDAARCHS"
81+
fi
7382
echo "==============================================================================================="
7483
echo ""
84+
if [[ "$1" == "--parflowgpu" ]]; then
85+
echo "WARNING: ParFlowGPU doesn't work on Intel toolchain due to missing Intel-built Hypre-CUDA library."
86+
echo " Load the 'jsc.2025.gnu.openmpi' env file instead to build ParFlowGPU."
87+
fi
88+
echo ""

env/ubuntu.gnu.openmpi

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,26 @@ export MPI_HOME=/usr/lib/x86_64-linux-gnu/openmpi
1313
export SYSTEMNAME="UBUNTU"
1414
export STAGE=$(lsb_release -sr)
1515

16+
# ParFlow additional libraries
17+
if [[ "$1" == "--parflowgpu" ]]; then
18+
export CUDA_HOME=/usr
19+
export CUCC=nvcc
20+
export CUDAARCHS="70" #TODO: Verify if this default value makes sense
21+
fi
22+
1623
# TODO: Find a way to pack all TSMP2 dependency info to CMAKE_PREFIX_PATH.
1724
# export CMAKE_PREFIX_PATH=$(cd ${DEPENDENCIES_ROOT} | ls -1 | xargs realpath | paste -sd ":" -)
1825

1926
# Display compiler settings
2027
echo "==================== COMPILER SETTINGS ====================="
21-
echo " Machine: ${SYSTEMNAME} $STAGE"
22-
echo " MPI lib: $(mpirun --version | head -n 1)"
23-
echo " C: $($CC --version | head -n 1)"
24-
echo " C++: $($CXX --version | head -n 1)"
25-
echo " Fortran: $($FC --version | head -n 1)"
28+
echo " Machine: ${SYSTEMNAME} $STAGE"
29+
echo " MPI lib: $(mpirun --version | head -n 1)"
30+
echo " C: $($CC --version | head -n 1)"
31+
echo " C++: $($CXX --version | head -n 1)"
32+
echo " Fortran: $($FC --version | head -n 1)"
33+
if [[ "$1" == "--parflowgpu" ]]; then
34+
echo " nvcc: $(nvcc --version | tail -n 1 | cut -d" " -f2)"
35+
echo " CUDAARCHS: $CUDAARCHS"
36+
fi
2637
echo "============================================================="
2738
echo ""

models/eCLM

0 commit comments

Comments
 (0)