diff --git a/components/eam/src/physics/rrtmgp/external b/components/eam/src/physics/rrtmgp/external index b24ca1f616e4..91d9076c7b18 160000 --- a/components/eam/src/physics/rrtmgp/external +++ b/components/eam/src/physics/rrtmgp/external @@ -1 +1 @@ -Subproject commit b24ca1f616e45659b334dbd7297017cb7927367e +Subproject commit 91d9076c7b18b0e67dbd03d208ef2f385efefadd diff --git a/components/eamxx/CMakeLists.txt b/components/eamxx/CMakeLists.txt index 1119a3f031f7..3462cef9a976 100644 --- a/components/eamxx/CMakeLists.txt +++ b/components/eamxx/CMakeLists.txt @@ -212,8 +212,8 @@ endif() # #cmakedefine RRTMGP_EXPENSIVE_CHECKS option (SCREAM_RRTMGP_DEBUG "Turn on extra debug checks in RRTMGP" ${SCREAM_DEBUG}) -option(SCREAM_RRTMGP_ENABLE_YAKL "Use YAKL under rrtmgp" TRUE) -option(SCREAM_RRTMGP_ENABLE_KOKKOS "Use Kokkos under rrtmgp" FALSE) +option(SCREAM_RRTMGP_ENABLE_YAKL "Use YAKL under rrtmgp" FALSE) +option(SCREAM_RRTMGP_ENABLE_KOKKOS "Use Kokkos under rrtmgp" TRUE) if (SCREAM_RRTMGP_ENABLE_YAKL) add_definitions("-DRRTMGP_ENABLE_YAKL") endif() diff --git a/components/eamxx/cime_config/namelist_defaults_scream.xml b/components/eamxx/cime_config/namelist_defaults_scream.xml index 9a06de10e1ec..c0dab4ac030b 100644 --- a/components/eamxx/cime_config/namelist_defaults_scream.xml +++ b/components/eamxx/cime_config/namelist_defaults_scream.xml @@ -531,6 +531,7 @@ be lost if SCREAM_HACK_XML is not enabled. true + 1.0 diff --git a/components/eamxx/cime_config/testdefs/testmods_dirs/scream/kokkos_rrtmgp/shell_commands b/components/eamxx/cime_config/testdefs/testmods_dirs/scream/kokkos_rrtmgp/shell_commands new file mode 100644 index 000000000000..817dcf42fef5 --- /dev/null +++ b/components/eamxx/cime_config/testdefs/testmods_dirs/scream/kokkos_rrtmgp/shell_commands @@ -0,0 +1,3 @@ +./xmlchange --append SCREAM_CMAKE_OPTIONS='SCREAM_RRTMGP_ENABLE_YAKL Off' +./xmlchange --append SCREAM_CMAKE_OPTIONS='SCREAM_RRTMGP_ENABLE_KOKKOS On' + diff --git a/components/eamxx/cime_config/testdefs/testmods_dirs/scream/yakl_rrtmgp/shell_commands b/components/eamxx/cime_config/testdefs/testmods_dirs/scream/yakl_rrtmgp/shell_commands new file mode 100644 index 000000000000..61d571c95974 --- /dev/null +++ b/components/eamxx/cime_config/testdefs/testmods_dirs/scream/yakl_rrtmgp/shell_commands @@ -0,0 +1,2 @@ +./xmlchange --append SCREAM_CMAKE_OPTIONS='SCREAM_RRTMGP_ENABLE_YAKL On' +./xmlchange --append SCREAM_CMAKE_OPTIONS='SCREAM_RRTMGP_ENABLE_KOKKOS Off' diff --git a/components/eamxx/src/physics/rrtmgp/CMakeLists.txt b/components/eamxx/src/physics/rrtmgp/CMakeLists.txt index a8e47384552a..d3d72a9a1f42 100644 --- a/components/eamxx/src/physics/rrtmgp/CMakeLists.txt +++ b/components/eamxx/src/physics/rrtmgp/CMakeLists.txt @@ -51,59 +51,60 @@ endmacro() ################################## # RRTMGP++ requires YAKL +if (SCREAM_RRTMGP_ENABLE_YAKL) + string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_ci) + if (TARGET yakl) + # Other E3SM components are building YAKL... + message ("It appears some other part of E3SM is building YAKL.\n" + "We will reuse that, but if this is a debug build we will\n" + "add the --fmad=false flag to the cuda flags used by YAKL\n") + else () + # Prepare CUDA/HIP flags for YAKL + if (CUDA_BUILD) + string(REPLACE ";" " " KOKKOS_CUDA_OPTIONS_STR "${KOKKOS_CUDA_OPTIONS}") + set(YAKL_ARCH "CUDA") + set(YAKL_CUDA_FLAGS "-DYAKL_ARCH_CUDA ${KOKKOS_CUDA_OPTIONS_STR} --expt-relaxed-constexpr -ccbin ${CMAKE_CXX_COMPILER}") + string (REPLACE " " ";" YAKL_CUDA_FLAGS_LIST ${YAKL_CUDA_FLAGS}) + endif() + if (HIP_BUILD) + set(YAKL_ARCH "HIP") + set(YAKL_HIP_FLAGS "-DYAKL_ARCH_HIP -O3 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip") + string (REPLACE " " ";" YAKL_HIP_FLAGS_LIST ${YAKL_HIP_FLAGS}) + endif() -string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_ci) -if (TARGET yakl) - # Other E3SM components are building YAKL... - message ("It appears some other part of E3SM is building YAKL.\n" - "We will reuse that, but if this is a debug build we will\n" - "add the --fmad=false flag to the cuda flags used by YAKL\n") -else () - # Prepare CUDA/HIP flags for YAKL - if (CUDA_BUILD) - string(REPLACE ";" " " KOKKOS_CUDA_OPTIONS_STR "${KOKKOS_CUDA_OPTIONS}") - set(YAKL_ARCH "CUDA") - set(YAKL_CUDA_FLAGS "-DYAKL_ARCH_CUDA ${KOKKOS_CUDA_OPTIONS_STR} --expt-relaxed-constexpr -ccbin ${CMAKE_CXX_COMPILER}") - string (REPLACE " " ";" YAKL_CUDA_FLAGS_LIST ${YAKL_CUDA_FLAGS}) - endif() - if (HIP_BUILD) - set(YAKL_ARCH "HIP") - set(YAKL_HIP_FLAGS "-DYAKL_ARCH_HIP -O3 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip") - string (REPLACE " " ";" YAKL_HIP_FLAGS_LIST ${YAKL_HIP_FLAGS}) - endif() + set (YAKL_SOURCE_DIR ${SCREAM_BASE_DIR}/../../externals/YAKL) + add_subdirectory(${YAKL_SOURCE_DIR} ${CMAKE_BINARY_DIR}/externals/YAKL) - set (YAKL_SOURCE_DIR ${SCREAM_BASE_DIR}/../../externals/YAKL) - add_subdirectory(${YAKL_SOURCE_DIR} ${CMAKE_BINARY_DIR}/externals/YAKL) + # Set some additional flag/cpp option on the yakl target - # Set some additional flag/cpp option on the yakl target + cmake_policy (SET CMP0079 NEW) # Allow to link to a tgt from a different directory - cmake_policy (SET CMP0079 NEW) # Allow to link to a tgt from a different directory + # EAMxx *requires* MPI, so simply look for it, then link against it + find_package(MPI REQUIRED COMPONENTS C) + target_link_libraries (yakl INTERFACE MPI::MPI_C) - # EAMxx *requires* MPI, so simply look for it, then link against it - find_package(MPI REQUIRED COMPONENTS C) - target_link_libraries (yakl INTERFACE MPI::MPI_C) + # For debug builds, set -DYAKL_DEBUG + if (CMAKE_BUILD_TYPE_ci STREQUAL "debug") + target_compile_definitions(yakl INTERFACE YAKL_DEBUG) + endif() + endif() - # For debug builds, set -DYAKL_DEBUG - if (CMAKE_BUILD_TYPE_ci STREQUAL "debug") - target_compile_definitions(yakl INTERFACE YAKL_DEBUG) + # See eamxx/src/dynamics/homme/CMakeLists.txt for an explanation of this + # workaround. + if ((SCREAM_MACHINE STREQUAL "ascent" OR SCREAM_MACHINE STREQUAL "pm-gpu") AND CMAKE_BUILD_TYPE_ci STREQUAL "debug") + SetCudaFlagsYakl(yakl CUDA_LANG FLAGS -UNDEBUG) + else() + SetCudaFlagsYakl(yakl CUDA_LANG) endif() -endif() -# See eamxx/src/dynamics/homme/CMakeLists.txt for an explanation of this -# workaround. -if ((SCREAM_MACHINE STREQUAL "ascent" OR SCREAM_MACHINE STREQUAL "pm-gpu") AND CMAKE_BUILD_TYPE_ci STREQUAL "debug") - SetCudaFlagsYakl(yakl CUDA_LANG FLAGS -UNDEBUG) -else() - SetCudaFlagsYakl(yakl CUDA_LANG) + list(APPEND CMAKE_MODULE_PATH ${YAKL_SOURCE_DIR}) + include (yakl_utils) endif() ################################## # RRTMGP # ################################## -list(APPEND CMAKE_MODULE_PATH ${YAKL_SOURCE_DIR}) -include (yakl_utils) - set(EAM_RRTMGP_DIR ${SCREAM_BASE_DIR}/../eam/src/physics/rrtmgp) # Build RRTMGP library; this builds the core RRTMGP external source as a library named "rrtmgp" # NOTE: The external RRTMGP build needs some fixes to work with CUDA in a library build, so for now we will build these ourselves @@ -122,7 +123,13 @@ set(EXTERNAL_SRC add_library(rrtmgp ${EXTERNAL_SRC}) target_compile_definitions(rrtmgp PUBLIC EAMXX_HAS_RRTMGP) EkatDisableAllWarning(rrtmgp) -yakl_process_target(rrtmgp) +if (SCREAM_RRTMGP_ENABLE_YAKL) + yakl_process_target(rrtmgp) +else() + if (CUDA_BUILD) + target_compile_options(rrtmgp PUBLIC $<$:--expt-relaxed-constexpr>) + endif() +endif() # NOTE: cannot use 'PUBLIC' in target_link_libraries, # since yakl_process_target already used it @@ -130,7 +137,11 @@ yakl_process_target(rrtmgp) if (NOT TARGET Kokkos::kokkos) find_package(Kokkos REQUIRED) endif () -target_link_libraries(rrtmgp yakl Kokkos::kokkos) +if (SCREAM_RRTMGP_ENABLE_YAKL) + target_link_libraries(rrtmgp yakl Kokkos::kokkos) +else() + target_link_libraries(rrtmgp Kokkos::kokkos) +endif() target_include_directories(rrtmgp PUBLIC ${SCREAM_BASE_DIR}/../../externals/YAKL ${EAM_RRTMGP_DIR}/external/cpp @@ -158,21 +169,23 @@ target_include_directories(rrtmgp PUBLIC # SCREAM_RRTMGP_YAKL # ################################## -set(SCREAM_RRTMGP_SOURCES_YAKL +set(SCREAM_RRTMGP_SOURCES_INTERFACE scream_rrtmgp_interface.cpp ) -add_library(scream_rrtmgp_yakl ${SCREAM_RRTMGP_SOURCES_YAKL}) -yakl_process_target(scream_rrtmgp_yakl) +add_library(scream_rrtmgp_interface ${SCREAM_RRTMGP_SOURCES_INTERFACE}) +if (SCREAM_RRTMGP_ENABLE_YAKL) + yakl_process_target(scream_rrtmgp_interface) +endif() # NOTE: cannot use 'PUBLIC' in target_link_libraries, # since yakl_process_target already used it # with the "plain" signature find_library(NETCDF_C netcdf HINTS ${NetCDF_C_PATH}/lib) -target_link_libraries(scream_rrtmgp_yakl ${NETCDF_C} rrtmgp scream_share Kokkos::kokkos) -target_include_directories(scream_rrtmgp_yakl PUBLIC +target_link_libraries(scream_rrtmgp_interface ${NETCDF_C} rrtmgp scream_share Kokkos::kokkos) +target_include_directories(scream_rrtmgp_interface PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) -target_include_directories(scream_rrtmgp_yakl SYSTEM PUBLIC +target_include_directories(scream_rrtmgp_interface SYSTEM PUBLIC ${NetCDF_C_PATH}/include ${EAM_RRTMGP_DIR}/external) @@ -186,7 +199,7 @@ set(SCREAM_RRTMGP_SOURCES ) add_library(scream_rrtmgp ${SCREAM_RRTMGP_SOURCES}) -target_link_libraries(scream_rrtmgp PUBLIC scream_share physics_share csm_share scream_rrtmgp_yakl Kokkos::kokkos) +target_link_libraries(scream_rrtmgp PUBLIC scream_share physics_share csm_share scream_rrtmgp_interface Kokkos::kokkos) set_target_properties(scream_rrtmgp PROPERTIES Fortran_MODULE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/modules ) @@ -199,9 +212,11 @@ target_include_directories(scream_rrtmgp PUBLIC # ${YAKL_${YAKL_ARCH}_FLAGS} flags to the CXX flags of scream_rrtmgp. # In particular, this will ensure that all the yakl macros # are correctly defined in YAKL headers, depending on the backend -if (YAKL_ARCH) - target_compile_options(scream_rrtmgp PUBLIC - "$<$:${YAKL_${YAKL_ARCH}_FLAGS_LIST}>") +if (SCREAM_RRTMGP_ENABLE_YAKL) + if (YAKL_ARCH) + target_compile_options(scream_rrtmgp PUBLIC + "$<$:${YAKL_${YAKL_ARCH}_FLAGS_LIST}>") + endif() endif() diff --git a/components/eamxx/src/physics/rrtmgp/eamxx_rrtmgp_process_interface.cpp b/components/eamxx/src/physics/rrtmgp/eamxx_rrtmgp_process_interface.cpp index 0121741963d9..ab61158152c1 100644 --- a/components/eamxx/src/physics/rrtmgp/eamxx_rrtmgp_process_interface.cpp +++ b/components/eamxx/src/physics/rrtmgp/eamxx_rrtmgp_process_interface.cpp @@ -636,6 +636,8 @@ void RRTMGPRadiation::initialize_impl(const RunType /* run_type */) { m_n2vmr = m_params.get("n2vmr", 0.7906); m_covmr = m_params.get("covmr", 1.0e-7); + const double multiplier = m_params.get("pool_size_multiplier", 1.0); + // Whether or not to do MCICA subcolumn sampling m_do_subcol_sampling = m_params.get("do_subcol_sampling",true); @@ -676,13 +678,14 @@ void RRTMGPRadiation::initialize_impl(const RunType /* run_type */) { m_gas_concs_k, coefficients_file_sw, coefficients_file_lw, cloud_optics_file_sw, cloud_optics_file_lw, - m_atm_logger + m_atm_logger, + multiplier ); VALIDATE_KOKKOS(m_gas_concs, m_gas_concs_k); - VALIDATE_KOKKOS(rrtmgp::k_dist_sw, interface_t::k_dist_sw_k); - VALIDATE_KOKKOS(rrtmgp::k_dist_lw, interface_t::k_dist_lw_k); - VALIDATE_KOKKOS(rrtmgp::cloud_optics_sw, interface_t::cloud_optics_sw_k); - VALIDATE_KOKKOS(rrtmgp::cloud_optics_lw, interface_t::cloud_optics_lw_k); + VALIDATE_KOKKOS(rrtmgp::k_dist_sw, *interface_t::k_dist_sw_k); + VALIDATE_KOKKOS(rrtmgp::k_dist_lw, *interface_t::k_dist_lw_k); + VALIDATE_KOKKOS(rrtmgp::cloud_optics_sw, *interface_t::cloud_optics_sw_k); + VALIDATE_KOKKOS(rrtmgp::cloud_optics_lw, *interface_t::cloud_optics_lw_k); #endif // Set property checks for fields in this process @@ -907,6 +910,7 @@ void RRTMGPRadiation::run_impl (const double dt) { ulrreal2dk d_dz = ulrreal2dk(m_buffer.d_dz.data(), m_col_chunk_size, m_nlay); auto d_mu0 = m_buffer.cosine_zenith; #ifdef RRTMGP_ENABLE_YAKL + TIMED_INLINE_KERNEL(init_views, // Create YAKL arrays. RRTMGP expects YAKL arrays with styleFortran, i.e., data has ncol // as the fastest index. For this reason we must copy the data. auto subview_1d = [&](const real1d v) -> real1d { @@ -976,9 +980,11 @@ void RRTMGPRadiation::run_impl (const double dt) { auto cld_tau_lw_bnd = subview_3d(m_buffer.cld_tau_lw_bnd); auto cld_tau_sw_gpt = subview_3d(m_buffer.cld_tau_sw_gpt); auto cld_tau_lw_gpt = subview_3d(m_buffer.cld_tau_lw_gpt); + ); #endif #ifdef RRTMGP_ENABLE_KOKKOS ConvertToRrtmgpSubview conv = {beg, ncol}; + TIMED_INLINE_KERNEL(init_views, // Note, ncol will not necessary be m_col_chunk_size because the number of cols // will not always be evenly divided by m_col_chunk_size. In most cases, the @@ -1039,6 +1045,7 @@ void RRTMGPRadiation::run_impl (const double dt) { auto cld_tau_lw_bnd_k = conv.subview3d(m_buffer.cld_tau_lw_bnd_k); auto cld_tau_sw_gpt_k = conv.subview3d(m_buffer.cld_tau_sw_gpt_k); auto cld_tau_lw_gpt_k = conv.subview3d(m_buffer.cld_tau_lw_gpt_k); + ); #endif // Set gas concs to "view" only the first ncol columns @@ -1072,6 +1079,7 @@ void RRTMGPRadiation::run_impl (const double dt) { Kokkos::deep_copy(d_mu0,h_mu0); const auto policy = ekat::ExeSpaceUtils::get_default_team_policy(ncol, m_nlay); + TIMED_KERNEL( Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const MemberType& team) { const int i = team.league_rank(); const int icol = i+beg; @@ -1215,6 +1223,7 @@ void RRTMGPRadiation::run_impl (const double dt) { } #endif }); + ); } Kokkos::fence(); #ifdef RRTMGP_ENABLE_KOKKOS @@ -1362,18 +1371,22 @@ void RRTMGPRadiation::run_impl (const double dt) { // Compute band-by-band surface_albedos. This is needed since // the AD passes broadband albedos, but rrtmgp require band-by-band. #ifdef RRTMGP_ENABLE_YAKL + TIMED_KERNEL( rrtmgp::compute_band_by_band_surface_albedos( ncol, nswbands, sfc_alb_dir_vis, sfc_alb_dir_nir, sfc_alb_dif_vis, sfc_alb_dif_nir, sfc_alb_dir, sfc_alb_dif); + ); #endif #ifdef RRTMGP_ENABLE_KOKKOS + TIMED_KERNEL( interface_t::compute_band_by_band_surface_albedos( ncol, nswbands, sfc_alb_dir_vis_k, sfc_alb_dir_nir_k, sfc_alb_dif_vis_k, sfc_alb_dif_nir_k, sfc_alb_dir_k, sfc_alb_dif_k); + ); COMPARE_ALL_WRAP(std::vector({sfc_alb_dir, sfc_alb_dif}), std::vector({sfc_alb_dir_k, sfc_alb_dif_k})); #endif @@ -1381,6 +1394,7 @@ void RRTMGPRadiation::run_impl (const double dt) { // Run RRTMGP driver #ifdef RRTMGP_ENABLE_YAKL + TIMED_KERNEL( rrtmgp::rrtmgp_main( ncol, m_nlay, p_lay, t_lay, p_lev, t_lev, @@ -1401,8 +1415,10 @@ void RRTMGPRadiation::run_impl (const double dt) { eccf, m_atm_logger, m_extra_clnclrsky_diag, m_extra_clnsky_diag ); + ); #endif #ifdef RRTMGP_ENABLE_KOKKOS + TIMED_KERNEL( interface_t::rrtmgp_main( ncol, m_nlay, p_lay_k, t_lay_k, p_lev_k, t_lev_k, @@ -1423,6 +1439,7 @@ void RRTMGPRadiation::run_impl (const double dt) { eccf, m_atm_logger, m_extra_clnclrsky_diag, m_extra_clnsky_diag ); + ); COMPARE_ALL_WRAP(std::vector({ sw_flux_up, sw_flux_dn, sw_flux_dn_dir, lw_flux_up, lw_flux_dn, sw_clnclrsky_flux_up, sw_clnclrsky_flux_dn, sw_clnclrsky_flux_dn_dir, @@ -1445,6 +1462,7 @@ void RRTMGPRadiation::run_impl (const double dt) { // Update heating tendency #ifdef RRTMGP_ENABLE_YAKL + TIMED_INLINE_KERNEL(heating_tendency, auto sw_heating = m_buffer.sw_heating; auto lw_heating = m_buffer.lw_heating; rrtmgp::compute_heating_rate( @@ -1466,8 +1484,10 @@ void RRTMGPRadiation::run_impl (const double dt) { }); } Kokkos::fence(); + ); #endif #ifdef RRTMGP_ENABLE_KOKKOS + TIMED_INLINE_KERNEL(heating_tendency, auto sw_heating_k = m_buffer.sw_heating_k; auto lw_heating_k = m_buffer.lw_heating_k; rrtmgp::compute_heating_rate( @@ -1489,6 +1509,7 @@ void RRTMGPRadiation::run_impl (const double dt) { }); } Kokkos::fence(); + ); COMPARE_ALL_WRAP(std::vector({sw_heating, lw_heating}), std::vector({sw_heating_k, lw_heating_k})); #endif @@ -1497,6 +1518,7 @@ void RRTMGPRadiation::run_impl (const double dt) { #ifdef RRTMGP_ENABLE_YAKL const int kbot = nlay+1; + TIMED_KERNEL( // Compute diffuse flux as difference between total and direct Kokkos::parallel_for(Kokkos::RangePolicy(0,nswbands*(nlay+1)*ncol), KOKKOS_LAMBDA (const int idx) { @@ -1513,10 +1535,12 @@ void RRTMGPRadiation::run_impl (const double dt) { sfc_flux_dir_vis, sfc_flux_dir_nir, sfc_flux_dif_vis, sfc_flux_dif_nir ); + ); #endif #ifdef RRTMGP_ENABLE_KOKKOS const int kbot_k = nlay; + TIMED_KERNEL( // Compute diffuse flux as difference between total and direct Kokkos::parallel_for(Kokkos::RangePolicy(0,nswbands*(nlay+1)*ncol), KOKKOS_LAMBDA (const int idx) { @@ -1533,12 +1557,14 @@ void RRTMGPRadiation::run_impl (const double dt) { sfc_flux_dir_vis_k, sfc_flux_dir_nir_k, sfc_flux_dif_vis_k, sfc_flux_dif_nir_k ); + ); COMPARE_ALL_WRAP(std::vector({sfc_flux_dir_vis, sfc_flux_dir_nir, sfc_flux_dif_vis, sfc_flux_dif_nir}), std::vector({sfc_flux_dir_vis_k, sfc_flux_dir_nir_k, sfc_flux_dif_vis_k, sfc_flux_dif_nir_k})); #endif // Compute diagnostic total cloud area (vertically-projected cloud cover) #ifdef RRTMGP_ENABLE_YAKL + TIMED_KERNEL( real1d cldlow ("cldlow", d_cldlow.data() + m_col_chunk_beg[ic], ncol); real1d cldmed ("cldmed", d_cldmed.data() + m_col_chunk_beg[ic], ncol); real1d cldhgh ("cldhgh", d_cldhgh.data() + m_col_chunk_beg[ic], ncol); @@ -1553,8 +1579,10 @@ void RRTMGPRadiation::run_impl (const double dt) { rrtmgp::compute_cloud_area(ncol, nlay, nlwgpts, 400e2, 700e2, p_lay, cld_tau_lw_gpt, cldmed); rrtmgp::compute_cloud_area(ncol, nlay, nlwgpts, 0, 400e2, p_lay, cld_tau_lw_gpt, cldhgh); rrtmgp::compute_cloud_area(ncol, nlay, nlwgpts, 0, std::numeric_limits::max(), p_lay, cld_tau_lw_gpt, cldtot); + ); #endif #ifdef RRTMGP_ENABLE_KOKKOS + TIMED_KERNEL( real1dk cldlow_k (d_cldlow.data() + m_col_chunk_beg[ic], ncol); real1dk cldmed_k (d_cldmed.data() + m_col_chunk_beg[ic], ncol); real1dk cldhgh_k (d_cldhgh.data() + m_col_chunk_beg[ic], ncol); @@ -1569,12 +1597,15 @@ void RRTMGPRadiation::run_impl (const double dt) { interface_t::compute_cloud_area(ncol, nlay, nlwgpts, 400e2, 700e2, p_lay_k, cld_tau_lw_gpt_k, cldmed_k); interface_t::compute_cloud_area(ncol, nlay, nlwgpts, 0, 400e2, p_lay_k, cld_tau_lw_gpt_k, cldhgh_k); interface_t::compute_cloud_area(ncol, nlay, nlwgpts, 0, std::numeric_limits::max(), p_lay_k, cld_tau_lw_gpt_k, cldtot_k); + ); COMPARE_ALL_WRAP(std::vector({cldlow, cldmed, cldhgh, cldtot}), std::vector({cldlow_k, cldmed_k, cldhgh_k, cldtot_k})); #endif // Compute cloud-top diagnostics following AeroCOM recommendation #ifdef RRTMGP_ENABLE_YAKL + TIMED_INLINE_KERNEL(cloud_top, + // Get visible 0.67 micron band for COSP auto idx_067 = rrtmgp::get_wavelength_index_sw(0.67e-6); // Get IR 10.5 micron band for COSP @@ -1595,8 +1626,10 @@ void RRTMGPRadiation::run_impl (const double dt) { nc, T_mid_at_cldtop, p_mid_at_cldtop, cldfrac_ice_at_cldtop, cldfrac_liq_at_cldtop, cldfrac_tot_at_cldtop, cdnc_at_cldtop, eff_radius_qc_at_cldtop, eff_radius_qi_at_cldtop); + ); #endif #ifdef RRTMGP_ENABLE_KOKKOS + TIMED_INLINE_KERNEL(cloud_top, // Get visible 0.67 micron band for COSP auto idx_067_k = interface_t::get_wavelength_index_sw_k(0.67e-6); // Get IR 10.5 micron band for COSP @@ -1616,6 +1649,7 @@ void RRTMGPRadiation::run_impl (const double dt) { nc_k, T_mid_at_cldtop_k, p_mid_at_cldtop_k, cldfrac_ice_at_cldtop_k, cldfrac_liq_at_cldtop_k, cldfrac_tot_at_cldtop_k, cdnc_at_cldtop_k, eff_radius_qc_at_cldtop_k, eff_radius_qi_at_cldtop_k); + ); COMPARE_ALL_WRAP(std::vector({ T_mid_at_cldtop, p_mid_at_cldtop, cldfrac_ice_at_cldtop, cldfrac_liq_at_cldtop, cldfrac_tot_at_cldtop, cdnc_at_cldtop, @@ -1629,6 +1663,7 @@ void RRTMGPRadiation::run_impl (const double dt) { // Copy output data back to FieldManager const auto policy = ekat::ExeSpaceUtils::get_default_team_policy(ncol, m_nlay); #ifdef RRTMGP_ENABLE_YAKL + TIMED_KERNEL( Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const MemberType& team) { const int i = team.league_rank(); const int icol = i + beg; @@ -1671,8 +1706,10 @@ void RRTMGPRadiation::run_impl (const double dt) { d_sunlit(icol) = 0.0; } }); + ); #endif #ifdef RRTMGP_ENABLE_KOKKOS + TIMED_KERNEL( Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const MemberType& team) { const int i = team.league_rank(); const int icol = i + beg; @@ -1714,6 +1751,7 @@ void RRTMGPRadiation::run_impl (const double dt) { d_sunlit(icol) = 0.0; } }); + ); #ifdef RRTMGP_ENABLE_YAKL // Sync back to gas_concs_k real3dk temp(gas_concs_k, std::make_pair(0, ncol), Kokkos::ALL, Kokkos::ALL); diff --git a/components/eamxx/src/physics/rrtmgp/rrtmgp_test_utils.hpp b/components/eamxx/src/physics/rrtmgp/rrtmgp_test_utils.hpp index 466467919fcf..b1cb1f424ec1 100644 --- a/components/eamxx/src/physics/rrtmgp/rrtmgp_test_utils.hpp +++ b/components/eamxx/src/physics/rrtmgp/rrtmgp_test_utils.hpp @@ -88,7 +88,7 @@ static void dummy_clouds( // put them in 2/3 of the columns since that's roughly the total cloudiness of earth. // Set sane values for liquid and ice water path. // NOTE: these "sane" values are in g/m2! - Kokkos::parallel_for( MDRP::template get<2>({nlay,ncol}) , KOKKOS_LAMBDA (int ilay, int icol) { + Kokkos::parallel_for( MDRP::template get<2>({ncol, nlay}) , KOKKOS_LAMBDA (int icol, int ilay) { cloud_mask(icol,ilay) = p_lay(icol,ilay) > 100. * 100. && p_lay(icol,ilay) < 900. * 100. && ((icol+1)%3) != 0; // Ice and liquid will overlap in a few layers lwp(icol,ilay) = conv::merge(10., 0., cloud_mask(icol,ilay) && t_lay(icol,ilay) > 263.); @@ -123,7 +123,7 @@ static void dummy_atmos( // needs the CloudOptics object only because it uses the min and max // valid values from the lookup tables for liquid and ice water path to // create a dummy atmosphere. - dummy_clouds(interface_t::cloud_optics_sw_k, p_lay, t_lay, lwp, iwp, rel, rei, cld); + dummy_clouds(*interface_t::cloud_optics_sw_k, p_lay, t_lay, lwp, iwp, rel, rei, cld); } static void read_fluxes( diff --git a/components/eamxx/src/physics/rrtmgp/rrtmgp_utils.hpp b/components/eamxx/src/physics/rrtmgp/rrtmgp_utils.hpp index aa6326c81811..863597a4fea3 100644 --- a/components/eamxx/src/physics/rrtmgp/rrtmgp_utils.hpp +++ b/components/eamxx/src/physics/rrtmgp/rrtmgp_utils.hpp @@ -40,12 +40,12 @@ void compute_heating_rate ( using physconst = scream::physics::Constants; auto ncol = flux_up.dimension[0]; auto nlay = flux_up.dimension[1]-1; - yakl::fortran::parallel_for(yakl::fortran::SimpleBounds<2>(nlay,ncol), YAKL_LAMBDA(int ilay, int icol) { + TIMED_KERNEL(yakl::fortran::parallel_for(yakl::fortran::SimpleBounds<2>(nlay,ncol), YAKL_LAMBDA(int ilay, int icol) { heating_rate(icol,ilay) = ( flux_up(icol,ilay+1) - flux_up(icol,ilay) - flux_dn(icol,ilay+1) + flux_dn(icol,ilay) ) * physconst::gravit / (physconst::Cpair * pdel(icol,ilay)); - }); + })); } #endif #ifdef RRTMGP_ENABLE_KOKKOS @@ -57,15 +57,15 @@ void compute_heating_rate ( View4 const &heating_rate) { using physconst = scream::physics::Constants; - using MDRP = typename conv::MDRP; - auto ncol = flux_up.extent(0); - auto nlay = flux_up.extent(1)-1; - Kokkos::parallel_for(MDRP::template get<2>({nlay,ncol}), KOKKOS_LAMBDA(int ilay, int icol) { + using LayoutT = typename View1::array_layout; + const int ncol = (int)flux_up.extent(0); + const int nlay = (int)flux_up.extent(1)-1; + TIMED_KERNEL(FLATTEN_MD_KERNEL2(ncol, nlay, icol, ilay, heating_rate(icol,ilay) = ( flux_up(icol,ilay+1) - flux_up(icol,ilay) - flux_dn(icol,ilay+1) + flux_dn(icol,ilay) ) * physconst::gravit / (physconst::Cpair * pdel(icol,ilay)); - }); + )); } #endif diff --git a/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.cpp b/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.cpp index 10281df8078c..9796ce49dbe0 100644 --- a/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.cpp +++ b/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.cpp @@ -122,11 +122,11 @@ OpticalProps2str get_subsampled_clouds( // randomly overlapped. auto cldfrac_rad = real2d("cldfrac_rad", ncol, nlay); memset(cldfrac_rad, 0.0); // Start with all zeros - parallel_for(SimpleBounds<3>(nbnd,nlay,ncol), YAKL_LAMBDA (int ibnd, int ilay, int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<3>(nbnd,nlay,ncol), YAKL_LAMBDA (int ibnd, int ilay, int icol) { if (cloud_optics.tau(icol,ilay,ibnd) > 0) { cldfrac_rad(icol,ilay) = cld(icol,ilay); } - }); + })); // Get subcolumn cloud mask; note that get_subcolumn_mask exposes overlap assumption as an option, // but the only currently supported options are 0 (trivial all-or-nothing cloud) or 1 (max-rand), // so overlap has not been exposed as an option beyond this subcolumn. In the future, we should @@ -136,14 +136,14 @@ OpticalProps2str get_subsampled_clouds( // Get unique seeds for each column that are reproducible across different MPI rank layouts; // use decimal part of pressure for this, consistent with the implementation in EAM auto seeds = int1d("seeds", ncol); - parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) { seeds(icol) = 1e9 * (p_lay(icol,nlay) - int(p_lay(icol,nlay))); - }); + })); auto cldmask = get_subcolumn_mask(ncol, nlay, ngpt, cldfrac_rad, overlap, seeds); // Assign optical properties to subcolumns (note this implements MCICA) auto gpoint_bands = kdist.get_gpoint_bands(); - parallel_for(SimpleBounds<3>(ngpt,nlay,ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<3>(ngpt,nlay,ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) { auto ibnd = gpoint_bands(igpt); if (cldmask(icol,ilay,igpt) == 1) { subsampled_optics.tau(icol,ilay,igpt) = cloud_optics.tau(icol,ilay,ibnd); @@ -154,7 +154,7 @@ OpticalProps2str get_subsampled_clouds( subsampled_optics.ssa(icol,ilay,igpt) = 0; subsampled_optics.g (icol,ilay,igpt) = 0; } - }); + })); return subsampled_optics; } @@ -174,31 +174,31 @@ OpticalProps1scl get_subsampled_clouds( // randomly overlapped. auto cldfrac_rad = real2d("cldfrac_rad", ncol, nlay); memset(cldfrac_rad, 0.0); // Start with all zeros - parallel_for(SimpleBounds<3>(nbnd,nlay,ncol), YAKL_LAMBDA (int ibnd, int ilay, int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<3>(nbnd,nlay,ncol), YAKL_LAMBDA (int ibnd, int ilay, int icol) { if (cloud_optics.tau(icol,ilay,ibnd) > 0) { cldfrac_rad(icol,ilay) = cld(icol,ilay); } - }); + })); // Get subcolumn cloud mask int overlap = 1; // Get unique seeds for each column that are reproducible across different MPI rank layouts; // use decimal part of pressure for this, consistent with the implementation in EAM; use different // seed values for longwave and shortwave auto seeds = int1d("seeds", ncol); - parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) { seeds(icol) = 1e9 * (p_lay(icol,nlay-1) - int(p_lay(icol,nlay-1))); - }); + })); auto cldmask = get_subcolumn_mask(ncol, nlay, ngpt, cldfrac_rad, overlap, seeds); // Assign optical properties to subcolumns (note this implements MCICA) auto gpoint_bands = kdist.get_gpoint_bands(); - parallel_for(SimpleBounds<3>(ngpt,nlay,ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<3>(ngpt,nlay,ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) { auto ibnd = gpoint_bands(igpt); if (cldmask(icol,ilay,igpt) == 1) { subsampled_optics.tau(icol,ilay,igpt) = cloud_optics.tau(icol,ilay,ibnd); } else { subsampled_optics.tau(icol,ilay,igpt) = 0; } - }); + })); return subsampled_optics; } @@ -260,7 +260,7 @@ void compute_band_by_band_surface_albedos( // Loop over bands, and determine for each band whether it is broadly in the // visible or infrared part of the spectrum (visible or "not visible") - parallel_for(SimpleBounds<2>(nswbands, ncol), YAKL_LAMBDA(const int ibnd, const int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<2>(nswbands, ncol), YAKL_LAMBDA(const int ibnd, const int icol) { // Threshold between visible and infrared is 0.7 micron, or 14286 cm^-1. const real visible_wavenumber_threshold = 14286; @@ -291,7 +291,7 @@ void compute_band_by_band_surface_albedos( sfc_alb_dif(icol,ibnd) = 0.5*(sfc_alb_dif_vis(icol) + sfc_alb_dif_nir(icol)); } - }); + })); } void compute_broadband_surface_fluxes( @@ -317,7 +317,7 @@ void compute_broadband_surface_fluxes( // Threshold between visible and infrared is 0.7 micron, or 14286 cm^-1. const real visible_wavenumber_threshold = 14286; auto wavenumber_limits = k_dist_sw.get_band_lims_wavenumber(); - parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(const int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(const int icol) { for (int ibnd = 1; ibnd <= nswbands; ++ibnd) { // Wavenumber is in the visible if it is above the visible wavenumber // threshold, and in the infrared if it is below the threshold @@ -346,7 +346,7 @@ void compute_broadband_surface_fluxes( sfc_flux_dif_nir(icol) += 0.5 * sw_bnd_flux_dif(icol,ktop,ibnd); } } - }); + })); } void rrtmgp_main( @@ -439,16 +439,16 @@ void rrtmgp_main( OpticalProps1scl aerosol_lw; aerosol_sw.init(k_dist_sw.get_band_lims_wavenumber()); aerosol_sw.alloc_2str(ncol, nlay); - parallel_for(SimpleBounds<3>(nswbands,nlay,ncol) , YAKL_LAMBDA (int ibnd, int ilay, int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<3>(nswbands,nlay,ncol) , YAKL_LAMBDA (int ibnd, int ilay, int icol) { aerosol_sw.tau(icol,ilay,ibnd) = aer_tau_sw(icol,ilay,ibnd); aerosol_sw.ssa(icol,ilay,ibnd) = aer_ssa_sw(icol,ilay,ibnd); aerosol_sw.g (icol,ilay,ibnd) = aer_asm_sw(icol,ilay,ibnd); - }); + })); aerosol_lw.init(k_dist_lw.get_band_lims_wavenumber()); aerosol_lw.alloc_1scl(ncol, nlay); - parallel_for(SimpleBounds<3>(nlwbands,nlay,ncol) , YAKL_LAMBDA (int ibnd, int ilay, int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<3>(nlwbands,nlay,ncol) , YAKL_LAMBDA (int ibnd, int ilay, int icol) { aerosol_lw.tau(icol,ilay,ibnd) = aer_tau_lw(icol,ilay,ibnd); - }); + })); #ifdef SCREAM_RRTMGP_DEBUG // Check aerosol optical properties @@ -477,12 +477,12 @@ void rrtmgp_main( // Copy cloud properties to outputs (is this needed, or can we just use pointers?) // Alternatively, just compute and output a subcolumn cloud mask - parallel_for(SimpleBounds<3>(nswgpts, nlay, ncol), YAKL_LAMBDA (int igpt, int ilay, int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<3>(nswgpts, nlay, ncol), YAKL_LAMBDA (int igpt, int ilay, int icol) { cld_tau_sw_gpt(icol,ilay,igpt) = clouds_sw_gpt.tau(icol,ilay,igpt); - }); - parallel_for(SimpleBounds<3>(nlwgpts, nlay, ncol), YAKL_LAMBDA (int igpt, int ilay, int icol) { + })); + TIMED_KERNEL(parallel_for(SimpleBounds<3>(nlwgpts, nlay, ncol), YAKL_LAMBDA (int igpt, int ilay, int icol) { cld_tau_lw_gpt(icol,ilay,igpt) = clouds_lw_gpt.tau(icol,ilay,igpt); - }); + })); #ifdef SCREAM_RRTMGP_DEBUG // Perform checks on optics; these would be caught by RRTMGP_EXPENSIVE_CHECKS in the RRTMGP code, @@ -542,16 +542,16 @@ int3d get_subcolumn_mask(const int ncol, const int nlay, const int ngpt, real2d // https://github.com/AER-RC/RRTMG_SW/blob/master/src/mcica_subcol_gen_sw.f90) // // First, fill cldx with random numbers. Need to use a unique seed for each column! - parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) { yakl::Random rand(seeds(icol)); for (int igpt = 1; igpt <= ngpt; igpt++) { for (int ilay = 1; ilay <= nlay; ilay++) { cldx(icol,ilay,igpt) = rand.genFP(); } } - }); + })); // Step down columns and apply algorithm from eq (14) - parallel_for(SimpleBounds<2>(ngpt,ncol), YAKL_LAMBDA(int igpt, int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<2>(ngpt,ncol), YAKL_LAMBDA(int igpt, int icol) { for (int ilay = 2; ilay <= nlay; ilay++) { // Check cldx in level above and see if it satisfies conditions to create a cloudy subcolumn if (cldx(icol,ilay-1,igpt) > 1.0 - cldf(icol,ilay-1)) { @@ -567,17 +567,17 @@ int3d get_subcolumn_mask(const int ncol, const int nlay, const int ngpt, real2d cldx(icol,ilay,igpt) = cldx(icol,ilay ,igpt) * (1.0 - cldf(icol,ilay-1)); } } - }); + })); } // Use cldx array to create subcolumn mask - parallel_for(SimpleBounds<3>(ngpt,nlay,ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<3>(ngpt,nlay,ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) { if (cldx(icol,ilay,igpt) > 1.0 - cldf(icol,ilay)) { subcolumn_mask(icol,ilay,igpt) = 1; } else { subcolumn_mask(icol,ilay,igpt) = 0; } - }); + })); return subcolumn_mask; } @@ -616,7 +616,7 @@ void rrtmgp_sw( auto &clnsky_flux_dn_dir = clnsky_fluxes.flux_dn_dir; // Reset fluxes to zero - parallel_for(SimpleBounds<2>(nlay+1,ncol), YAKL_LAMBDA(int ilev, int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay+1,ncol), YAKL_LAMBDA(int ilev, int icol) { flux_up (icol,ilev) = 0; flux_dn (icol,ilev) = 0; flux_dn_dir(icol,ilev) = 0; @@ -629,12 +629,12 @@ void rrtmgp_sw( clnsky_flux_up (icol,ilev) = 0; clnsky_flux_dn (icol,ilev) = 0; clnsky_flux_dn_dir(icol,ilev) = 0; - }); - parallel_for(SimpleBounds<3>(nbnd,nlay+1,ncol), YAKL_LAMBDA(int ibnd, int ilev, int icol) { + })); + TIMED_KERNEL(parallel_for(SimpleBounds<3>(nbnd,nlay+1,ncol), YAKL_LAMBDA(int ibnd, int ilev, int icol) { bnd_flux_up (icol,ilev,ibnd) = 0; bnd_flux_dn (icol,ilev,ibnd) = 0; bnd_flux_dn_dir(icol,ilev,ibnd) = 0; - }); + })); // Get daytime indices auto dayIndices = int1d("dayIndices", ncol); @@ -660,23 +660,23 @@ void rrtmgp_sw( // Subset mu0 auto mu0_day = real1d("mu0_day", nday); - parallel_for(SimpleBounds<1>(nday), YAKL_LAMBDA(int iday) { + TIMED_KERNEL(parallel_for(SimpleBounds<1>(nday), YAKL_LAMBDA(int iday) { mu0_day(iday) = mu0(dayIndices(iday)); - }); + })); // subset state variables auto p_lay_day = real2d("p_lay_day", nday, nlay); auto t_lay_day = real2d("t_lay_day", nday, nlay); - parallel_for(SimpleBounds<2>(nlay,nday), YAKL_LAMBDA(int ilay, int iday) { + TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay,nday), YAKL_LAMBDA(int ilay, int iday) { p_lay_day(iday,ilay) = p_lay(dayIndices(iday),ilay); t_lay_day(iday,ilay) = t_lay(dayIndices(iday),ilay); - }); + })); auto p_lev_day = real2d("p_lev_day", nday, nlay+1); auto t_lev_day = real2d("t_lev_day", nday, nlay+1); - parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) { + TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) { p_lev_day(iday,ilev) = p_lev(dayIndices(iday),ilev); t_lev_day(iday,ilev) = t_lev(dayIndices(iday),ilev); - }); + })); // Subset gases auto gas_names = gas_concs.get_gas_names(); @@ -686,9 +686,9 @@ void rrtmgp_sw( auto vmr_day = real2d("vmr_day", nday, nlay); auto vmr = real2d("vmr" , ncol, nlay); gas_concs.get_vmr(gas_names[igas], vmr); - parallel_for(SimpleBounds<2>(nlay,nday), YAKL_LAMBDA(int ilay, int iday) { + TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay,nday), YAKL_LAMBDA(int ilay, int iday) { vmr_day(iday,ilay) = vmr(dayIndices(iday),ilay); - }); + })); gas_concs_day.set_vmr(gas_names[igas], vmr_day); } @@ -696,32 +696,32 @@ void rrtmgp_sw( OpticalProps2str aerosol_day; aerosol_day.init(k_dist.get_band_lims_wavenumber()); aerosol_day.alloc_2str(nday, nlay); - parallel_for(SimpleBounds<3>(nbnd,nlay,nday), YAKL_LAMBDA(int ibnd, int ilay, int iday) { + TIMED_KERNEL(parallel_for(SimpleBounds<3>(nbnd,nlay,nday), YAKL_LAMBDA(int ibnd, int ilay, int iday) { aerosol_day.tau(iday,ilay,ibnd) = aerosol.tau(dayIndices(iday),ilay,ibnd); aerosol_day.ssa(iday,ilay,ibnd) = aerosol.ssa(dayIndices(iday),ilay,ibnd); aerosol_day.g (iday,ilay,ibnd) = aerosol.g (dayIndices(iday),ilay,ibnd); - }); + })); // Subset cloud optics // TODO: nbnd -> ngpt once we pass sub-sampled cloud state OpticalProps2str clouds_day; clouds_day.init(k_dist.get_band_lims_wavenumber(), k_dist.get_band_lims_gpoint()); clouds_day.alloc_2str(nday, nlay); - parallel_for(SimpleBounds<3>(ngpt,nlay,nday), YAKL_LAMBDA(int igpt, int ilay, int iday) { + TIMED_KERNEL(parallel_for(SimpleBounds<3>(ngpt,nlay,nday), YAKL_LAMBDA(int igpt, int ilay, int iday) { clouds_day.tau(iday,ilay,igpt) = clouds.tau(dayIndices(iday),ilay,igpt); clouds_day.ssa(iday,ilay,igpt) = clouds.ssa(dayIndices(iday),ilay,igpt); clouds_day.g (iday,ilay,igpt) = clouds.g (dayIndices(iday),ilay,igpt); - }); + })); // RRTMGP assumes surface albedos have a screwy dimension ordering // for some strange reason, so we need to transpose these; also do // daytime subsetting in the same kernel real2d sfc_alb_dir_T("sfc_alb_dir", nbnd, nday); real2d sfc_alb_dif_T("sfc_alb_dif", nbnd, nday); - parallel_for(SimpleBounds<2>(nbnd,nday), YAKL_LAMBDA(int ibnd, int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<2>(nbnd,nday), YAKL_LAMBDA(int ibnd, int icol) { sfc_alb_dir_T(ibnd,icol) = sfc_alb_dir(dayIndices(icol),ibnd); sfc_alb_dif_T(ibnd,icol) = sfc_alb_dif(dayIndices(icol),ibnd); - }); + })); // Temporaries we need for daytime-only fluxes auto flux_up_day = real2d("flux_up_day", nday, nlay+1); @@ -770,20 +770,20 @@ void rrtmgp_sw( #endif // Apply tsi_scaling - parallel_for(SimpleBounds<2>(ngpt,nday), YAKL_LAMBDA(int igpt, int iday) { + TIMED_KERNEL(parallel_for(SimpleBounds<2>(ngpt,nday), YAKL_LAMBDA(int igpt, int iday) { toa_flux(iday,igpt) = tsi_scaling * toa_flux(iday,igpt); - }); + })); if (extra_clnclrsky_diag) { // Compute clear-clean-sky (just gas) fluxes on daytime columns rte_sw(optics, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day); // Expand daytime fluxes to all columns - parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) { + TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) { int icol = dayIndices(iday); clnclrsky_flux_up (icol,ilev) = flux_up_day (iday,ilev); clnclrsky_flux_dn (icol,ilev) = flux_dn_day (iday,ilev); clnclrsky_flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev); - }); + })); } // Combine gas and aerosol optics @@ -794,12 +794,12 @@ void rrtmgp_sw( rte_sw(optics, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day); // Expand daytime fluxes to all columns - parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) { + TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) { int icol = dayIndices(iday); clrsky_flux_up (icol,ilev) = flux_up_day (iday,ilev); clrsky_flux_dn (icol,ilev) = flux_dn_day (iday,ilev); clrsky_flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev); - }); + })); // Now merge in cloud optics and do allsky calculations @@ -809,18 +809,18 @@ void rrtmgp_sw( // Compute fluxes on daytime columns rte_sw(optics, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day); // Expand daytime fluxes to all columns - parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) { + TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) { int icol = dayIndices(iday); flux_up (icol,ilev) = flux_up_day (iday,ilev); flux_dn (icol,ilev) = flux_dn_day (iday,ilev); flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev); - }); - parallel_for(SimpleBounds<3>(nbnd,nlay+1,nday), YAKL_LAMBDA(int ibnd, int ilev, int iday) { + })); + TIMED_KERNEL(parallel_for(SimpleBounds<3>(nbnd,nlay+1,nday), YAKL_LAMBDA(int ibnd, int ilev, int iday) { int icol = dayIndices(iday); bnd_flux_up (icol,ilev,ibnd) = bnd_flux_up_day (iday,ilev,ibnd); bnd_flux_dn (icol,ilev,ibnd) = bnd_flux_dn_day (iday,ilev,ibnd); bnd_flux_dn_dir(icol,ilev,ibnd) = bnd_flux_dn_dir_day(iday,ilev,ibnd); - }); + })); if (extra_clnsky_diag) { // First increment clouds in optics_no_aerosols @@ -828,12 +828,12 @@ void rrtmgp_sw( // Compute cleansky (gas + clouds) fluxes on daytime columns rte_sw(optics_no_aerosols, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day); // Expand daytime fluxes to all columns - parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) { + TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) { int icol = dayIndices(iday); clnsky_flux_up (icol,ilev) = flux_up_day (iday,ilev); clnsky_flux_dn (icol,ilev) = flux_dn_day (iday,ilev); clnsky_flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev); - }); + })); } } @@ -863,7 +863,7 @@ void rrtmgp_lw( auto &clnsky_flux_dn = clnsky_fluxes.flux_dn; // Reset fluxes to zero - parallel_for( + TIMED_KERNEL(parallel_for( SimpleBounds<2>(nlay + 1, ncol), YAKL_LAMBDA(int ilev, int icol) { flux_up(icol, ilev) = 0; flux_dn(icol, ilev) = 0; @@ -873,13 +873,13 @@ void rrtmgp_lw( clrsky_flux_dn(icol, ilev) = 0; clnsky_flux_up(icol, ilev) = 0; clnsky_flux_dn(icol, ilev) = 0; - }); - parallel_for( + })); + TIMED_KERNEL(parallel_for( SimpleBounds<3>(nbnd, nlay + 1, ncol), YAKL_LAMBDA(int ibnd, int ilev, int icol) { bnd_flux_up(icol, ilev, ibnd) = 0; bnd_flux_dn(icol, ilev, ibnd) = 0; - }); + })); // Allocate space for optical properties OpticalProps1scl optics; @@ -899,9 +899,9 @@ void rrtmgp_lw( // Surface temperature auto p_lay_host = p_lay.createHostCopy(); bool top_at_1 = p_lay_host(1, 1) < p_lay_host(1, nlay); - parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) { + TIMED_KERNEL(parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) { t_sfc(icol) = t_lev(icol, merge(nlay+1, 1, top_at_1)); - }); + })); memset(emis_sfc , 0.98_wp); // Get Gaussian quadrature weights @@ -977,22 +977,22 @@ void compute_cloud_area( // then 2d subcol mask is 1, otherwise it is 0 auto subcol_mask = real2d("subcol_mask", ncol, ngpt); memset(subcol_mask, 0); - yakl::fortran::parallel_for(SimpleBounds<3>(ngpt, nlay, ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) { + TIMED_KERNEL(yakl::fortran::parallel_for(SimpleBounds<3>(ngpt, nlay, ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) { // NOTE: using plev would need to assume level ordering (top to bottom or bottom to top), but // using play/pmid does not if (cld_tau_gpt(icol,ilay,igpt) > 0 && pmid(icol,ilay) >= pmin && pmid(icol,ilay) < pmax) { subcol_mask(icol,igpt) = 1; } - }); + })); // Compute average over subcols to get cloud area auto ngpt_inv = 1.0 / ngpt; memset(cld_area, 0); - yakl::fortran::parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) { + TIMED_KERNEL(yakl::fortran::parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) { // This loop needs to be serial because of the atomic reduction for (int igpt = 1; igpt <= ngpt; ++igpt) { cld_area(icol) += subcol_mask(icol,igpt) * ngpt_inv; } - }); + })); } int get_wavelength_index_sw(double wavelength) { return get_wavelength_index(k_dist_sw, wavelength); } @@ -1008,7 +1008,7 @@ int get_wavelength_index(OpticalProps &kdist, double wavelength) { // in units of meters, we need a conversion factor of 10^2 int nbnds = kdist.get_nband(); yakl::ScalarLiveOut band_index(-1); - yakl::fortran::parallel_for(SimpleBounds<1>(nbnds), YAKL_LAMBDA(int ibnd) { + TIMED_KERNEL(yakl::fortran::parallel_for(SimpleBounds<1>(nbnds), YAKL_LAMBDA(int ibnd) { if (wavelength_bounds(1,ibnd) < wavelength_bounds(2,ibnd)) { if (wavelength_bounds(1,ibnd) <= wavelength * 1e2 && wavelength * 1e2 <= wavelength_bounds(2,ibnd)) { band_index = ibnd; @@ -1018,7 +1018,7 @@ int get_wavelength_index(OpticalProps &kdist, double wavelength) { band_index = ibnd; } } - }); + })); return band_index.hostRead(); } @@ -1056,7 +1056,7 @@ void compute_aerocom_cloudtop( // TODO: move tunable constant to namelist constexpr real cldfrac_tot_threshold = 0.001; // BAD_CONSTANT! // Loop over all columns in parallel - yakl::fortran::parallel_for( + TIMED_KERNEL(yakl::fortran::parallel_for( SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) { // Loop over all layers in serial (due to accumulative // product), starting at 2 (second highest) layer because the @@ -1116,7 +1116,7 @@ void compute_aerocom_cloudtop( // aerocom_clr is the result of accumulative probabilities // (their products) cldfrac_tot_at_cldtop(icol) = 1.0 - aerocom_clr(icol); - }); + })); } } // namespace rrtmgp diff --git a/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.hpp b/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.hpp index e807643b05a9..44cd16ac2314 100644 --- a/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.hpp +++ b/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.hpp @@ -130,7 +130,7 @@ void mixing_ratio_to_cloud_mass( int ncol = mixing_ratio.dimension[0]; int nlay = mixing_ratio.dimension[1]; using physconst = scream::physics::Constants; - yakl::fortran::parallel_for(yakl::fortran::SimpleBounds<2>(nlay, ncol), YAKL_LAMBDA(int ilay, int icol) { + TIMED_KERNEL(yakl::fortran::parallel_for(yakl::fortran::SimpleBounds<2>(nlay, ncol), YAKL_LAMBDA(int ilay, int icol) { // Compute in-cloud mixing ratio (mixing ratio of the cloudy part of the layer) // NOTE: these thresholds (from E3SM) seem arbitrary, but included here for consistency // This limits in-cloud mixing ratio to 0.005 kg/kg. According to note in cloud_diagnostics @@ -142,14 +142,14 @@ void mixing_ratio_to_cloud_mass( } else { cloud_mass(icol,ilay) = 0; } - }); + })); } template void limit_to_bounds(S const &arr_in, T const lower, T const upper, S &arr_out) { - yakl::c::parallel_for(arr_in.totElems(), YAKL_LAMBDA(int i) { + TIMED_KERNEL(yakl::c::parallel_for(arr_in.totElems(), YAKL_LAMBDA(int i) { arr_out.data()[i] = std::min(std::max(arr_in.data()[i], lower), upper); - }); + })); } int get_wavelength_index(OpticalProps &kdist, double wavelength); @@ -170,7 +170,7 @@ using view_t = Kokkos::View; template using hview_t = Kokkos::View; -using pool_t = conv::MemPoolSingleton; +using pool_t = conv::MemPoolSingleton; using real1dk = view_t; using real2dk = view_t; @@ -179,6 +179,7 @@ using creal1dk = view_t; using creal2dk = view_t; using creal3dk = view_t; using int1dk = view_t; +using int2dk = view_t; using int3dk = view_t; using gas_optics_t = GasOpticsRRTMGPK; @@ -196,16 +197,16 @@ using source_func_t = SourceFuncLWK; * once and then persist throughout the life of the program, so we * declare them here within the rrtmgp namespace. */ -static inline gas_optics_t k_dist_sw_k; -static inline gas_optics_t k_dist_lw_k; +static inline std::unique_ptr k_dist_sw_k; +static inline std::unique_ptr k_dist_lw_k; /* * Objects containing cloud optical property look-up table information. * We want to initialize these once and use throughout the life of the * program, so declare here and read data in during rrtmgp_initialize(). */ -static inline cloud_optics_t cloud_optics_sw_k; -static inline cloud_optics_t cloud_optics_lw_k; +static inline std::unique_ptr cloud_optics_sw_k; +static inline std::unique_ptr cloud_optics_lw_k; /* * Flag to indicate whether or not we have initialized RRTMGP @@ -213,13 +214,14 @@ static inline cloud_optics_t cloud_optics_lw_k; static inline bool initialized_k = false; /* - * Initialize data for RRTMGP driver + * Initialize data for RRTMGP driver. Increase multiplier to allocate more pool space. */ static void rrtmgp_initialize( const gas_concs_t &gas_concs, const std::string& coefficients_file_sw, const std::string& coefficients_file_lw, const std::string& cloud_optics_file_sw, const std::string& cloud_optics_file_lw, - const std::shared_ptr& logger) + const std::shared_ptr& logger, + const double multiplier = 1.0) { // If we've already initialized, just exit if (initialized_k) { @@ -231,21 +233,27 @@ static void rrtmgp_initialize( // Initialize Kokkos if (!Kokkos::is_initialized()) { Kokkos::initialize(); } + // Create objects for static ptrs + k_dist_sw_k = std::make_unique(); + k_dist_lw_k = std::make_unique(); + cloud_optics_sw_k = std::make_unique(); + cloud_optics_lw_k = std::make_unique(); + // Load and initialize absorption coefficient data - load_and_init(k_dist_sw_k, coefficients_file_sw, gas_concs); - load_and_init(k_dist_lw_k, coefficients_file_lw, gas_concs); + load_and_init(*k_dist_sw_k, coefficients_file_sw, gas_concs); + load_and_init(*k_dist_lw_k, coefficients_file_lw, gas_concs); // Load and initialize cloud optical property look-up table information - load_cld_lutcoeff(cloud_optics_sw_k, cloud_optics_file_sw); - load_cld_lutcoeff(cloud_optics_lw_k, cloud_optics_file_lw); + load_cld_lutcoeff(*cloud_optics_sw_k, cloud_optics_file_sw); + load_cld_lutcoeff(*cloud_optics_lw_k, cloud_optics_file_lw); // initialize kokkos rrtmgp pool allocator - const size_t base_ref = 80000; + const size_t base_ref = 40000; const size_t ncol = gas_concs.ncol; const size_t nlay = gas_concs.nlay; const size_t nlev = SCREAM_NUM_VERTICAL_LEV; const size_t my_size_ref = ncol * nlay * nlev; - pool_t::init(2e6 * (float(my_size_ref) / base_ref)); + pool_t::init(2e6 * (float(my_size_ref) / base_ref) * multiplier); // We are now initialized! initialized_k = true; @@ -261,7 +269,7 @@ static void compute_band_by_band_surface_albedos( const real2dk &sfc_alb_dir, const real2dk &sfc_alb_dif) { EKAT_ASSERT_MSG(initialized_k, "Error! rrtmgp_initialize must be called before GasOpticsRRTMGP object can be used."); - auto wavenumber_limits = k_dist_sw_k.get_band_lims_wavenumber(); + auto wavenumber_limits = k_dist_sw_k->get_band_lims_wavenumber(); EKAT_ASSERT_MSG(wavenumber_limits.extent(0) == 2, "Error! 1st dimension for wavenumber_limits should be 2. It's " << wavenumber_limits.extent(0)); @@ -270,8 +278,7 @@ static void compute_band_by_band_surface_albedos( // Loop over bands, and determine for each band whether it is broadly in the // visible or infrared part of the spectrum (visible or "not visible") - Kokkos::parallel_for(MDRP::template get<2>({nswbands, ncol}), KOKKOS_LAMBDA(const int ibnd, const int icol) { - + TIMED_KERNEL(FLATTEN_MD_KERNEL2(ncol, nswbands, icol, ibnd, // Threshold between visible and infrared is 0.7 micron, or 14286 cm^-1. const RealT visible_wavenumber_threshold = 14286; @@ -297,7 +304,7 @@ static void compute_band_by_band_surface_albedos( sfc_alb_dir(icol,ibnd) = 0.5*(sfc_alb_dir_vis(icol) + sfc_alb_dir_nir(icol)); sfc_alb_dif(icol,ibnd) = 0.5*(sfc_alb_dif_vis(icol) + sfc_alb_dif_nir(icol)); } - }); + )); } /* @@ -326,8 +333,8 @@ static void compute_broadband_surface_fluxes( // Threshold between visible and infrared is 0.7 micron, or 14286 cm^-1. const RealT visible_wavenumber_threshold = 14286; - auto wavenumber_limits = k_dist_sw_k.get_band_lims_wavenumber(); - Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(const int icol) { + auto wavenumber_limits = k_dist_sw_k->get_band_lims_wavenumber(); + TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(const int icol) { for (int ibnd = 0; ibnd < nswbands; ++ibnd) { // Wavenumber is in the visible if it is above the visible wavenumber // threshold, and in the infrared if it is below the threshold @@ -353,7 +360,7 @@ static void compute_broadband_surface_fluxes( sfc_flux_dif_nir(icol) += 0.5 * sw_bnd_flux_dif(icol,ktop,ibnd); } } - }); + })); } /* @@ -384,12 +391,17 @@ static void rrtmgp_main( const std::shared_ptr& logger, const bool extra_clnclrsky_diag = false, const bool extra_clnsky_diag = false) { + const int sw_nband = k_dist_sw_k->get_nband(); + const int lw_nband = k_dist_lw_k->get_nband(); + const int sw_ngpt = k_dist_sw_k->get_ngpt(); + const int lw_ngpt = k_dist_lw_k->get_ngpt(); + #ifdef SCREAM_RRTMGP_DEBUG // Sanity check inputs, and possibly repair - check_range_k(t_lay , k_dist_sw_k.get_temp_min(), k_dist_sw_k.get_temp_max(), "rrtmgp_main::t_lay"); - check_range_k(t_lev , k_dist_sw_k.get_temp_min(), k_dist_sw_k.get_temp_max(), "rrtmgp_main::t_lev"); - check_range_k(p_lay , k_dist_sw_k.get_press_min(), k_dist_sw_k.get_press_max(), "rrtmgp_main::p_lay"); - check_range_k(p_lev , k_dist_sw_k.get_press_min(), k_dist_sw_k.get_press_max(), "rrtmgp_main::p_lev"); + check_range_k(t_lay , k_dist_sw_k->get_temp_min(), k_dist_sw_k->get_temp_max(), "rrtmgp_main::t_lay"); + check_range_k(t_lev , k_dist_sw_k->get_temp_min(), k_dist_sw_k->get_temp_max(), "rrtmgp_main::t_lev"); + check_range_k(p_lay , k_dist_sw_k->get_press_min(), k_dist_sw_k->get_press_max(), "rrtmgp_main::p_lay"); + check_range_k(p_lev , k_dist_sw_k->get_press_min(), k_dist_sw_k->get_press_max(), "rrtmgp_main::p_lev"); check_range_k(sfc_alb_dir, 0, 1, "rrtmgp_main::sfc_alb_dir"); check_range_k(sfc_alb_dif, 0, 1, "rrtmgp_main::sfc_alb_dif"); check_range_k(mu0 , 0, 1, "rrtmgp_main::mu0"); @@ -399,6 +411,36 @@ static void rrtmgp_main( check_range_k(rei , 0, std::numeric_limits::max(), "rrtmgp_main::rei"); #endif + auto sw_band2gpt_mem = pool_t::template alloc(2, sw_nband); + auto sw_gpt2band_mem = pool_t::template alloc( sw_nband); + auto lw_band2gpt_mem = pool_t::template alloc(2, lw_nband); + auto lw_gpt2band_mem = pool_t::template alloc( lw_nband); + + auto sw_cloud_band2gpt_mem = pool_t::template alloc(2, sw_nband); + auto sw_cloud_gpt2band_mem = pool_t::template alloc( sw_nband); + auto lw_cloud_band2gpt_mem = pool_t::template alloc(2, lw_nband); + auto lw_cloud_gpt2band_mem = pool_t::template alloc( lw_nband); + + auto sw_subcloud_band2gpt_mem = pool_t::template alloc(2, sw_nband); + auto sw_subcloud_gpt2band_mem = pool_t::template alloc( sw_ngpt); + auto lw_subcloud_band2gpt_mem = pool_t::template alloc(2, lw_nband); + auto lw_subcloud_gpt2band_mem = pool_t::template alloc( lw_ngpt); + + auto sw_tau_mem = pool_t::template alloc(ncol, nlay, sw_nband); + auto sw_ssa_mem = pool_t::template alloc(ncol, nlay, sw_nband); + auto sw_g_mem = pool_t::template alloc(ncol, nlay, sw_nband); + auto lw_tau_mem = pool_t::template alloc(ncol, nlay, lw_nband); + + auto sw_cloud_tau_mem = pool_t::template alloc(ncol, nlay, sw_nband); + auto sw_cloud_ssa_mem = pool_t::template alloc(ncol, nlay, sw_nband); + auto sw_cloud_g_mem = pool_t::template alloc(ncol, nlay, sw_nband); + auto lw_cloud_tau_mem = pool_t::template alloc(ncol, nlay, lw_nband); + + auto sw_subcloud_tau_mem = pool_t::template alloc(ncol, nlay, sw_ngpt); + auto sw_subcloud_ssa_mem = pool_t::template alloc(ncol, nlay, sw_ngpt); + auto sw_subcloud_g_mem = pool_t::template alloc(ncol, nlay, sw_ngpt); + auto lw_subcloud_tau_mem = pool_t::template alloc(ncol, nlay, lw_ngpt); + // Setup pointers to RRTMGP SW fluxes fluxes_t fluxes_sw; fluxes_sw.flux_up = sw_flux_up; @@ -442,24 +484,24 @@ static void rrtmgp_main( clnsky_fluxes_lw.flux_up = lw_clnsky_flux_up; clnsky_fluxes_lw.flux_dn = lw_clnsky_flux_dn; - auto nswbands = k_dist_sw_k.get_nband(); - auto nlwbands = k_dist_lw_k.get_nband(); + auto nswbands = k_dist_sw_k->get_nband(); + auto nlwbands = k_dist_lw_k->get_nband(); // Setup aerosol optical properties optical_props2_t aerosol_sw; optical_props1_t aerosol_lw; - aerosol_sw.init(k_dist_sw_k.get_band_lims_wavenumber()); - aerosol_sw.alloc_2str(ncol, nlay); - Kokkos::parallel_for(MDRP::template get<3>({nswbands,nlay,ncol}) , KOKKOS_LAMBDA (int ibnd, int ilay, int icol) { + aerosol_sw.init_no_alloc(k_dist_sw_k->get_band_lims_wavenumber(), sw_band2gpt_mem, sw_gpt2band_mem); + aerosol_sw.alloc_2str_no_alloc(ncol, nlay, sw_tau_mem, sw_ssa_mem, sw_g_mem); + TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol,nlay,nswbands, icol, ilay, ibnd, aerosol_sw.tau(icol,ilay,ibnd) = aer_tau_sw(icol,ilay,ibnd); aerosol_sw.ssa(icol,ilay,ibnd) = aer_ssa_sw(icol,ilay,ibnd); aerosol_sw.g (icol,ilay,ibnd) = aer_asm_sw(icol,ilay,ibnd); - }); - aerosol_lw.init(k_dist_lw_k.get_band_lims_wavenumber()); - aerosol_lw.alloc_1scl(ncol, nlay); - Kokkos::parallel_for(MDRP::template get<3>({nlwbands,nlay,ncol}) , KOKKOS_LAMBDA (int ibnd, int ilay, int icol) { + )); + aerosol_lw.init_no_alloc(k_dist_lw_k->get_band_lims_wavenumber(), lw_band2gpt_mem, lw_gpt2band_mem); + aerosol_lw.alloc_1scl_no_alloc(ncol, nlay, lw_tau_mem); + TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol,nlay,nlwbands, icol, ilay, ibnd, aerosol_lw.tau(icol,ilay,ibnd) = aer_tau_lw(icol,ilay,ibnd); - }); + )); #ifdef SCREAM_RRTMGP_DEBUG // Check aerosol optical properties @@ -472,28 +514,29 @@ static void rrtmgp_main( #endif // Convert cloud physical properties to optical properties for input to RRTMGP - optical_props2_t clouds_sw = get_cloud_optics_sw(ncol, nlay, cloud_optics_sw_k, k_dist_sw_k, lwp, iwp, rel, rei); - optical_props1_t clouds_lw = get_cloud_optics_lw(ncol, nlay, cloud_optics_lw_k, k_dist_lw_k, lwp, iwp, rel, rei); + optical_props2_t clouds_sw = get_cloud_optics_sw(ncol, nlay, *cloud_optics_sw_k, *k_dist_sw_k, lwp, iwp, rel, rei, sw_cloud_band2gpt_mem, sw_cloud_gpt2band_mem, sw_cloud_tau_mem, sw_cloud_ssa_mem, sw_cloud_g_mem); + optical_props1_t clouds_lw = get_cloud_optics_lw(ncol, nlay, *cloud_optics_lw_k, *k_dist_lw_k, lwp, iwp, rel, rei, lw_cloud_band2gpt_mem, lw_cloud_gpt2band_mem, lw_cloud_tau_mem); Kokkos::deep_copy(cld_tau_sw_bnd, clouds_sw.tau); Kokkos::deep_copy(cld_tau_lw_bnd, clouds_lw.tau); // Do subcolumn sampling to map bands -> gpoints based on cloud fraction and overlap assumption; // This implements the Monte Carlo Independing Column Approximation by mapping only a single // subcolumn (cloud state) to each gpoint. - auto nswgpts = k_dist_sw_k.get_ngpt(); - auto clouds_sw_gpt = get_subsampled_clouds(ncol, nlay, nswbands, nswgpts, clouds_sw, k_dist_sw_k, cldfrac, p_lay); + auto nswgpts = k_dist_sw_k->get_ngpt(); + auto clouds_sw_gpt = get_subsampled_clouds(ncol, nlay, nswbands, nswgpts, clouds_sw, *k_dist_sw_k, cldfrac, p_lay, sw_subcloud_band2gpt_mem, sw_subcloud_gpt2band_mem, sw_subcloud_tau_mem, sw_subcloud_ssa_mem, sw_subcloud_g_mem); + // Longwave - auto nlwgpts = k_dist_lw_k.get_ngpt(); - auto clouds_lw_gpt = get_subsampled_clouds(ncol, nlay, nlwbands, nlwgpts, clouds_lw, k_dist_lw_k, cldfrac, p_lay); + auto nlwgpts = k_dist_lw_k->get_ngpt(); + auto clouds_lw_gpt = get_subsampled_clouds(ncol, nlay, nlwbands, nlwgpts, clouds_lw, *k_dist_lw_k, cldfrac, p_lay, lw_subcloud_band2gpt_mem, lw_subcloud_gpt2band_mem, lw_subcloud_tau_mem); // Copy cloud properties to outputs (is this needed, or can we just use pointers?) // Alternatively, just compute and output a subcolumn cloud mask - Kokkos::parallel_for(MDRP::template get<3>({nswgpts, nlay, ncol}), KOKKOS_LAMBDA (int igpt, int ilay, int icol) { + TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay, nswgpts, icol, ilay, igpt, cld_tau_sw_gpt(icol,ilay,igpt) = clouds_sw_gpt.tau(icol,ilay,igpt); - }); - Kokkos::parallel_for(MDRP::template get<3>({nlwgpts, nlay, ncol}), KOKKOS_LAMBDA (int igpt, int ilay, int icol) { + )); + TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay, nlwgpts, icol, ilay, igpt, cld_tau_lw_gpt(icol,ilay,igpt) = clouds_lw_gpt.tau(icol,ilay,igpt); - }); + )); #ifdef SCREAM_RRTMGP_DEBUG // Perform checks on optics; these would be caught by RRTMGP_EXPENSIVE_CHECKS in the RRTMGP code, @@ -511,7 +554,7 @@ static void rrtmgp_main( // Do shortwave rrtmgp_sw( ncol, nlay, - k_dist_sw_k, p_lay, t_lay, p_lev, t_lev, gas_concs, + *k_dist_sw_k, p_lay, t_lay, p_lev, t_lev, gas_concs, sfc_alb_dir, sfc_alb_dif, mu0, aerosol_sw, clouds_sw_gpt, fluxes_sw, clnclrsky_fluxes_sw, clrsky_fluxes_sw, clnsky_fluxes_sw, tsi_scaling, logger, @@ -521,12 +564,41 @@ static void rrtmgp_main( // Do longwave rrtmgp_lw( ncol, nlay, - k_dist_lw_k, p_lay, t_lay, p_lev, t_lev, gas_concs, + *k_dist_lw_k, p_lay, t_lay, p_lev, t_lev, gas_concs, aerosol_lw, clouds_lw_gpt, fluxes_lw, clnclrsky_fluxes_lw, clrsky_fluxes_lw, clnsky_fluxes_lw, extra_clnclrsky_diag, extra_clnsky_diag ); + pool_t::dealloc(sw_band2gpt_mem); + pool_t::dealloc(sw_gpt2band_mem); + pool_t::dealloc(lw_band2gpt_mem); + pool_t::dealloc(lw_gpt2band_mem); + + pool_t::dealloc(sw_cloud_band2gpt_mem); + pool_t::dealloc(sw_cloud_gpt2band_mem); + pool_t::dealloc(lw_cloud_band2gpt_mem); + pool_t::dealloc(lw_cloud_gpt2band_mem); + + pool_t::dealloc(sw_subcloud_band2gpt_mem); + pool_t::dealloc(sw_subcloud_gpt2band_mem); + pool_t::dealloc(lw_subcloud_band2gpt_mem); + pool_t::dealloc(lw_subcloud_gpt2band_mem); + + pool_t::dealloc(sw_tau_mem); + pool_t::dealloc(sw_ssa_mem); + pool_t::dealloc(sw_g_mem); + pool_t::dealloc(lw_tau_mem); + + pool_t::dealloc(sw_cloud_tau_mem); + pool_t::dealloc(sw_cloud_ssa_mem); + pool_t::dealloc(sw_cloud_g_mem); + pool_t::dealloc(lw_cloud_tau_mem); + + pool_t::dealloc(sw_subcloud_tau_mem); + pool_t::dealloc(sw_subcloud_ssa_mem); + pool_t::dealloc(sw_subcloud_g_mem); + pool_t::dealloc(lw_subcloud_tau_mem); } /* @@ -535,10 +607,14 @@ static void rrtmgp_main( static void rrtmgp_finalize() { initialized_k = false; - k_dist_sw_k.finalize(); - k_dist_lw_k.finalize(); - cloud_optics_sw_k.finalize(); //~CloudOptics(); - cloud_optics_lw_k.finalize(); //~CloudOptics(); + k_dist_sw_k->finalize(); + k_dist_lw_k->finalize(); + cloud_optics_sw_k->finalize(); //~CloudOptics(); + cloud_optics_lw_k->finalize(); //~CloudOptics(); + k_dist_sw_k = nullptr; + k_dist_lw_k = nullptr; + cloud_optics_sw_k = nullptr; + cloud_optics_lw_k = nullptr; pool_t::finalize(); } @@ -580,7 +656,7 @@ static void rrtmgp_sw( auto &clnsky_flux_dn_dir = clnsky_fluxes.flux_dn_dir; // Reset fluxes to zero - Kokkos::parallel_for(MDRP::template get<2>({nlay+1,ncol}), KOKKOS_LAMBDA(int ilev, int icol) { + TIMED_KERNEL(FLATTEN_MD_KERNEL2(ncol, nlay+1, icol, ilev, flux_up (icol,ilev) = 0; flux_dn (icol,ilev) = 0; flux_dn_dir(icol,ilev) = 0; @@ -593,15 +669,15 @@ static void rrtmgp_sw( clnsky_flux_up (icol,ilev) = 0; clnsky_flux_dn (icol,ilev) = 0; clnsky_flux_dn_dir(icol,ilev) = 0; - }); - Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay+1,ncol}), KOKKOS_LAMBDA(int ibnd, int ilev, int icol) { + )); + TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay+1, nbnd, icol, ilev, ibnd, bnd_flux_up (icol,ilev,ibnd) = 0; bnd_flux_dn (icol,ilev,ibnd) = 0; bnd_flux_dn_dir(icol,ilev,ibnd) = 0; - }); + )); // Get daytime indices - auto dayIndices = pool_t::template alloc_and_init(ncol); + auto dayIndices = pool_t::template alloc(ncol); Kokkos::deep_copy(dayIndices, -1); int nday = 0; @@ -621,99 +697,113 @@ static void rrtmgp_sw( } // Allocate temporaries from pool - const int size1 = nday; - const int size2 = nday*nlay; // 4 - const int size3 = nday*(nlay+1); // 5 - const int size4 = ncol*nlay; - const int size5 = nbnd*nday; //2 - const int size6 = nday*ngpt; - const int size7 = nday*(nlay+1)*nbnd; // 3 - const int size8 = ncol*nlay*(k_dist.get_ngas()+1); - const int total_size = size1 + size2*4 + size3*5 + size4 + size5*2 + size6 + size7*3 + size8; - auto data = pool_t::template alloc_and_init(total_size); RealT* dcurr = data.data(); + auto mu0_day = pool_t::template alloc(nday); - auto mu0_day = view_t (dcurr, nday); dcurr += size1; + auto p_lay_day = pool_t::template alloc(nday, nlay); + auto t_lay_day = pool_t::template alloc(nday, nlay); + auto vmr_day = pool_t::template alloc(nday, nlay); + auto t_lay_limited = pool_t::template alloc(nday, nlay); - auto p_lay_day = view_t (dcurr, nday, nlay); dcurr += size2; - auto t_lay_day = view_t (dcurr, nday, nlay); dcurr += size2; - auto vmr_day = view_t (dcurr, nday, nlay); dcurr += size2; - auto t_lay_limited = view_t (dcurr, nday, nlay); dcurr += size2; + auto p_lev_day = pool_t::template alloc(nday, nlay+1); + auto t_lev_day = pool_t::template alloc(nday, nlay+1); + auto flux_up_day = pool_t::template alloc(nday, nlay+1); + auto flux_dn_day = pool_t::template alloc(nday, nlay+1); + auto flux_dn_dir_day = pool_t::template alloc(nday, nlay+1); - auto p_lev_day = view_t (dcurr, nday, nlay+1); dcurr += size3; - auto t_lev_day = view_t (dcurr, nday, nlay+1); dcurr += size3; - auto flux_up_day = view_t (dcurr, nday, nlay+1); dcurr += size3; - auto flux_dn_day = view_t (dcurr, nday, nlay+1); dcurr += size3; - auto flux_dn_dir_day = view_t (dcurr, nday, nlay+1); dcurr += size3; + auto vmr = pool_t::template alloc(ncol, nlay); - auto vmr = view_t (dcurr, ncol, nlay); dcurr += size4; + auto sfc_alb_dir_T = pool_t::template alloc(nbnd, nday); + auto sfc_alb_dif_T = pool_t::template alloc(nbnd, nday); - auto sfc_alb_dir_T = view_t (dcurr, nbnd, nday); dcurr += size5; - auto sfc_alb_dif_T = view_t (dcurr, nbnd, nday); dcurr += size5; + auto toa_flux = pool_t::template alloc(nday, ngpt); - auto toa_flux = view_t (dcurr, nday, ngpt); dcurr += size6; + auto bnd_flux_up_day = pool_t::template alloc(nday, nlay+1, nbnd); + auto bnd_flux_dn_day = pool_t::template alloc(nday, nlay+1, nbnd); + auto bnd_flux_dn_dir_day = pool_t::template alloc(nday, nlay+1, nbnd); - auto bnd_flux_up_day = view_t(dcurr, nday, nlay+1, nbnd); dcurr += size7; - auto bnd_flux_dn_day = view_t(dcurr, nday, nlay+1, nbnd); dcurr += size7; - auto bnd_flux_dn_dir_day = view_t(dcurr, nday, nlay+1, nbnd); dcurr += size7; + auto col_gas = pool_t::template alloc(ncol, nlay, k_dist.get_ngas()+1); - auto col_gas = view_t(dcurr, ncol, nlay, k_dist.get_ngas()+1); dcurr += size8; + auto concs_mem = pool_t::template alloc(nday, nlay, ngas); + + auto sw_aero_tau_mem = pool_t::template alloc(nday, nlay, nbnd); + auto sw_aero_ssa_mem = pool_t::template alloc(nday, nlay, nbnd); + auto sw_aero_g_mem = pool_t::template alloc(nday, nlay, nbnd); + + auto sw_cloud_tau_mem = pool_t::template alloc(nday, nlay, ngpt); + auto sw_cloud_ssa_mem = pool_t::template alloc(nday, nlay, ngpt); + auto sw_cloud_g_mem = pool_t::template alloc(nday, nlay, ngpt); + auto sw_optics_tau_mem = pool_t::template alloc(nday, nlay, ngpt); + auto sw_optics_ssa_mem = pool_t::template alloc(nday, nlay, ngpt); + auto sw_optics_g_mem = pool_t::template alloc(nday, nlay, ngpt); + auto sw_noaero_tau_mem = pool_t::template alloc(nday, nlay, ngpt); + auto sw_noaero_ssa_mem = pool_t::template alloc(nday, nlay, ngpt); + auto sw_noaero_g_mem = pool_t::template alloc(nday, nlay, ngpt); + + auto sw_aero_band2gpt_mem = pool_t::template alloc(2, nbnd); + auto sw_aero_gpt2band_mem = pool_t::template alloc( nbnd); + auto sw_cloud_band2gpt_mem = pool_t::template alloc(2, nbnd); + auto sw_cloud_gpt2band_mem = pool_t::template alloc( ngpt); + auto sw_optics_band2gpt_mem = pool_t::template alloc(2, nbnd); + auto sw_optics_gpt2band_mem = pool_t::template alloc( ngpt); + auto sw_noaero_band2gpt_mem = pool_t::template alloc(2, nbnd); + auto sw_noaero_gpt2band_mem = pool_t::template alloc( ngpt); // Subset mu0 - Kokkos::parallel_for(nday, KOKKOS_LAMBDA(int iday) { + TIMED_KERNEL(Kokkos::parallel_for(nday, KOKKOS_LAMBDA(int iday) { mu0_day(iday) = mu0(dayIndices(iday)); - }); + })); // subset state variables - Kokkos::parallel_for(MDRP::template get<2>({nlay,nday}), KOKKOS_LAMBDA(int ilay, int iday) { + TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay, iday, ilay, p_lay_day(iday,ilay) = p_lay(dayIndices(iday),ilay); t_lay_day(iday,ilay) = t_lay(dayIndices(iday),ilay); - }); - Kokkos::parallel_for(MDRP::template get<2>({nlay+1,nday}), KOKKOS_LAMBDA(int ilev, int iday) { + )); + TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay+1, iday, ilev, p_lev_day(iday,ilev) = p_lev(dayIndices(iday),ilev); t_lev_day(iday,ilev) = t_lev(dayIndices(iday),ilev); - }); + )); // Subset gases auto gas_names = gas_concs.get_gas_names(); gas_concs_t gas_concs_day; - gas_concs_day.init(gas_names, nday, nlay); + gas_concs_day.init_no_alloc(gas_names, nday, nlay, concs_mem); for (int igas = 0; igas < ngas; igas++) { gas_concs.get_vmr(gas_names[igas], vmr); - Kokkos::parallel_for(MDRP::template get<2>({nlay,nday}), KOKKOS_LAMBDA(int ilay, int iday) { + TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay, iday, ilay, vmr_day(iday,ilay) = vmr(dayIndices(iday),ilay); - }); + )); gas_concs_day.set_vmr(gas_names[igas], vmr_day); } // Subset aerosol optics optical_props2_t aerosol_day; - aerosol_day.init(k_dist.get_band_lims_wavenumber()); - aerosol_day.alloc_2str(nday, nlay); - Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay,nday}), KOKKOS_LAMBDA(int ibnd, int ilay, int iday) { + aerosol_day.init_no_alloc(k_dist.get_band_lims_wavenumber(), sw_aero_band2gpt_mem, sw_aero_gpt2band_mem); + aerosol_day.alloc_2str_no_alloc(nday, nlay, sw_aero_tau_mem, sw_aero_ssa_mem, sw_aero_g_mem); + TIMED_KERNEL(FLATTEN_MD_KERNEL3(nday,nlay,nbnd, iday, ilay, ibnd, aerosol_day.tau(iday,ilay,ibnd) = aerosol.tau(dayIndices(iday),ilay,ibnd); aerosol_day.ssa(iday,ilay,ibnd) = aerosol.ssa(dayIndices(iday),ilay,ibnd); aerosol_day.g (iday,ilay,ibnd) = aerosol.g (dayIndices(iday),ilay,ibnd); - }); + )); // Subset cloud optics // TODO: nbnd -> ngpt once we pass sub-sampled cloud state optical_props2_t clouds_day; - clouds_day.init(k_dist.get_band_lims_wavenumber(), k_dist.get_band_lims_gpoint()); - clouds_day.alloc_2str(nday, nlay); - Kokkos::parallel_for(MDRP::template get<3>({ngpt,nlay,nday}), KOKKOS_LAMBDA(int igpt, int ilay, int iday) { + clouds_day.init_no_alloc(k_dist.get_band_lims_wavenumber(), k_dist.get_band_lims_gpoint(), sw_cloud_band2gpt_mem, sw_cloud_gpt2band_mem); + clouds_day.alloc_2str_no_alloc(nday, nlay, sw_cloud_tau_mem, sw_cloud_ssa_mem, sw_cloud_g_mem); + TIMED_KERNEL(FLATTEN_MD_KERNEL3(nday,nlay,ngpt, iday, ilay, igpt, clouds_day.tau(iday,ilay,igpt) = clouds.tau(dayIndices(iday),ilay,igpt); clouds_day.ssa(iday,ilay,igpt) = clouds.ssa(dayIndices(iday),ilay,igpt); clouds_day.g (iday,ilay,igpt) = clouds.g (dayIndices(iday),ilay,igpt); - }); + )); // RRTMGP assumes surface albedos have a screwy dimension ordering // for some strange reason, so we need to transpose these; also do // daytime subsetting in the same kernel - Kokkos::parallel_for(MDRP::template get<2>({nbnd,nday}), KOKKOS_LAMBDA(int ibnd, int icol) { + TIMED_KERNEL(FLATTEN_MD_KERNEL2(nbnd,nday, ibnd, icol, sfc_alb_dir_T(ibnd,icol) = sfc_alb_dir(dayIndices(icol),ibnd); sfc_alb_dif_T(ibnd,icol) = sfc_alb_dif(dayIndices(icol),ibnd); - }); + )); // Temporaries we need for daytime-only fluxes fluxes_t fluxes_day; @@ -726,16 +816,16 @@ static void rrtmgp_sw( // Allocate space for optical properties optical_props2_t optics; - optics.alloc_2str(nday, nlay, k_dist); + optics.alloc_2str_no_alloc(nday, nlay, k_dist, sw_optics_band2gpt_mem, sw_optics_gpt2band_mem, sw_optics_tau_mem, sw_optics_ssa_mem, sw_optics_g_mem); optical_props2_t optics_no_aerosols; if (extra_clnsky_diag) { // Allocate space for optical properties (no aerosols) - optics_no_aerosols.alloc_2str(nday, nlay, k_dist); + optics_no_aerosols.alloc_2str_no_alloc(nday, nlay, k_dist, sw_noaero_band2gpt_mem, sw_noaero_gpt2band_mem, sw_noaero_tau_mem, sw_noaero_ssa_mem, sw_noaero_g_mem); } // Limit temperatures for gas optics look-up tables - limit_to_bounds_k(t_lay_day, k_dist_sw_k.get_temp_min(), k_dist_sw_k.get_temp_max(), t_lay_limited); + limit_to_bounds_k(t_lay_day, k_dist_sw_k->get_temp_min(), k_dist_sw_k->get_temp_max(), t_lay_limited); // Do gas optics bool top_at_1 = false; @@ -756,20 +846,20 @@ static void rrtmgp_sw( #endif // Apply tsi_scaling - Kokkos::parallel_for(MDRP::template get<2>({ngpt,nday}), KOKKOS_LAMBDA(int igpt, int iday) { + TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, ngpt, iday, igpt, toa_flux(iday,igpt) = tsi_scaling * toa_flux(iday,igpt); - }); + )); if (extra_clnclrsky_diag) { // Compute clear-clean-sky (just gas) fluxes on daytime columns rte_sw(optics, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day); // Expand daytime fluxes to all columns - Kokkos::parallel_for(MDRP::template get<2>({nlay+1,nday}), KOKKOS_LAMBDA(int ilev, int iday) { + TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay+1, iday, ilev, const int icol = dayIndices(iday); clnclrsky_flux_up (icol,ilev) = flux_up_day (iday,ilev); clnclrsky_flux_dn (icol,ilev) = flux_dn_day (iday,ilev); clnclrsky_flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev); - }); + )); } // Combine gas and aerosol optics @@ -780,12 +870,12 @@ static void rrtmgp_sw( rte_sw(optics, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day); // Expand daytime fluxes to all columns - Kokkos::parallel_for(MDRP::template get<2>({nlay+1,nday}), KOKKOS_LAMBDA(int ilev, int iday) { + TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay+1, iday, ilev, const int icol = dayIndices(iday); clrsky_flux_up (icol,ilev) = flux_up_day (iday,ilev); clrsky_flux_dn (icol,ilev) = flux_dn_day (iday,ilev); clrsky_flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev); - }); + )); // Now merge in cloud optics and do allsky calculations @@ -795,18 +885,18 @@ static void rrtmgp_sw( // Compute fluxes on daytime columns rte_sw(optics, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day); // Expand daytime fluxes to all columns - Kokkos::parallel_for(MDRP::template get<2>({nlay+1,nday}), KOKKOS_LAMBDA(int ilev, int iday) { + TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay+1, iday, ilev, const int icol = dayIndices(iday); flux_up (icol,ilev) = flux_up_day (iday,ilev); flux_dn (icol,ilev) = flux_dn_day (iday,ilev); flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev); - }); - Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay+1,nday}), KOKKOS_LAMBDA(int ibnd, int ilev, int iday) { + )); + TIMED_KERNEL(FLATTEN_MD_KERNEL3(nday, nlay+1, nbnd, iday, ilev, ibnd, const int icol = dayIndices(iday); bnd_flux_up (icol,ilev,ibnd) = bnd_flux_up_day (iday,ilev,ibnd); bnd_flux_dn (icol,ilev,ibnd) = bnd_flux_dn_day (iday,ilev,ibnd); bnd_flux_dn_dir(icol,ilev,ibnd) = bnd_flux_dn_dir_day(iday,ilev,ibnd); - }); + )); if (extra_clnsky_diag) { // First increment clouds in optics_no_aerosols @@ -814,16 +904,66 @@ static void rrtmgp_sw( // Compute cleansky (gas + clouds) fluxes on daytime columns rte_sw(optics_no_aerosols, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day); // Expand daytime fluxes to all columns - Kokkos::parallel_for(MDRP::template get<2>({nlay+1,nday}), KOKKOS_LAMBDA(int ilev, int iday) { + TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay+1, iday, ilev, const int icol = dayIndices(iday); clnsky_flux_up (icol,ilev) = flux_up_day (iday,ilev); clnsky_flux_dn (icol,ilev) = flux_dn_day (iday,ilev); clnsky_flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev); - }); + )); } - pool_t::dealloc(data); pool_t::dealloc(dayIndices); + + pool_t::dealloc(mu0_day); + + pool_t::dealloc(p_lay_day); + pool_t::dealloc(t_lay_day); + pool_t::dealloc(vmr_day); + pool_t::dealloc(t_lay_limited); + + pool_t::dealloc(p_lev_day); + pool_t::dealloc(t_lev_day); + pool_t::dealloc(flux_up_day); + pool_t::dealloc(flux_dn_day); + pool_t::dealloc(flux_dn_dir_day); + + pool_t::dealloc(vmr); + + pool_t::dealloc(sfc_alb_dir_T); + pool_t::dealloc(sfc_alb_dif_T); + + pool_t::dealloc(toa_flux); + + pool_t::dealloc(bnd_flux_up_day); + pool_t::dealloc(bnd_flux_dn_day); + pool_t::dealloc(bnd_flux_dn_dir_day); + + pool_t::dealloc(col_gas); + + pool_t::dealloc(concs_mem); + + pool_t::dealloc(sw_aero_tau_mem); + pool_t::dealloc(sw_aero_ssa_mem); + pool_t::dealloc(sw_aero_g_mem); + + pool_t::dealloc(sw_cloud_tau_mem); + pool_t::dealloc(sw_cloud_ssa_mem); + pool_t::dealloc(sw_cloud_g_mem); + pool_t::dealloc(sw_optics_tau_mem); + pool_t::dealloc(sw_optics_ssa_mem); + pool_t::dealloc(sw_optics_g_mem); + pool_t::dealloc(sw_noaero_tau_mem); + pool_t::dealloc(sw_noaero_ssa_mem); + pool_t::dealloc(sw_noaero_g_mem); + + pool_t::dealloc(sw_aero_band2gpt_mem); + pool_t::dealloc(sw_aero_gpt2band_mem); + pool_t::dealloc(sw_cloud_band2gpt_mem); + pool_t::dealloc(sw_cloud_gpt2band_mem); + pool_t::dealloc(sw_optics_band2gpt_mem); + pool_t::dealloc(sw_optics_gpt2band_mem); + pool_t::dealloc(sw_noaero_band2gpt_mem); + pool_t::dealloc(sw_noaero_gpt2band_mem); } /* @@ -841,24 +981,28 @@ static void rrtmgp_lw( // Problem size int nbnd = k_dist.get_nband(); int constexpr max_gauss_pts = 4; + const int ngpt = k_dist.get_ngpt(); - const int size1 = ncol; - const int size2 = nbnd*ncol; - const int size3 = max_gauss_pts*max_gauss_pts; - const int size4 = ncol*nlay; - const int size5 = ncol*(nlay+1); - const int size6 = ncol*nlay*(k_dist.get_ngas()+1); - - const int total_size = size1 + size2 + size3*2 + size4 + size5 + size6; - auto data = pool_t::template alloc_and_init(total_size); RealT *dcurr = data.data(); - - view_t t_sfc (dcurr, ncol); dcurr += size1; - view_t emis_sfc (dcurr, nbnd,ncol); dcurr += size2; - view_t gauss_Ds (dcurr, max_gauss_pts,max_gauss_pts); dcurr += size3; - view_t gauss_wts (dcurr, max_gauss_pts,max_gauss_pts); dcurr += size3; - view_t t_lay_limited(dcurr, ncol, nlay); dcurr += size4; - view_t t_lev_limited(dcurr, ncol, nlay+1); dcurr += size5; - view_t col_gas (dcurr, ncol, nlay, k_dist.get_ngas()+1); dcurr += size6; + auto t_sfc = pool_t::template alloc(ncol); + auto emis_sfc = pool_t::template alloc(nbnd, ncol); + auto gauss_Ds = pool_t::template alloc(max_gauss_pts, max_gauss_pts); + auto gauss_wts = pool_t::template alloc(max_gauss_pts, max_gauss_pts); + auto t_lay_limited = pool_t::template alloc(ncol, nlay); + auto t_lev_limited = pool_t::template alloc(ncol, nlay+1); + auto col_gas = pool_t::template alloc(ncol, nlay, k_dist.get_ngas()+1); + auto lw_optics_tau_mem = pool_t::template alloc(ncol, nlay, ngpt); + auto lw_noaero_tau_mem = pool_t::template alloc(ncol, nlay, ngpt); + auto lay_source_mem = pool_t::template alloc(ncol, nlay, ngpt); + auto lev_source_inc_mem = pool_t::template alloc(ncol, nlay, ngpt); + auto lev_source_dec_mem = pool_t::template alloc(ncol, nlay, ngpt); + auto sfc_source_mem = pool_t::template alloc(ncol, ngpt); + + auto lw_optics_band2gpt_mem = pool_t::template alloc(2, nbnd); + auto lw_optics_gpt2band_mem = pool_t::template alloc( ngpt); + auto lw_noaero_band2gpt_mem = pool_t::template alloc(2, nbnd); + auto lw_noaero_gpt2band_mem = pool_t::template alloc( ngpt); + auto lw_source_band2gpt_mem = pool_t::template alloc(2, nbnd); + auto lw_source_gpt2band_mem = pool_t::template alloc( ngpt); // Associate local pointers for fluxes auto &flux_up = fluxes.flux_up; @@ -873,8 +1017,7 @@ static void rrtmgp_lw( auto &clnsky_flux_dn = clnsky_fluxes.flux_dn; // Reset fluxes to zero - Kokkos::parallel_for( - MDRP::template get<2>({nlay + 1, ncol}), KOKKOS_LAMBDA(int ilev, int icol) { + TIMED_KERNEL(FLATTEN_MD_KERNEL2(ncol, nlay + 1, icol, ilev, flux_up(icol, ilev) = 0; flux_dn(icol, ilev) = 0; clnclrsky_flux_up(icol, ilev) = 0; @@ -883,26 +1026,25 @@ static void rrtmgp_lw( clrsky_flux_dn(icol, ilev) = 0; clnsky_flux_up(icol, ilev) = 0; clnsky_flux_dn(icol, ilev) = 0; - }); - Kokkos::parallel_for( - MDRP::template get<3>({nbnd, nlay + 1, ncol}), - KOKKOS_LAMBDA(int ibnd, int ilev, int icol) { + )); + TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay + 1, nbnd, icol, ilev, ibnd, bnd_flux_up(icol, ilev, ibnd) = 0; bnd_flux_dn(icol, ilev, ibnd) = 0; - }); + )); // Allocate space for optical properties optical_props1_t optics; - optics.alloc_1scl(ncol, nlay, k_dist); + optics.alloc_1scl_no_alloc(ncol, nlay, k_dist, lw_optics_band2gpt_mem, lw_optics_gpt2band_mem, lw_optics_tau_mem); + optical_props1_t optics_no_aerosols; if (extra_clnsky_diag) { // Allocate space for optical properties (no aerosols) - optics_no_aerosols.alloc_1scl(ncol, nlay, k_dist); + optics_no_aerosols.alloc_1scl_no_alloc(ncol, nlay, k_dist, lw_noaero_band2gpt_mem, lw_noaero_gpt2band_mem, lw_noaero_tau_mem); } // Boundary conditions source_func_t lw_sources; - lw_sources.alloc(ncol, nlay, k_dist); + lw_sources.alloc_no_alloc(ncol, nlay, k_dist, lw_source_band2gpt_mem, lw_source_gpt2band_mem, sfc_source_mem, lay_source_mem, lev_source_inc_mem, lev_source_dec_mem); bool top_at_1 = false; Kokkos::parallel_reduce(1, KOKKOS_LAMBDA(int, bool& val) { @@ -910,9 +1052,9 @@ static void rrtmgp_lw( }, Kokkos::LOr(top_at_1)); // Surface temperature - Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) { + TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) { t_sfc(icol) = t_lev(icol, conv::merge(nlay, 0, top_at_1)); - }); + })); Kokkos::deep_copy(emis_sfc , 0.98); // Get Gaussian quadrature weights @@ -941,8 +1083,8 @@ static void rrtmgp_lw( Kokkos::deep_copy(gauss_wts, gauss_wts_host); // Limit temperatures for gas optics look-up tables - limit_to_bounds_k(t_lay, k_dist_lw_k.get_temp_min(), k_dist_lw_k.get_temp_max(), t_lay_limited); - limit_to_bounds_k(t_lev, k_dist_lw_k.get_temp_min(), k_dist_lw_k.get_temp_max(), t_lev_limited); + limit_to_bounds_k(t_lay, k_dist_lw_k->get_temp_min(), k_dist_lw_k->get_temp_max(), t_lay_limited); + limit_to_bounds_k(t_lev, k_dist_lw_k->get_temp_min(), k_dist_lw_k->get_temp_max(), t_lev_limited); // Do gas optics k_dist.gas_optics(ncol, nlay, top_at_1, p_lay, p_lev, t_lay_limited, t_sfc, gas_concs, col_gas, optics, lw_sources, view_t(), t_lev_limited); @@ -979,7 +1121,26 @@ static void rrtmgp_lw( rte_lw(max_gauss_pts, gauss_Ds, gauss_wts, optics_no_aerosols, top_at_1, lw_sources, emis_sfc, clnsky_fluxes); } - pool_t::dealloc(data); + pool_t::dealloc(t_sfc); + pool_t::dealloc(emis_sfc); + pool_t::dealloc(gauss_Ds); + pool_t::dealloc(gauss_wts); + pool_t::dealloc(t_lay_limited); + pool_t::dealloc(t_lev_limited); + pool_t::dealloc(col_gas); + pool_t::dealloc(lw_optics_tau_mem); + pool_t::dealloc(lw_noaero_tau_mem); + pool_t::dealloc(lay_source_mem); + pool_t::dealloc(lev_source_inc_mem); + pool_t::dealloc(lev_source_dec_mem); + pool_t::dealloc(sfc_source_mem); + + pool_t::dealloc(lw_optics_band2gpt_mem); + pool_t::dealloc(lw_optics_gpt2band_mem); + pool_t::dealloc(lw_noaero_band2gpt_mem); + pool_t::dealloc(lw_noaero_gpt2band_mem); + pool_t::dealloc(lw_source_band2gpt_mem); + pool_t::dealloc(lw_source_gpt2band_mem); } /* @@ -994,7 +1155,7 @@ static void get_subcolumn_mask(const int ncol, const int nlay, const int ngpt, c // c(i,j,k) = 0 for x(i,j,k) <= 1 - cldf(i,j) // // I am going to call this "cldx" to be just slightly less ambiguous - auto cldx = pool_t::template alloc_and_init(ncol, nlay, ngpt); + auto cldx = pool_t::template alloc(ncol, nlay, ngpt); // Apply overlap assumption to set cldx if (overlap_option == 0) { // Dummy mask, always cloudy @@ -1011,23 +1172,35 @@ static void get_subcolumn_mask(const int ncol, const int nlay, const int ngpt, c // Kokkos::deep_copy(seeds_host, seeds); // for (int icol = 0; icol < ncol; ++icol) { // Kokkos::Random_XorShift64_Pool<> random_pool(seeds_host(icol)); - // Kokkos::parallel_for(MDRP::template get<2>({ngpt, nlay}), KOKKOS_LAMBDA(int igpt, int ilay) { + // TIMED_KERNEL(FLATTEN_MD_KERNEL2(nlay, ngpt, ilay, igpt, // auto generator = random_pool.get_state(); // cldx(icol,ilay,igpt) = generator.drand(0., 1.); // random_pool.free_state(generator); - // }); + // })); // } - Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) { - conv::Random rand(seeds(icol)); - for (int igpt = 0; igpt < ngpt; igpt++) { - for (int ilay = 0; ilay < nlay; ilay++) { - cldx(icol,ilay,igpt) = rand.genFP(); - } - } - }); + // TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) { + // conv::Random rand(seeds(icol)); + // for (int igpt = 0; igpt < ngpt; igpt++) { + // for (int ilay = 0; ilay < nlay; ilay++) { + // cldx(icol,ilay,igpt) = rand.genFP(); + // } + // } + // })); + // Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) { + // conv::Random rand(seeds(icol)); + // for (int igpt = 0; igpt < ngpt; igpt++) { + // for (int ilay = 0; ilay < nlay; ilay++) { + // cldx(icol,ilay,igpt) = rand.genFP(); + // } + // } + // }); + TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay, ngpt, icol, ilay, igpt, + conv::Random rand(seeds(icol) + ilay*ngpt + igpt); + cldx(icol,ilay,igpt) = rand.genFP(); + )); // Step down columns and apply algorithm from eq (14) - Kokkos::parallel_for(MDRP::template get<2>({ngpt,ncol}), KOKKOS_LAMBDA(int igpt, int icol) { + TIMED_KERNEL(FLATTEN_MD_KERNEL2(ncol,ngpt, icol, igpt, for (int ilay = 1; ilay < nlay; ilay++) { // Check cldx in level above and see if it satisfies conditions to create a cloudy subcolumn if (cldx(icol,ilay-1,igpt) > 1.0 - cldf(icol,ilay-1)) { @@ -1043,17 +1216,17 @@ static void get_subcolumn_mask(const int ncol, const int nlay, const int ngpt, c cldx(icol,ilay,igpt) = cldx(icol,ilay ,igpt) * (1.0 - cldf(icol,ilay-1)); } } - }); + )); } // Use cldx array to create subcolumn mask - Kokkos::parallel_for(MDRP::template get<3>({ngpt,nlay,ncol}), KOKKOS_LAMBDA(int igpt, int ilay, int icol) { + TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol,nlay,ngpt, icol, ilay, igpt, if (cldx(icol,ilay,igpt) > 1.0 - cldf(icol,ilay)) { subcolumn_mask(icol,ilay,igpt) = 1; } else { subcolumn_mask(icol,ilay,igpt) = 0; } - }); + )); pool_t::dealloc(cldx); } @@ -1068,22 +1241,22 @@ static void compute_cloud_area( // Subcolumn binary cld mask; if any layers with pressure between pmin and pmax are cloudy // then 2d subcol mask is 1, otherwise it is 0 auto subcol_mask = pool_t::template alloc_and_init(ncol, ngpt); - Kokkos::parallel_for(MDRP::template get<3>({ngpt, nlay, ncol}), KOKKOS_LAMBDA(int igpt, int ilay, int icol) { + TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay, ngpt, icol, ilay, igpt, // NOTE: using plev would need to assume level ordering (top to bottom or bottom to top), but // using play/pmid does not if (cld_tau_gpt(icol,ilay,igpt) > 0 && pmid(icol,ilay) >= pmin && pmid(icol,ilay) < pmax) { subcol_mask(icol,igpt) = 1; } - }); + )); // Compute average over subcols to get cloud area auto ngpt_inv = 1.0 / ngpt; Kokkos::deep_copy(cld_area, 0); - Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) { + TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) { // This loop needs to be serial because of the atomic reduction for (int igpt = 0; igpt < ngpt; ++igpt) { cld_area(icol) += subcol_mask(icol,igpt) * ngpt_inv; } - }); + })); pool_t::dealloc(subcol_mask); } @@ -1118,7 +1291,7 @@ static void compute_aerocom_cloudtop( Kokkos::deep_copy(eff_radius_qi_at_cldtop, 0.0); // Initialize the 1D "clear fraction" as 1 (totally clear) - auto aerocom_clr = pool_t::template alloc_and_init(ncol); + auto aerocom_clr = pool_t::template alloc(ncol); Kokkos::deep_copy(aerocom_clr, 1.0); // Get gravity acceleration constant from constants @@ -1131,7 +1304,7 @@ static void compute_aerocom_cloudtop( constexpr RealT cldfrac_tot_threshold = 0.001; // BAD_CONSTANT! // Loop over all columns in parallel - Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) { + TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) { // Loop over all layers in serial (due to accumulative // product), starting at 2 (second highest) layer because the // highest is assumed to hav no clouds @@ -1190,7 +1363,7 @@ static void compute_aerocom_cloudtop( // aerocom_clr is the result of accumulative probabilities // (their products) cldfrac_tot_at_cldtop(icol) = 1.0 - aerocom_clr(icol); - }); + })); pool_t::dealloc(aerocom_clr); } @@ -1211,7 +1384,7 @@ static void mixing_ratio_to_cloud_mass( int ncol = mixing_ratio.extent(0); int nlay = mixing_ratio.extent(1); using physconst = scream::physics::Constants; - Kokkos::parallel_for(MDRP::template get<2>({nlay, ncol}), KOKKOS_LAMBDA(int ilay, int icol) { + TIMED_KERNEL(FLATTEN_MD_KERNEL2(ncol, nlay, icol, ilay, // Compute in-cloud mixing ratio (mixing ratio of the cloudy part of the layer) // NOTE: these thresholds (from E3SM) seem arbitrary, but included here for consistency // This limits in-cloud mixing ratio to 0.005 kg/kg. According to note in cloud_diagnostics @@ -1223,7 +1396,7 @@ static void mixing_ratio_to_cloud_mass( } else { cloud_mass(icol,ilay) = 0; } - }); + )); } /* @@ -1234,30 +1407,34 @@ static void mixing_ratio_to_cloud_mass( */ template::type* dummy = nullptr> static void limit_to_bounds_k(InT const &arr_in, T const lower, T const upper, OutT &arr_out) { - Kokkos::parallel_for(arr_out.size(), KOKKOS_LAMBDA(int i) { + TIMED_KERNEL(Kokkos::parallel_for(arr_out.size(), KOKKOS_LAMBDA(int i) { arr_out(i) = std::min(std::max(arr_in(i), lower), upper); - }); + })); } template::type* dummy = nullptr> static void limit_to_bounds_k(InT const &arr_in, T const lower, T const upper, OutT &arr_out) { - Kokkos::parallel_for(MDRP::template get<2>({arr_out.extent(0), arr_out.extent(1)}), KOKKOS_LAMBDA(int i, int j) { + const int ex0 = (int) arr_out.extent(0); + const int ex1 = (int) arr_out.extent(1); + TIMED_KERNEL(FLATTEN_MD_KERNEL2(ex0, ex1, i, j, arr_out(i, j) = std::min(std::max(arr_in(i, j), lower), upper); - }); + )); } static int get_wavelength_index(optical_props_t &kdist, RealT wavelength) { + auto band_lims_wvn = kdist.get_band_lims_wavenumber(); + auto wavelength_bounds = pool_t::template alloc(band_lims_wvn.extent(0), band_lims_wvn.extent(1)); // Get wavelength bounds for all wavelength bands - auto wavelength_bounds = kdist.get_band_lims_wavelength(); + kdist.get_band_lims_wavelength(wavelength_bounds); // Find the band index for the specified wavelength // Note that bands are stored in wavenumber space, units of cm-1, so if we are passed wavelength // in units of meters, we need a conversion factor of 10^2 const int nbnds = kdist.get_nband(); int band_index = -1; - Kokkos::parallel_reduce(nbnds, KOKKOS_LAMBDA(int ibnd, int& band_index_inner) { + TIMED_KERNEL(Kokkos::parallel_reduce(nbnds, KOKKOS_LAMBDA(int ibnd, int& band_index_inner) { if (wavelength_bounds(0,ibnd) < wavelength_bounds(1,ibnd)) { if (wavelength_bounds(0,ibnd) <= wavelength * 1e2 && wavelength * 1e2 <= wavelength_bounds(1,ibnd)) { band_index_inner = ibnd; @@ -1267,36 +1444,38 @@ static int get_wavelength_index(optical_props_t &kdist, RealT wavelength) band_index_inner = ibnd; } } - }, Kokkos::Max(band_index)); + }, Kokkos::Max(band_index))); + pool_t::dealloc(wavelength_bounds); return band_index; } static inline int get_wavelength_index_sw_k(RealT wavelength) { - return get_wavelength_index(k_dist_sw_k, wavelength); + return get_wavelength_index(*k_dist_sw_k, wavelength); } static inline int get_wavelength_index_lw_k(RealT wavelength) { - return get_wavelength_index(k_dist_lw_k, wavelength); + return get_wavelength_index(*k_dist_lw_k, wavelength); } static optical_props2_t get_cloud_optics_sw( const int ncol, const int nlay, cloud_optics_t &cloud_optics, gas_optics_t &kdist, - const real2dk &lwp, const real2dk &iwp, const creal2dk &rel, const creal2dk &rei) { + const real2dk &lwp, const real2dk &iwp, const creal2dk &rel, const creal2dk &rei, + const int2dk& sw_band2gpt_mem, const int1dk& sw_gpt2band_mem, const real3dk& sw_tau_mem, const real3dk& sw_ssa_mem, const real3dk& sw_g_mem) { // Initialize optics optical_props2_t clouds; - clouds.init(kdist.get_band_lims_wavenumber()); - clouds.alloc_2str(ncol, nlay); + clouds.init_no_alloc(kdist.get_band_lims_wavenumber(), sw_band2gpt_mem, sw_gpt2band_mem); + clouds.alloc_2str_no_alloc(ncol, nlay, sw_tau_mem, sw_ssa_mem, sw_g_mem); // Needed for consistency with all-sky example problem? cloud_optics.set_ice_roughness(2); // Limit effective radii to be within bounds of lookup table - auto rel_limited = pool_t::template alloc_and_init(ncol, nlay); - auto rei_limited = pool_t::template alloc_and_init(ncol, nlay); + auto rel_limited = pool_t::template alloc(ncol, nlay); + auto rei_limited = pool_t::template alloc(ncol, nlay); limit_to_bounds_k(rel, cloud_optics.radliq_lwr, cloud_optics.radliq_upr, rel_limited); limit_to_bounds_k(rei, cloud_optics.radice_lwr, cloud_optics.radice_upr, rei_limited); @@ -1313,19 +1492,20 @@ static optical_props2_t get_cloud_optics_sw( static optical_props1_t get_cloud_optics_lw( const int ncol, const int nlay, cloud_optics_t &cloud_optics, gas_optics_t &kdist, - const real2dk &lwp, const real2dk &iwp, const creal2dk &rel, const creal2dk &rei) { + const real2dk &lwp, const real2dk &iwp, const creal2dk &rel, const creal2dk &rei, + const int2dk& lw_band2gpt_mem, const int1dk& lw_gpt2band_mem, const real3dk& lw_tau_mem) { // Initialize optics optical_props1_t clouds; - clouds.init(kdist.get_band_lims_wavenumber()); - clouds.alloc_1scl(ncol, nlay); // this is dumb, why do we need to init and alloc separately?! + clouds.init_no_alloc(kdist.get_band_lims_wavenumber(), lw_band2gpt_mem, lw_gpt2band_mem); + clouds.alloc_1scl_no_alloc(ncol, nlay, lw_tau_mem); // this is dumb, why do we need to init and alloc separately?! // Needed for consistency with all-sky example problem? cloud_optics.set_ice_roughness(2); // Limit effective radii to be within bounds of lookup table - auto rel_limited = pool_t::template alloc_and_init(ncol, nlay); - auto rei_limited = pool_t::template alloc_and_init(ncol, nlay); + auto rel_limited = pool_t::template alloc(ncol, nlay); + auto rei_limited = pool_t::template alloc(ncol, nlay); limit_to_bounds_k(rel, cloud_optics.radliq_lwr, cloud_optics.radliq_upr, rel_limited); limit_to_bounds_k(rei, cloud_optics.radice_lwr, cloud_optics.radice_upr, rei_limited); @@ -1341,14 +1521,15 @@ static optical_props1_t get_cloud_optics_lw( static optical_props2_t get_subsampled_clouds( const int ncol, const int nlay, const int nbnd, const int ngpt, - optical_props2_t &cloud_optics, gas_optics_t &kdist, const real2dk &cld, const creal2dk &p_lay) { + optical_props2_t &cloud_optics, gas_optics_t &kdist, const real2dk &cld, const creal2dk &p_lay, + const int2dk& sw_band2gpt_mem, const int1dk& sw_gpt2band_mem, const real3dk& sw_tau_mem, const real3dk& sw_ssa_mem, const real3dk& sw_g_mem) { // Initialized subsampled optics optical_props2_t subsampled_optics; - subsampled_optics.init(kdist.get_band_lims_wavenumber(), kdist.get_band_lims_gpoint(), "subsampled_optics"); - subsampled_optics.alloc_2str(ncol, nlay); + subsampled_optics.init_no_alloc(kdist.get_band_lims_wavenumber(), kdist.get_band_lims_gpoint(), sw_band2gpt_mem, sw_gpt2band_mem, "subsampled_optics"); + subsampled_optics.alloc_2str_no_alloc(ncol, nlay, sw_tau_mem, sw_ssa_mem, sw_g_mem); // Subcolumn mask with values of 0 indicating no cloud, 1 indicating cloud - auto cldmask = pool_t::template alloc_and_init(ncol, nlay, ngpt); + auto cldmask = pool_t::template alloc(ncol, nlay, ngpt); // Check that we do not have clouds with no optical properties; this would get corrected // when we assign optical props, but we want to use a "radiative cloud fraction" @@ -1358,11 +1539,12 @@ static optical_props2_t get_subsampled_clouds( // even when separated by layers with no cloud properties, when in fact those layers should be // randomly overlapped. auto cldfrac_rad = pool_t::template alloc_and_init(ncol, nlay); - Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay,ncol}), KOKKOS_LAMBDA (int ibnd, int ilay, int icol) { + TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay, nbnd, icol, ilay, ibnd, if (cloud_optics.tau(icol,ilay,ibnd) > 0) { cldfrac_rad(icol,ilay) = cld(icol,ilay); } - }); + )); + // Get subcolumn cloud mask; note that get_subcolumn_mask exposes overlap assumption as an option, // but the only currently supported options are 0 (trivial all-or-nothing cloud) or 1 (max-rand), // so overlap has not been exposed as an option beyond this subcolumn. In the future, we should @@ -1371,14 +1553,15 @@ static optical_props2_t get_subsampled_clouds( int overlap = 1; // Get unique seeds for each column that are reproducible across different MPI rank layouts; // use decimal part of pressure for this, consistent with the implementation in EAM - auto seeds = pool_t::template alloc_and_init(ncol); - Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) { + auto seeds = pool_t::template alloc(ncol); + TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) { seeds(icol) = 1e9 * (p_lay(icol,nlay-1) - int(p_lay(icol,nlay-1))); - }); + })); get_subcolumn_mask(ncol, nlay, ngpt, cldfrac_rad, overlap, seeds, cldmask); + // Assign optical properties to subcolumns (note this implements MCICA) auto gpoint_bands = kdist.get_gpoint_bands(); - Kokkos::parallel_for(MDRP::template get<3>({ngpt,nlay,ncol}), KOKKOS_LAMBDA(int igpt, int ilay, int icol) { + TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol,nlay,ngpt, icol, ilay, igpt, auto ibnd = gpoint_bands(igpt); if (cldmask(icol,ilay,igpt) == 1) { subsampled_optics.tau(icol,ilay,igpt) = cloud_optics.tau(icol,ilay,ibnd); @@ -1389,7 +1572,7 @@ static optical_props2_t get_subsampled_clouds( subsampled_optics.ssa(icol,ilay,igpt) = 0; subsampled_optics.g (icol,ilay,igpt) = 0; } - }); + )); pool_t::dealloc(cldmask); pool_t::dealloc(cldfrac_rad); @@ -1400,15 +1583,16 @@ static optical_props2_t get_subsampled_clouds( static optical_props1_t get_subsampled_clouds( const int ncol, const int nlay, const int nbnd, const int ngpt, - optical_props1_t &cloud_optics, gas_optics_t &kdist, const real2dk &cld, const creal2dk &p_lay) { + optical_props1_t &cloud_optics, gas_optics_t &kdist, const real2dk &cld, const creal2dk &p_lay, + const int2dk& lw_band2gpt_mem, const int1dk& lw_gpt2band_mem, const real3dk& lw_tau_mem) { // Initialized subsampled optics optical_props1_t subsampled_optics; - subsampled_optics.init(kdist.get_band_lims_wavenumber(), kdist.get_band_lims_gpoint(), "subsampled_optics"); - subsampled_optics.alloc_1scl(ncol, nlay); + subsampled_optics.init_no_alloc(kdist.get_band_lims_wavenumber(), kdist.get_band_lims_gpoint(), lw_band2gpt_mem, lw_gpt2band_mem, "subsampled_optics"); + subsampled_optics.alloc_1scl_no_alloc(ncol, nlay, lw_tau_mem); // Subcolumn mask with values of 0 indicating no cloud, 1 indicating cloud - auto cldmask = pool_t::template alloc_and_init(ncol, nlay, ngpt); + auto cldmask = pool_t::template alloc(ncol, nlay, ngpt); // Check that we do not have clouds with no optical properties; this would get corrected // when we assign optical props, but we want to use a "radiative cloud fraction" @@ -1418,31 +1602,31 @@ static optical_props1_t get_subsampled_clouds( // even when separated by layers with no cloud properties, when in fact those layers should be // randomly overlapped. auto cldfrac_rad = pool_t::template alloc_and_init(ncol, nlay); - Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay,ncol}), KOKKOS_LAMBDA (int ibnd, int ilay, int icol) { + TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol,nlay,nbnd, icol, ilay, ibnd, if (cloud_optics.tau(icol,ilay,ibnd) > 0) { cldfrac_rad(icol,ilay) = cld(icol,ilay); } - }); + )); // Get subcolumn cloud mask int overlap = 1; // Get unique seeds for each column that are reproducible across different MPI rank layouts; // use decimal part of pressure for this, consistent with the implementation in EAM; use different // seed values for longwave and shortwave - auto seeds = pool_t::template alloc_and_init(ncol); - Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) { + auto seeds = pool_t::template alloc(ncol); + TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) { seeds(icol) = 1e9 * (p_lay(icol,nlay-2) - int(p_lay(icol,nlay-2))); - }); + })); get_subcolumn_mask(ncol, nlay, ngpt, cldfrac_rad, overlap, seeds, cldmask); // Assign optical properties to subcolumns (note this implements MCICA) auto gpoint_bands = kdist.get_gpoint_bands(); - Kokkos::parallel_for(MDRP::template get<3>({ngpt,nlay,ncol}), KOKKOS_LAMBDA(int igpt, int ilay, int icol) { - auto ibnd = gpoint_bands(igpt); - if (cldmask(icol,ilay,igpt) == 1) { - subsampled_optics.tau(icol,ilay,igpt) = cloud_optics.tau(icol,ilay,ibnd); - } else { - subsampled_optics.tau(icol,ilay,igpt) = 0; - } - }); + TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol,nlay,ngpt, icol, ilay, igpt, + auto ibnd = gpoint_bands(igpt); + if (cldmask(icol,ilay,igpt) == 1) { + subsampled_optics.tau(icol,ilay,igpt) = cloud_optics.tau(icol,ilay,ibnd); + } else { + subsampled_optics.tau(icol,ilay,igpt) = 0; + } + )); pool_t::dealloc(cldmask); pool_t::dealloc(cldfrac_rad); diff --git a/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_tests.cpp b/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_tests.cpp index fedb8c3d0478..165272a237d4 100644 --- a/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_tests.cpp +++ b/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_tests.cpp @@ -401,7 +401,7 @@ int run_kokkos(int argc, char** argv) { // Initialize absorption coefficients logger->info("Initialize RRTMGP...\n"); - interface_t::rrtmgp_initialize(gas_concs, coefficients_file_sw, coefficients_file_lw, cloud_optics_file_sw, cloud_optics_file_lw, logger); + interface_t::rrtmgp_initialize(gas_concs, coefficients_file_sw, coefficients_file_lw, cloud_optics_file_sw, cloud_optics_file_lw, logger, 2.0); // Setup our dummy atmosphere based on the input data we read in logger->info("Setup dummy atmos...\n"); @@ -428,8 +428,8 @@ int run_kokkos(int argc, char** argv) { // we would just have to setup the pointers to them in the // FluxesBroadband object logger->info("Setup fluxes...\n"); - const auto nswbands = interface_t::k_dist_sw_k.get_nband(); - const auto nlwbands = interface_t::k_dist_lw_k.get_nband(); + const auto nswbands = interface_t::k_dist_sw_k->get_nband(); + const auto nlwbands = interface_t::k_dist_lw_k->get_nband(); real2dk sw_flux_up ("sw_flux_up" , ncol, nlay+1); real2dk sw_flux_dn ("sw_flux_dn" , ncol, nlay+1); real2dk sw_flux_dir("sw_flux_dir", ncol, nlay+1); @@ -470,19 +470,19 @@ int run_kokkos(int argc, char** argv) { auto aer_ssa_sw = real3dk("aer_ssa_sw", ncol, nlay, nswbands); auto aer_asm_sw = real3dk("aer_asm_sw", ncol, nlay, nswbands); auto aer_tau_lw = real3dk("aer_tau_lw", ncol, nlay, nlwbands); - Kokkos::parallel_for(MDRP::template get<3>({nswbands,nlay,ncol}), KOKKOS_LAMBDA(int ibnd, int ilay, int icol) { + Kokkos::parallel_for(MDRP::template get<3>({ncol, nlay, nswbands}), KOKKOS_LAMBDA(int icol, int ilay, int ibnd) { aer_tau_sw(icol,ilay,ibnd) = 0; aer_ssa_sw(icol,ilay,ibnd) = 0; aer_asm_sw(icol,ilay,ibnd) = 0; }); - Kokkos::parallel_for(MDRP::template get<3>({nlwbands,nlay,ncol}), KOKKOS_LAMBDA(int ibnd, int ilay, int icol) { + Kokkos::parallel_for(MDRP::template get<3>({ncol, nlay, nlwbands}), KOKKOS_LAMBDA(int icol, int ilay, int ibnd) { aer_tau_lw(icol,ilay,ibnd) = 0; }); // These are returned as outputs now from rrtmgp_main // TODO: provide as inputs consistent with how aerosol is treated? - const auto nswgpts = interface_t::k_dist_sw_k.get_ngpt(); - const auto nlwgpts = interface_t::k_dist_lw_k.get_ngpt(); + const auto nswgpts = interface_t::k_dist_sw_k->get_ngpt(); + const auto nlwgpts = interface_t::k_dist_lw_k->get_ngpt(); auto cld_tau_sw_bnd = real3dk("cld_tau_sw_bnd", ncol, nlay, nswbands); auto cld_tau_lw_bnd = real3dk("cld_tau_lw_bnd", ncol, nlay, nlwbands); auto cld_tau_sw = real3dk("cld_tau_sw", ncol, nlay, nswgpts); diff --git a/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_unit_tests.cpp b/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_unit_tests.cpp index 06ca64af5f24..f716e64791b8 100644 --- a/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_unit_tests.cpp +++ b/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_unit_tests.cpp @@ -1110,7 +1110,7 @@ TEST_CASE("rrtmgp_test_compute_broadband_surface_flux_k") { auto sw_bnd_flux_dir = real3dk("sw_bnd_flux_dir", ncol, nlay+1, nbnd); auto sw_bnd_flux_dif = real3dk("sw_bnd_flux_dif", ncol, nlay+1, nbnd); logger->info("Populate band-resolved 3d fluxes for test case with only transition band flux...\n"); - Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay+1,ncol}), KOKKOS_LAMBDA(int ibnd, int ilay, int icol) { + Kokkos::parallel_for(MDRP::template get<3>({ncol, nlay+1, nbnd}), KOKKOS_LAMBDA(int icol, int ilay, int ibnd) { if (ibnd < 9) { sw_bnd_flux_dir(icol,ilay,ibnd) = 0; sw_bnd_flux_dif(icol,ilay,ibnd) = 0; @@ -1142,7 +1142,7 @@ TEST_CASE("rrtmgp_test_compute_broadband_surface_flux_k") { // --------------------------------- // Test case, only flux in NIR bands logger->info("Populate band-resolved 3d fluxes for test case with only NIR flux...\n"); - Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay+1,ncol}), KOKKOS_LAMBDA(int ibnd, int ilay, int icol) { + Kokkos::parallel_for(MDRP::template get<3>({ncol, nlay+1, nbnd}), KOKKOS_LAMBDA(int icol, int ilay, int ibnd) { if (ibnd < 9) { sw_bnd_flux_dir(icol,ilay,ibnd) = 1; sw_bnd_flux_dif(icol,ilay,ibnd) = 1; @@ -1173,7 +1173,7 @@ TEST_CASE("rrtmgp_test_compute_broadband_surface_flux_k") { // --------------------------------- // Test case, only flux in VIS bands logger->info("Populate band-resolved 3d fluxes for test case with only VIS/UV flux...\n"); - Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay+1,ncol}), KOKKOS_LAMBDA(int ibnd, int ilay, int icol) { + Kokkos::parallel_for(MDRP::template get<3>({ncol, nlay+1, nbnd}), KOKKOS_LAMBDA(int icol, int ilay, int ibnd) { if (ibnd < 9) { sw_bnd_flux_dir(icol,ilay,ibnd) = 0; sw_bnd_flux_dif(icol,ilay,ibnd) = 0; @@ -1204,7 +1204,7 @@ TEST_CASE("rrtmgp_test_compute_broadband_surface_flux_k") { // --------------------------------- // Test case, only flux in all bands logger->info("Populate band-resolved 3d fluxes for test with non-zero flux in all bands...\n"); - Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay+1,ncol}), KOKKOS_LAMBDA(int ibnd, int ilay, int icol) { + Kokkos::parallel_for(MDRP::template get<3>({ncol, nlay+1, nbnd}), KOKKOS_LAMBDA(int icol, int ilay, int ibnd) { if (ibnd < 9) { sw_bnd_flux_dir(icol,ilay,ibnd) = 1.0; sw_bnd_flux_dif(icol,ilay,ibnd) = 2.0; @@ -1307,13 +1307,13 @@ TEST_CASE("rrtmgp_test_subcol_gen_k") { interface_t::get_subcolumn_mask(ncol, nlay, ngpt, cldfrac, 1, seeds, cldmask); // Check answers by computing new cldfrac from mask Kokkos::deep_copy(cldfrac_from_mask, 0.0); - Kokkos::parallel_for(MDRP::template get<2>({nlay,ncol}), KOKKOS_LAMBDA(int ilay, int icol) { + Kokkos::parallel_for(MDRP::template get<2>({ncol, nlay}), KOKKOS_LAMBDA(int icol, int ilay) { for (int igpt = 0; igpt < ngpt; ++igpt) { real cldmask_real = cldmask(icol,ilay,igpt); cldfrac_from_mask(icol,ilay) += cldmask_real; } }); - Kokkos::parallel_for(MDRP::template get<2>({nlay,ncol}), KOKKOS_LAMBDA(int ilay, int icol) { + Kokkos::parallel_for(MDRP::template get<2>({ncol, nlay}), KOKKOS_LAMBDA(int icol, int ilay) { cldfrac_from_mask(icol,ilay) = cldfrac_from_mask(icol,ilay) / ngpt; }); // For cldfrac 1 we should get 1, for cldfrac 0 we should get 0, but in between we cannot be sure diff --git a/components/eamxx/tests/single-process/rrtmgp/CMakeLists.txt b/components/eamxx/tests/single-process/rrtmgp/CMakeLists.txt index b16c2e732884..c8b57e98948b 100644 --- a/components/eamxx/tests/single-process/rrtmgp/CMakeLists.txt +++ b/components/eamxx/tests/single-process/rrtmgp/CMakeLists.txt @@ -6,20 +6,25 @@ set (FIXTURES_BASE_NAME ${TEST_BASE_NAME}_generate_output_nc_files) # Ensure test input files are present in the data dir GetInputFile(scream/init/${EAMxx_tests_IC_FILE_72lev}) +set(YAKL_LIB_NAME "") +if (SCREAM_RRTMGP_ENABLE_YAKL) + set(YAKL_LIB_NAME "yakl") +endif() + if (SCREAM_ENABLE_BASELINE_TESTS AND NOT SCREAM_ONLY_GENERATE_BASELINES) # Unit test to compare against raw rrtmgp output configure_file(${CMAKE_CURRENT_SOURCE_DIR}/input_unit.yaml ${CMAKE_CURRENT_BINARY_DIR}/input_unit.yaml) CreateUnitTest(${TEST_BASE_NAME}_unit rrtmgp_standalone_unit.cpp LABELS rrtmgp physics driver - LIBS scream_rrtmgp rrtmgp scream_control yakl diagnostics rrtmgp_test_utils - EXE_ARGS "--args --inputfile ${SCREAM_DATA_DIR}/init/rrtmgp-allsky.nc --baseline ${SCREAM_BASELINES_DIR}/data/rrtmgp-allsky-baseline.nc" + LIBS scream_rrtmgp rrtmgp scream_control ${YAKL_LIB_NAME} diagnostics rrtmgp_test_utils + EXE_ARGS "--args --rrtmgp_inputfile ${SCREAM_DATA_DIR}/init/rrtmgp-allsky.nc --rrtmgp_baseline ${SCREAM_BASELINES_DIR}/data/rrtmgp-allsky-baseline.nc" ) endif() ## Create rrtmgp stand alone executable CreateUnitTestExec(${TEST_BASE_NAME} "rrtmgp_standalone.cpp" - LIBS scream_rrtmgp rrtmgp scream_control yakl diagnostics + LIBS scream_rrtmgp rrtmgp scream_control ${YAKL_LIB_NAME} diagnostics ) # The RRTMGP stand-alone test that runs multi-step diff --git a/components/eamxx/tests/single-process/rrtmgp/input_unit.yaml b/components/eamxx/tests/single-process/rrtmgp/input_unit.yaml index 4a1f474c8778..92961fb30d99 100644 --- a/components/eamxx/tests/single-process/rrtmgp/input_unit.yaml +++ b/components/eamxx/tests/single-process/rrtmgp/input_unit.yaml @@ -21,6 +21,7 @@ atmosphere_processes: rrtmgp_coefficients_file_lw: ${SCREAM_DATA_DIR}/init/rrtmgp-data-lw-g256-2018-12-04.nc rrtmgp_cloud_optics_file_sw: ${SCREAM_DATA_DIR}/init/rrtmgp-cloud-optics-coeffs-sw.nc rrtmgp_cloud_optics_file_lw: ${SCREAM_DATA_DIR}/init/rrtmgp-cloud-optics-coeffs-lw.nc + pool_size_multiplier: 2.0 grids_manager: Type: Mesh Free