diff --git a/components/eam/src/physics/rrtmgp/external b/components/eam/src/physics/rrtmgp/external
index b24ca1f616e4..91d9076c7b18 160000
--- a/components/eam/src/physics/rrtmgp/external
+++ b/components/eam/src/physics/rrtmgp/external
@@ -1 +1 @@
-Subproject commit b24ca1f616e45659b334dbd7297017cb7927367e
+Subproject commit 91d9076c7b18b0e67dbd03d208ef2f385efefadd
diff --git a/components/eamxx/CMakeLists.txt b/components/eamxx/CMakeLists.txt
index 1119a3f031f7..3462cef9a976 100644
--- a/components/eamxx/CMakeLists.txt
+++ b/components/eamxx/CMakeLists.txt
@@ -212,8 +212,8 @@ endif()
 #     #cmakedefine RRTMGP_EXPENSIVE_CHECKS
 option (SCREAM_RRTMGP_DEBUG "Turn on extra debug checks in RRTMGP" ${SCREAM_DEBUG})
 
-option(SCREAM_RRTMGP_ENABLE_YAKL "Use YAKL under rrtmgp" TRUE)
-option(SCREAM_RRTMGP_ENABLE_KOKKOS "Use Kokkos under rrtmgp" FALSE)
+option(SCREAM_RRTMGP_ENABLE_YAKL "Use YAKL under rrtmgp" FALSE)
+option(SCREAM_RRTMGP_ENABLE_KOKKOS "Use Kokkos under rrtmgp" TRUE)
 if (SCREAM_RRTMGP_ENABLE_YAKL)
   add_definitions("-DRRTMGP_ENABLE_YAKL")
 endif()
diff --git a/components/eamxx/cime_config/namelist_defaults_scream.xml b/components/eamxx/cime_config/namelist_defaults_scream.xml
index 9a06de10e1ec..c0dab4ac030b 100644
--- a/components/eamxx/cime_config/namelist_defaults_scream.xml
+++ b/components/eamxx/cime_config/namelist_defaults_scream.xml
@@ -531,6 +531,7 @@ be lost if SCREAM_HACK_XML is not enabled.
       <do_subcol_sampling type="logical" doc="Flag to turn on/off subcolumn sampling of optical properties; if false treat cells as either completely clear or cloudy">
           true
       </do_subcol_sampling>
+      <pool_size_multiplier type="real">1.0</pool_size_multiplier>
     </rrtmgp>
 
     <mac_aero_mic inherit="atm_proc_group">
diff --git a/components/eamxx/cime_config/testdefs/testmods_dirs/scream/kokkos_rrtmgp/shell_commands b/components/eamxx/cime_config/testdefs/testmods_dirs/scream/kokkos_rrtmgp/shell_commands
new file mode 100644
index 000000000000..817dcf42fef5
--- /dev/null
+++ b/components/eamxx/cime_config/testdefs/testmods_dirs/scream/kokkos_rrtmgp/shell_commands
@@ -0,0 +1,3 @@
+./xmlchange --append SCREAM_CMAKE_OPTIONS='SCREAM_RRTMGP_ENABLE_YAKL Off'
+./xmlchange --append SCREAM_CMAKE_OPTIONS='SCREAM_RRTMGP_ENABLE_KOKKOS On'
+
diff --git a/components/eamxx/cime_config/testdefs/testmods_dirs/scream/yakl_rrtmgp/shell_commands b/components/eamxx/cime_config/testdefs/testmods_dirs/scream/yakl_rrtmgp/shell_commands
new file mode 100644
index 000000000000..61d571c95974
--- /dev/null
+++ b/components/eamxx/cime_config/testdefs/testmods_dirs/scream/yakl_rrtmgp/shell_commands
@@ -0,0 +1,2 @@
+./xmlchange --append SCREAM_CMAKE_OPTIONS='SCREAM_RRTMGP_ENABLE_YAKL On'
+./xmlchange --append SCREAM_CMAKE_OPTIONS='SCREAM_RRTMGP_ENABLE_KOKKOS Off'
diff --git a/components/eamxx/src/physics/rrtmgp/CMakeLists.txt b/components/eamxx/src/physics/rrtmgp/CMakeLists.txt
index a8e47384552a..d3d72a9a1f42 100644
--- a/components/eamxx/src/physics/rrtmgp/CMakeLists.txt
+++ b/components/eamxx/src/physics/rrtmgp/CMakeLists.txt
@@ -51,59 +51,60 @@ endmacro()
 ##################################
 
 # RRTMGP++ requires YAKL
+if (SCREAM_RRTMGP_ENABLE_YAKL)
+  string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_ci)
+  if (TARGET yakl)
+    # Other E3SM components are building YAKL...
+    message ("It appears some other part of E3SM is building YAKL.\n"
+      "We will reuse that, but if this is a debug build we will\n"
+      "add the --fmad=false flag to the cuda flags used by YAKL\n")
+  else ()
+    # Prepare CUDA/HIP flags for YAKL
+    if (CUDA_BUILD)
+      string(REPLACE ";" " " KOKKOS_CUDA_OPTIONS_STR "${KOKKOS_CUDA_OPTIONS}")
+      set(YAKL_ARCH "CUDA")
+      set(YAKL_CUDA_FLAGS "-DYAKL_ARCH_CUDA ${KOKKOS_CUDA_OPTIONS_STR} --expt-relaxed-constexpr -ccbin ${CMAKE_CXX_COMPILER}")
+      string (REPLACE " " ";" YAKL_CUDA_FLAGS_LIST ${YAKL_CUDA_FLAGS})
+    endif()
+    if (HIP_BUILD)
+      set(YAKL_ARCH "HIP")
+      set(YAKL_HIP_FLAGS "-DYAKL_ARCH_HIP -O3 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip")
+      string (REPLACE " " ";" YAKL_HIP_FLAGS_LIST ${YAKL_HIP_FLAGS})
+    endif()
 
-string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_ci)
-if (TARGET yakl)
-  # Other E3SM components are building YAKL...
-  message ("It appears some other part of E3SM is building YAKL.\n"
-           "We will reuse that, but if this is a debug build we will\n"
-           "add the --fmad=false flag to the cuda flags used by YAKL\n")
-else ()
-  # Prepare CUDA/HIP flags for YAKL
-  if (CUDA_BUILD)
-    string(REPLACE ";" " " KOKKOS_CUDA_OPTIONS_STR "${KOKKOS_CUDA_OPTIONS}")
-    set(YAKL_ARCH "CUDA")
-    set(YAKL_CUDA_FLAGS "-DYAKL_ARCH_CUDA ${KOKKOS_CUDA_OPTIONS_STR} --expt-relaxed-constexpr -ccbin ${CMAKE_CXX_COMPILER}")
-    string (REPLACE " " ";" YAKL_CUDA_FLAGS_LIST ${YAKL_CUDA_FLAGS})
-  endif()
-  if (HIP_BUILD)
-    set(YAKL_ARCH "HIP")
-    set(YAKL_HIP_FLAGS "-DYAKL_ARCH_HIP -O3 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip")
-    string (REPLACE " " ";" YAKL_HIP_FLAGS_LIST ${YAKL_HIP_FLAGS})
-  endif()
+    set (YAKL_SOURCE_DIR ${SCREAM_BASE_DIR}/../../externals/YAKL)
+    add_subdirectory(${YAKL_SOURCE_DIR} ${CMAKE_BINARY_DIR}/externals/YAKL)
 
-  set (YAKL_SOURCE_DIR ${SCREAM_BASE_DIR}/../../externals/YAKL)
-  add_subdirectory(${YAKL_SOURCE_DIR} ${CMAKE_BINARY_DIR}/externals/YAKL)
+    # Set some additional flag/cpp option on the yakl target
 
-  # Set some additional flag/cpp option on the yakl target
+    cmake_policy (SET CMP0079 NEW) # Allow to link to a tgt from a different directory
 
-  cmake_policy (SET CMP0079 NEW) # Allow to link to a tgt from a different directory
+    # EAMxx *requires* MPI, so simply look for it, then link against it
+    find_package(MPI REQUIRED COMPONENTS C)
+    target_link_libraries (yakl INTERFACE MPI::MPI_C)
 
-  # EAMxx *requires* MPI, so simply look for it, then link against it
-  find_package(MPI REQUIRED COMPONENTS C)
-  target_link_libraries (yakl INTERFACE MPI::MPI_C)
+    # For debug builds, set -DYAKL_DEBUG
+    if (CMAKE_BUILD_TYPE_ci STREQUAL "debug")
+      target_compile_definitions(yakl INTERFACE YAKL_DEBUG)
+    endif()
+  endif()
 
-  # For debug builds, set -DYAKL_DEBUG
-  if (CMAKE_BUILD_TYPE_ci STREQUAL "debug")
-    target_compile_definitions(yakl INTERFACE YAKL_DEBUG)
+  # See eamxx/src/dynamics/homme/CMakeLists.txt for an explanation of this
+  # workaround.
+  if ((SCREAM_MACHINE STREQUAL "ascent" OR SCREAM_MACHINE STREQUAL "pm-gpu") AND CMAKE_BUILD_TYPE_ci STREQUAL "debug")
+    SetCudaFlagsYakl(yakl CUDA_LANG FLAGS -UNDEBUG)
+  else()
+    SetCudaFlagsYakl(yakl CUDA_LANG)
   endif()
-endif()
 
-# See eamxx/src/dynamics/homme/CMakeLists.txt for an explanation of this
-# workaround.
-if ((SCREAM_MACHINE STREQUAL "ascent" OR SCREAM_MACHINE STREQUAL "pm-gpu") AND CMAKE_BUILD_TYPE_ci STREQUAL "debug")
-  SetCudaFlagsYakl(yakl CUDA_LANG FLAGS -UNDEBUG)
-else()
-  SetCudaFlagsYakl(yakl CUDA_LANG)
+  list(APPEND CMAKE_MODULE_PATH ${YAKL_SOURCE_DIR})
+  include (yakl_utils)
 endif()
 
 ##################################
 #           RRTMGP               #
 ##################################
 
-list(APPEND CMAKE_MODULE_PATH ${YAKL_SOURCE_DIR})
-include (yakl_utils)
-
 set(EAM_RRTMGP_DIR ${SCREAM_BASE_DIR}/../eam/src/physics/rrtmgp)
 # Build RRTMGP library; this builds the core RRTMGP external source as a library named "rrtmgp"
 # NOTE: The external RRTMGP build needs some fixes to work with CUDA in a library build, so for now we will build these ourselves
@@ -122,7 +123,13 @@ set(EXTERNAL_SRC
 add_library(rrtmgp ${EXTERNAL_SRC})
 target_compile_definitions(rrtmgp PUBLIC EAMXX_HAS_RRTMGP)
 EkatDisableAllWarning(rrtmgp)
-yakl_process_target(rrtmgp)
+if (SCREAM_RRTMGP_ENABLE_YAKL)
+  yakl_process_target(rrtmgp)
+else()
+  if (CUDA_BUILD)
+    target_compile_options(rrtmgp PUBLIC $<$<COMPILE_LANGUAGE:CXX>:--expt-relaxed-constexpr>)
+  endif()
+endif()
 
 # NOTE: cannot use 'PUBLIC' in target_link_libraries,
 #       since yakl_process_target already used it
@@ -130,7 +137,11 @@ yakl_process_target(rrtmgp)
 if (NOT TARGET Kokkos::kokkos)
   find_package(Kokkos REQUIRED)
 endif ()
-target_link_libraries(rrtmgp yakl Kokkos::kokkos)
+if (SCREAM_RRTMGP_ENABLE_YAKL)
+  target_link_libraries(rrtmgp yakl Kokkos::kokkos)
+else()
+  target_link_libraries(rrtmgp Kokkos::kokkos)
+endif()
 target_include_directories(rrtmgp PUBLIC
     ${SCREAM_BASE_DIR}/../../externals/YAKL
     ${EAM_RRTMGP_DIR}/external/cpp
@@ -158,21 +169,23 @@ target_include_directories(rrtmgp PUBLIC
 #       SCREAM_RRTMGP_YAKL       #
 ##################################
 
-set(SCREAM_RRTMGP_SOURCES_YAKL
+set(SCREAM_RRTMGP_SOURCES_INTERFACE
   scream_rrtmgp_interface.cpp
 )
 
-add_library(scream_rrtmgp_yakl ${SCREAM_RRTMGP_SOURCES_YAKL})
-yakl_process_target(scream_rrtmgp_yakl)
+add_library(scream_rrtmgp_interface ${SCREAM_RRTMGP_SOURCES_INTERFACE})
+if (SCREAM_RRTMGP_ENABLE_YAKL)
+  yakl_process_target(scream_rrtmgp_interface)
+endif()
 
 # NOTE: cannot use 'PUBLIC' in target_link_libraries,
 #       since yakl_process_target already used it
 #       with the "plain" signature
 find_library(NETCDF_C netcdf HINTS ${NetCDF_C_PATH}/lib)
-target_link_libraries(scream_rrtmgp_yakl ${NETCDF_C} rrtmgp scream_share Kokkos::kokkos)
-target_include_directories(scream_rrtmgp_yakl PUBLIC
+target_link_libraries(scream_rrtmgp_interface ${NETCDF_C} rrtmgp scream_share Kokkos::kokkos)
+target_include_directories(scream_rrtmgp_interface PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR})
-target_include_directories(scream_rrtmgp_yakl SYSTEM PUBLIC
+target_include_directories(scream_rrtmgp_interface SYSTEM PUBLIC
     ${NetCDF_C_PATH}/include
     ${EAM_RRTMGP_DIR}/external)
 
@@ -186,7 +199,7 @@ set(SCREAM_RRTMGP_SOURCES
 )
 
 add_library(scream_rrtmgp ${SCREAM_RRTMGP_SOURCES})
-target_link_libraries(scream_rrtmgp PUBLIC scream_share physics_share csm_share scream_rrtmgp_yakl Kokkos::kokkos)
+target_link_libraries(scream_rrtmgp PUBLIC scream_share physics_share csm_share scream_rrtmgp_interface Kokkos::kokkos)
 set_target_properties(scream_rrtmgp PROPERTIES
   Fortran_MODULE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/modules
 )
@@ -199,9 +212,11 @@ target_include_directories(scream_rrtmgp PUBLIC
 # ${YAKL_${YAKL_ARCH}_FLAGS} flags to the CXX flags of scream_rrtmgp.
 # In particular, this will ensure that all the yakl macros
 # are correctly defined in YAKL headers, depending on the backend
-if (YAKL_ARCH)
-  target_compile_options(scream_rrtmgp PUBLIC
-        "$<$<COMPILE_LANGUAGE:CXX>:${YAKL_${YAKL_ARCH}_FLAGS_LIST}>")
+if (SCREAM_RRTMGP_ENABLE_YAKL)
+  if (YAKL_ARCH)
+    target_compile_options(scream_rrtmgp PUBLIC
+      "$<$<COMPILE_LANGUAGE:CXX>:${YAKL_${YAKL_ARCH}_FLAGS_LIST}>")
+  endif()
 endif()
 
 
diff --git a/components/eamxx/src/physics/rrtmgp/eamxx_rrtmgp_process_interface.cpp b/components/eamxx/src/physics/rrtmgp/eamxx_rrtmgp_process_interface.cpp
index 0121741963d9..ab61158152c1 100644
--- a/components/eamxx/src/physics/rrtmgp/eamxx_rrtmgp_process_interface.cpp
+++ b/components/eamxx/src/physics/rrtmgp/eamxx_rrtmgp_process_interface.cpp
@@ -636,6 +636,8 @@ void RRTMGPRadiation::initialize_impl(const RunType /* run_type */) {
   m_n2vmr      = m_params.get<double>("n2vmr", 0.7906);
   m_covmr      = m_params.get<double>("covmr", 1.0e-7);
 
+  const double multiplier = m_params.get<double>("pool_size_multiplier", 1.0);
+
   // Whether or not to do MCICA subcolumn sampling
   m_do_subcol_sampling = m_params.get<bool>("do_subcol_sampling",true);
 
@@ -676,13 +678,14 @@ void RRTMGPRadiation::initialize_impl(const RunType /* run_type */) {
           m_gas_concs_k,
           coefficients_file_sw, coefficients_file_lw,
           cloud_optics_file_sw, cloud_optics_file_lw,
-          m_atm_logger
+          m_atm_logger,
+          multiplier
   );
   VALIDATE_KOKKOS(m_gas_concs, m_gas_concs_k);
-  VALIDATE_KOKKOS(rrtmgp::k_dist_sw, interface_t::k_dist_sw_k);
-  VALIDATE_KOKKOS(rrtmgp::k_dist_lw, interface_t::k_dist_lw_k);
-  VALIDATE_KOKKOS(rrtmgp::cloud_optics_sw, interface_t::cloud_optics_sw_k);
-  VALIDATE_KOKKOS(rrtmgp::cloud_optics_lw, interface_t::cloud_optics_lw_k);
+  VALIDATE_KOKKOS(rrtmgp::k_dist_sw, *interface_t::k_dist_sw_k);
+  VALIDATE_KOKKOS(rrtmgp::k_dist_lw, *interface_t::k_dist_lw_k);
+  VALIDATE_KOKKOS(rrtmgp::cloud_optics_sw, *interface_t::cloud_optics_sw_k);
+  VALIDATE_KOKKOS(rrtmgp::cloud_optics_lw, *interface_t::cloud_optics_lw_k);
 #endif
 
   // Set property checks for fields in this process
@@ -907,6 +910,7 @@ void RRTMGPRadiation::run_impl (const double dt) {
       ulrreal2dk d_dz   = ulrreal2dk(m_buffer.d_dz.data(), m_col_chunk_size, m_nlay);
       auto d_mu0 = m_buffer.cosine_zenith;
 #ifdef RRTMGP_ENABLE_YAKL
+      TIMED_INLINE_KERNEL(init_views,
       // Create YAKL arrays. RRTMGP expects YAKL arrays with styleFortran, i.e., data has ncol
       // as the fastest index. For this reason we must copy the data.
       auto subview_1d = [&](const real1d v) -> real1d {
@@ -976,9 +980,11 @@ void RRTMGPRadiation::run_impl (const double dt) {
       auto cld_tau_lw_bnd  = subview_3d(m_buffer.cld_tau_lw_bnd);
       auto cld_tau_sw_gpt  = subview_3d(m_buffer.cld_tau_sw_gpt);
       auto cld_tau_lw_gpt  = subview_3d(m_buffer.cld_tau_lw_gpt);
+                   );
 #endif
 #ifdef RRTMGP_ENABLE_KOKKOS
       ConvertToRrtmgpSubview conv = {beg, ncol};
+      TIMED_INLINE_KERNEL(init_views,
 
       // Note, ncol will not necessary be m_col_chunk_size because the number of cols
       // will not always be evenly divided by m_col_chunk_size. In most cases, the
@@ -1039,6 +1045,7 @@ void RRTMGPRadiation::run_impl (const double dt) {
       auto cld_tau_lw_bnd_k  = conv.subview3d(m_buffer.cld_tau_lw_bnd_k);
       auto cld_tau_sw_gpt_k  = conv.subview3d(m_buffer.cld_tau_sw_gpt_k);
       auto cld_tau_lw_gpt_k  = conv.subview3d(m_buffer.cld_tau_lw_gpt_k);
+                   );
 #endif
 
       // Set gas concs to "view" only the first ncol columns
@@ -1072,6 +1079,7 @@ void RRTMGPRadiation::run_impl (const double dt) {
         Kokkos::deep_copy(d_mu0,h_mu0);
 
         const auto policy = ekat::ExeSpaceUtils<ExeSpace>::get_default_team_policy(ncol, m_nlay);
+        TIMED_KERNEL(
         Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const MemberType& team) {
           const int i = team.league_rank();
           const int icol = i+beg;
@@ -1215,6 +1223,7 @@ void RRTMGPRadiation::run_impl (const double dt) {
           }
 #endif
         });
+                     );
       }
       Kokkos::fence();
 #ifdef RRTMGP_ENABLE_KOKKOS
@@ -1362,18 +1371,22 @@ void RRTMGPRadiation::run_impl (const double dt) {
       // Compute band-by-band surface_albedos. This is needed since
       // the AD passes broadband albedos, but rrtmgp require band-by-band.
 #ifdef RRTMGP_ENABLE_YAKL
+      TIMED_KERNEL(
       rrtmgp::compute_band_by_band_surface_albedos(
         ncol, nswbands,
         sfc_alb_dir_vis, sfc_alb_dir_nir,
         sfc_alb_dif_vis, sfc_alb_dif_nir,
         sfc_alb_dir, sfc_alb_dif);
+                   );
 #endif
 #ifdef RRTMGP_ENABLE_KOKKOS
+      TIMED_KERNEL(
       interface_t::compute_band_by_band_surface_albedos(
         ncol, nswbands,
         sfc_alb_dir_vis_k, sfc_alb_dir_nir_k,
         sfc_alb_dif_vis_k, sfc_alb_dif_nir_k,
         sfc_alb_dir_k, sfc_alb_dif_k);
+                   );
       COMPARE_ALL_WRAP(std::vector<real2d>({sfc_alb_dir, sfc_alb_dif}),
                        std::vector<real2dk>({sfc_alb_dir_k, sfc_alb_dif_k}));
 #endif
@@ -1381,6 +1394,7 @@ void RRTMGPRadiation::run_impl (const double dt) {
 
       // Run RRTMGP driver
 #ifdef RRTMGP_ENABLE_YAKL
+      TIMED_KERNEL(
       rrtmgp::rrtmgp_main(
         ncol, m_nlay,
         p_lay, t_lay, p_lev, t_lev,
@@ -1401,8 +1415,10 @@ void RRTMGPRadiation::run_impl (const double dt) {
         eccf, m_atm_logger,
         m_extra_clnclrsky_diag, m_extra_clnsky_diag
       );
+                   );
 #endif
 #ifdef RRTMGP_ENABLE_KOKKOS
+      TIMED_KERNEL(
       interface_t::rrtmgp_main(
         ncol, m_nlay,
         p_lay_k, t_lay_k, p_lev_k, t_lev_k,
@@ -1423,6 +1439,7 @@ void RRTMGPRadiation::run_impl (const double dt) {
         eccf, m_atm_logger,
         m_extra_clnclrsky_diag, m_extra_clnsky_diag
       );
+                   );
       COMPARE_ALL_WRAP(std::vector<real2d>({
         sw_flux_up, sw_flux_dn, sw_flux_dn_dir, lw_flux_up, lw_flux_dn,
         sw_clnclrsky_flux_up, sw_clnclrsky_flux_dn, sw_clnclrsky_flux_dn_dir,
@@ -1445,6 +1462,7 @@ void RRTMGPRadiation::run_impl (const double dt) {
 
       // Update heating tendency
 #ifdef RRTMGP_ENABLE_YAKL
+      TIMED_INLINE_KERNEL(heating_tendency,
       auto sw_heating  = m_buffer.sw_heating;
       auto lw_heating  = m_buffer.lw_heating;
       rrtmgp::compute_heating_rate(
@@ -1466,8 +1484,10 @@ void RRTMGPRadiation::run_impl (const double dt) {
         });
       }
       Kokkos::fence();
+                   );
 #endif
 #ifdef RRTMGP_ENABLE_KOKKOS
+      TIMED_INLINE_KERNEL(heating_tendency,
       auto sw_heating_k  = m_buffer.sw_heating_k;
       auto lw_heating_k  = m_buffer.lw_heating_k;
       rrtmgp::compute_heating_rate(
@@ -1489,6 +1509,7 @@ void RRTMGPRadiation::run_impl (const double dt) {
         });
       }
       Kokkos::fence();
+                   );
       COMPARE_ALL_WRAP(std::vector<real2d>({sw_heating, lw_heating}),
                        std::vector<real2dk>({sw_heating_k, lw_heating_k}));
 #endif
@@ -1497,6 +1518,7 @@ void RRTMGPRadiation::run_impl (const double dt) {
 #ifdef RRTMGP_ENABLE_YAKL
       const int kbot = nlay+1;
 
+      TIMED_KERNEL(
       // Compute diffuse flux as difference between total and direct
       Kokkos::parallel_for(Kokkos::RangePolicy<ExeSpace>(0,nswbands*(nlay+1)*ncol),
                            KOKKOS_LAMBDA (const int idx) {
@@ -1513,10 +1535,12 @@ void RRTMGPRadiation::run_impl (const double dt) {
           sfc_flux_dir_vis, sfc_flux_dir_nir,
           sfc_flux_dif_vis, sfc_flux_dif_nir
       );
+                   );
 #endif
 #ifdef RRTMGP_ENABLE_KOKKOS
       const int kbot_k = nlay;
 
+      TIMED_KERNEL(
       // Compute diffuse flux as difference between total and direct
       Kokkos::parallel_for(Kokkos::RangePolicy<ExeSpace>(0,nswbands*(nlay+1)*ncol),
                            KOKKOS_LAMBDA (const int idx) {
@@ -1533,12 +1557,14 @@ void RRTMGPRadiation::run_impl (const double dt) {
           sfc_flux_dir_vis_k, sfc_flux_dir_nir_k,
           sfc_flux_dif_vis_k, sfc_flux_dif_nir_k
       );
+                   );
       COMPARE_ALL_WRAP(std::vector<real1d>({sfc_flux_dir_vis, sfc_flux_dir_nir, sfc_flux_dif_vis, sfc_flux_dif_nir}),
                        std::vector<real1dk>({sfc_flux_dir_vis_k, sfc_flux_dir_nir_k, sfc_flux_dif_vis_k, sfc_flux_dif_nir_k}));
 #endif
 
       // Compute diagnostic total cloud area (vertically-projected cloud cover)
 #ifdef RRTMGP_ENABLE_YAKL
+      TIMED_KERNEL(
       real1d cldlow ("cldlow", d_cldlow.data() + m_col_chunk_beg[ic], ncol);
       real1d cldmed ("cldmed", d_cldmed.data() + m_col_chunk_beg[ic], ncol);
       real1d cldhgh ("cldhgh", d_cldhgh.data() + m_col_chunk_beg[ic], ncol);
@@ -1553,8 +1579,10 @@ void RRTMGPRadiation::run_impl (const double dt) {
       rrtmgp::compute_cloud_area(ncol, nlay, nlwgpts, 400e2,                            700e2, p_lay, cld_tau_lw_gpt, cldmed);
       rrtmgp::compute_cloud_area(ncol, nlay, nlwgpts,     0,                            400e2, p_lay, cld_tau_lw_gpt, cldhgh);
       rrtmgp::compute_cloud_area(ncol, nlay, nlwgpts,     0, std::numeric_limits<Real>::max(), p_lay, cld_tau_lw_gpt, cldtot);
+                   );
 #endif
 #ifdef RRTMGP_ENABLE_KOKKOS
+      TIMED_KERNEL(
       real1dk cldlow_k (d_cldlow.data() + m_col_chunk_beg[ic], ncol);
       real1dk cldmed_k (d_cldmed.data() + m_col_chunk_beg[ic], ncol);
       real1dk cldhgh_k (d_cldhgh.data() + m_col_chunk_beg[ic], ncol);
@@ -1569,12 +1597,15 @@ void RRTMGPRadiation::run_impl (const double dt) {
       interface_t::compute_cloud_area(ncol, nlay, nlwgpts, 400e2,                            700e2, p_lay_k, cld_tau_lw_gpt_k, cldmed_k);
       interface_t::compute_cloud_area(ncol, nlay, nlwgpts,     0,                            400e2, p_lay_k, cld_tau_lw_gpt_k, cldhgh_k);
       interface_t::compute_cloud_area(ncol, nlay, nlwgpts,     0, std::numeric_limits<Real>::max(), p_lay_k, cld_tau_lw_gpt_k, cldtot_k);
+                   );
       COMPARE_ALL_WRAP(std::vector<real1d>({cldlow, cldmed, cldhgh, cldtot}),
                        std::vector<real1dk>({cldlow_k, cldmed_k, cldhgh_k, cldtot_k}));
 #endif
 
       // Compute cloud-top diagnostics following AeroCOM recommendation
 #ifdef RRTMGP_ENABLE_YAKL
+      TIMED_INLINE_KERNEL(cloud_top,
+
       // Get visible 0.67 micron band for COSP
       auto idx_067 = rrtmgp::get_wavelength_index_sw(0.67e-6);
       // Get IR 10.5 micron band for COSP
@@ -1595,8 +1626,10 @@ void RRTMGPRadiation::run_impl (const double dt) {
           nc, T_mid_at_cldtop, p_mid_at_cldtop, cldfrac_ice_at_cldtop,
           cldfrac_liq_at_cldtop, cldfrac_tot_at_cldtop, cdnc_at_cldtop,
           eff_radius_qc_at_cldtop, eff_radius_qi_at_cldtop);
+                          );
 #endif
 #ifdef RRTMGP_ENABLE_KOKKOS
+      TIMED_INLINE_KERNEL(cloud_top,
       // Get visible 0.67 micron band for COSP
       auto idx_067_k = interface_t::get_wavelength_index_sw_k(0.67e-6);
       // Get IR 10.5 micron band for COSP
@@ -1616,6 +1649,7 @@ void RRTMGPRadiation::run_impl (const double dt) {
           nc_k, T_mid_at_cldtop_k, p_mid_at_cldtop_k, cldfrac_ice_at_cldtop_k,
           cldfrac_liq_at_cldtop_k, cldfrac_tot_at_cldtop_k, cdnc_at_cldtop_k,
           eff_radius_qc_at_cldtop_k, eff_radius_qi_at_cldtop_k);
+                          );
       COMPARE_ALL_WRAP(std::vector<real1d>({
             T_mid_at_cldtop, p_mid_at_cldtop, cldfrac_ice_at_cldtop,
             cldfrac_liq_at_cldtop, cldfrac_tot_at_cldtop, cdnc_at_cldtop,
@@ -1629,6 +1663,7 @@ void RRTMGPRadiation::run_impl (const double dt) {
       // Copy output data back to FieldManager
       const auto policy = ekat::ExeSpaceUtils<ExeSpace>::get_default_team_policy(ncol, m_nlay);
 #ifdef RRTMGP_ENABLE_YAKL
+      TIMED_KERNEL(
       Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const MemberType& team) {
         const int i = team.league_rank();
         const int icol = i + beg;
@@ -1671,8 +1706,10 @@ void RRTMGPRadiation::run_impl (const double dt) {
             d_sunlit(icol) = 0.0;
         }
       });
+                   );
 #endif
 #ifdef RRTMGP_ENABLE_KOKKOS
+      TIMED_KERNEL(
       Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const MemberType& team) {
         const int i = team.league_rank();
         const int icol = i + beg;
@@ -1714,6 +1751,7 @@ void RRTMGPRadiation::run_impl (const double dt) {
             d_sunlit(icol) = 0.0;
         }
       });
+                   );
 #ifdef RRTMGP_ENABLE_YAKL
       // Sync back to gas_concs_k
       real3dk temp(gas_concs_k, std::make_pair(0, ncol), Kokkos::ALL, Kokkos::ALL);
diff --git a/components/eamxx/src/physics/rrtmgp/rrtmgp_test_utils.hpp b/components/eamxx/src/physics/rrtmgp/rrtmgp_test_utils.hpp
index 466467919fcf..b1cb1f424ec1 100644
--- a/components/eamxx/src/physics/rrtmgp/rrtmgp_test_utils.hpp
+++ b/components/eamxx/src/physics/rrtmgp/rrtmgp_test_utils.hpp
@@ -88,7 +88,7 @@ static void dummy_clouds(
   // put them in 2/3 of the columns since that's roughly the total cloudiness of earth.
   // Set sane values for liquid and ice water path.
   // NOTE: these "sane" values are in g/m2!
-  Kokkos::parallel_for( MDRP::template get<2>({nlay,ncol}) , KOKKOS_LAMBDA (int ilay, int icol) {
+  Kokkos::parallel_for( MDRP::template get<2>({ncol, nlay}) , KOKKOS_LAMBDA (int icol, int ilay) {
     cloud_mask(icol,ilay) = p_lay(icol,ilay) > 100. * 100. && p_lay(icol,ilay) < 900. * 100. && ((icol+1)%3) != 0;
     // Ice and liquid will overlap in a few layers
     lwp(icol,ilay) = conv::merge(10.,  0., cloud_mask(icol,ilay) && t_lay(icol,ilay) > 263.);
@@ -123,7 +123,7 @@ static void dummy_atmos(
   // needs the CloudOptics object only because it uses the min and max
   // valid values from the lookup tables for liquid and ice water path to
   // create a dummy atmosphere.
-  dummy_clouds(interface_t::cloud_optics_sw_k, p_lay, t_lay, lwp, iwp, rel, rei, cld);
+  dummy_clouds(*interface_t::cloud_optics_sw_k, p_lay, t_lay, lwp, iwp, rel, rei, cld);
 }
 
 static void read_fluxes(
diff --git a/components/eamxx/src/physics/rrtmgp/rrtmgp_utils.hpp b/components/eamxx/src/physics/rrtmgp/rrtmgp_utils.hpp
index aa6326c81811..863597a4fea3 100644
--- a/components/eamxx/src/physics/rrtmgp/rrtmgp_utils.hpp
+++ b/components/eamxx/src/physics/rrtmgp/rrtmgp_utils.hpp
@@ -40,12 +40,12 @@ void compute_heating_rate (
   using physconst = scream::physics::Constants<Real>;
   auto ncol = flux_up.dimension[0];
   auto nlay = flux_up.dimension[1]-1;
-  yakl::fortran::parallel_for(yakl::fortran::SimpleBounds<2>(nlay,ncol), YAKL_LAMBDA(int ilay, int icol) {
+  TIMED_KERNEL(yakl::fortran::parallel_for(yakl::fortran::SimpleBounds<2>(nlay,ncol), YAKL_LAMBDA(int ilay, int icol) {
       heating_rate(icol,ilay) = (
         flux_up(icol,ilay+1) - flux_up(icol,ilay) -
         flux_dn(icol,ilay+1) + flux_dn(icol,ilay)
                                  ) * physconst::gravit / (physconst::Cpair * pdel(icol,ilay));
-    });
+      }));
 }
 #endif
 #ifdef RRTMGP_ENABLE_KOKKOS
@@ -57,15 +57,15 @@ void compute_heating_rate (
   View4 const &heating_rate)
 {
   using physconst = scream::physics::Constants<Real>;
-  using MDRP = typename conv::MDRP<typename View1::array_layout>;
-  auto ncol = flux_up.extent(0);
-  auto nlay = flux_up.extent(1)-1;
-  Kokkos::parallel_for(MDRP::template get<2>({nlay,ncol}), KOKKOS_LAMBDA(int ilay, int icol) {
+  using LayoutT = typename View1::array_layout;
+  const int ncol = (int)flux_up.extent(0);
+  const int nlay = (int)flux_up.extent(1)-1;
+  TIMED_KERNEL(FLATTEN_MD_KERNEL2(ncol, nlay, icol, ilay,
     heating_rate(icol,ilay) = (
       flux_up(icol,ilay+1) - flux_up(icol,ilay) -
       flux_dn(icol,ilay+1) + flux_dn(icol,ilay)
                                ) * physconst::gravit / (physconst::Cpair * pdel(icol,ilay));
-  });
+                                  ));
 }
 #endif
 
diff --git a/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.cpp b/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.cpp
index 10281df8078c..9796ce49dbe0 100644
--- a/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.cpp
+++ b/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.cpp
@@ -122,11 +122,11 @@ OpticalProps2str get_subsampled_clouds(
   // randomly overlapped.
   auto cldfrac_rad = real2d("cldfrac_rad", ncol, nlay);
   memset(cldfrac_rad, 0.0);  // Start with all zeros
-  parallel_for(SimpleBounds<3>(nbnd,nlay,ncol), YAKL_LAMBDA (int ibnd, int ilay, int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<3>(nbnd,nlay,ncol), YAKL_LAMBDA (int ibnd, int ilay, int icol) {
       if (cloud_optics.tau(icol,ilay,ibnd) > 0) {
         cldfrac_rad(icol,ilay) = cld(icol,ilay);
       }
-    });
+    }));
   // Get subcolumn cloud mask; note that get_subcolumn_mask exposes overlap assumption as an option,
   // but the only currently supported options are 0 (trivial all-or-nothing cloud) or 1 (max-rand),
   // so overlap has not been exposed as an option beyond this subcolumn. In the future, we should
@@ -136,14 +136,14 @@ OpticalProps2str get_subsampled_clouds(
   // Get unique seeds for each column that are reproducible across different MPI rank layouts;
   // use decimal part of pressure for this, consistent with the implementation in EAM
   auto seeds = int1d("seeds", ncol);
-  parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) {
       seeds(icol) = 1e9 * (p_lay(icol,nlay) - int(p_lay(icol,nlay)));
-    });
+    }));
   auto cldmask = get_subcolumn_mask(ncol, nlay, ngpt, cldfrac_rad, overlap, seeds);
 
   // Assign optical properties to subcolumns (note this implements MCICA)
   auto gpoint_bands = kdist.get_gpoint_bands();
-  parallel_for(SimpleBounds<3>(ngpt,nlay,ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<3>(ngpt,nlay,ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) {
       auto ibnd = gpoint_bands(igpt);
       if (cldmask(icol,ilay,igpt) == 1) {
         subsampled_optics.tau(icol,ilay,igpt) = cloud_optics.tau(icol,ilay,ibnd);
@@ -154,7 +154,7 @@ OpticalProps2str get_subsampled_clouds(
         subsampled_optics.ssa(icol,ilay,igpt) = 0;
         subsampled_optics.g  (icol,ilay,igpt) = 0;
       }
-    });
+    }));
   return subsampled_optics;
 }
 
@@ -174,31 +174,31 @@ OpticalProps1scl get_subsampled_clouds(
   // randomly overlapped.
   auto cldfrac_rad = real2d("cldfrac_rad", ncol, nlay);
   memset(cldfrac_rad, 0.0);  // Start with all zeros
-  parallel_for(SimpleBounds<3>(nbnd,nlay,ncol), YAKL_LAMBDA (int ibnd, int ilay, int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<3>(nbnd,nlay,ncol), YAKL_LAMBDA (int ibnd, int ilay, int icol) {
       if (cloud_optics.tau(icol,ilay,ibnd) > 0) {
         cldfrac_rad(icol,ilay) = cld(icol,ilay);
       }
-    });
+    }));
   // Get subcolumn cloud mask
   int overlap = 1;
   // Get unique seeds for each column that are reproducible across different MPI rank layouts;
   // use decimal part of pressure for this, consistent with the implementation in EAM; use different
   // seed values for longwave and shortwave
   auto seeds = int1d("seeds", ncol);
-  parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) {
       seeds(icol) = 1e9 * (p_lay(icol,nlay-1) - int(p_lay(icol,nlay-1)));
-    });
+    }));
   auto cldmask = get_subcolumn_mask(ncol, nlay, ngpt, cldfrac_rad, overlap, seeds);
   // Assign optical properties to subcolumns (note this implements MCICA)
   auto gpoint_bands = kdist.get_gpoint_bands();
-  parallel_for(SimpleBounds<3>(ngpt,nlay,ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<3>(ngpt,nlay,ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) {
       auto ibnd = gpoint_bands(igpt);
       if (cldmask(icol,ilay,igpt) == 1) {
         subsampled_optics.tau(icol,ilay,igpt) = cloud_optics.tau(icol,ilay,ibnd);
       } else {
         subsampled_optics.tau(icol,ilay,igpt) = 0;
       }
-    });
+    }));
   return subsampled_optics;
 }
 
@@ -260,7 +260,7 @@ void compute_band_by_band_surface_albedos(
 
   // Loop over bands, and determine for each band whether it is broadly in the
   // visible or infrared part of the spectrum (visible or "not visible")
-  parallel_for(SimpleBounds<2>(nswbands, ncol), YAKL_LAMBDA(const int ibnd, const int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<2>(nswbands, ncol), YAKL_LAMBDA(const int ibnd, const int icol) {
 
       // Threshold between visible and infrared is 0.7 micron, or 14286 cm^-1.
       const real visible_wavenumber_threshold = 14286;
@@ -291,7 +291,7 @@ void compute_band_by_band_surface_albedos(
         sfc_alb_dif(icol,ibnd) = 0.5*(sfc_alb_dif_vis(icol) + sfc_alb_dif_nir(icol));
 
       }
-    });
+    }));
 }
 
 void compute_broadband_surface_fluxes(
@@ -317,7 +317,7 @@ void compute_broadband_surface_fluxes(
   // Threshold between visible and infrared is 0.7 micron, or 14286 cm^-1.
   const real visible_wavenumber_threshold = 14286;
   auto wavenumber_limits = k_dist_sw.get_band_lims_wavenumber();
-  parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(const int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(const int icol) {
       for (int ibnd = 1; ibnd <= nswbands; ++ibnd) {
         // Wavenumber is in the visible if it is above the visible wavenumber
         // threshold, and in the infrared if it is below the threshold
@@ -346,7 +346,7 @@ void compute_broadband_surface_fluxes(
           sfc_flux_dif_nir(icol) += 0.5 * sw_bnd_flux_dif(icol,ktop,ibnd);
         }
       }
-    });
+    }));
 }
 
 void rrtmgp_main(
@@ -439,16 +439,16 @@ void rrtmgp_main(
   OpticalProps1scl aerosol_lw;
   aerosol_sw.init(k_dist_sw.get_band_lims_wavenumber());
   aerosol_sw.alloc_2str(ncol, nlay);
-  parallel_for(SimpleBounds<3>(nswbands,nlay,ncol) , YAKL_LAMBDA (int ibnd, int ilay, int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<3>(nswbands,nlay,ncol) , YAKL_LAMBDA (int ibnd, int ilay, int icol) {
       aerosol_sw.tau(icol,ilay,ibnd) = aer_tau_sw(icol,ilay,ibnd);
       aerosol_sw.ssa(icol,ilay,ibnd) = aer_ssa_sw(icol,ilay,ibnd);
       aerosol_sw.g  (icol,ilay,ibnd) = aer_asm_sw(icol,ilay,ibnd);
-    });
+    }));
   aerosol_lw.init(k_dist_lw.get_band_lims_wavenumber());
   aerosol_lw.alloc_1scl(ncol, nlay);
-  parallel_for(SimpleBounds<3>(nlwbands,nlay,ncol) , YAKL_LAMBDA (int ibnd, int ilay, int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<3>(nlwbands,nlay,ncol) , YAKL_LAMBDA (int ibnd, int ilay, int icol) {
       aerosol_lw.tau(icol,ilay,ibnd) = aer_tau_lw(icol,ilay,ibnd);
-    });
+    }));
 
 #ifdef SCREAM_RRTMGP_DEBUG
   // Check aerosol optical properties
@@ -477,12 +477,12 @@ void rrtmgp_main(
 
   // Copy cloud properties to outputs (is this needed, or can we just use pointers?)
   // Alternatively, just compute and output a subcolumn cloud mask
-  parallel_for(SimpleBounds<3>(nswgpts, nlay, ncol), YAKL_LAMBDA (int igpt, int ilay, int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<3>(nswgpts, nlay, ncol), YAKL_LAMBDA (int igpt, int ilay, int icol) {
       cld_tau_sw_gpt(icol,ilay,igpt) = clouds_sw_gpt.tau(icol,ilay,igpt);
-    });
-  parallel_for(SimpleBounds<3>(nlwgpts, nlay, ncol), YAKL_LAMBDA (int igpt, int ilay, int icol) {
+    }));
+  TIMED_KERNEL(parallel_for(SimpleBounds<3>(nlwgpts, nlay, ncol), YAKL_LAMBDA (int igpt, int ilay, int icol) {
       cld_tau_lw_gpt(icol,ilay,igpt) = clouds_lw_gpt.tau(icol,ilay,igpt);
-    });
+    }));
 
 #ifdef SCREAM_RRTMGP_DEBUG
   // Perform checks on optics; these would be caught by RRTMGP_EXPENSIVE_CHECKS in the RRTMGP code,
@@ -542,16 +542,16 @@ int3d get_subcolumn_mask(const int ncol, const int nlay, const int ngpt, real2d
     // https://github.com/AER-RC/RRTMG_SW/blob/master/src/mcica_subcol_gen_sw.f90)
     //
     // First, fill cldx with random numbers. Need to use a unique seed for each column!
-    parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) {
+    TIMED_KERNEL(parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) {
         yakl::Random rand(seeds(icol));
         for (int igpt = 1; igpt <= ngpt; igpt++) {
           for (int ilay = 1; ilay <= nlay; ilay++) {
             cldx(icol,ilay,igpt) = rand.genFP<Real>();
           }
         }
-      });
+      }));
     // Step down columns and apply algorithm from eq (14)
-    parallel_for(SimpleBounds<2>(ngpt,ncol), YAKL_LAMBDA(int igpt, int icol) {
+    TIMED_KERNEL(parallel_for(SimpleBounds<2>(ngpt,ncol), YAKL_LAMBDA(int igpt, int icol) {
         for (int ilay = 2; ilay <= nlay; ilay++) {
           // Check cldx in level above and see if it satisfies conditions to create a cloudy subcolumn
           if (cldx(icol,ilay-1,igpt) > 1.0 - cldf(icol,ilay-1)) {
@@ -567,17 +567,17 @@ int3d get_subcolumn_mask(const int ncol, const int nlay, const int ngpt, real2d
             cldx(icol,ilay,igpt) = cldx(icol,ilay  ,igpt) * (1.0 - cldf(icol,ilay-1));
           }
         }
-      });
+      }));
   }
 
   // Use cldx array to create subcolumn mask
-  parallel_for(SimpleBounds<3>(ngpt,nlay,ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<3>(ngpt,nlay,ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) {
       if (cldx(icol,ilay,igpt) > 1.0 - cldf(icol,ilay)) {
         subcolumn_mask(icol,ilay,igpt) = 1;
       } else {
         subcolumn_mask(icol,ilay,igpt) = 0;
       }
-    });
+    }));
   return subcolumn_mask;
 }
 
@@ -616,7 +616,7 @@ void rrtmgp_sw(
   auto &clnsky_flux_dn_dir = clnsky_fluxes.flux_dn_dir;
 
   // Reset fluxes to zero
-  parallel_for(SimpleBounds<2>(nlay+1,ncol), YAKL_LAMBDA(int ilev, int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay+1,ncol), YAKL_LAMBDA(int ilev, int icol) {
       flux_up    (icol,ilev) = 0;
       flux_dn    (icol,ilev) = 0;
       flux_dn_dir(icol,ilev) = 0;
@@ -629,12 +629,12 @@ void rrtmgp_sw(
       clnsky_flux_up    (icol,ilev) = 0;
       clnsky_flux_dn    (icol,ilev) = 0;
       clnsky_flux_dn_dir(icol,ilev) = 0;
-    });
-  parallel_for(SimpleBounds<3>(nbnd,nlay+1,ncol), YAKL_LAMBDA(int ibnd, int ilev, int icol) {
+    }));
+  TIMED_KERNEL(parallel_for(SimpleBounds<3>(nbnd,nlay+1,ncol), YAKL_LAMBDA(int ibnd, int ilev, int icol) {
       bnd_flux_up    (icol,ilev,ibnd) = 0;
       bnd_flux_dn    (icol,ilev,ibnd) = 0;
       bnd_flux_dn_dir(icol,ilev,ibnd) = 0;
-    });
+    }));
 
   // Get daytime indices
   auto dayIndices = int1d("dayIndices", ncol);
@@ -660,23 +660,23 @@ void rrtmgp_sw(
 
   // Subset mu0
   auto mu0_day = real1d("mu0_day", nday);
-  parallel_for(SimpleBounds<1>(nday), YAKL_LAMBDA(int iday) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<1>(nday), YAKL_LAMBDA(int iday) {
       mu0_day(iday) = mu0(dayIndices(iday));
-    });
+    }));
 
   // subset state variables
   auto p_lay_day = real2d("p_lay_day", nday, nlay);
   auto t_lay_day = real2d("t_lay_day", nday, nlay);
-  parallel_for(SimpleBounds<2>(nlay,nday), YAKL_LAMBDA(int ilay, int iday) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay,nday), YAKL_LAMBDA(int ilay, int iday) {
       p_lay_day(iday,ilay) = p_lay(dayIndices(iday),ilay);
       t_lay_day(iday,ilay) = t_lay(dayIndices(iday),ilay);
-    });
+    }));
   auto p_lev_day = real2d("p_lev_day", nday, nlay+1);
   auto t_lev_day = real2d("t_lev_day", nday, nlay+1);
-  parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) {
       p_lev_day(iday,ilev) = p_lev(dayIndices(iday),ilev);
       t_lev_day(iday,ilev) = t_lev(dayIndices(iday),ilev);
-    });
+    }));
 
   // Subset gases
   auto gas_names = gas_concs.get_gas_names();
@@ -686,9 +686,9 @@ void rrtmgp_sw(
     auto vmr_day = real2d("vmr_day", nday, nlay);
     auto vmr     = real2d("vmr"    , ncol, nlay);
     gas_concs.get_vmr(gas_names[igas], vmr);
-    parallel_for(SimpleBounds<2>(nlay,nday), YAKL_LAMBDA(int ilay, int iday) {
+    TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay,nday), YAKL_LAMBDA(int ilay, int iday) {
         vmr_day(iday,ilay) = vmr(dayIndices(iday),ilay);
-      });
+      }));
     gas_concs_day.set_vmr(gas_names[igas], vmr_day);
   }
 
@@ -696,32 +696,32 @@ void rrtmgp_sw(
   OpticalProps2str aerosol_day;
   aerosol_day.init(k_dist.get_band_lims_wavenumber());
   aerosol_day.alloc_2str(nday, nlay);
-  parallel_for(SimpleBounds<3>(nbnd,nlay,nday), YAKL_LAMBDA(int ibnd, int ilay, int iday) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<3>(nbnd,nlay,nday), YAKL_LAMBDA(int ibnd, int ilay, int iday) {
       aerosol_day.tau(iday,ilay,ibnd) = aerosol.tau(dayIndices(iday),ilay,ibnd);
       aerosol_day.ssa(iday,ilay,ibnd) = aerosol.ssa(dayIndices(iday),ilay,ibnd);
       aerosol_day.g  (iday,ilay,ibnd) = aerosol.g  (dayIndices(iday),ilay,ibnd);
-    });
+    }));
 
   // Subset cloud optics
   // TODO: nbnd -> ngpt once we pass sub-sampled cloud state
   OpticalProps2str clouds_day;
   clouds_day.init(k_dist.get_band_lims_wavenumber(), k_dist.get_band_lims_gpoint());
   clouds_day.alloc_2str(nday, nlay);
-  parallel_for(SimpleBounds<3>(ngpt,nlay,nday), YAKL_LAMBDA(int igpt, int ilay, int iday) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<3>(ngpt,nlay,nday), YAKL_LAMBDA(int igpt, int ilay, int iday) {
       clouds_day.tau(iday,ilay,igpt) = clouds.tau(dayIndices(iday),ilay,igpt);
       clouds_day.ssa(iday,ilay,igpt) = clouds.ssa(dayIndices(iday),ilay,igpt);
       clouds_day.g  (iday,ilay,igpt) = clouds.g  (dayIndices(iday),ilay,igpt);
-    });
+    }));
 
   // RRTMGP assumes surface albedos have a screwy dimension ordering
   // for some strange reason, so we need to transpose these; also do
   // daytime subsetting in the same kernel
   real2d sfc_alb_dir_T("sfc_alb_dir", nbnd, nday);
   real2d sfc_alb_dif_T("sfc_alb_dif", nbnd, nday);
-  parallel_for(SimpleBounds<2>(nbnd,nday), YAKL_LAMBDA(int ibnd, int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<2>(nbnd,nday), YAKL_LAMBDA(int ibnd, int icol) {
       sfc_alb_dir_T(ibnd,icol) = sfc_alb_dir(dayIndices(icol),ibnd);
       sfc_alb_dif_T(ibnd,icol) = sfc_alb_dif(dayIndices(icol),ibnd);
-    });
+    }));
 
   // Temporaries we need for daytime-only fluxes
   auto flux_up_day = real2d("flux_up_day", nday, nlay+1);
@@ -770,20 +770,20 @@ void rrtmgp_sw(
 #endif
 
   // Apply tsi_scaling
-  parallel_for(SimpleBounds<2>(ngpt,nday), YAKL_LAMBDA(int igpt, int iday) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<2>(ngpt,nday), YAKL_LAMBDA(int igpt, int iday) {
       toa_flux(iday,igpt) = tsi_scaling * toa_flux(iday,igpt);
-    });
+    }));
 
   if (extra_clnclrsky_diag) {
     // Compute clear-clean-sky (just gas) fluxes on daytime columns
     rte_sw(optics, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day);
     // Expand daytime fluxes to all columns
-    parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) {
+    TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) {
         int icol = dayIndices(iday);
         clnclrsky_flux_up    (icol,ilev) = flux_up_day    (iday,ilev);
         clnclrsky_flux_dn    (icol,ilev) = flux_dn_day    (iday,ilev);
         clnclrsky_flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev);
-      });
+      }));
   }
 
   // Combine gas and aerosol optics
@@ -794,12 +794,12 @@ void rrtmgp_sw(
   rte_sw(optics, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day);
 
   // Expand daytime fluxes to all columns
-  parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) {
       int icol = dayIndices(iday);
       clrsky_flux_up    (icol,ilev) = flux_up_day    (iday,ilev);
       clrsky_flux_dn    (icol,ilev) = flux_dn_day    (iday,ilev);
       clrsky_flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev);
-    });
+    }));
 
   // Now merge in cloud optics and do allsky calculations
 
@@ -809,18 +809,18 @@ void rrtmgp_sw(
   // Compute fluxes on daytime columns
   rte_sw(optics, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day);
   // Expand daytime fluxes to all columns
-  parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) {
       int icol = dayIndices(iday);
       flux_up    (icol,ilev) = flux_up_day    (iday,ilev);
       flux_dn    (icol,ilev) = flux_dn_day    (iday,ilev);
       flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev);
-    });
-  parallel_for(SimpleBounds<3>(nbnd,nlay+1,nday), YAKL_LAMBDA(int ibnd, int ilev, int iday) {
+    }));
+  TIMED_KERNEL(parallel_for(SimpleBounds<3>(nbnd,nlay+1,nday), YAKL_LAMBDA(int ibnd, int ilev, int iday) {
       int icol = dayIndices(iday);
       bnd_flux_up    (icol,ilev,ibnd) = bnd_flux_up_day    (iday,ilev,ibnd);
       bnd_flux_dn    (icol,ilev,ibnd) = bnd_flux_dn_day    (iday,ilev,ibnd);
       bnd_flux_dn_dir(icol,ilev,ibnd) = bnd_flux_dn_dir_day(iday,ilev,ibnd);
-    });
+    }));
 
   if (extra_clnsky_diag) {
     // First increment clouds in optics_no_aerosols
@@ -828,12 +828,12 @@ void rrtmgp_sw(
     // Compute cleansky (gas + clouds) fluxes on daytime columns
     rte_sw(optics_no_aerosols, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day);
     // Expand daytime fluxes to all columns
-    parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) {
+    TIMED_KERNEL(parallel_for(SimpleBounds<2>(nlay+1,nday), YAKL_LAMBDA(int ilev, int iday) {
         int icol = dayIndices(iday);
         clnsky_flux_up    (icol,ilev) = flux_up_day    (iday,ilev);
         clnsky_flux_dn    (icol,ilev) = flux_dn_day    (iday,ilev);
         clnsky_flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev);
-      });
+      }));
   }
 }
 
@@ -863,7 +863,7 @@ void rrtmgp_lw(
   auto &clnsky_flux_dn    = clnsky_fluxes.flux_dn;
 
   // Reset fluxes to zero
-  parallel_for(
+  TIMED_KERNEL(parallel_for(
     SimpleBounds<2>(nlay + 1, ncol), YAKL_LAMBDA(int ilev, int icol) {
       flux_up(icol, ilev)           = 0;
       flux_dn(icol, ilev)           = 0;
@@ -873,13 +873,13 @@ void rrtmgp_lw(
       clrsky_flux_dn(icol, ilev)    = 0;
       clnsky_flux_up(icol, ilev)    = 0;
       clnsky_flux_dn(icol, ilev)    = 0;
-    });
-  parallel_for(
+    }));
+  TIMED_KERNEL(parallel_for(
     SimpleBounds<3>(nbnd, nlay + 1, ncol),
     YAKL_LAMBDA(int ibnd, int ilev, int icol) {
       bnd_flux_up(icol, ilev, ibnd) = 0;
       bnd_flux_dn(icol, ilev, ibnd) = 0;
-    });
+    }));
 
   // Allocate space for optical properties
   OpticalProps1scl optics;
@@ -899,9 +899,9 @@ void rrtmgp_lw(
   // Surface temperature
   auto p_lay_host = p_lay.createHostCopy();
   bool top_at_1 = p_lay_host(1, 1) < p_lay_host(1, nlay);
-  parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) {
+  TIMED_KERNEL(parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) {
       t_sfc(icol) = t_lev(icol, merge(nlay+1, 1, top_at_1));
-    });
+    }));
   memset(emis_sfc , 0.98_wp);
 
   // Get Gaussian quadrature weights
@@ -977,22 +977,22 @@ void compute_cloud_area(
   // then 2d subcol mask is 1, otherwise it is 0
   auto subcol_mask = real2d("subcol_mask", ncol, ngpt);
   memset(subcol_mask, 0);
-  yakl::fortran::parallel_for(SimpleBounds<3>(ngpt, nlay, ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) {
+  TIMED_KERNEL(yakl::fortran::parallel_for(SimpleBounds<3>(ngpt, nlay, ncol), YAKL_LAMBDA(int igpt, int ilay, int icol) {
       // NOTE: using plev would need to assume level ordering (top to bottom or bottom to top), but
       // using play/pmid does not
       if (cld_tau_gpt(icol,ilay,igpt) > 0 && pmid(icol,ilay) >= pmin && pmid(icol,ilay) < pmax) {
         subcol_mask(icol,igpt) = 1;
       }
-    });
+    }));
   // Compute average over subcols to get cloud area
   auto ngpt_inv = 1.0 / ngpt;
   memset(cld_area, 0);
-  yakl::fortran::parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) {
+  TIMED_KERNEL(yakl::fortran::parallel_for(SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) {
       // This loop needs to be serial because of the atomic reduction
       for (int igpt = 1; igpt <= ngpt; ++igpt) {
         cld_area(icol) += subcol_mask(icol,igpt) * ngpt_inv;
       }
-    });
+    }));
 }
 
 int get_wavelength_index_sw(double wavelength) { return get_wavelength_index(k_dist_sw, wavelength); }
@@ -1008,7 +1008,7 @@ int get_wavelength_index(OpticalProps &kdist, double wavelength) {
   // in units of meters, we need a conversion factor of 10^2
   int nbnds = kdist.get_nband();
   yakl::ScalarLiveOut<int> band_index(-1);
-  yakl::fortran::parallel_for(SimpleBounds<1>(nbnds), YAKL_LAMBDA(int ibnd) {
+  TIMED_KERNEL(yakl::fortran::parallel_for(SimpleBounds<1>(nbnds), YAKL_LAMBDA(int ibnd) {
       if (wavelength_bounds(1,ibnd) < wavelength_bounds(2,ibnd)) {
         if (wavelength_bounds(1,ibnd) <= wavelength * 1e2 && wavelength * 1e2 <= wavelength_bounds(2,ibnd)) {
           band_index = ibnd;
@@ -1018,7 +1018,7 @@ int get_wavelength_index(OpticalProps &kdist, double wavelength) {
           band_index = ibnd;
         }
       }
-    });
+    }));
   return band_index.hostRead();
 }
 
@@ -1056,7 +1056,7 @@ void compute_aerocom_cloudtop(
   // TODO: move tunable constant to namelist
   constexpr real cldfrac_tot_threshold = 0.001;  // BAD_CONSTANT!
   // Loop over all columns in parallel
-  yakl::fortran::parallel_for(
+  TIMED_KERNEL(yakl::fortran::parallel_for(
     SimpleBounds<1>(ncol), YAKL_LAMBDA(int icol) {
       // Loop over all layers in serial (due to accumulative
       // product), starting at 2 (second highest) layer because the
@@ -1116,7 +1116,7 @@ void compute_aerocom_cloudtop(
       // aerocom_clr is the result of accumulative probabilities
       // (their products)
       cldfrac_tot_at_cldtop(icol) = 1.0 - aerocom_clr(icol);
-    });
+    }));
 }
 
 }  // namespace rrtmgp
diff --git a/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.hpp b/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.hpp
index e807643b05a9..44cd16ac2314 100644
--- a/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.hpp
+++ b/components/eamxx/src/physics/rrtmgp/scream_rrtmgp_interface.hpp
@@ -130,7 +130,7 @@ void mixing_ratio_to_cloud_mass(
   int ncol = mixing_ratio.dimension[0];
   int nlay = mixing_ratio.dimension[1];
   using physconst = scream::physics::Constants<Real>;
-  yakl::fortran::parallel_for(yakl::fortran::SimpleBounds<2>(nlay, ncol), YAKL_LAMBDA(int ilay, int icol) {
+  TIMED_KERNEL(yakl::fortran::parallel_for(yakl::fortran::SimpleBounds<2>(nlay, ncol), YAKL_LAMBDA(int ilay, int icol) {
     // Compute in-cloud mixing ratio (mixing ratio of the cloudy part of the layer)
     // NOTE: these thresholds (from E3SM) seem arbitrary, but included here for consistency
     // This limits in-cloud mixing ratio to 0.005 kg/kg. According to note in cloud_diagnostics
@@ -142,14 +142,14 @@ void mixing_ratio_to_cloud_mass(
     } else {
       cloud_mass(icol,ilay) = 0;
     }
-  });
+      }));
 }
 
 template<class S, class T>
 void limit_to_bounds(S const &arr_in, T const lower, T const upper, S &arr_out) {
-  yakl::c::parallel_for(arr_in.totElems(), YAKL_LAMBDA(int i) {
+  TIMED_KERNEL(yakl::c::parallel_for(arr_in.totElems(), YAKL_LAMBDA(int i) {
     arr_out.data()[i] = std::min(std::max(arr_in.data()[i], lower), upper);
-  });
+      }));
 }
 
 int get_wavelength_index(OpticalProps &kdist, double wavelength);
@@ -170,7 +170,7 @@ using view_t = Kokkos::View<T, LayoutT, DeviceT>;
 template <typename T>
 using hview_t = Kokkos::View<T, LayoutT, HostDevice>;
 
-using pool_t = conv::MemPoolSingleton<RealT, DeviceT>;
+using pool_t = conv::MemPoolSingleton<RealT, LayoutT, DeviceT>;
 
 using real1dk = view_t<RealT*>;
 using real2dk = view_t<RealT**>;
@@ -179,6 +179,7 @@ using creal1dk = view_t<const RealT*>;
 using creal2dk = view_t<const RealT**>;
 using creal3dk = view_t<const RealT***>;
 using int1dk  = view_t<int*>;
+using int2dk  = view_t<int**>;
 using int3dk  = view_t<int***>;
 
 using gas_optics_t   = GasOpticsRRTMGPK<RealT, LayoutT, DeviceT>;
@@ -196,16 +197,16 @@ using source_func_t = SourceFuncLWK<RealT, LayoutT, DeviceT>;
  * once and then persist throughout the life of the program, so we
  * declare them here within the rrtmgp namespace.
  */
-static inline gas_optics_t k_dist_sw_k;
-static inline gas_optics_t k_dist_lw_k;
+static inline std::unique_ptr<gas_optics_t> k_dist_sw_k;
+static inline std::unique_ptr<gas_optics_t> k_dist_lw_k;
 
 /*
  * Objects containing cloud optical property look-up table information.
  * We want to initialize these once and use throughout the life of the
  * program, so declare here and read data in during rrtmgp_initialize().
  */
-static inline cloud_optics_t cloud_optics_sw_k;
-static inline cloud_optics_t cloud_optics_lw_k;
+static inline std::unique_ptr<cloud_optics_t> cloud_optics_sw_k;
+static inline std::unique_ptr<cloud_optics_t> cloud_optics_lw_k;
 
 /*
  * Flag to indicate whether or not we have initialized RRTMGP
@@ -213,13 +214,14 @@ static inline cloud_optics_t cloud_optics_lw_k;
 static inline bool initialized_k = false;
 
 /*
- * Initialize data for RRTMGP driver
+ * Initialize data for RRTMGP driver. Increase multiplier to allocate more pool space.
  */
 static void rrtmgp_initialize(
   const gas_concs_t &gas_concs,
   const std::string& coefficients_file_sw, const std::string& coefficients_file_lw,
   const std::string& cloud_optics_file_sw, const std::string& cloud_optics_file_lw,
-  const std::shared_ptr<spdlog::logger>& logger)
+  const std::shared_ptr<spdlog::logger>& logger,
+  const double multiplier = 1.0)
 {
   // If we've already initialized, just exit
   if (initialized_k) {
@@ -231,21 +233,27 @@ static void rrtmgp_initialize(
   // Initialize Kokkos
   if (!Kokkos::is_initialized()) {  Kokkos::initialize(); }
 
+  // Create objects for static ptrs
+  k_dist_sw_k = std::make_unique<gas_optics_t>();
+  k_dist_lw_k = std::make_unique<gas_optics_t>();
+  cloud_optics_sw_k = std::make_unique<cloud_optics_t>();
+  cloud_optics_lw_k = std::make_unique<cloud_optics_t>();
+
   // Load and initialize absorption coefficient data
-  load_and_init(k_dist_sw_k, coefficients_file_sw, gas_concs);
-  load_and_init(k_dist_lw_k, coefficients_file_lw, gas_concs);
+  load_and_init(*k_dist_sw_k, coefficients_file_sw, gas_concs);
+  load_and_init(*k_dist_lw_k, coefficients_file_lw, gas_concs);
 
   // Load and initialize cloud optical property look-up table information
-  load_cld_lutcoeff(cloud_optics_sw_k, cloud_optics_file_sw);
-  load_cld_lutcoeff(cloud_optics_lw_k, cloud_optics_file_lw);
+  load_cld_lutcoeff(*cloud_optics_sw_k, cloud_optics_file_sw);
+  load_cld_lutcoeff(*cloud_optics_lw_k, cloud_optics_file_lw);
 
   // initialize kokkos rrtmgp pool allocator
-  const size_t base_ref = 80000;
+  const size_t base_ref = 40000;
   const size_t ncol = gas_concs.ncol;
   const size_t nlay = gas_concs.nlay;
   const size_t nlev = SCREAM_NUM_VERTICAL_LEV;
   const size_t my_size_ref = ncol * nlay * nlev;
-  pool_t::init(2e6 * (float(my_size_ref) / base_ref));
+  pool_t::init(2e6 * (float(my_size_ref) / base_ref) * multiplier);
 
   // We are now initialized!
   initialized_k = true;
@@ -261,7 +269,7 @@ static void compute_band_by_band_surface_albedos(
   const real2dk &sfc_alb_dir,     const real2dk &sfc_alb_dif)
 {
   EKAT_ASSERT_MSG(initialized_k, "Error! rrtmgp_initialize must be called before GasOpticsRRTMGP object can be used.");
-  auto wavenumber_limits = k_dist_sw_k.get_band_lims_wavenumber();
+  auto wavenumber_limits = k_dist_sw_k->get_band_lims_wavenumber();
 
   EKAT_ASSERT_MSG(wavenumber_limits.extent(0) == 2,
                   "Error! 1st dimension for wavenumber_limits should be 2. It's " << wavenumber_limits.extent(0));
@@ -270,8 +278,7 @@ static void compute_band_by_band_surface_albedos(
 
   // Loop over bands, and determine for each band whether it is broadly in the
   // visible or infrared part of the spectrum (visible or "not visible")
-  Kokkos::parallel_for(MDRP::template get<2>({nswbands, ncol}), KOKKOS_LAMBDA(const int ibnd, const int icol) {
-
+  TIMED_KERNEL(FLATTEN_MD_KERNEL2(ncol, nswbands, icol, ibnd,
     // Threshold between visible and infrared is 0.7 micron, or 14286 cm^-1.
     const RealT visible_wavenumber_threshold = 14286;
 
@@ -297,7 +304,7 @@ static void compute_band_by_band_surface_albedos(
       sfc_alb_dir(icol,ibnd) = 0.5*(sfc_alb_dir_vis(icol) + sfc_alb_dir_nir(icol));
       sfc_alb_dif(icol,ibnd) = 0.5*(sfc_alb_dif_vis(icol) + sfc_alb_dif_nir(icol));
     }
-  });
+  ));
 }
 
 /*
@@ -326,8 +333,8 @@ static void compute_broadband_surface_fluxes(
 
   // Threshold between visible and infrared is 0.7 micron, or 14286 cm^-1.
   const RealT visible_wavenumber_threshold = 14286;
-  auto wavenumber_limits = k_dist_sw_k.get_band_lims_wavenumber();
-  Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(const int icol) {
+  auto wavenumber_limits = k_dist_sw_k->get_band_lims_wavenumber();
+  TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(const int icol) {
     for (int ibnd = 0; ibnd < nswbands; ++ibnd) {
       // Wavenumber is in the visible if it is above the visible wavenumber
       // threshold, and in the infrared if it is below the threshold
@@ -353,7 +360,7 @@ static void compute_broadband_surface_fluxes(
         sfc_flux_dif_nir(icol) += 0.5 * sw_bnd_flux_dif(icol,ktop,ibnd);
       }
     }
-  });
+  }));
 }
 
 /*
@@ -384,12 +391,17 @@ static void rrtmgp_main(
   const std::shared_ptr<spdlog::logger>& logger,
   const bool extra_clnclrsky_diag = false, const bool extra_clnsky_diag = false)
 {
+  const int sw_nband = k_dist_sw_k->get_nband();
+  const int lw_nband = k_dist_lw_k->get_nband();
+  const int sw_ngpt = k_dist_sw_k->get_ngpt();
+  const int lw_ngpt = k_dist_lw_k->get_ngpt();
+
 #ifdef SCREAM_RRTMGP_DEBUG
   // Sanity check inputs, and possibly repair
-  check_range_k(t_lay      ,  k_dist_sw_k.get_temp_min(),         k_dist_sw_k.get_temp_max(), "rrtmgp_main::t_lay");
-  check_range_k(t_lev      ,  k_dist_sw_k.get_temp_min(),         k_dist_sw_k.get_temp_max(), "rrtmgp_main::t_lev");
-  check_range_k(p_lay      , k_dist_sw_k.get_press_min(),        k_dist_sw_k.get_press_max(), "rrtmgp_main::p_lay");
-  check_range_k(p_lev      , k_dist_sw_k.get_press_min(),        k_dist_sw_k.get_press_max(), "rrtmgp_main::p_lev");
+  check_range_k(t_lay      ,  k_dist_sw_k->get_temp_min(),         k_dist_sw_k->get_temp_max(), "rrtmgp_main::t_lay");
+  check_range_k(t_lev      ,  k_dist_sw_k->get_temp_min(),         k_dist_sw_k->get_temp_max(), "rrtmgp_main::t_lev");
+  check_range_k(p_lay      , k_dist_sw_k->get_press_min(),        k_dist_sw_k->get_press_max(), "rrtmgp_main::p_lay");
+  check_range_k(p_lev      , k_dist_sw_k->get_press_min(),        k_dist_sw_k->get_press_max(), "rrtmgp_main::p_lev");
   check_range_k(sfc_alb_dir,                         0,                                1, "rrtmgp_main::sfc_alb_dir");
   check_range_k(sfc_alb_dif,                         0,                                1, "rrtmgp_main::sfc_alb_dif");
   check_range_k(mu0        ,                         0,                                1, "rrtmgp_main::mu0");
@@ -399,6 +411,36 @@ static void rrtmgp_main(
   check_range_k(rei        ,                         0, std::numeric_limits<RealT>::max(), "rrtmgp_main::rei");
 #endif
 
+  auto sw_band2gpt_mem = pool_t::template alloc<int>(2, sw_nband);
+  auto sw_gpt2band_mem = pool_t::template alloc<int>(   sw_nband);
+  auto lw_band2gpt_mem = pool_t::template alloc<int>(2, lw_nband);
+  auto lw_gpt2band_mem = pool_t::template alloc<int>(   lw_nband);
+
+  auto sw_cloud_band2gpt_mem = pool_t::template alloc<int>(2, sw_nband);
+  auto sw_cloud_gpt2band_mem = pool_t::template alloc<int>(   sw_nband);
+  auto lw_cloud_band2gpt_mem = pool_t::template alloc<int>(2, lw_nband);
+  auto lw_cloud_gpt2band_mem = pool_t::template alloc<int>(   lw_nband);
+
+  auto sw_subcloud_band2gpt_mem = pool_t::template alloc<int>(2, sw_nband);
+  auto sw_subcloud_gpt2band_mem = pool_t::template alloc<int>(   sw_ngpt);
+  auto lw_subcloud_band2gpt_mem = pool_t::template alloc<int>(2, lw_nband);
+  auto lw_subcloud_gpt2band_mem = pool_t::template alloc<int>(   lw_ngpt);
+
+  auto sw_tau_mem = pool_t::template alloc<RealT>(ncol, nlay, sw_nband);
+  auto sw_ssa_mem = pool_t::template alloc<RealT>(ncol, nlay, sw_nband);
+  auto sw_g_mem   = pool_t::template alloc<RealT>(ncol, nlay, sw_nband);
+  auto lw_tau_mem = pool_t::template alloc<RealT>(ncol, nlay, lw_nband);
+
+  auto sw_cloud_tau_mem = pool_t::template alloc<RealT>(ncol, nlay, sw_nband);
+  auto sw_cloud_ssa_mem = pool_t::template alloc<RealT>(ncol, nlay, sw_nband);
+  auto sw_cloud_g_mem   = pool_t::template alloc<RealT>(ncol, nlay, sw_nband);
+  auto lw_cloud_tau_mem = pool_t::template alloc<RealT>(ncol, nlay, lw_nband);
+
+  auto sw_subcloud_tau_mem = pool_t::template alloc<RealT>(ncol, nlay, sw_ngpt);
+  auto sw_subcloud_ssa_mem = pool_t::template alloc<RealT>(ncol, nlay, sw_ngpt);
+  auto sw_subcloud_g_mem   = pool_t::template alloc<RealT>(ncol, nlay, sw_ngpt);
+  auto lw_subcloud_tau_mem = pool_t::template alloc<RealT>(ncol, nlay, lw_ngpt);
+
   // Setup pointers to RRTMGP SW fluxes
   fluxes_t fluxes_sw;
   fluxes_sw.flux_up = sw_flux_up;
@@ -442,24 +484,24 @@ static void rrtmgp_main(
   clnsky_fluxes_lw.flux_up = lw_clnsky_flux_up;
   clnsky_fluxes_lw.flux_dn = lw_clnsky_flux_dn;
 
-  auto nswbands = k_dist_sw_k.get_nband();
-  auto nlwbands = k_dist_lw_k.get_nband();
+  auto nswbands = k_dist_sw_k->get_nband();
+  auto nlwbands = k_dist_lw_k->get_nband();
 
   // Setup aerosol optical properties
   optical_props2_t aerosol_sw;
   optical_props1_t aerosol_lw;
-  aerosol_sw.init(k_dist_sw_k.get_band_lims_wavenumber());
-  aerosol_sw.alloc_2str(ncol, nlay);
-  Kokkos::parallel_for(MDRP::template get<3>({nswbands,nlay,ncol}) , KOKKOS_LAMBDA (int ibnd, int ilay, int icol) {
+  aerosol_sw.init_no_alloc(k_dist_sw_k->get_band_lims_wavenumber(), sw_band2gpt_mem, sw_gpt2band_mem);
+  aerosol_sw.alloc_2str_no_alloc(ncol, nlay, sw_tau_mem, sw_ssa_mem, sw_g_mem);
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol,nlay,nswbands, icol, ilay, ibnd,
     aerosol_sw.tau(icol,ilay,ibnd) = aer_tau_sw(icol,ilay,ibnd);
     aerosol_sw.ssa(icol,ilay,ibnd) = aer_ssa_sw(icol,ilay,ibnd);
     aerosol_sw.g  (icol,ilay,ibnd) = aer_asm_sw(icol,ilay,ibnd);
-  });
-  aerosol_lw.init(k_dist_lw_k.get_band_lims_wavenumber());
-  aerosol_lw.alloc_1scl(ncol, nlay);
-  Kokkos::parallel_for(MDRP::template get<3>({nlwbands,nlay,ncol}) , KOKKOS_LAMBDA (int ibnd, int ilay, int icol) {
+  ));
+  aerosol_lw.init_no_alloc(k_dist_lw_k->get_band_lims_wavenumber(), lw_band2gpt_mem, lw_gpt2band_mem);
+  aerosol_lw.alloc_1scl_no_alloc(ncol, nlay, lw_tau_mem);
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol,nlay,nlwbands, icol, ilay, ibnd,
     aerosol_lw.tau(icol,ilay,ibnd) = aer_tau_lw(icol,ilay,ibnd);
-  });
+  ));
 
 #ifdef SCREAM_RRTMGP_DEBUG
   // Check aerosol optical properties
@@ -472,28 +514,29 @@ static void rrtmgp_main(
 #endif
 
   // Convert cloud physical properties to optical properties for input to RRTMGP
-  optical_props2_t clouds_sw = get_cloud_optics_sw(ncol, nlay, cloud_optics_sw_k, k_dist_sw_k, lwp, iwp, rel, rei);
-  optical_props1_t clouds_lw = get_cloud_optics_lw(ncol, nlay, cloud_optics_lw_k, k_dist_lw_k, lwp, iwp, rel, rei);
+  optical_props2_t clouds_sw = get_cloud_optics_sw(ncol, nlay, *cloud_optics_sw_k, *k_dist_sw_k, lwp, iwp, rel, rei, sw_cloud_band2gpt_mem, sw_cloud_gpt2band_mem, sw_cloud_tau_mem, sw_cloud_ssa_mem, sw_cloud_g_mem);
+  optical_props1_t clouds_lw = get_cloud_optics_lw(ncol, nlay, *cloud_optics_lw_k, *k_dist_lw_k, lwp, iwp, rel, rei, lw_cloud_band2gpt_mem, lw_cloud_gpt2band_mem, lw_cloud_tau_mem);
   Kokkos::deep_copy(cld_tau_sw_bnd, clouds_sw.tau);
   Kokkos::deep_copy(cld_tau_lw_bnd, clouds_lw.tau);
 
   // Do subcolumn sampling to map bands -> gpoints based on cloud fraction and overlap assumption;
   // This implements the Monte Carlo Independing Column Approximation by mapping only a single
   // subcolumn (cloud state) to each gpoint.
-  auto nswgpts = k_dist_sw_k.get_ngpt();
-  auto clouds_sw_gpt = get_subsampled_clouds(ncol, nlay, nswbands, nswgpts, clouds_sw, k_dist_sw_k, cldfrac, p_lay);
+  auto nswgpts = k_dist_sw_k->get_ngpt();
+  auto clouds_sw_gpt = get_subsampled_clouds(ncol, nlay, nswbands, nswgpts, clouds_sw, *k_dist_sw_k, cldfrac, p_lay, sw_subcloud_band2gpt_mem, sw_subcloud_gpt2band_mem, sw_subcloud_tau_mem, sw_subcloud_ssa_mem, sw_subcloud_g_mem);
+
   // Longwave
-  auto nlwgpts = k_dist_lw_k.get_ngpt();
-  auto clouds_lw_gpt = get_subsampled_clouds(ncol, nlay, nlwbands, nlwgpts, clouds_lw, k_dist_lw_k, cldfrac, p_lay);
+  auto nlwgpts = k_dist_lw_k->get_ngpt();
+  auto clouds_lw_gpt = get_subsampled_clouds(ncol, nlay, nlwbands, nlwgpts, clouds_lw, *k_dist_lw_k, cldfrac, p_lay, lw_subcloud_band2gpt_mem, lw_subcloud_gpt2band_mem, lw_subcloud_tau_mem);
 
   // Copy cloud properties to outputs (is this needed, or can we just use pointers?)
   // Alternatively, just compute and output a subcolumn cloud mask
-  Kokkos::parallel_for(MDRP::template get<3>({nswgpts, nlay, ncol}), KOKKOS_LAMBDA (int igpt, int ilay, int icol) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay, nswgpts, icol, ilay, igpt,
     cld_tau_sw_gpt(icol,ilay,igpt) = clouds_sw_gpt.tau(icol,ilay,igpt);
-  });
-  Kokkos::parallel_for(MDRP::template get<3>({nlwgpts, nlay, ncol}), KOKKOS_LAMBDA (int igpt, int ilay, int icol) {
+  ));
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay, nlwgpts, icol, ilay, igpt,
     cld_tau_lw_gpt(icol,ilay,igpt) = clouds_lw_gpt.tau(icol,ilay,igpt);
-  });
+  ));
 
 #ifdef SCREAM_RRTMGP_DEBUG
   // Perform checks on optics; these would be caught by RRTMGP_EXPENSIVE_CHECKS in the RRTMGP code,
@@ -511,7 +554,7 @@ static void rrtmgp_main(
   // Do shortwave
   rrtmgp_sw(
     ncol, nlay,
-    k_dist_sw_k, p_lay, t_lay, p_lev, t_lev, gas_concs,
+    *k_dist_sw_k, p_lay, t_lay, p_lev, t_lev, gas_concs,
     sfc_alb_dir, sfc_alb_dif, mu0, aerosol_sw, clouds_sw_gpt,
     fluxes_sw, clnclrsky_fluxes_sw, clrsky_fluxes_sw, clnsky_fluxes_sw,
     tsi_scaling, logger,
@@ -521,12 +564,41 @@ static void rrtmgp_main(
   // Do longwave
   rrtmgp_lw(
     ncol, nlay,
-    k_dist_lw_k, p_lay, t_lay, p_lev, t_lev, gas_concs,
+    *k_dist_lw_k, p_lay, t_lay, p_lev, t_lev, gas_concs,
     aerosol_lw, clouds_lw_gpt,
     fluxes_lw, clnclrsky_fluxes_lw, clrsky_fluxes_lw, clnsky_fluxes_lw,
     extra_clnclrsky_diag, extra_clnsky_diag
             );
 
+  pool_t::dealloc(sw_band2gpt_mem);
+  pool_t::dealloc(sw_gpt2band_mem);
+  pool_t::dealloc(lw_band2gpt_mem);
+  pool_t::dealloc(lw_gpt2band_mem);
+
+  pool_t::dealloc(sw_cloud_band2gpt_mem);
+  pool_t::dealloc(sw_cloud_gpt2band_mem);
+  pool_t::dealloc(lw_cloud_band2gpt_mem);
+  pool_t::dealloc(lw_cloud_gpt2band_mem);
+
+  pool_t::dealloc(sw_subcloud_band2gpt_mem);
+  pool_t::dealloc(sw_subcloud_gpt2band_mem);
+  pool_t::dealloc(lw_subcloud_band2gpt_mem);
+  pool_t::dealloc(lw_subcloud_gpt2band_mem);
+
+  pool_t::dealloc(sw_tau_mem);
+  pool_t::dealloc(sw_ssa_mem);
+  pool_t::dealloc(sw_g_mem);
+  pool_t::dealloc(lw_tau_mem);
+
+  pool_t::dealloc(sw_cloud_tau_mem);
+  pool_t::dealloc(sw_cloud_ssa_mem);
+  pool_t::dealloc(sw_cloud_g_mem);
+  pool_t::dealloc(lw_cloud_tau_mem);
+
+  pool_t::dealloc(sw_subcloud_tau_mem);
+  pool_t::dealloc(sw_subcloud_ssa_mem);
+  pool_t::dealloc(sw_subcloud_g_mem);
+  pool_t::dealloc(lw_subcloud_tau_mem);
 }
 
 /*
@@ -535,10 +607,14 @@ static void rrtmgp_main(
 static void rrtmgp_finalize()
 {
   initialized_k = false;
-  k_dist_sw_k.finalize();
-  k_dist_lw_k.finalize();
-  cloud_optics_sw_k.finalize(); //~CloudOptics();
-  cloud_optics_lw_k.finalize(); //~CloudOptics();
+  k_dist_sw_k->finalize();
+  k_dist_lw_k->finalize();
+  cloud_optics_sw_k->finalize(); //~CloudOptics();
+  cloud_optics_lw_k->finalize(); //~CloudOptics();
+  k_dist_sw_k = nullptr;
+  k_dist_lw_k = nullptr;
+  cloud_optics_sw_k = nullptr;
+  cloud_optics_lw_k = nullptr;
   pool_t::finalize();
 }
 
@@ -580,7 +656,7 @@ static void rrtmgp_sw(
   auto &clnsky_flux_dn_dir = clnsky_fluxes.flux_dn_dir;
 
   // Reset fluxes to zero
-  Kokkos::parallel_for(MDRP::template get<2>({nlay+1,ncol}), KOKKOS_LAMBDA(int ilev, int icol) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL2(ncol, nlay+1, icol, ilev,
     flux_up    (icol,ilev) = 0;
     flux_dn    (icol,ilev) = 0;
     flux_dn_dir(icol,ilev) = 0;
@@ -593,15 +669,15 @@ static void rrtmgp_sw(
     clnsky_flux_up    (icol,ilev) = 0;
     clnsky_flux_dn    (icol,ilev) = 0;
     clnsky_flux_dn_dir(icol,ilev) = 0;
-  });
-  Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay+1,ncol}), KOKKOS_LAMBDA(int ibnd, int ilev, int icol) {
+  ));
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay+1, nbnd, icol, ilev, ibnd,
     bnd_flux_up    (icol,ilev,ibnd) = 0;
     bnd_flux_dn    (icol,ilev,ibnd) = 0;
     bnd_flux_dn_dir(icol,ilev,ibnd) = 0;
-  });
+  ));
 
   // Get daytime indices
-  auto dayIndices = pool_t::template alloc_and_init<int>(ncol);
+  auto dayIndices = pool_t::template alloc<int>(ncol);
   Kokkos::deep_copy(dayIndices, -1);
 
   int nday = 0;
@@ -621,99 +697,113 @@ static void rrtmgp_sw(
   }
 
   // Allocate temporaries from pool
-  const int size1 = nday;
-  const int size2 = nday*nlay; // 4
-  const int size3 = nday*(nlay+1); // 5
-  const int size4 = ncol*nlay;
-  const int size5 = nbnd*nday; //2
-  const int size6 = nday*ngpt;
-  const int size7 = nday*(nlay+1)*nbnd; // 3
-  const int size8 = ncol*nlay*(k_dist.get_ngas()+1);
 
-  const int total_size = size1 + size2*4 + size3*5 + size4 + size5*2 + size6 + size7*3 + size8;
-  auto data = pool_t::template alloc_and_init<RealT>(total_size); RealT* dcurr = data.data();
+  auto mu0_day = pool_t::template alloc<RealT>(nday);
 
-  auto mu0_day             = view_t<RealT*>  (dcurr, nday); dcurr += size1;
+  auto p_lay_day = pool_t::template alloc<RealT>(nday, nlay);
+  auto t_lay_day = pool_t::template alloc<RealT>(nday, nlay);
+  auto vmr_day = pool_t::template alloc<RealT>(nday, nlay);
+  auto t_lay_limited = pool_t::template alloc<RealT>(nday, nlay);
 
-  auto p_lay_day           = view_t<RealT**> (dcurr, nday, nlay); dcurr += size2;
-  auto t_lay_day           = view_t<RealT**> (dcurr, nday, nlay); dcurr += size2;
-  auto vmr_day             = view_t<RealT**> (dcurr, nday, nlay); dcurr += size2;
-  auto t_lay_limited       = view_t<RealT**> (dcurr, nday, nlay); dcurr += size2;
+  auto p_lev_day = pool_t::template alloc<RealT>(nday, nlay+1);
+  auto t_lev_day = pool_t::template alloc<RealT>(nday, nlay+1);
+  auto flux_up_day = pool_t::template alloc<RealT>(nday, nlay+1);
+  auto flux_dn_day = pool_t::template alloc<RealT>(nday, nlay+1);
+  auto flux_dn_dir_day = pool_t::template alloc<RealT>(nday, nlay+1);
 
-  auto p_lev_day           = view_t<RealT**> (dcurr, nday, nlay+1); dcurr += size3;
-  auto t_lev_day           = view_t<RealT**> (dcurr, nday, nlay+1); dcurr += size3;
-  auto flux_up_day         = view_t<RealT**> (dcurr, nday, nlay+1); dcurr += size3;
-  auto flux_dn_day         = view_t<RealT**> (dcurr, nday, nlay+1); dcurr += size3;
-  auto flux_dn_dir_day     = view_t<RealT**> (dcurr, nday, nlay+1); dcurr += size3;
+  auto vmr = pool_t::template alloc<RealT>(ncol, nlay);
 
-  auto vmr                 = view_t<RealT**> (dcurr, ncol, nlay); dcurr += size4;
+  auto sfc_alb_dir_T = pool_t::template alloc<RealT>(nbnd, nday);
+  auto sfc_alb_dif_T = pool_t::template alloc<RealT>(nbnd, nday);
 
-  auto sfc_alb_dir_T       = view_t<RealT**> (dcurr, nbnd, nday); dcurr += size5;
-  auto sfc_alb_dif_T       = view_t<RealT**> (dcurr, nbnd, nday); dcurr += size5;
+  auto toa_flux = pool_t::template alloc<RealT>(nday, ngpt);
 
-  auto toa_flux            = view_t<RealT**> (dcurr, nday, ngpt); dcurr += size6;
+  auto bnd_flux_up_day = pool_t::template alloc<RealT>(nday, nlay+1, nbnd);
+  auto bnd_flux_dn_day = pool_t::template alloc<RealT>(nday, nlay+1, nbnd);
+  auto bnd_flux_dn_dir_day = pool_t::template alloc<RealT>(nday, nlay+1, nbnd);
 
-  auto bnd_flux_up_day     = view_t<RealT***>(dcurr, nday, nlay+1, nbnd); dcurr += size7;
-  auto bnd_flux_dn_day     = view_t<RealT***>(dcurr, nday, nlay+1, nbnd); dcurr += size7;
-  auto bnd_flux_dn_dir_day = view_t<RealT***>(dcurr, nday, nlay+1, nbnd); dcurr += size7;
+  auto col_gas = pool_t::template alloc<RealT>(ncol, nlay, k_dist.get_ngas()+1);
 
-  auto col_gas             = view_t<RealT***>(dcurr, ncol, nlay, k_dist.get_ngas()+1); dcurr += size8;
+  auto concs_mem = pool_t::template alloc<RealT>(nday, nlay, ngas);
+
+  auto sw_aero_tau_mem = pool_t::template alloc<RealT>(nday, nlay, nbnd);
+  auto sw_aero_ssa_mem = pool_t::template alloc<RealT>(nday, nlay, nbnd);
+  auto sw_aero_g_mem = pool_t::template alloc<RealT>(nday, nlay, nbnd);
+
+  auto sw_cloud_tau_mem = pool_t::template alloc<RealT>(nday, nlay, ngpt);
+  auto sw_cloud_ssa_mem = pool_t::template alloc<RealT>(nday, nlay, ngpt);
+  auto sw_cloud_g_mem = pool_t::template alloc<RealT>(nday, nlay, ngpt);
+  auto sw_optics_tau_mem = pool_t::template alloc<RealT>(nday, nlay, ngpt);
+  auto sw_optics_ssa_mem = pool_t::template alloc<RealT>(nday, nlay, ngpt);
+  auto sw_optics_g_mem = pool_t::template alloc<RealT>(nday, nlay, ngpt);
+  auto sw_noaero_tau_mem = pool_t::template alloc<RealT>(nday, nlay, ngpt);
+  auto sw_noaero_ssa_mem = pool_t::template alloc<RealT>(nday, nlay, ngpt);
+  auto sw_noaero_g_mem = pool_t::template alloc<RealT>(nday, nlay, ngpt);
+
+  auto sw_aero_band2gpt_mem = pool_t::template alloc<int>(2, nbnd);
+  auto sw_aero_gpt2band_mem = pool_t::template alloc<int>(   nbnd);
+  auto sw_cloud_band2gpt_mem = pool_t::template alloc<int>(2, nbnd);
+  auto sw_cloud_gpt2band_mem = pool_t::template alloc<int>(   ngpt);
+  auto sw_optics_band2gpt_mem = pool_t::template alloc<int>(2, nbnd);
+  auto sw_optics_gpt2band_mem = pool_t::template alloc<int>(   ngpt);
+  auto sw_noaero_band2gpt_mem = pool_t::template alloc<int>(2, nbnd);
+  auto sw_noaero_gpt2band_mem = pool_t::template alloc<int>(   ngpt);
 
   // Subset mu0
-  Kokkos::parallel_for(nday, KOKKOS_LAMBDA(int iday) {
+  TIMED_KERNEL(Kokkos::parallel_for(nday, KOKKOS_LAMBDA(int iday) {
     mu0_day(iday) = mu0(dayIndices(iday));
-  });
+  }));
 
   // subset state variables
-  Kokkos::parallel_for(MDRP::template get<2>({nlay,nday}), KOKKOS_LAMBDA(int ilay, int iday) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay, iday, ilay,
     p_lay_day(iday,ilay) = p_lay(dayIndices(iday),ilay);
     t_lay_day(iday,ilay) = t_lay(dayIndices(iday),ilay);
-  });
-  Kokkos::parallel_for(MDRP::template get<2>({nlay+1,nday}), KOKKOS_LAMBDA(int ilev, int iday) {
+  ));
+  TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay+1, iday, ilev,
     p_lev_day(iday,ilev) = p_lev(dayIndices(iday),ilev);
     t_lev_day(iday,ilev) = t_lev(dayIndices(iday),ilev);
-  });
+  ));
 
   // Subset gases
   auto gas_names = gas_concs.get_gas_names();
   gas_concs_t gas_concs_day;
-  gas_concs_day.init(gas_names, nday, nlay);
+  gas_concs_day.init_no_alloc(gas_names, nday, nlay, concs_mem);
   for (int igas = 0; igas < ngas; igas++) {
     gas_concs.get_vmr(gas_names[igas], vmr);
-    Kokkos::parallel_for(MDRP::template get<2>({nlay,nday}), KOKKOS_LAMBDA(int ilay, int iday) {
+    TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay, iday, ilay,
       vmr_day(iday,ilay) = vmr(dayIndices(iday),ilay);
-    });
+    ));
     gas_concs_day.set_vmr(gas_names[igas], vmr_day);
   }
 
   // Subset aerosol optics
   optical_props2_t aerosol_day;
-  aerosol_day.init(k_dist.get_band_lims_wavenumber());
-  aerosol_day.alloc_2str(nday, nlay);
-  Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay,nday}), KOKKOS_LAMBDA(int ibnd, int ilay, int iday) {
+  aerosol_day.init_no_alloc(k_dist.get_band_lims_wavenumber(), sw_aero_band2gpt_mem, sw_aero_gpt2band_mem);
+  aerosol_day.alloc_2str_no_alloc(nday, nlay, sw_aero_tau_mem, sw_aero_ssa_mem, sw_aero_g_mem);
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(nday,nlay,nbnd, iday, ilay, ibnd,
     aerosol_day.tau(iday,ilay,ibnd) = aerosol.tau(dayIndices(iday),ilay,ibnd);
     aerosol_day.ssa(iday,ilay,ibnd) = aerosol.ssa(dayIndices(iday),ilay,ibnd);
     aerosol_day.g  (iday,ilay,ibnd) = aerosol.g  (dayIndices(iday),ilay,ibnd);
-  });
+  ));
 
   // Subset cloud optics
   // TODO: nbnd -> ngpt once we pass sub-sampled cloud state
   optical_props2_t clouds_day;
-  clouds_day.init(k_dist.get_band_lims_wavenumber(), k_dist.get_band_lims_gpoint());
-  clouds_day.alloc_2str(nday, nlay);
-  Kokkos::parallel_for(MDRP::template get<3>({ngpt,nlay,nday}), KOKKOS_LAMBDA(int igpt, int ilay, int iday) {
+  clouds_day.init_no_alloc(k_dist.get_band_lims_wavenumber(), k_dist.get_band_lims_gpoint(), sw_cloud_band2gpt_mem, sw_cloud_gpt2band_mem);
+  clouds_day.alloc_2str_no_alloc(nday, nlay, sw_cloud_tau_mem, sw_cloud_ssa_mem, sw_cloud_g_mem);
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(nday,nlay,ngpt, iday, ilay, igpt,
     clouds_day.tau(iday,ilay,igpt) = clouds.tau(dayIndices(iday),ilay,igpt);
     clouds_day.ssa(iday,ilay,igpt) = clouds.ssa(dayIndices(iday),ilay,igpt);
     clouds_day.g  (iday,ilay,igpt) = clouds.g  (dayIndices(iday),ilay,igpt);
-  });
+  ));
 
   // RRTMGP assumes surface albedos have a screwy dimension ordering
   // for some strange reason, so we need to transpose these; also do
   // daytime subsetting in the same kernel
-  Kokkos::parallel_for(MDRP::template get<2>({nbnd,nday}), KOKKOS_LAMBDA(int ibnd, int icol) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL2(nbnd,nday, ibnd, icol,
     sfc_alb_dir_T(ibnd,icol) = sfc_alb_dir(dayIndices(icol),ibnd);
     sfc_alb_dif_T(ibnd,icol) = sfc_alb_dif(dayIndices(icol),ibnd);
-  });
+  ));
 
   // Temporaries we need for daytime-only fluxes
   fluxes_t fluxes_day;
@@ -726,16 +816,16 @@ static void rrtmgp_sw(
 
   // Allocate space for optical properties
   optical_props2_t optics;
-  optics.alloc_2str(nday, nlay, k_dist);
+  optics.alloc_2str_no_alloc(nday, nlay, k_dist, sw_optics_band2gpt_mem, sw_optics_gpt2band_mem, sw_optics_tau_mem, sw_optics_ssa_mem, sw_optics_g_mem);
 
   optical_props2_t optics_no_aerosols;
   if (extra_clnsky_diag) {
     // Allocate space for optical properties (no aerosols)
-    optics_no_aerosols.alloc_2str(nday, nlay, k_dist);
+    optics_no_aerosols.alloc_2str_no_alloc(nday, nlay, k_dist, sw_noaero_band2gpt_mem, sw_noaero_gpt2band_mem, sw_noaero_tau_mem, sw_noaero_ssa_mem, sw_noaero_g_mem);
   }
 
   // Limit temperatures for gas optics look-up tables
-  limit_to_bounds_k(t_lay_day, k_dist_sw_k.get_temp_min(), k_dist_sw_k.get_temp_max(), t_lay_limited);
+  limit_to_bounds_k(t_lay_day, k_dist_sw_k->get_temp_min(), k_dist_sw_k->get_temp_max(), t_lay_limited);
 
   // Do gas optics
   bool top_at_1 = false;
@@ -756,20 +846,20 @@ static void rrtmgp_sw(
 #endif
 
   // Apply tsi_scaling
-  Kokkos::parallel_for(MDRP::template get<2>({ngpt,nday}), KOKKOS_LAMBDA(int igpt, int iday) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, ngpt, iday, igpt,
     toa_flux(iday,igpt) = tsi_scaling * toa_flux(iday,igpt);
-  });
+  ));
 
   if (extra_clnclrsky_diag) {
     // Compute clear-clean-sky (just gas) fluxes on daytime columns
     rte_sw(optics, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day);
     // Expand daytime fluxes to all columns
-    Kokkos::parallel_for(MDRP::template get<2>({nlay+1,nday}), KOKKOS_LAMBDA(int ilev, int iday) {
+    TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay+1, iday, ilev,
       const int icol = dayIndices(iday);
       clnclrsky_flux_up    (icol,ilev) = flux_up_day    (iday,ilev);
       clnclrsky_flux_dn    (icol,ilev) = flux_dn_day    (iday,ilev);
       clnclrsky_flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev);
-    });
+    ));
   }
 
   // Combine gas and aerosol optics
@@ -780,12 +870,12 @@ static void rrtmgp_sw(
   rte_sw(optics, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day);
 
   // Expand daytime fluxes to all columns
-  Kokkos::parallel_for(MDRP::template get<2>({nlay+1,nday}), KOKKOS_LAMBDA(int ilev, int iday) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay+1, iday, ilev,
     const int icol = dayIndices(iday);
     clrsky_flux_up    (icol,ilev) = flux_up_day    (iday,ilev);
     clrsky_flux_dn    (icol,ilev) = flux_dn_day    (iday,ilev);
     clrsky_flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev);
-  });
+  ));
 
   // Now merge in cloud optics and do allsky calculations
 
@@ -795,18 +885,18 @@ static void rrtmgp_sw(
   // Compute fluxes on daytime columns
   rte_sw(optics, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day);
   // Expand daytime fluxes to all columns
-  Kokkos::parallel_for(MDRP::template get<2>({nlay+1,nday}), KOKKOS_LAMBDA(int ilev, int iday) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay+1, iday, ilev,
     const int icol = dayIndices(iday);
     flux_up    (icol,ilev) = flux_up_day    (iday,ilev);
     flux_dn    (icol,ilev) = flux_dn_day    (iday,ilev);
     flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev);
-  });
-  Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay+1,nday}), KOKKOS_LAMBDA(int ibnd, int ilev, int iday) {
+  ));
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(nday, nlay+1, nbnd, iday, ilev, ibnd,
     const int icol = dayIndices(iday);
     bnd_flux_up    (icol,ilev,ibnd) = bnd_flux_up_day    (iday,ilev,ibnd);
     bnd_flux_dn    (icol,ilev,ibnd) = bnd_flux_dn_day    (iday,ilev,ibnd);
     bnd_flux_dn_dir(icol,ilev,ibnd) = bnd_flux_dn_dir_day(iday,ilev,ibnd);
-  });
+  ));
 
   if (extra_clnsky_diag) {
     // First increment clouds in optics_no_aerosols
@@ -814,16 +904,66 @@ static void rrtmgp_sw(
     // Compute cleansky (gas + clouds) fluxes on daytime columns
     rte_sw(optics_no_aerosols, top_at_1, mu0_day, toa_flux, sfc_alb_dir_T, sfc_alb_dif_T, fluxes_day);
     // Expand daytime fluxes to all columns
-    Kokkos::parallel_for(MDRP::template get<2>({nlay+1,nday}), KOKKOS_LAMBDA(int ilev, int iday) {
+    TIMED_KERNEL(FLATTEN_MD_KERNEL2(nday, nlay+1, iday, ilev,
       const int icol = dayIndices(iday);
       clnsky_flux_up    (icol,ilev) = flux_up_day    (iday,ilev);
       clnsky_flux_dn    (icol,ilev) = flux_dn_day    (iday,ilev);
       clnsky_flux_dn_dir(icol,ilev) = flux_dn_dir_day(iday,ilev);
-    });
+    ));
   }
 
-  pool_t::dealloc(data);
   pool_t::dealloc(dayIndices);
+
+  pool_t::dealloc(mu0_day);
+
+  pool_t::dealloc(p_lay_day);
+  pool_t::dealloc(t_lay_day);
+  pool_t::dealloc(vmr_day);
+  pool_t::dealloc(t_lay_limited);
+
+  pool_t::dealloc(p_lev_day);
+  pool_t::dealloc(t_lev_day);
+  pool_t::dealloc(flux_up_day);
+  pool_t::dealloc(flux_dn_day);
+  pool_t::dealloc(flux_dn_dir_day);
+
+  pool_t::dealloc(vmr);
+
+  pool_t::dealloc(sfc_alb_dir_T);
+  pool_t::dealloc(sfc_alb_dif_T);
+
+  pool_t::dealloc(toa_flux);
+
+  pool_t::dealloc(bnd_flux_up_day);
+  pool_t::dealloc(bnd_flux_dn_day);
+  pool_t::dealloc(bnd_flux_dn_dir_day);
+
+  pool_t::dealloc(col_gas);
+
+  pool_t::dealloc(concs_mem);
+
+  pool_t::dealloc(sw_aero_tau_mem);
+  pool_t::dealloc(sw_aero_ssa_mem);
+  pool_t::dealloc(sw_aero_g_mem);
+
+  pool_t::dealloc(sw_cloud_tau_mem);
+  pool_t::dealloc(sw_cloud_ssa_mem);
+  pool_t::dealloc(sw_cloud_g_mem);
+  pool_t::dealloc(sw_optics_tau_mem);
+  pool_t::dealloc(sw_optics_ssa_mem);
+  pool_t::dealloc(sw_optics_g_mem);
+  pool_t::dealloc(sw_noaero_tau_mem);
+  pool_t::dealloc(sw_noaero_ssa_mem);
+  pool_t::dealloc(sw_noaero_g_mem);
+
+  pool_t::dealloc(sw_aero_band2gpt_mem);
+  pool_t::dealloc(sw_aero_gpt2band_mem);
+  pool_t::dealloc(sw_cloud_band2gpt_mem);
+  pool_t::dealloc(sw_cloud_gpt2band_mem);
+  pool_t::dealloc(sw_optics_band2gpt_mem);
+  pool_t::dealloc(sw_optics_gpt2band_mem);
+  pool_t::dealloc(sw_noaero_band2gpt_mem);
+  pool_t::dealloc(sw_noaero_gpt2band_mem);
 }
 
 /*
@@ -841,24 +981,28 @@ static void rrtmgp_lw(
   // Problem size
   int nbnd = k_dist.get_nband();
   int constexpr max_gauss_pts = 4;
+  const int ngpt = k_dist.get_ngpt();
 
-  const int size1 = ncol;
-  const int size2 = nbnd*ncol;
-  const int size3 = max_gauss_pts*max_gauss_pts;
-  const int size4 = ncol*nlay;
-  const int size5 = ncol*(nlay+1);
-  const int size6 = ncol*nlay*(k_dist.get_ngas()+1);
-
-  const int total_size = size1 + size2 + size3*2 + size4 + size5 + size6;
-  auto data = pool_t::template alloc_and_init<RealT>(total_size); RealT *dcurr = data.data();
-
-  view_t<RealT*>   t_sfc        (dcurr, ncol); dcurr += size1;
-  view_t<RealT**>  emis_sfc     (dcurr, nbnd,ncol); dcurr += size2;
-  view_t<RealT**>  gauss_Ds     (dcurr, max_gauss_pts,max_gauss_pts); dcurr += size3;
-  view_t<RealT**>  gauss_wts    (dcurr, max_gauss_pts,max_gauss_pts); dcurr += size3;
-  view_t<RealT**>  t_lay_limited(dcurr, ncol, nlay); dcurr += size4;
-  view_t<RealT**>  t_lev_limited(dcurr, ncol, nlay+1); dcurr += size5;
-  view_t<RealT***> col_gas      (dcurr, ncol, nlay, k_dist.get_ngas()+1); dcurr += size6;
+  auto t_sfc               = pool_t::template alloc<RealT>(ncol);
+  auto emis_sfc            = pool_t::template alloc<RealT>(nbnd, ncol);
+  auto gauss_Ds            = pool_t::template alloc<RealT>(max_gauss_pts, max_gauss_pts);
+  auto gauss_wts           = pool_t::template alloc<RealT>(max_gauss_pts, max_gauss_pts);
+  auto t_lay_limited       = pool_t::template alloc<RealT>(ncol, nlay);
+  auto t_lev_limited       = pool_t::template alloc<RealT>(ncol, nlay+1);
+  auto col_gas             = pool_t::template alloc<RealT>(ncol, nlay, k_dist.get_ngas()+1);
+  auto lw_optics_tau_mem   = pool_t::template alloc<RealT>(ncol, nlay, ngpt);
+  auto lw_noaero_tau_mem   = pool_t::template alloc<RealT>(ncol, nlay, ngpt);
+  auto lay_source_mem      = pool_t::template alloc<RealT>(ncol, nlay, ngpt);
+  auto lev_source_inc_mem  = pool_t::template alloc<RealT>(ncol, nlay, ngpt);
+  auto lev_source_dec_mem  = pool_t::template alloc<RealT>(ncol, nlay, ngpt);
+  auto sfc_source_mem      = pool_t::template alloc<RealT>(ncol, ngpt);
+
+  auto lw_optics_band2gpt_mem = pool_t::template alloc<int>(2, nbnd);
+  auto lw_optics_gpt2band_mem = pool_t::template alloc<int>(   ngpt);
+  auto lw_noaero_band2gpt_mem = pool_t::template alloc<int>(2, nbnd);
+  auto lw_noaero_gpt2band_mem = pool_t::template alloc<int>(   ngpt);
+  auto lw_source_band2gpt_mem = pool_t::template alloc<int>(2, nbnd);
+  auto lw_source_gpt2band_mem = pool_t::template alloc<int>(   ngpt);
 
   // Associate local pointers for fluxes
   auto &flux_up           = fluxes.flux_up;
@@ -873,8 +1017,7 @@ static void rrtmgp_lw(
   auto &clnsky_flux_dn    = clnsky_fluxes.flux_dn;
 
   // Reset fluxes to zero
-  Kokkos::parallel_for(
-    MDRP::template get<2>({nlay + 1, ncol}), KOKKOS_LAMBDA(int ilev, int icol) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL2(ncol, nlay + 1, icol, ilev,
       flux_up(icol, ilev)           = 0;
       flux_dn(icol, ilev)           = 0;
       clnclrsky_flux_up(icol, ilev) = 0;
@@ -883,26 +1026,25 @@ static void rrtmgp_lw(
       clrsky_flux_dn(icol, ilev)    = 0;
       clnsky_flux_up(icol, ilev)    = 0;
       clnsky_flux_dn(icol, ilev)    = 0;
-    });
-  Kokkos::parallel_for(
-    MDRP::template get<3>({nbnd, nlay + 1, ncol}),
-    KOKKOS_LAMBDA(int ibnd, int ilev, int icol) {
+  ));
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay + 1, nbnd, icol, ilev, ibnd,
       bnd_flux_up(icol, ilev, ibnd) = 0;
       bnd_flux_dn(icol, ilev, ibnd) = 0;
-    });
+  ));
 
   // Allocate space for optical properties
   optical_props1_t optics;
-  optics.alloc_1scl(ncol, nlay, k_dist);
+  optics.alloc_1scl_no_alloc(ncol, nlay, k_dist, lw_optics_band2gpt_mem, lw_optics_gpt2band_mem, lw_optics_tau_mem);
+
   optical_props1_t optics_no_aerosols;
   if (extra_clnsky_diag) {
     // Allocate space for optical properties (no aerosols)
-    optics_no_aerosols.alloc_1scl(ncol, nlay, k_dist);
+    optics_no_aerosols.alloc_1scl_no_alloc(ncol, nlay, k_dist, lw_noaero_band2gpt_mem, lw_noaero_gpt2band_mem, lw_noaero_tau_mem);
   }
 
   // Boundary conditions
   source_func_t lw_sources;
-  lw_sources.alloc(ncol, nlay, k_dist);
+  lw_sources.alloc_no_alloc(ncol, nlay, k_dist, lw_source_band2gpt_mem, lw_source_gpt2band_mem, sfc_source_mem, lay_source_mem, lev_source_inc_mem, lev_source_dec_mem);
 
   bool top_at_1 = false;
   Kokkos::parallel_reduce(1, KOKKOS_LAMBDA(int, bool& val) {
@@ -910,9 +1052,9 @@ static void rrtmgp_lw(
   }, Kokkos::LOr<bool>(top_at_1));
 
   // Surface temperature
-  Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) {
+  TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) {
     t_sfc(icol) = t_lev(icol, conv::merge(nlay, 0, top_at_1));
-  });
+  }));
   Kokkos::deep_copy(emis_sfc , 0.98);
 
   // Get Gaussian quadrature weights
@@ -941,8 +1083,8 @@ static void rrtmgp_lw(
   Kokkos::deep_copy(gauss_wts, gauss_wts_host);
 
   // Limit temperatures for gas optics look-up tables
-  limit_to_bounds_k(t_lay, k_dist_lw_k.get_temp_min(), k_dist_lw_k.get_temp_max(), t_lay_limited);
-  limit_to_bounds_k(t_lev, k_dist_lw_k.get_temp_min(), k_dist_lw_k.get_temp_max(), t_lev_limited);
+  limit_to_bounds_k(t_lay, k_dist_lw_k->get_temp_min(), k_dist_lw_k->get_temp_max(), t_lay_limited);
+  limit_to_bounds_k(t_lev, k_dist_lw_k->get_temp_min(), k_dist_lw_k->get_temp_max(), t_lev_limited);
 
   // Do gas optics
   k_dist.gas_optics(ncol, nlay, top_at_1, p_lay, p_lev, t_lay_limited, t_sfc, gas_concs, col_gas, optics, lw_sources, view_t<RealT**>(), t_lev_limited);
@@ -979,7 +1121,26 @@ static void rrtmgp_lw(
     rte_lw(max_gauss_pts, gauss_Ds, gauss_wts, optics_no_aerosols, top_at_1, lw_sources, emis_sfc, clnsky_fluxes);
   }
 
-  pool_t::dealloc(data);
+  pool_t::dealloc(t_sfc);
+  pool_t::dealloc(emis_sfc);
+  pool_t::dealloc(gauss_Ds);
+  pool_t::dealloc(gauss_wts);
+  pool_t::dealloc(t_lay_limited);
+  pool_t::dealloc(t_lev_limited);
+  pool_t::dealloc(col_gas);
+  pool_t::dealloc(lw_optics_tau_mem);
+  pool_t::dealloc(lw_noaero_tau_mem);
+  pool_t::dealloc(lay_source_mem);
+  pool_t::dealloc(lev_source_inc_mem);
+  pool_t::dealloc(lev_source_dec_mem);
+  pool_t::dealloc(sfc_source_mem);
+
+  pool_t::dealloc(lw_optics_band2gpt_mem);
+  pool_t::dealloc(lw_optics_gpt2band_mem);
+  pool_t::dealloc(lw_noaero_band2gpt_mem);
+  pool_t::dealloc(lw_noaero_gpt2band_mem);
+  pool_t::dealloc(lw_source_band2gpt_mem);
+  pool_t::dealloc(lw_source_gpt2band_mem);
 }
 
 /*
@@ -994,7 +1155,7 @@ static void get_subcolumn_mask(const int ncol, const int nlay, const int ngpt, c
   //     c(i,j,k) = 0 for x(i,j,k) <= 1 - cldf(i,j)
   //
   // I am going to call this "cldx" to be just slightly less ambiguous
-  auto cldx = pool_t::template alloc_and_init<RealT>(ncol, nlay, ngpt);
+  auto cldx = pool_t::template alloc<RealT>(ncol, nlay, ngpt);
 
   // Apply overlap assumption to set cldx
   if (overlap_option == 0) {  // Dummy mask, always cloudy
@@ -1011,23 +1172,35 @@ static void get_subcolumn_mask(const int ncol, const int nlay, const int ngpt, c
     // Kokkos::deep_copy(seeds_host, seeds);
     // for (int icol = 0; icol < ncol; ++icol) {
     //   Kokkos::Random_XorShift64_Pool<> random_pool(seeds_host(icol));
-    //   Kokkos::parallel_for(MDRP::template get<2>({ngpt, nlay}), KOKKOS_LAMBDA(int igpt, int ilay) {
+    //   TIMED_KERNEL(FLATTEN_MD_KERNEL2(nlay, ngpt, ilay, igpt,
     //     auto generator = random_pool.get_state();
     //     cldx(icol,ilay,igpt) = generator.drand(0., 1.);
     //     random_pool.free_state(generator);
-    //   });
+    //   }));
     // }
-    Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) {
-      conv::Random rand(seeds(icol));
-      for (int igpt = 0; igpt < ngpt; igpt++) {
-        for (int ilay = 0; ilay < nlay; ilay++) {
-          cldx(icol,ilay,igpt) = rand.genFP<RealT>();
-        }
-      }
-    });
+    // TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) {
+    //   conv::Random rand(seeds(icol));
+    //   for (int igpt = 0; igpt < ngpt; igpt++) {
+    //     for (int ilay = 0; ilay < nlay; ilay++) {
+    //       cldx(icol,ilay,igpt) = rand.genFP<RealT>();
+    //     }
+    //   }
+    // }));
+    // Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) {
+    //     conv::Random rand(seeds(icol));
+    //     for (int igpt = 0; igpt < ngpt; igpt++) {
+    //       for (int ilay = 0; ilay < nlay; ilay++) {
+    //         cldx(icol,ilay,igpt) = rand.genFP<RealT>();
+    //       }
+    //     }
+    //   });
+    TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay, ngpt, icol, ilay, igpt,
+      conv::Random rand(seeds(icol) + ilay*ngpt + igpt);
+      cldx(icol,ilay,igpt) = rand.genFP<RealT>();
+    ));
 
     // Step down columns and apply algorithm from eq (14)
-    Kokkos::parallel_for(MDRP::template get<2>({ngpt,ncol}), KOKKOS_LAMBDA(int igpt, int icol) {
+    TIMED_KERNEL(FLATTEN_MD_KERNEL2(ncol,ngpt, icol, igpt,
       for (int ilay = 1; ilay < nlay; ilay++) {
         // Check cldx in level above and see if it satisfies conditions to create a cloudy subcolumn
         if (cldx(icol,ilay-1,igpt) > 1.0 - cldf(icol,ilay-1)) {
@@ -1043,17 +1216,17 @@ static void get_subcolumn_mask(const int ncol, const int nlay, const int ngpt, c
           cldx(icol,ilay,igpt) = cldx(icol,ilay  ,igpt) * (1.0 - cldf(icol,ilay-1));
         }
       }
-    });
+    ));
   }
 
   // Use cldx array to create subcolumn mask
-  Kokkos::parallel_for(MDRP::template get<3>({ngpt,nlay,ncol}), KOKKOS_LAMBDA(int igpt, int ilay, int icol) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol,nlay,ngpt, icol, ilay, igpt,
     if (cldx(icol,ilay,igpt) > 1.0 - cldf(icol,ilay)) {
       subcolumn_mask(icol,ilay,igpt) = 1;
     } else {
       subcolumn_mask(icol,ilay,igpt) = 0;
     }
-  });
+  ));
 
   pool_t::dealloc(cldx);
 }
@@ -1068,22 +1241,22 @@ static void compute_cloud_area(
   // Subcolumn binary cld mask; if any layers with pressure between pmin and pmax are cloudy
   // then 2d subcol mask is 1, otherwise it is 0
   auto subcol_mask = pool_t::template alloc_and_init<RealT>(ncol, ngpt);
-  Kokkos::parallel_for(MDRP::template get<3>({ngpt, nlay, ncol}), KOKKOS_LAMBDA(int igpt, int ilay, int icol) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay, ngpt, icol, ilay, igpt,
     // NOTE: using plev would need to assume level ordering (top to bottom or bottom to top), but
     // using play/pmid does not
     if (cld_tau_gpt(icol,ilay,igpt) > 0 && pmid(icol,ilay) >= pmin && pmid(icol,ilay) < pmax) {
       subcol_mask(icol,igpt) = 1;
     }
-  });
+  ));
   // Compute average over subcols to get cloud area
   auto ngpt_inv = 1.0 / ngpt;
   Kokkos::deep_copy(cld_area, 0);
-  Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) {
+  TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) {
     // This loop needs to be serial because of the atomic reduction
     for (int igpt = 0; igpt < ngpt; ++igpt) {
       cld_area(icol) += subcol_mask(icol,igpt) * ngpt_inv;
     }
-  });
+  }));
 
   pool_t::dealloc(subcol_mask);
 }
@@ -1118,7 +1291,7 @@ static void compute_aerocom_cloudtop(
   Kokkos::deep_copy(eff_radius_qi_at_cldtop, 0.0);
 
   // Initialize the 1D "clear fraction" as 1 (totally clear)
-  auto aerocom_clr = pool_t::template alloc_and_init<RealT>(ncol);
+  auto aerocom_clr = pool_t::template alloc<RealT>(ncol);
   Kokkos::deep_copy(aerocom_clr, 1.0);
 
   // Get gravity acceleration constant from constants
@@ -1131,7 +1304,7 @@ static void compute_aerocom_cloudtop(
   constexpr RealT cldfrac_tot_threshold = 0.001;  // BAD_CONSTANT!
 
   // Loop over all columns in parallel
-  Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) {
+  TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) {
     // Loop over all layers in serial (due to accumulative
     // product), starting at 2 (second highest) layer because the
     // highest is assumed to hav no clouds
@@ -1190,7 +1363,7 @@ static void compute_aerocom_cloudtop(
     // aerocom_clr is the result of accumulative probabilities
     // (their products)
     cldfrac_tot_at_cldtop(icol) = 1.0 - aerocom_clr(icol);
-  });
+  }));
 
   pool_t::dealloc(aerocom_clr);
 }
@@ -1211,7 +1384,7 @@ static void mixing_ratio_to_cloud_mass(
   int ncol = mixing_ratio.extent(0);
   int nlay = mixing_ratio.extent(1);
   using physconst = scream::physics::Constants<Real>;
-  Kokkos::parallel_for(MDRP::template get<2>({nlay, ncol}), KOKKOS_LAMBDA(int ilay, int icol) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL2(ncol, nlay, icol, ilay,
     // Compute in-cloud mixing ratio (mixing ratio of the cloudy part of the layer)
     // NOTE: these thresholds (from E3SM) seem arbitrary, but included here for consistency
     // This limits in-cloud mixing ratio to 0.005 kg/kg. According to note in cloud_diagnostics
@@ -1223,7 +1396,7 @@ static void mixing_ratio_to_cloud_mass(
     } else {
       cloud_mass(icol,ilay) = 0;
     }
-  });
+  ));
 }
 
 /*
@@ -1234,30 +1407,34 @@ static void mixing_ratio_to_cloud_mass(
  */
 template<typename InT, typename T, typename OutT, typename std::enable_if<OutT::rank == 1>::type* dummy = nullptr>
 static void limit_to_bounds_k(InT const &arr_in, T const lower, T const upper, OutT &arr_out) {
-  Kokkos::parallel_for(arr_out.size(), KOKKOS_LAMBDA(int i) {
+  TIMED_KERNEL(Kokkos::parallel_for(arr_out.size(), KOKKOS_LAMBDA(int i) {
     arr_out(i) = std::min(std::max(arr_in(i), lower), upper);
-  });
+  }));
 }
 
 template<typename InT, typename T, typename OutT, typename std::enable_if<OutT::rank == 2>::type* dummy = nullptr>
 static void limit_to_bounds_k(InT const &arr_in, T const lower, T const upper, OutT &arr_out) {
-  Kokkos::parallel_for(MDRP::template get<2>({arr_out.extent(0), arr_out.extent(1)}), KOKKOS_LAMBDA(int i, int j) {
+  const int ex0 = (int) arr_out.extent(0);
+  const int ex1 = (int) arr_out.extent(1);
+  TIMED_KERNEL(FLATTEN_MD_KERNEL2(ex0, ex1, i, j,
     arr_out(i, j) = std::min(std::max(arr_in(i, j), lower), upper);
-  });
+  ));
 }
 
 
 static int get_wavelength_index(optical_props_t &kdist, RealT wavelength)
 {
+  auto band_lims_wvn = kdist.get_band_lims_wavenumber();
+  auto wavelength_bounds = pool_t::template alloc<RealT>(band_lims_wvn.extent(0), band_lims_wvn.extent(1));
   // Get wavelength bounds for all wavelength bands
-  auto wavelength_bounds = kdist.get_band_lims_wavelength();
+  kdist.get_band_lims_wavelength(wavelength_bounds);
 
   // Find the band index for the specified wavelength
   // Note that bands are stored in wavenumber space, units of cm-1, so if we are passed wavelength
   // in units of meters, we need a conversion factor of 10^2
   const int nbnds = kdist.get_nband();
   int band_index = -1;
-  Kokkos::parallel_reduce(nbnds, KOKKOS_LAMBDA(int ibnd, int& band_index_inner) {
+  TIMED_KERNEL(Kokkos::parallel_reduce(nbnds, KOKKOS_LAMBDA(int ibnd, int& band_index_inner) {
     if (wavelength_bounds(0,ibnd) < wavelength_bounds(1,ibnd)) {
       if (wavelength_bounds(0,ibnd) <= wavelength * 1e2 && wavelength * 1e2 <= wavelength_bounds(1,ibnd)) {
         band_index_inner = ibnd;
@@ -1267,36 +1444,38 @@ static int get_wavelength_index(optical_props_t &kdist, RealT wavelength)
         band_index_inner = ibnd;
       }
     }
-  }, Kokkos::Max<int>(band_index));
+      }, Kokkos::Max<int>(band_index)));
+  pool_t::dealloc(wavelength_bounds);
   return band_index;
 }
 
 static inline int get_wavelength_index_sw_k(RealT wavelength)
 {
-  return get_wavelength_index(k_dist_sw_k, wavelength);
+  return get_wavelength_index(*k_dist_sw_k, wavelength);
 }
 
 static inline int get_wavelength_index_lw_k(RealT wavelength)
 {
-  return get_wavelength_index(k_dist_lw_k, wavelength);
+  return get_wavelength_index(*k_dist_lw_k, wavelength);
 }
 
 static optical_props2_t get_cloud_optics_sw(
   const int ncol, const int nlay,
   cloud_optics_t &cloud_optics, gas_optics_t &kdist,
-  const real2dk &lwp, const real2dk &iwp, const creal2dk &rel, const creal2dk &rei) {
+  const real2dk &lwp, const real2dk &iwp, const creal2dk &rel, const creal2dk &rei,
+  const int2dk& sw_band2gpt_mem, const int1dk& sw_gpt2band_mem, const real3dk& sw_tau_mem, const real3dk& sw_ssa_mem, const real3dk& sw_g_mem) {
 
   // Initialize optics
   optical_props2_t clouds;
-  clouds.init(kdist.get_band_lims_wavenumber());
-  clouds.alloc_2str(ncol, nlay);
+  clouds.init_no_alloc(kdist.get_band_lims_wavenumber(), sw_band2gpt_mem, sw_gpt2band_mem);
+  clouds.alloc_2str_no_alloc(ncol, nlay, sw_tau_mem, sw_ssa_mem, sw_g_mem);
 
   // Needed for consistency with all-sky example problem?
   cloud_optics.set_ice_roughness(2);
 
   // Limit effective radii to be within bounds of lookup table
-  auto rel_limited = pool_t::template alloc_and_init<RealT>(ncol, nlay);
-  auto rei_limited = pool_t::template alloc_and_init<RealT>(ncol, nlay);
+  auto rel_limited = pool_t::template alloc<RealT>(ncol, nlay);
+  auto rei_limited = pool_t::template alloc<RealT>(ncol, nlay);
   limit_to_bounds_k(rel, cloud_optics.radliq_lwr, cloud_optics.radliq_upr, rel_limited);
   limit_to_bounds_k(rei, cloud_optics.radice_lwr, cloud_optics.radice_upr, rei_limited);
 
@@ -1313,19 +1492,20 @@ static optical_props2_t get_cloud_optics_sw(
 static optical_props1_t get_cloud_optics_lw(
   const int ncol, const int nlay,
   cloud_optics_t &cloud_optics, gas_optics_t &kdist,
-  const real2dk &lwp, const real2dk &iwp, const creal2dk &rel, const creal2dk &rei) {
+  const real2dk &lwp, const real2dk &iwp, const creal2dk &rel, const creal2dk &rei,
+  const int2dk& lw_band2gpt_mem, const int1dk& lw_gpt2band_mem, const real3dk& lw_tau_mem) {
 
   // Initialize optics
   optical_props1_t clouds;
-  clouds.init(kdist.get_band_lims_wavenumber());
-  clouds.alloc_1scl(ncol, nlay);  // this is dumb, why do we need to init and alloc separately?!
+  clouds.init_no_alloc(kdist.get_band_lims_wavenumber(), lw_band2gpt_mem, lw_gpt2band_mem);
+  clouds.alloc_1scl_no_alloc(ncol, nlay, lw_tau_mem);  // this is dumb, why do we need to init and alloc separately?!
 
   // Needed for consistency with all-sky example problem?
   cloud_optics.set_ice_roughness(2);
 
   // Limit effective radii to be within bounds of lookup table
-  auto rel_limited = pool_t::template alloc_and_init<RealT>(ncol, nlay);
-  auto rei_limited = pool_t::template alloc_and_init<RealT>(ncol, nlay);
+  auto rel_limited = pool_t::template alloc<RealT>(ncol, nlay);
+  auto rei_limited = pool_t::template alloc<RealT>(ncol, nlay);
   limit_to_bounds_k(rel, cloud_optics.radliq_lwr, cloud_optics.radliq_upr, rel_limited);
   limit_to_bounds_k(rei, cloud_optics.radice_lwr, cloud_optics.radice_upr, rei_limited);
 
@@ -1341,14 +1521,15 @@ static optical_props1_t get_cloud_optics_lw(
 
 static optical_props2_t get_subsampled_clouds(
   const int ncol, const int nlay, const int nbnd, const int ngpt,
-  optical_props2_t &cloud_optics, gas_optics_t &kdist, const real2dk &cld, const creal2dk &p_lay) {
+  optical_props2_t &cloud_optics, gas_optics_t &kdist, const real2dk &cld, const creal2dk &p_lay,
+  const int2dk& sw_band2gpt_mem, const int1dk& sw_gpt2band_mem, const real3dk& sw_tau_mem, const real3dk& sw_ssa_mem, const real3dk& sw_g_mem) {
   // Initialized subsampled optics
   optical_props2_t subsampled_optics;
-  subsampled_optics.init(kdist.get_band_lims_wavenumber(), kdist.get_band_lims_gpoint(), "subsampled_optics");
-  subsampled_optics.alloc_2str(ncol, nlay);
+  subsampled_optics.init_no_alloc(kdist.get_band_lims_wavenumber(), kdist.get_band_lims_gpoint(), sw_band2gpt_mem, sw_gpt2band_mem, "subsampled_optics");
+  subsampled_optics.alloc_2str_no_alloc(ncol, nlay, sw_tau_mem, sw_ssa_mem, sw_g_mem);
 
   // Subcolumn mask with values of 0 indicating no cloud, 1 indicating cloud
-  auto cldmask = pool_t::template alloc_and_init<int>(ncol, nlay, ngpt);
+  auto cldmask = pool_t::template alloc<int>(ncol, nlay, ngpt);
 
   // Check that we do not have clouds with no optical properties; this would get corrected
   // when we assign optical props, but we want to use a "radiative cloud fraction"
@@ -1358,11 +1539,12 @@ static optical_props2_t get_subsampled_clouds(
   // even when separated by layers with no cloud properties, when in fact those layers should be
   // randomly overlapped.
   auto cldfrac_rad = pool_t::template alloc_and_init<RealT>(ncol, nlay);
-  Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay,ncol}), KOKKOS_LAMBDA (int ibnd, int ilay, int icol) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol, nlay, nbnd, icol, ilay, ibnd,
     if (cloud_optics.tau(icol,ilay,ibnd) > 0) {
       cldfrac_rad(icol,ilay) = cld(icol,ilay);
     }
-  });
+  ));
+
   // Get subcolumn cloud mask; note that get_subcolumn_mask exposes overlap assumption as an option,
   // but the only currently supported options are 0 (trivial all-or-nothing cloud) or 1 (max-rand),
   // so overlap has not been exposed as an option beyond this subcolumn. In the future, we should
@@ -1371,14 +1553,15 @@ static optical_props2_t get_subsampled_clouds(
   int overlap = 1;
   // Get unique seeds for each column that are reproducible across different MPI rank layouts;
   // use decimal part of pressure for this, consistent with the implementation in EAM
-  auto seeds = pool_t::template alloc_and_init<int>(ncol);
-  Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) {
+  auto seeds = pool_t::template alloc<int>(ncol);
+  TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) {
     seeds(icol) = 1e9 * (p_lay(icol,nlay-1) - int(p_lay(icol,nlay-1)));
-  });
+  }));
   get_subcolumn_mask(ncol, nlay, ngpt, cldfrac_rad, overlap, seeds, cldmask);
+
   // Assign optical properties to subcolumns (note this implements MCICA)
   auto gpoint_bands = kdist.get_gpoint_bands();
-  Kokkos::parallel_for(MDRP::template get<3>({ngpt,nlay,ncol}), KOKKOS_LAMBDA(int igpt, int ilay, int icol) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol,nlay,ngpt, icol, ilay, igpt,
     auto ibnd = gpoint_bands(igpt);
     if (cldmask(icol,ilay,igpt) == 1) {
       subsampled_optics.tau(icol,ilay,igpt) = cloud_optics.tau(icol,ilay,ibnd);
@@ -1389,7 +1572,7 @@ static optical_props2_t get_subsampled_clouds(
       subsampled_optics.ssa(icol,ilay,igpt) = 0;
       subsampled_optics.g  (icol,ilay,igpt) = 0;
     }
-  });
+  ));
 
   pool_t::dealloc(cldmask);
   pool_t::dealloc(cldfrac_rad);
@@ -1400,15 +1583,16 @@ static optical_props2_t get_subsampled_clouds(
 
 static optical_props1_t get_subsampled_clouds(
   const int ncol, const int nlay, const int nbnd, const int ngpt,
-  optical_props1_t &cloud_optics, gas_optics_t &kdist, const real2dk &cld, const creal2dk &p_lay) {
+  optical_props1_t &cloud_optics, gas_optics_t &kdist, const real2dk &cld, const creal2dk &p_lay,
+  const int2dk& lw_band2gpt_mem, const int1dk& lw_gpt2band_mem, const real3dk& lw_tau_mem) {
 
   // Initialized subsampled optics
   optical_props1_t subsampled_optics;
-  subsampled_optics.init(kdist.get_band_lims_wavenumber(), kdist.get_band_lims_gpoint(), "subsampled_optics");
-  subsampled_optics.alloc_1scl(ncol, nlay);
+  subsampled_optics.init_no_alloc(kdist.get_band_lims_wavenumber(), kdist.get_band_lims_gpoint(), lw_band2gpt_mem, lw_gpt2band_mem, "subsampled_optics");
+  subsampled_optics.alloc_1scl_no_alloc(ncol, nlay, lw_tau_mem);
 
   // Subcolumn mask with values of 0 indicating no cloud, 1 indicating cloud
-  auto cldmask = pool_t::template alloc_and_init<int>(ncol, nlay, ngpt);
+  auto cldmask = pool_t::template alloc<int>(ncol, nlay, ngpt);
 
   // Check that we do not have clouds with no optical properties; this would get corrected
   // when we assign optical props, but we want to use a "radiative cloud fraction"
@@ -1418,31 +1602,31 @@ static optical_props1_t get_subsampled_clouds(
   // even when separated by layers with no cloud properties, when in fact those layers should be
   // randomly overlapped.
   auto cldfrac_rad = pool_t::template alloc_and_init<RealT>(ncol, nlay);
-  Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay,ncol}), KOKKOS_LAMBDA (int ibnd, int ilay, int icol) {
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol,nlay,nbnd, icol, ilay, ibnd,
     if (cloud_optics.tau(icol,ilay,ibnd) > 0) {
       cldfrac_rad(icol,ilay) = cld(icol,ilay);
     }
-  });
+  ));
   // Get subcolumn cloud mask
   int overlap = 1;
   // Get unique seeds for each column that are reproducible across different MPI rank layouts;
   // use decimal part of pressure for this, consistent with the implementation in EAM; use different
   // seed values for longwave and shortwave
-  auto seeds = pool_t::template alloc_and_init<int>(ncol);
-  Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) {
+  auto seeds = pool_t::template alloc<int>(ncol);
+  TIMED_KERNEL(Kokkos::parallel_for(ncol, KOKKOS_LAMBDA(int icol) {
     seeds(icol) = 1e9 * (p_lay(icol,nlay-2) - int(p_lay(icol,nlay-2)));
-  });
+  }));
   get_subcolumn_mask(ncol, nlay, ngpt, cldfrac_rad, overlap, seeds, cldmask);
   // Assign optical properties to subcolumns (note this implements MCICA)
   auto gpoint_bands = kdist.get_gpoint_bands();
-  Kokkos::parallel_for(MDRP::template get<3>({ngpt,nlay,ncol}), KOKKOS_LAMBDA(int igpt, int ilay, int icol) {
-      auto ibnd = gpoint_bands(igpt);
-      if (cldmask(icol,ilay,igpt) == 1) {
-        subsampled_optics.tau(icol,ilay,igpt) = cloud_optics.tau(icol,ilay,ibnd);
-      } else {
-        subsampled_optics.tau(icol,ilay,igpt) = 0;
-      }
-    });
+  TIMED_KERNEL(FLATTEN_MD_KERNEL3(ncol,nlay,ngpt, icol, ilay, igpt,
+    auto ibnd = gpoint_bands(igpt);
+    if (cldmask(icol,ilay,igpt) == 1) {
+      subsampled_optics.tau(icol,ilay,igpt) = cloud_optics.tau(icol,ilay,ibnd);
+    } else {
+      subsampled_optics.tau(icol,ilay,igpt) = 0;
+    }
+  ));
 
   pool_t::dealloc(cldmask);
   pool_t::dealloc(cldfrac_rad);
diff --git a/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_tests.cpp b/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_tests.cpp
index fedb8c3d0478..165272a237d4 100644
--- a/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_tests.cpp
+++ b/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_tests.cpp
@@ -401,7 +401,7 @@ int run_kokkos(int argc, char** argv) {
 
     // Initialize absorption coefficients
   logger->info("Initialize RRTMGP...\n");
-  interface_t::rrtmgp_initialize(gas_concs, coefficients_file_sw, coefficients_file_lw, cloud_optics_file_sw, cloud_optics_file_lw, logger);
+  interface_t::rrtmgp_initialize(gas_concs, coefficients_file_sw, coefficients_file_lw, cloud_optics_file_sw, cloud_optics_file_lw, logger, 2.0);
 
   // Setup our dummy atmosphere based on the input data we read in
   logger->info("Setup dummy atmos...\n");
@@ -428,8 +428,8 @@ int run_kokkos(int argc, char** argv) {
   // we would just have to setup the pointers to them in the
   // FluxesBroadband object
   logger->info("Setup fluxes...\n");
-  const auto nswbands = interface_t::k_dist_sw_k.get_nband();
-  const auto nlwbands = interface_t::k_dist_lw_k.get_nband();
+  const auto nswbands = interface_t::k_dist_sw_k->get_nband();
+  const auto nlwbands = interface_t::k_dist_lw_k->get_nband();
   real2dk sw_flux_up ("sw_flux_up" , ncol, nlay+1);
   real2dk sw_flux_dn ("sw_flux_dn" , ncol, nlay+1);
   real2dk sw_flux_dir("sw_flux_dir", ncol, nlay+1);
@@ -470,19 +470,19 @@ int run_kokkos(int argc, char** argv) {
   auto aer_ssa_sw = real3dk("aer_ssa_sw", ncol, nlay, nswbands);
   auto aer_asm_sw = real3dk("aer_asm_sw", ncol, nlay, nswbands);
   auto aer_tau_lw = real3dk("aer_tau_lw", ncol, nlay, nlwbands);
-  Kokkos::parallel_for(MDRP::template get<3>({nswbands,nlay,ncol}), KOKKOS_LAMBDA(int ibnd, int ilay, int icol) {
+  Kokkos::parallel_for(MDRP::template get<3>({ncol, nlay, nswbands}), KOKKOS_LAMBDA(int icol, int ilay, int ibnd) {
     aer_tau_sw(icol,ilay,ibnd) = 0;
     aer_ssa_sw(icol,ilay,ibnd) = 0;
     aer_asm_sw(icol,ilay,ibnd) = 0;
   });
-  Kokkos::parallel_for(MDRP::template get<3>({nlwbands,nlay,ncol}), KOKKOS_LAMBDA(int ibnd, int ilay, int icol) {
+  Kokkos::parallel_for(MDRP::template get<3>({ncol, nlay, nlwbands}), KOKKOS_LAMBDA(int icol, int ilay, int ibnd) {
     aer_tau_lw(icol,ilay,ibnd) = 0;
   });
 
   // These are returned as outputs now from rrtmgp_main
   // TODO: provide as inputs consistent with how aerosol is treated?
-  const auto nswgpts = interface_t::k_dist_sw_k.get_ngpt();
-  const auto nlwgpts = interface_t::k_dist_lw_k.get_ngpt();
+  const auto nswgpts = interface_t::k_dist_sw_k->get_ngpt();
+  const auto nlwgpts = interface_t::k_dist_lw_k->get_ngpt();
   auto cld_tau_sw_bnd = real3dk("cld_tau_sw_bnd", ncol, nlay, nswbands);
   auto cld_tau_lw_bnd = real3dk("cld_tau_lw_bnd", ncol, nlay, nlwbands);
   auto cld_tau_sw = real3dk("cld_tau_sw", ncol, nlay, nswgpts);
diff --git a/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_unit_tests.cpp b/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_unit_tests.cpp
index 06ca64af5f24..f716e64791b8 100644
--- a/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_unit_tests.cpp
+++ b/components/eamxx/src/physics/rrtmgp/tests/rrtmgp_unit_tests.cpp
@@ -1110,7 +1110,7 @@ TEST_CASE("rrtmgp_test_compute_broadband_surface_flux_k") {
   auto sw_bnd_flux_dir = real3dk("sw_bnd_flux_dir", ncol, nlay+1, nbnd);
   auto sw_bnd_flux_dif = real3dk("sw_bnd_flux_dif", ncol, nlay+1, nbnd);
   logger->info("Populate band-resolved 3d fluxes for test case with only transition band flux...\n");
-  Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay+1,ncol}), KOKKOS_LAMBDA(int ibnd, int ilay, int icol) {
+  Kokkos::parallel_for(MDRP::template get<3>({ncol, nlay+1, nbnd}), KOKKOS_LAMBDA(int icol, int ilay, int ibnd) {
     if (ibnd < 9) {
       sw_bnd_flux_dir(icol,ilay,ibnd) = 0;
       sw_bnd_flux_dif(icol,ilay,ibnd) = 0;
@@ -1142,7 +1142,7 @@ TEST_CASE("rrtmgp_test_compute_broadband_surface_flux_k") {
   // ---------------------------------
   // Test case, only flux in NIR bands
   logger->info("Populate band-resolved 3d fluxes for test case with only NIR flux...\n");
-  Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay+1,ncol}), KOKKOS_LAMBDA(int ibnd, int ilay, int icol) {
+  Kokkos::parallel_for(MDRP::template get<3>({ncol, nlay+1, nbnd}), KOKKOS_LAMBDA(int icol, int ilay, int ibnd) {
     if (ibnd < 9) {
       sw_bnd_flux_dir(icol,ilay,ibnd) = 1;
       sw_bnd_flux_dif(icol,ilay,ibnd) = 1;
@@ -1173,7 +1173,7 @@ TEST_CASE("rrtmgp_test_compute_broadband_surface_flux_k") {
   // ---------------------------------
   // Test case, only flux in VIS bands
   logger->info("Populate band-resolved 3d fluxes for test case with only VIS/UV flux...\n");
-  Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay+1,ncol}), KOKKOS_LAMBDA(int ibnd, int ilay, int icol) {
+  Kokkos::parallel_for(MDRP::template get<3>({ncol, nlay+1, nbnd}), KOKKOS_LAMBDA(int icol, int ilay, int ibnd) {
     if (ibnd < 9) {
       sw_bnd_flux_dir(icol,ilay,ibnd) = 0;
       sw_bnd_flux_dif(icol,ilay,ibnd) = 0;
@@ -1204,7 +1204,7 @@ TEST_CASE("rrtmgp_test_compute_broadband_surface_flux_k") {
   // ---------------------------------
   // Test case, only flux in all bands
   logger->info("Populate band-resolved 3d fluxes for test with non-zero flux in all bands...\n");
-  Kokkos::parallel_for(MDRP::template get<3>({nbnd,nlay+1,ncol}), KOKKOS_LAMBDA(int ibnd, int ilay, int icol) {
+  Kokkos::parallel_for(MDRP::template get<3>({ncol, nlay+1, nbnd}), KOKKOS_LAMBDA(int icol, int ilay, int ibnd) {
     if (ibnd < 9) {
       sw_bnd_flux_dir(icol,ilay,ibnd) = 1.0;
       sw_bnd_flux_dif(icol,ilay,ibnd) = 2.0;
@@ -1307,13 +1307,13 @@ TEST_CASE("rrtmgp_test_subcol_gen_k") {
     interface_t::get_subcolumn_mask(ncol, nlay, ngpt, cldfrac, 1, seeds, cldmask);
     // Check answers by computing new cldfrac from mask
     Kokkos::deep_copy(cldfrac_from_mask, 0.0);
-    Kokkos::parallel_for(MDRP::template get<2>({nlay,ncol}), KOKKOS_LAMBDA(int ilay, int icol) {
+    Kokkos::parallel_for(MDRP::template get<2>({ncol, nlay}), KOKKOS_LAMBDA(int icol, int ilay) {
       for (int igpt = 0; igpt < ngpt; ++igpt) {
         real cldmask_real = cldmask(icol,ilay,igpt);
         cldfrac_from_mask(icol,ilay) += cldmask_real;
       }
     });
-    Kokkos::parallel_for(MDRP::template get<2>({nlay,ncol}), KOKKOS_LAMBDA(int ilay, int icol) {
+    Kokkos::parallel_for(MDRP::template get<2>({ncol, nlay}), KOKKOS_LAMBDA(int icol, int ilay) {
       cldfrac_from_mask(icol,ilay) = cldfrac_from_mask(icol,ilay) / ngpt;
     });
     // For cldfrac 1 we should get 1, for cldfrac 0 we should get 0, but in between we cannot be sure
diff --git a/components/eamxx/tests/single-process/rrtmgp/CMakeLists.txt b/components/eamxx/tests/single-process/rrtmgp/CMakeLists.txt
index b16c2e732884..c8b57e98948b 100644
--- a/components/eamxx/tests/single-process/rrtmgp/CMakeLists.txt
+++ b/components/eamxx/tests/single-process/rrtmgp/CMakeLists.txt
@@ -6,20 +6,25 @@ set (FIXTURES_BASE_NAME ${TEST_BASE_NAME}_generate_output_nc_files)
 # Ensure test input files are present in the data dir
 GetInputFile(scream/init/${EAMxx_tests_IC_FILE_72lev})
 
+set(YAKL_LIB_NAME "")
+if (SCREAM_RRTMGP_ENABLE_YAKL)
+  set(YAKL_LIB_NAME "yakl")
+endif()
+
 if (SCREAM_ENABLE_BASELINE_TESTS AND NOT SCREAM_ONLY_GENERATE_BASELINES)
   # Unit test to compare against raw rrtmgp output
   configure_file(${CMAKE_CURRENT_SOURCE_DIR}/input_unit.yaml
                  ${CMAKE_CURRENT_BINARY_DIR}/input_unit.yaml)
   CreateUnitTest(${TEST_BASE_NAME}_unit rrtmgp_standalone_unit.cpp
     LABELS rrtmgp physics driver
-    LIBS scream_rrtmgp rrtmgp scream_control yakl diagnostics rrtmgp_test_utils
-    EXE_ARGS "--args --inputfile ${SCREAM_DATA_DIR}/init/rrtmgp-allsky.nc --baseline ${SCREAM_BASELINES_DIR}/data/rrtmgp-allsky-baseline.nc"
+    LIBS scream_rrtmgp rrtmgp scream_control ${YAKL_LIB_NAME} diagnostics rrtmgp_test_utils
+    EXE_ARGS "--args --rrtmgp_inputfile ${SCREAM_DATA_DIR}/init/rrtmgp-allsky.nc --rrtmgp_baseline ${SCREAM_BASELINES_DIR}/data/rrtmgp-allsky-baseline.nc"
   )
 endif()
 
 ## Create rrtmgp stand alone executable
 CreateUnitTestExec(${TEST_BASE_NAME} "rrtmgp_standalone.cpp"
-  LIBS scream_rrtmgp rrtmgp scream_control yakl diagnostics
+  LIBS scream_rrtmgp rrtmgp scream_control ${YAKL_LIB_NAME} diagnostics
 )
 
 # The RRTMGP stand-alone test that runs multi-step
diff --git a/components/eamxx/tests/single-process/rrtmgp/input_unit.yaml b/components/eamxx/tests/single-process/rrtmgp/input_unit.yaml
index 4a1f474c8778..92961fb30d99 100644
--- a/components/eamxx/tests/single-process/rrtmgp/input_unit.yaml
+++ b/components/eamxx/tests/single-process/rrtmgp/input_unit.yaml
@@ -21,6 +21,7 @@ atmosphere_processes:
     rrtmgp_coefficients_file_lw: ${SCREAM_DATA_DIR}/init/rrtmgp-data-lw-g256-2018-12-04.nc
     rrtmgp_cloud_optics_file_sw: ${SCREAM_DATA_DIR}/init/rrtmgp-cloud-optics-coeffs-sw.nc
     rrtmgp_cloud_optics_file_lw: ${SCREAM_DATA_DIR}/init/rrtmgp-cloud-optics-coeffs-lw.nc
+    pool_size_multiplier: 2.0
 
 grids_manager:
   Type: Mesh Free