diff --git a/libglide/glide.F90 b/libglide/glide.F90
index 69f7f5e3..aca02953 100644
--- a/libglide/glide.F90
+++ b/libglide/glide.F90
@@ -384,8 +384,7 @@ subroutine glide_initialise(model)
     call glide_set_mask(model%numerics,                                   &
                         model%geometry%thck,  model%geometry%topg,     &
                         model%general%ewn,    model%general%nsn,       &
-                        model%climate%eus,    model%geometry%thkmask,  &
-                        model%geometry%iarea, model%geometry%ivol)
+                        model%climate%eus,    model%geometry%thkmask)
  
     ! calculate lower and upper ice surface
     call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus,model%geometry%lsrf)
@@ -503,16 +502,10 @@ subroutine glide_init_state_diagnostic(model, evolve_ice)
           call glide_set_mask(model%numerics,                                &
                               model%geometry%thck,  model%geometry%topg,     &
                               model%general%ewn,    model%general%nsn,       &
-                              model%climate%eus,    model%geometry%thkmask,  &
-                              model%geometry%iarea, model%geometry%ivol)
+                              model%climate%eus,    model%geometry%thkmask)
           
        endif  ! calving_init
 
-       ! Compute total areas of grounded and floating ice
-       call calc_iareaf_iareag(model%numerics%dew,    model%numerics%dns,     &
-                               model%geometry%thkmask,                        &
-                               model%geometry%iareaf, model%geometry%iareag)
-
     ! ------------------------------------------------------------------------ 
     ! ***Part 2: Calculate geometry related fields
     ! ------------------------------------------------------------------------    
@@ -874,13 +867,10 @@ subroutine glide_tstep_p2(model)
 
     call glide_prof_start(model,model%glide_prof%ice_mask2)
 
-    !TODO - Calculate area and vol separately from glide_set_mask?
-
     call glide_set_mask(model%numerics,                                &
                         model%geometry%thck,  model%geometry%topg,     &
                         model%general%ewn,    model%general%nsn,       &
-                        model%climate%eus,    model%geometry%thkmask,  &
-                        model%geometry%iarea, model%geometry%ivol)
+                        model%climate%eus,    model%geometry%thkmask)
 
     call glide_prof_stop(model,model%glide_prof%ice_mask2)
 
@@ -909,14 +899,7 @@ subroutine glide_tstep_p2(model)
     call glide_set_mask(model%numerics,                                &
                         model%geometry%thck,  model%geometry%topg,     &
                         model%general%ewn,    model%general%nsn,       &
-                        model%climate%eus,    model%geometry%thkmask,  &
-                        model%geometry%iarea, model%geometry%ivol)
- endif   ! oldglide = F
-
- if (.not. oldglide) then   ! calculate area of floating and grounded ice
-    call calc_iareaf_iareag(model%numerics%dew,    model%numerics%dns,     &
-                            model%geometry%thkmask,                        &
-                            model%geometry%iareaf, model%geometry%iareag)
+                        model%climate%eus,    model%geometry%thkmask)
  endif   ! oldglide = F
 
     ! ------------------------------------------------------------------------
diff --git a/libglide/glide_diagnostics.F90 b/libglide/glide_diagnostics.F90
index e49ebb2f..a4571ed1 100644
--- a/libglide/glide_diagnostics.F90
+++ b/libglide/glide_diagnostics.F90
@@ -39,12 +39,14 @@ module glide_diagnostics
   use cism_parallel, only: this_rank, main_task, lhalo, uhalo, nhalo, &
        parallel_type, broadcast, &
        parallel_localindex, parallel_globalindex, &
-       parallel_reduce_sum, parallel_reduce_max, &
+       parallel_global_sum, parallel_reduce_max, &
        parallel_reduce_maxloc, parallel_reduce_minloc, &
        parallel_is_zero
 
   implicit none
 
+  logical, parameter :: verbose_diagnostics = .false.
+
 contains
 
   subroutine glide_write_diagnostics (model,  time,    &
@@ -64,8 +66,6 @@ subroutine glide_write_diagnostics (model,  time,    &
 
     ! local arguments
 
-    logical, parameter :: verbose_diagnostics = .false.
-
     ! debug
     if (main_task .and. verbose_diagnostics) then
        write(iulog,*) '	'
@@ -231,8 +231,12 @@ subroutine glide_write_diag (model,       time)
     integer, dimension(model%general%ewn,model%general%nsn) ::  &
          ice_mask,                 & ! = 1 where ice is present with thck > minthck, else = 0
          floating_mask,            & ! = 1 where ice is present and floating, else = 0
+         grounded_mask,            & ! = 1 where ice is present and grounded, else = 0
          glacier_ice_mask            ! = 1 where glacier ice is present, initially and/or currently
 
+    integer, dimension(model%general%ewn-1,model%general%nsn-1) ::  &
+         stag_ice_mask               ! staggered mask; = 1 if ice_mask = 1 for any of the four neighbors
+
     real(dp), dimension(model%general%upn) ::  &
          temp_diag,                     &    ! Note: sfc temp not included if temps are staggered
                                              !       (use artm instead)
@@ -273,8 +277,14 @@ subroutine glide_write_diag (model,       time)
          velo_ew_ubound, velo_ns_ubound          ! upper bounds for velocity variables
 
     real(dp), dimension(model%general%ewn, model%general%nsn) ::  &
-         velo_sfc,            & ! surface ice speed
-         thck_obs               ! observed ice thickness, derived from usrf_obs and topg
+         mass_above_flotation,& ! ice mass above flotation (kg)
+         thck_obs               ! observed ice thickness (m), derived from usrf_obs and topg
+
+    real(dp), dimension(model%general%ewn-1, model%general%nsn-1) ::  &
+         velo_sfc               ! surface ice speed (m/s)
+
+    real(dp), dimension(:,:,:), allocatable :: &
+         local_energy           ! internal energy (J) per layer in a column
 
     character(len=100) :: message
     
@@ -337,12 +347,26 @@ subroutine glide_write_diag (model,       time)
              ice_mask(i,j) = 1
              if (model%geometry%topg(i,j) - model%climate%eus < (-rhoi/rhoo)*model%geometry%thck(i,j)) then
                 floating_mask(i,j) = 1
+                grounded_mask(i,j) = 0
              else
                 floating_mask(i,j) = 0
+                grounded_mask(i,j) = 1
              endif
           else
              ice_mask(i,j) = 0
              floating_mask(i,j) = 0
+             grounded_mask(i,j) = 0
+          endif
+       enddo
+    enddo
+
+    do j = 1, nsn-1
+       do i = 1, ewn-1
+          if (ice_mask(i,j+1) == 1 .or. ice_mask(i+1,j+1) == 1 .or.  &
+              ice_mask(i,j)   == 1 .or. ice_mask(i+1,j)   == 1) then
+             stag_ice_mask(i,j) = 1
+          else
+             stag_ice_mask(i,j) = 0
           endif
        enddo
     enddo
@@ -358,109 +382,75 @@ subroutine glide_write_diag (model,       time)
     call write_log(' ')
 
     ! total ice area (m^2)
-
-    tot_area = 0.d0
-    tot_area_ground = 0.d0
-    tot_area_float = 0.d0
-    do j = lhalo+1, nsn-uhalo
-       do i = lhalo+1, ewn-uhalo
-          if (ice_mask(i,j) == 1) then
-             tot_area = tot_area + cell_area(i,j)
-             if (floating_mask(i,j) == 1) then
-                tot_area_float = tot_area_float + cell_area(i,j)
-             else
-                tot_area_ground = tot_area_ground + cell_area(i,j)
-             endif
-          endif
-       enddo
-    enddo
-
-    tot_area = parallel_reduce_sum(tot_area)
-    tot_area_ground = parallel_reduce_sum(tot_area_ground)
-    tot_area_float = parallel_reduce_sum(tot_area_float)
+    tot_area = parallel_global_sum(cell_area, parallel, ice_mask)
+    tot_area_float = parallel_global_sum(cell_area, parallel, floating_mask)
+    tot_area_ground = parallel_global_sum(cell_area, parallel, grounded_mask)
 
     ! total ice volume (m^3)
- 
-    tot_volume = 0.d0
-    do j = lhalo+1, nsn-uhalo
-       do i = lhalo+1, ewn-uhalo
-          if (ice_mask(i,j) == 1) then
-             tot_volume = tot_volume + model%geometry%thck(i,j) * cell_area(i,j)
-          endif
-       enddo
-    enddo
-    tot_volume = parallel_reduce_sum(tot_volume)
+    tot_volume = parallel_global_sum(model%geometry%thck*cell_area, parallel, ice_mask)
 
     ! total ice mass (kg)
     tot_mass = tot_volume * rhoi
 
     ! total ice mass above flotation (kg)
-    tot_mass_above_flotation = 0.d0
-
-    do j = lhalo+1, nsn-uhalo
-       do i = lhalo+1, ewn-uhalo
-          if (ice_mask(i,j) == 1) then
-             if (floating_mask(i,j) == 0) then  ! grounded ice
-                if (model%geometry%topg(i,j) - model%climate%eus < 0.0d0) then  ! grounded below sea level
-                   thck_floating = (-rhoo/rhoi) * (model%geometry%topg(i,j) - model%climate%eus)  ! exactly floating
-                   thck_above_flotation = model%geometry%thck(i,j) - thck_floating
-                   tot_mass_above_flotation = tot_mass_above_flotation    &
-                                            + thck_above_flotation * cell_area(i,j)
-                else   ! grounded above sea level
-                   tot_mass_above_flotation = tot_mass_above_flotation    &
-                                            + model%geometry%thck(i,j) * cell_area(i,j)
-                endif
+    mass_above_flotation = 0.0d0
+    do j = 1, nsn
+       do i = 1, ewn
+          if (ice_mask(i,j) == 1 .and. floating_mask(i,j) == 0) then
+             if (model%geometry%topg(i,j) - model%climate%eus < 0.0d0) then  ! grounded below sea level
+                thck_floating = (-rhoo/rhoi) * (model%geometry%topg(i,j) - model%climate%eus)  ! exactly floating
+                mass_above_flotation(i,j) = (model%geometry%thck(i,j) - thck_floating) * cell_area(i,j)
+             else   ! grounded above sea level
+                mass_above_flotation(i,j) = model%geometry%thck(i,j) * cell_area(i,j)
              endif
           endif
        enddo
     enddo
-
-    tot_mass_above_flotation = tot_mass_above_flotation * rhoi            ! convert from m^3 to kg
-    tot_mass_above_flotation = parallel_reduce_sum(tot_mass_above_flotation)
+    mass_above_flotation = mass_above_flotation * rhoi
+    tot_mass_above_flotation = parallel_global_sum(mass_above_flotation, parallel)
 
     ! total ice energy relative to T = 0 deg C (J)
- 
-    tot_energy = 0.d0
-    if (size(model%temper%temp,1) == upn+1) then  ! temps are staggered in vertical
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
+    local_energy = 0.0d0
+    if (size(model%temper%temp,1) == upn+1) then  ! temps are staggered in vertical, located at layer centers
+       allocate(local_energy(model%general%upn-1, model%general%ewn, model%general%nsn))
+       do j = 1, nsn
+          do i = 1, ewn
              if (ice_mask(i,j) == 1) then
-                do k = 1, upn-1
-                   tot_energy = tot_energy +   &
-                                model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j)   &
-                                *(model%numerics%sigma(k+1) - model%numerics%sigma(k))
+                do k = 1, upn-1   ! (upn-1) layers
+                   local_energy(k,i,j) = &
+                        model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j)   &
+                        * (model%numerics%sigma(k+1) - model%numerics%sigma(k))
                 enddo
              endif
           enddo
        enddo
-    
-    else   ! temps are unstaggered in vertical
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
+    else   ! temps are unstaggered in vertical, located at layer interfaces
+       allocate(local_energy(model%general%upn, model%general%ewn, model%general%nsn))
+       do j = 1, nsn
+          do i = 1, ewn
              if (ice_mask(i,j) == 1) then
-                ! upper half-layer, T = upper sfc temp
-                tot_energy = tot_energy +   &
-                             model%geometry%thck(i,j) * model%temper%temp(1,i,j) * cell_area(i,j)    &
-                             * 0.5d0 * model%numerics%sigma(2)
-                do k = 2, upn-1
-                   tot_energy = tot_energy +   &
-                             model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j)  &
-                             * 0.5d0*(model%numerics%sigma(k+1) - model%numerics%sigma(k-1))
+                k = 1   ! top interface; assign this temperature to the top half layer
+                local_energy(k,i,j) = &
+                     model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j)    &
+                     * 0.5d0 * model%numerics%sigma(k+1)
+                do k = 2, upn-1   ! interior layers
+                   local_energy(k,i,j) = &
+                        model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j)  &
+                        * 0.5d0*(model%numerics%sigma(k+1) - model%numerics%sigma(k-1))
                 enddo
-                ! lower half-layer, T = lower sfc temp
-                tot_energy = tot_energy +   &
-                             model%geometry%thck(i,j) * model%temper%temp(upn,i,j) * cell_area(i,j)  &
-                             * 0.5d0 * (1.0d0 - model%numerics%sigma(upn-1))
+                k = upn   ! bottom interface; assign this temperature to the bottom half layer
+                local_energy(k,i,j) = &
+                     model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j)  &
+                     * 0.5d0 * (1.0d0 - model%numerics%sigma(k-1))
              endif
           enddo
        enddo
     endif
-
-    tot_energy = tot_energy * rhoi * shci
-    tot_energy = parallel_reduce_sum(tot_energy)
+    local_energy = local_energy * rhoi * shci
+    tot_energy = parallel_global_sum(local_energy, parallel, ice_mask)
+    deallocate(local_energy)
 
     ! mean thickness
-
     if (tot_area > eps) then
        mean_thck = tot_volume/tot_area
     else
@@ -468,7 +458,6 @@ subroutine glide_write_diag (model,       time)
     endif
 
     ! mean temperature
- 
     if (tot_volume > eps) then
        mean_temp = tot_energy/ (rhoi*shci*tot_volume)
     else
@@ -490,19 +479,11 @@ subroutine glide_write_diag (model,       time)
     if (model%options%whichdycore == DYCORE_GLISSADE) then
 
        ! total surface accumulation/ablation rate (m^3/yr ice)
- 
-       tot_acab = 0.d0
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
-             tot_acab = tot_acab + model%climate%acab_applied(i,j) * cell_area(i,j)
-          enddo
-       enddo
-
-       tot_acab = tot_acab * scyr               ! convert to m^3/yr
-       tot_acab = parallel_reduce_sum(tot_acab)
+       tot_acab = parallel_global_sum(model%climate%acab_applied*cell_area, parallel)
+       tot_acab = tot_acab * scyr                ! convert from m^3/s to m^3/yr
 
        ! total surface mass balance flux (kg/s)
-       tot_smb_flux = tot_acab * rhoi / scyr   ! convert m^3/yr to kg/s
+       tot_smb_flux = tot_acab * rhoi / scyr     ! convert m^3/yr to kg/s
 
        ! mean accumulation/ablation rate (m/yr)
        ! Note: This will be only approximate if some ice has melted completely during the time step
@@ -513,15 +494,8 @@ subroutine glide_write_diag (model,       time)
        endif
 
        ! total basal melting rate (positive for ice loss)
-       tot_bmlt = 0.d0
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
-             tot_bmlt = tot_bmlt + model%basal_melt%bmlt_applied(i,j) * cell_area(i,j)
-          enddo
-       enddo
-
-       tot_bmlt = tot_bmlt * scyr               ! convert to m^3/yr
-       tot_bmlt = parallel_reduce_sum(tot_bmlt)
+       tot_bmlt = parallel_global_sum(model%basal_melt%bmlt_applied*cell_area, parallel)
+       tot_bmlt = tot_bmlt * scyr                ! convert from m^3/s to m^3/yr
 
        ! total basal mass balance (kg/s, positive for freeze-on, negative for melt)
        tot_bmb_flux = -tot_bmlt * rhoi / scyr   ! convert m^3/yr to kg/s
@@ -536,14 +510,7 @@ subroutine glide_write_diag (model,       time)
 
        ! total calving rate (m^3/yr ice)
        ! Note: calving%calving_rate has units of m/yr ice
-
-       tot_calving = 0.d0
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
-             tot_calving = tot_calving + model%calving%calving_rate(i,j) * cell_area(i,j)  ! m^3/yr ice
-          enddo
-       enddo
-       tot_calving = parallel_reduce_sum(tot_calving)
+       tot_calving = parallel_global_sum(model%calving%calving_rate*cell_area, parallel)
 
        ! total calving mass balance flux (kg/s, negative for ice loss by calving)
        tot_calving_flux = -tot_calving * rhoi / scyr   ! convert m^3/yr to kg/s
@@ -559,15 +526,10 @@ subroutine glide_write_diag (model,       time)
        ! total grounding line mass balance flux (< 0 by definition)
        ! Note: At this point, gl_flux_east and gl_flux_north are already dimensionalized in kg/m/s,
        !       so tot_gl_flux will have units of kg/s
-
-       tot_gl_flux = 0.d0
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
-             tot_gl_flux = tot_gl_flux - abs(model%geometry%gl_flux_east(i,j))  * model%numerics%dns &
-                                       - abs(model%geometry%gl_flux_north(i,j)) * model%numerics%dew
-          enddo
-       enddo
-       tot_gl_flux = parallel_reduce_sum(tot_gl_flux)
+       tot_gl_flux = parallel_global_sum(abs(model%geometry%gl_flux_east)  * model%numerics%dns  &
+                                       + abs(model%geometry%gl_flux_north) * model%numerics%dew, &
+                                       parallel)
+       tot_gl_flux = -tot_gl_flux   ! negative by definition
 
        ! total rate of change of ice mass (kg/s)
        ! Note: dthck_dt has units of m/s
@@ -575,14 +537,8 @@ subroutine glide_write_diag (model,       time)
        !       in successive time steps, instead of summing over dthck_dt.
        !      Note that dthck_dt does not account for global outflow fluxes (i.e., removal of ice
        !       near the global boundary in halo updates).
-       tot_dmass_dt = 0.d0
-       do j = lhalo+1, nsn-uhalo
-          do i = lhalo+1, ewn-uhalo
-             tot_dmass_dt = tot_dmass_dt + model%geometry%dthck_dt(i,j) * cell_area(i,j)
-          enddo
-       enddo
+       tot_dmass_dt = parallel_global_sum(model%geometry%dthck_dt*cell_area, parallel)
        tot_dmass_dt = tot_dmass_dt * rhoi              ! convert to kg/s
-       tot_dmass_dt = parallel_reduce_sum(tot_dmass_dt)
 
        ! mass conservation error
        ! Note: For most runs, this should be close to zero.
@@ -957,8 +913,8 @@ subroutine glide_write_diag (model,       time)
                      + model%velocity%vvel(1,:,:)**2)
 
        call glissade_rms_error(&
-            ewn,            nsn,                &
-            ice_mask,                           &
+            ewn-1,          nsn-1,              &
+            stag_ice_mask,                      &
             parallel,                           &
             velo_sfc * scyr,                    &  ! m/yr
             model%velocity%velo_sfc_obs * scyr, &  ! m/yr
@@ -1310,48 +1266,57 @@ subroutine glide_write_diag (model,       time)
 
        ng = model%glacier%ngdiag
 
-       write(message,'(a35,i14)') 'Diagnostic glacier index (RGI)     ', &
-            model%glacier%cism_to_rgi_glacier_id(ng)
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+       if (ng > 0) then
 
-       write(message,'(a35,i14)') 'Diagnostic glacier index (CISM)    ', ng
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,i14)') 'Diagnostic glacier index (RGI)     ', &
+               model%glacier%cism_to_rgi_glacier_id(ng)
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'Glacier area_init (km^2)           ', &
-            model%glacier%area_init(ng) / 1.0d6
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,i14)') 'Diagnostic glacier index (CISM)    ', ng
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'Glacier area (km^2)                ', &
-            model%glacier%area(ng) / 1.0d6
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'Glacier area_init (km^2)           ', &
+               model%glacier%area_init(ng) / 1.0d6
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'Glacier area_init_extent (km^2)    ', &
-            model%glacier%area_init_extent(ng) / 1.0d6
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'Glacier area (km^2)                ', &
+               model%glacier%area(ng) / 1.0d6
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'Glacier volume (km^3)              ', &
-            model%glacier%volume(ng) / 1.0d9
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'Glacier area_init_extent (km^2)    ', &
+               model%glacier%area_init_extent(ng) / 1.0d6
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'Glacier volume_init (km^3)         ', &
-            model%glacier%volume_init(ng) / 1.0d9
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'Glacier volume (km^3)              ', &
+               model%glacier%volume(ng) / 1.0d9
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'Glacier volume_init_extent (km^3)  ', &
-            model%glacier%volume_init_extent(ng) / 1.0d9
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'Glacier volume_init (km^3)         ', &
+               model%glacier%volume_init(ng) / 1.0d9
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'mu_star (mm/yr w.e./deg C)         ', &
-            model%glacier%mu_star(ng)
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'Glacier volume_init_extent (km^3)  ', &
+               model%glacier%volume_init_extent(ng) / 1.0d9
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'alpha_snow                         ', &
-            model%glacier%alpha_snow(ng)
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'mu_star (mm/yr w.e./deg C)         ', &
+               model%glacier%mu_star(ng)
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
 
-       write(message,'(a35,f14.6)') 'beta_artm (deg C)                  ', &
-            model%glacier%beta_artm(ng)
-       call write_log(trim(message), type = GM_DIAGNOSTIC)
+          write(message,'(a35,f14.6)') 'alpha_snow                         ', &
+               model%glacier%alpha_snow(ng)
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
+
+          write(message,'(a35,f14.6)') 'beta_artm (deg C)                  ', &
+               model%glacier%beta_artm(ng)
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
+
+       else   ! glacier ID = 0
+
+          write(message,'(a35,i14)') 'Diagnostic glacier index (CISM)    ', ng
+          call write_log(trim(message), type = GM_DIAGNOSTIC)
+
+       endif
 
        call write_log(' ')
 
diff --git a/libglide/glide_mask.F90 b/libglide/glide_mask.F90
index c08dfd17..1e4ef585 100644
--- a/libglide/glide_mask.F90
+++ b/libglide/glide_mask.F90
@@ -41,14 +41,11 @@ module glide_mask
 
 contains
 
-!TODO - Remove iarea and ivol calculations?  They are now computed in glide_write_diag..
-
 !TODO - Write a new subroutine (in addition to glide_set_mask) to compute mask for staggered grid?
 !       This subroutine is now called from glissade_velo_driver with stagthck and stagtopg
 !       as input arguments.
 
-  subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol, &
-                            exec_serial, parallel)
+  subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask)
 
     use glide_types
     use glimmer_physcon, only : rhoi, rhoo
@@ -60,14 +57,9 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol
     integer, intent(in) :: ewn, nsn              ! Grid size
     real(dp), intent(in) :: eus                  ! Sea level
     integer, dimension(:,:), intent(inout) :: mask   ! Output mask
-    real(dp), intent(inout), optional :: ivol, iarea ! Area and volume of ice
-
-    logical, optional :: exec_serial             !JEFF If executing in serial in MPI program.
-    type(parallel_type), optional :: parallel    ! info for parallel communication
 
     ! local variables
     integer ew,ns
-    logical :: exec_serial_flag
 
     !Note - This array may not be needed, at least in parallel.
 
@@ -76,21 +68,8 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol
 
     integer, dimension(0:ewn+1,0:nsn+1) :: maskWithBounds;
 
-    !TODO - What is the exec_serial option?  Is it still needed?
-
-    !JEFF Handle exec_serial optional parameter
-    if ( present(exec_serial) ) then
-       exec_serial_flag = exec_serial
-    else
-       ! Default to off
-       exec_serial_flag = .FALSE.
-    endif
-
     mask = 0
 
-    if (present(iarea)) iarea = 0.d0
-    if (present(ivol)) ivol = 0.d0
-
 !Note - This mask is confusing.  Wondering if we should replace it by a series of logical masks.
 
 ! Would need the following:
@@ -138,10 +117,6 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol
         mask = ior(mask, GLIDE_MASK_LAND)    ! GLIDE_MASK_LAND = 4
     endwhere
 
-    if (present(iarea) .and. present(ivol)) then
-        call get_area_vol(thck, numerics%dew, numerics%dns, numerics%thklim, iarea, ivol, exec_serial_flag)
-    end if
-
     !TODO - Replace the following with a halo call for 'mask', with appropriate global BC?
     
     maskWithBounds = 0
@@ -185,14 +160,9 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol
        end do
     end do
 
-    !JEFF Don't call halo update if running in serial mode
-    !WHL - I think the halo update will now work in serial mode.
-    if (.NOT. exec_serial_flag .and. present(parallel)) then
-       call parallel_halo(mask, parallel)
-    endif
-
   end subroutine glide_set_mask
 
+
   subroutine augment_kinbc_mask(mask, kinbcmask)
 
     !  Augments the Glide mask with the location of kinematic (dirichlet) boundary
@@ -217,83 +187,8 @@ subroutine augment_kinbc_mask(mask, kinbcmask)
     endwhere
   end subroutine augment_kinbc_mask
 
-  subroutine get_area_vol(thck, dew, dns, thklim, iarea, ivol, exec_serial)
-
-    implicit none
-    real(dp), dimension(:,:) :: thck
-    real(dp) :: dew, dns, thklim
-    real(dp) :: iarea, ivol, sum(2)
-    logical :: exec_serial
-
-    integer :: i,j
-
-    do i = 1+lhalo, size(thck,1)-uhalo
-        do j = 1+lhalo, size(thck,2)-uhalo
-            if (thck(i,j) > thklim ) then
-                iarea = iarea + 1
-                ivol = ivol + thck(i,j)
-            end if
-        end do
-    end do
-
-    iarea = iarea  * dew * dns
-    ivol = ivol * dew * dns
-    
-    if (.NOT. exec_serial) then
-       sum(1) = iarea
-       sum(2) = ivol
-       sum = parallel_reduce_sum(sum)
-       iarea = sum(1)
-       ivol  = sum(2)
-    endif
-
-  end subroutine get_area_vol
- 
-  subroutine calc_iareaf_iareag(dew, dns, mask, iareaf, iareag, exec_serial)
-    
-    implicit none
-    real(dp), intent(in) :: dew, dns
-    real(dp), intent(out) :: iareaf, iareag
-    integer, dimension(:,:), intent(in) :: mask 
-    logical, optional :: exec_serial  ! If executing in serial in MPI program.
-
-    integer :: i,j
-    logical :: exec_serial_flag
-    real(dp) :: sum(2)
- 
-    !TODO - exec_serial option may not be needed
-    if ( present(exec_serial) ) then
-      exec_serial_flag = exec_serial
-    else
-      ! Default to off
-      exec_serial_flag = .FALSE.
-    endif
-
-    iareaf = 0.d0
-    iareag = 0.d0 
-
-    !loop over locally owned scalars
-    do j = 1+lhalo, size(mask,2)-uhalo
-      do i = 1+lhalo, size(mask,1)-uhalo
-        if (GLIDE_IS_FLOAT(mask(i,j))) then
-          iareaf = iareaf + dew * dns
-        else if(GLIDE_IS_GROUND_OR_GNDLINE(mask(i,j))) then
-          iareag = iareag + dew * dns
-        end if
-      end do
-    end do
-
-    if (.NOT. exec_serial_flag) then
-       sum(1) = iareaf
-       sum(2) = iareag
-       sum = parallel_reduce_sum(sum)
-       iareaf = sum(1)
-       iareag = sum(2)
-    endif
-
-  end subroutine calc_iareaf_iareag
 
-    subroutine glide_marine_margin_normal(thck, mask, marine_bc_normal, &
+  subroutine glide_marine_margin_normal(thck, mask, marine_bc_normal, &
                                           exec_serial, parallel)
 
       !TODO - Remove subroutine glide_marine_margin_normal?  Old PBJ routine.
diff --git a/libglide/glide_nc_custom.F90 b/libglide/glide_nc_custom.F90
index a672203a..bd464a4f 100644
--- a/libglide/glide_nc_custom.F90
+++ b/libglide/glide_nc_custom.F90
@@ -116,7 +116,8 @@ subroutine glide_nc_filldvars(outfile,    model)
        ! This does not work, in general, when computing on active blocks only, because the local versions
        !  of model%general%x1 may not span the global domain.
        ! The revised code calls parallel_put_var to write (x0,y0) and (x1,y1) to the output file.
-       ! This assumes that x1_global and y1_global were read from the input file and saved in a global array.
+       ! This assumes that x1_global and y1_global were read from the input file and saved in a global array
+       ! (e.g., in subroutine glide_io_read).
 
        status = parallel_inq_varid(NCO%id,'x1',varid)
        status = parallel_put_var(NCO%id,varid,model%general%x1_global)
@@ -126,8 +127,8 @@ subroutine glide_nc_filldvars(outfile,    model)
        status = parallel_put_var(NCO%id,varid,model%general%y1_global)
        call nc_errorhandle(__FILE__,__LINE__,status)
 
-       ! create the x0 and y0 grids from x1 and y1
-
+       ! create the x0 and y0 grids from x1 and y1;
+       ! this does not require model%general%x0_global and y0_global to have been filled
        status = parallel_inq_varid(NCO%id,'x0',varid)
        do i = 1, global_ewn-1
           x0_global(i) = (model%general%x1_global(i) + model%general%x1_global(i+1)) / 2.0d0
diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90
index 1f5fd0a2..e25fcc3e 100644
--- a/libglide/glide_setup.F90
+++ b/libglide/glide_setup.F90
@@ -907,6 +907,7 @@ subroutine handle_ho_options(section, model)
     call GetValue(section, 'linear_solve_ncheck',         model%options%linear_solve_ncheck)
     call GetValue(section, 'linear_maxiters',             model%options%linear_maxiters)
     call GetValue(section, 'linear_tolerance',            model%options%linear_tolerance)
+    call GetValue(section, 'reproducible_sums',           model%options%reproducible_sums)
 
   end subroutine handle_ho_options
 
@@ -1128,9 +1129,9 @@ subroutine print_options(model)
          'power law                                        ', &
          'Coulomb friction law w/ effec press              ', &
          'Schoof friction law                              ', &
+         'modified Schoof friction law                     ', &
          'min of Coulomb stress and power-law stress (Tsai)', &
          'power law using effective pressure               ', &
-         'simple pattern of beta                           ', &
          'till yield stress (Picard)                       ' /)
 
     character(len=*), dimension(0:1), parameter :: ho_whichbeta_limit = (/ &
@@ -1828,8 +1829,8 @@ subroutine print_options(model)
 
        if (model%options%use_c_space_factor) then
           if (model%options%which_ho_babc == HO_BABC_COULOMB_FRICTION .or.  &
-              model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_SCHOOF .or. &
-              model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_TSAI) then
+              model%options%which_ho_babc == HO_BABC_SCHOOF .or. &
+              model%options%which_ho_babc == HO_BABC_TSAI) then
              write(message,*) 'Multiplying beta by C_space_factor'
              call write_log(message)
           else
@@ -1878,18 +1879,19 @@ subroutine print_options(model)
 
        ! Inversion options
 
-       ! Note: Inversion for Cp is currently supported for the Schoof sliding law, Tsai law, and basic power law
+       ! Note: Inversion for Cp is supported for the basic power law plus the Schoof and Tsai laws
        if (model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION .or. &
            model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION_BASIN) then
 
-          if (model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_SCHOOF .or.  &
-              model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_TSAI .or.  &
-              model%options%which_ho_babc == HO_BABC_POWERLAW) then
+          if (model%options%which_ho_babc == HO_BABC_POWERLAW .or. &
+              model%options%which_ho_babc == HO_BABC_SCHOOF .or.  &
+              model%options%which_ho_babc == HO_BABC_MODIFIED_SCHOOF .or.  &
+              model%options%which_ho_babc == HO_BABC_TSAI) then
              ! inversion for Cp is supported
           else
              call write_log('Error, Cp inversion is not supported for this basal BC option')
              write(message,*) 'Cp inversion is supported for these options: ', &
-                  HO_BABC_COULOMB_POWERLAW_SCHOOF, HO_BABC_COULOMB_POWERLAW_TSAI, HO_BABC_POWERLAW
+                  HO_BABC_POWERLAW, HO_BABC_SCHOOF, HO_BABC_MODIFIED_SCHOOF, HO_BABC_TSAI
              call write_log(message, GM_FATAL)
           endif
        endif
@@ -1899,12 +1901,13 @@ subroutine print_options(model)
            model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION_BASIN) then
 
           if (model%options%which_ho_babc == HO_BABC_ZOET_IVERSON .or. &
-              model%options%which_ho_babc == HO_BABC_PSEUDO_PLASTIC) then
+              model%options%which_ho_babc == HO_BABC_PSEUDO_PLASTIC .or. &
+              model%options%which_ho_babc == HO_BABC_MODIFIED_SCHOOF) then
              ! inversion for Cc is supported
           else
              call write_log('Error, Cc inversion is not supported for this basal BC option')
              write(message,*) 'Cc inversion is supported for these options: ', &
-                  HO_BABC_ZOET_IVERSON, HO_BABC_PSEUDO_PLASTIC
+                  HO_BABC_ZOET_IVERSON, HO_BABC_PSEUDO_PLASTIC, HO_BABC_MODIFIED_SCHOOF
              call write_log(message, GM_FATAL)
           endif
        endif
@@ -1970,9 +1973,6 @@ subroutine print_options(model)
                model%basal_hydro%ho_flux_routing_scheme >= size(ho_flux_routing_scheme)) then
              call write_log('Error, HO flux routing scheme out of range', GM_FATAL)
           end if
-          write(message,*) 'ho_flux_routing_scheme  : ',model%basal_hydro%ho_flux_routing_scheme,  &
-               ho_flux_routing_scheme(model%basal_hydro%ho_flux_routing_scheme)
-          call write_log(message)
        endif
 
        write(message,*) 'ho_whicheffecpress      : ',model%options%which_ho_effecpress,  &
@@ -2183,6 +2183,10 @@ subroutine print_options(model)
           write(message,*) 'linear_tolerance        : ',model%options%linear_tolerance
           call write_log(message)
 
+          if (model%options%reproducible_sums) then
+             call write_log('Global sums will be reproducible')
+          endif
+
        end if   ! DYCORE_GLISSADE
 
        if (model%options%whichdycore == DYCORE_GLISSADE .and.   &
@@ -2195,8 +2199,15 @@ subroutine print_options(model)
        if (model%options%whichdycore == DYCORE_GLISSADE .and.   &
            (model%options%which_ho_sparse == HO_SPARSE_PCG_STANDARD .or.  &
             model%options%which_ho_sparse == HO_SPARSE_PCG_CHRONGEAR) ) then 
+          if (model%options%reproducible_sums) then
+             if (model%options%which_ho_precond == HO_PRECOND_TRIDIAG_LOCAL .or. &
+                 model%options%which_ho_precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+                call write_log ('Tridiagonal preconditioners are not supported with reproducible sums.')
+                call write_log ('Please choose a different preconditioner (e.g., diagonal)', GM_FATAL)
+             endif
+          endif
           write(message,*) 'ho_whichprecond         : ',model%options%which_ho_precond,  &
-                            ho_whichprecond(model%options%which_ho_precond)
+               ho_whichprecond(model%options%which_ho_precond)
           call write_log(message)
           if (model%options%which_ho_precond < 0 .or. model%options%which_ho_precond >= size(ho_whichprecond)) then
              call write_log('Error, glissade preconditioner out of range', GM_FATAL)
@@ -2694,6 +2705,9 @@ subroutine print_parameters(model)
     if (model%options%which_ho_babc == HO_BABC_BETA_CONSTANT) then
        write(message,*) 'uniform beta (Pa yr/m)        : ',model%basal_physics%ho_beta_const
        call write_log(message)
+    elseif (model%options%which_ho_babc == HO_BABC_BETA_LARGE) then
+       write(message,*) 'large beta (Pa yr/m) : ',model%basal_physics%ho_beta_large
+       call write_log(message)
     elseif (model%options%which_ho_babc == HO_BABC_BETA_BPMP) then
        write(message,*) 'large (frozen) beta (Pa yr/m) : ',model%basal_physics%ho_beta_large
        call write_log(message)
@@ -2762,7 +2776,7 @@ subroutine print_parameters(model)
        call write_log(message)
        write(message,*) 'bed bump wavelength for Coulomb friction law : ', model%basal_physics%coulomb_bump_wavelength
        call write_log(message)
-    elseif (model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_SCHOOF) then
+    elseif (model%options%which_ho_babc == HO_BABC_SCHOOF) then
        ! Note: The Schoof law typically uses a spatially variable powerlaw_c.
        !       If so, the value written here is just the initial value.
        write(message,*) 'Cc for Schoof Coulomb law                    : ', model%basal_physics%coulomb_c_const
@@ -2775,7 +2789,24 @@ subroutine print_parameters(model)
        call write_log(message)
        write(message,*) 'm exponent for Schoof power law              : ', model%basal_physics%powerlaw_m
        call write_log(message)
-    elseif (model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_TSAI) then
+    elseif (model%options%which_ho_babc == HO_BABC_MODIFIED_SCHOOF) then
+       ! Note: This law supports inversion for both Cc and Cp.
+       !       When inverting, the values here are just the initial values.
+       write(message,*) 'Cc for modified Schoof law                   : ', model%basal_physics%coulomb_c_const
+       call write_log(message)
+       write(message,*) 'Max Cc                                       : ', model%basal_physics%coulomb_c_max
+       call write_log(message)
+       write(message,*) 'Min Cc                                       : ', model%basal_physics%coulomb_c_min
+       call write_log(message)
+       write(message,*) 'Cp for modified Schoof law, Pa (m/yr)^(-1/3) : ', model%basal_physics%powerlaw_c_const
+       call write_log(message)
+       write(message,*) 'Max Cp                                       : ', model%basal_physics%powerlaw_c_max
+       call write_log(message)
+       write(message,*) 'Min Cp                                       : ', model%basal_physics%powerlaw_c_min
+       call write_log(message)
+       write(message,*) 'm exponent for power law                     : ', model%basal_physics%powerlaw_m
+       call write_log(message)
+    elseif (model%options%which_ho_babc == HO_BABC_TSAI) then
        ! Note: The Tsai law typically uses a spatially variable powerlaw_c. 
        !       If so, the value written here is just the initial value.
        write(message,*) 'Cc for Tsai Coulomb law                      : ', model%basal_physics%coulomb_c_const
@@ -2877,6 +2908,9 @@ subroutine print_parameters(model)
        write(message,*) 'coulomb_c min                                : ', &
             model%basal_physics%coulomb_c_min
        call write_log(message)
+       write(message,*) 'coulomb_c const                              : ', &
+            model%basal_physics%coulomb_c_const
+       call write_log(message)
        write(message,*) 'thickness scale (m) for C_c inversion        : ', &
             model%inversion%babc_thck_scale
        call write_log(message)
@@ -3305,7 +3339,8 @@ subroutine handle_basal_hydro(section, model)
     ! flux routing
     call GetValue(section, 'ho_flux_routing_scheme', model%basal_hydro%ho_flux_routing_scheme)
     call GetValue(section, 'const_source', model%basal_hydro%const_source)
-    call GetValue(section, 'btemp_scale', model%basal_hydro%btemp_scale)
+    call GetValue(section, 'btemp_flow_scale', model%basal_hydro%btemp_flow_scale)
+    call GetValue(section, 'btemp_freeze_scale', model%basal_hydro%btemp_freeze_scale)
 
     ! effective pressure options and parameters
     call GetValue(section, 'effecpress_delta',   model%basal_hydro%effecpress_delta)
@@ -3380,6 +3415,19 @@ subroutine print_basal_hydro(model)
             model%basal_hydro%ho_flux_routing_scheme >= size(ho_flux_routing_scheme)) then
           call write_log('Error, HO flux routing scheme out of range', GM_FATAL)
        end if
+       if (model%options%reproducible_sums) then
+          if (model%basal_hydro%ho_flux_routing_scheme /= HO_FLUX_ROUTING_D8) then
+             write(message,*) 'With reproducible sums, only D8 flux-routing is supported; switching to D8'
+             model%basal_hydro%ho_flux_routing_scheme = HO_FLUX_ROUTING_D8
+             call write_log(message)
+          endif
+          if (model%basal_hydro%btemp_freeze_scale > 0.0d0) then
+             write(message,*) 'With reproducible sums, the flux-routing does not support refreezing;' // &
+                  ' setting btemp_freeze_scale = 0'
+             call write_log(message)
+             model%basal_hydro%btemp_freeze_scale = 0.0d0
+          endif
+       endif  ! reproducible sums
        write(message,*) 'ho_flux_routing_scheme        : ',model%basal_hydro%ho_flux_routing_scheme,  &
             ho_flux_routing_scheme(model%basal_hydro%ho_flux_routing_scheme)
        call write_log(message)
@@ -3387,8 +3435,14 @@ subroutine print_basal_hydro(model)
           write(message,*) 'constant melt source at the bed (m/yr): ', model%basal_hydro%const_source
           call write_log(message)
        endif
-       if (model%basal_hydro%btemp_scale > 0.0d0) then
-          write(message,*) 'temp scale (deg C) for frz/thaw transition: ', model%basal_hydro%btemp_scale
+       if (model%basal_hydro%btemp_flow_scale > 0.0d0) then
+          write(message,*) 'temp scale (deg C) for flow around frozen bed: ', &
+               model%basal_hydro%btemp_flow_scale
+          call write_log(message)
+       endif
+       if (model%basal_hydro%btemp_freeze_scale > 0.0d0) then
+          write(message,*) 'temp scale (deg C) for refreezing at the bed: ', &
+               model%basal_hydro%btemp_freeze_scale
           call write_log(message)
        endif
        if (model%options%which_ho_effecpress == HO_EFFECPRESS_BWAT) then
@@ -3995,12 +4049,12 @@ subroutine define_glide_restart_variables(model, model_id)
     ! basal sliding option
     select case (options%which_ho_babc)
        !WHL - Removed effecpress as a restart variable; it is recomputed with each velocity solve.
-!!      case (HO_BABC_POWERLAW, HO_BABC_COULOMB_FRICTION, HO_BABC_COULOMB_POWERLAW_SCHOOF)
+!!      case (HO_BABC_POWERLAW, HO_BABC_COULOMB_FRICTION, HO_BABC_SCHOOF)
 !!        ! These friction laws need effective pressure
 !!        call glide_add_to_restart_variable_list('effecpress', model_id)
 !!      case(HO_BABC_COULOMB_POWERLAW_TSAI)
 !!        call glide_add_to_restart_variable_list('effecpress', model_id)
-      case (HO_BABC_COULOMB_FRICTION, HO_BABC_COULOMB_POWERLAW_SCHOOF, HO_BABC_COULOMB_POWERLAW_TSAI)
+      case (HO_BABC_COULOMB_FRICTION, HO_BABC_SCHOOF, HO_BABC_TSAI)
          ! Note: These options compute beta internally, so it does not need to be in the restart file.
          if (options%use_c_space_factor) then
             ! c_space_factor needs to be in the restart file
diff --git a/libglide/glide_types.F90 b/libglide/glide_types.F90
index 0be4b0fe..e52641f1 100644
--- a/libglide/glide_types.F90
+++ b/libglide/glide_types.F90
@@ -249,7 +249,7 @@ module glide_types
   integer, parameter :: HO_THERMAL_AFTER_TRANSPORT = 1
   integer, parameter :: HO_THERMAL_SPLIT_TIMESTEP = 2
   
-  !TODO - Deprecate the last two options? Rarely if ever used.
+  !TODO - Deprecate some little-used options?
   integer, parameter :: HO_BABC_BETA_CONSTANT = 0
   integer, parameter :: HO_BABC_BETA_BPMP = 1
   integer, parameter :: HO_BABC_PSEUDO_PLASTIC = 2
@@ -261,10 +261,10 @@ module glide_types
   integer, parameter :: HO_BABC_ISHOMC = 8
   integer, parameter :: HO_BABC_POWERLAW = 9
   integer, parameter :: HO_BABC_COULOMB_FRICTION = 10
-  integer, parameter :: HO_BABC_COULOMB_POWERLAW_SCHOOF = 11
-  integer, parameter :: HO_BABC_COULOMB_POWERLAW_TSAI = 12
-  integer, parameter :: HO_BABC_POWERLAW_EFFECPRESS = 13
-  integer, parameter :: HO_BABC_SIMPLE = 14
+  integer, parameter :: HO_BABC_SCHOOF = 11
+  integer, parameter :: HO_BABC_MODIFIED_SCHOOF = 12
+  integer, parameter :: HO_BABC_TSAI = 13
+  integer, parameter :: HO_BABC_POWERLAW_EFFECPRESS = 14
   integer, parameter :: HO_BABC_YIELD_PICARD = 15
 
   integer, parameter :: HO_BETA_LIMIT_ABSOLUTE = 0
@@ -851,10 +851,10 @@ module glide_types
     !> \item[8] beta field as prescribed for ISMIP-HOM test C (serial only)
     !> \item[9] power law
     !> \item[10] Coulomb friction law using effective pressure, with flwa from lowest ice layer
-    !> \item[11] Coulomb friction law using effective pressure, with constant basal flwa
-    !> \item[12] basal stress is the minimum of Coulomb and power-law values, as in Tsai et al. (2015)
-    !> \item[13] power law using effective pressure
-    !> \item[14] simple hard-coded pattern (useful for debugging)
+    !> \item[11] Schoof law that blends powerlaw and Coulomb behavior
+    !> \item[12] modified version of the Schoof law
+    !> \item[13] basal stress is the minimum of Coulomb and power-law values, as in Tsai et al. (2015)
+    !> \item[14] power law using effective pressure
     !> \item[15] treat beta value as a till yield stress (in Pa) using Picard iteration
     !> \end{description}
 
@@ -1157,6 +1157,10 @@ module glide_types
     real(dp) :: linear_tolerance = 1.0d-08
     !> error tolerance for linear solver
 
+    logical :: reproducible_sums = .false.
+    !> if true, then compute reproducible global sums
+    !> (independent of the number of tasks)
+
     ! The remaining options are not currently supported
 
     !integer :: which_bproc = 0
@@ -2154,7 +2158,11 @@ module glide_types
      !TODO - Add visc_water and omega_hydro? Currently set in glissade_basal_water module
      real(dp) :: const_source = 0.0d0            !> constant melt source at the bed (m/yr)
                                                  !> could be used to represent an englacial or surface source
-     real(dp) :: btemp_scale = 0.0d0             !> temperature scale (degC) for transition between thawed and frozen bed
+     real(dp) :: btemp_flow_scale = 0.0d0        !> temperature scale (degC) for transition between thawed and frozen bed;
+                                                 !> used to route flow away from cells with a frozen bed;
+                                                 !> btemp_scale = 0 => temperature-independent flow
+     real(dp) :: btemp_freeze_scale = 0.0d0      !> temperature scale (degC) for transition between thawed and frozen bed;
+                                                 !> used to refreeze water beneath cells with a frozen bed;
                                                  !> btemp_scale = 0 => temperature-independent flow
      ! parameters for macroporous sheet
      real(dp) :: bwat_threshold = 1.0d-3         !> scale over which N ramps down from overburden to a small value (m)
@@ -2196,7 +2204,7 @@ module glide_types
                                                                 !> Note: Defined on velocity grid, whereas temp and bpmp are on ice grid
 
 
-     ! Note: c_space_factor supported for which_ho_babc = HO_BABC_COULOMB_FRICTION, *COULOMB_POWERLAW_SCHOOF AND *COULOMB_POWERLAW_TSAI
+     ! Note: c_space_factor supported for which_ho_babc = HO_BABC_COULOMB_FRICTION, *SCHOOF AND *TSAI
      real(dp), dimension(:,:), pointer :: c_space_factor => null()      !> spatial factor for basal shear stress (no dimension)
      real(dp), dimension(:,:), pointer :: c_space_factor_stag => null() !> spatial factor for basal shear stress on staggered grid
 
@@ -2238,7 +2246,7 @@ module glide_types
           coulomb_c_hi => null(),  &             !> coulomb_c value at high bed elevation, topg >= bed_hi
           coulomb_c_lo => null()                 !> coulomb_c value at low bed elevation, topg <= bed_lo
 
-     ! parameters for power law, taub_b = C * u_b^(1/m); used for HO_BABC_COULOMB_POWERLAW_TSAI/SCHOOF
+     ! parameters for power law, taub_b = C * u_b^(1/m); used for HO_BABC_SCHOOF AND *_TSAI
      ! The default values are from Asay-Davis et al. (2016).
      ! The value of powerlaw_c suggested by Tsai et al. (2015) is 7.624d6 Pa m^(-1/3) s^(1/3).
      ! This value can be converted to CISM units by dividing by scyr^(1/3), to obtain 2.413d4 Pa m^(-1/3) yr^(1/3).
diff --git a/libglide/isostasy_elastic.F90 b/libglide/isostasy_elastic.F90
index 60f2f55e..d309137d 100644
--- a/libglide/isostasy_elastic.F90
+++ b/libglide/isostasy_elastic.F90
@@ -136,7 +136,7 @@ subroutine calc_elastic(&
     !>  the load for simulations on more than one task.
 
     use cism_parallel, only: this_rank, main_task, &
-         parallel_type, distributed_gather_var, distributed_scatter_var, parallel_halo
+         parallel_type, gather_var, scatter_var, parallel_halo
 
     implicit none
 
@@ -182,8 +182,8 @@ subroutine calc_elastic(&
 
     ! Gather the local arrays onto the main task
     ! Note: global arrays are allocated in the subroutine
-    call distributed_gather_var(load_factors, load_factors_global, parallel)
-    call distributed_gather_var(load, load_global, parallel)
+    call gather_var(load_factors, load_factors_global, parallel)
+    call gather_var(load, load_global, parallel)
 
     if (main_task) then
        do j = 1, global_nsn
@@ -207,9 +207,9 @@ subroutine calc_elastic(&
 
     ! Scatter the load values back to local arrays
     ! Note: The global array is deallocated in the subroutine
-    call distributed_scatter_var(load, load_global, parallel)
+    call scatter_var(load, load_global, parallel)
 
-    ! distributed_scatter_var does not update the halo, so do an update here
+    ! scatter_var does not update the halo, so do an update here
     call parallel_halo(load, parallel)
 
     ! Deallocate the other global array (which is intent(in) and does not need to be scattered)
@@ -251,7 +251,7 @@ subroutine init_rbel(rbel, a)
     use isostasy_kelvin
     implicit none
     type(isos_elastic) :: rbel        !> structure holding elastic litho data
-    real(dp), intent(in) :: a             !> radius of disk
+    real(dp), intent(in) :: a         !> radius of disk
 
     real(dp) :: dummy_a
 
diff --git a/libglimmer/cism_reprosum_mod.F90 b/libglimmer/cism_reprosum_mod.F90
new file mode 100644
index 00000000..1b4161a8
--- /dev/null
+++ b/libglimmer/cism_reprosum_mod.F90
@@ -0,0 +1,2084 @@
+module cism_reprosum_mod
+!------------------------------------------------------------------------
+!
+! Purpose:
+! Compute reproducible global sums of a set of arrays across an MPI
+! subcommunicator
+!
+! Methods:
+! Compute using either or both a scalable, reproducible algorithm and a
+! scalable, nonreproducible algorithm:
+! * Reproducible (scalable):
+!    Convert each floating point summand to an integer vector
+!    representation, to enable reproducibility when using
+!    MPI_Allreduce, then convert the resulting global sum back to a
+!    floating point representation locally;
+! * Alternative usually reproducible (scalable):
+!    Use parallel double-double algorithm due to Helen He and
+!    Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm;
+! * Nonreproducible (scalable):
+!    Floating point and MPI_Allreduce based.
+! If computing both reproducible and nonreproducible sums, compare
+! these and report relative difference (if absolute difference
+! less than sum) or absolute difference back to calling routine.
+!
+! Author: P. Worley (based on suggestions from J. White for integer
+!                    vector algorithm and on He/Ding paper for DDPDD
+!                    algorithm)
+!
+! William Lipscomb, Jan. 2026:
+! Renamed this module to cism_reprosum_mod. Modified as follows to build
+!  without using CESM shared code:
+! * Replaced some use statements
+! * Changed 'shr' to 'cism' to avoid name conflicts with shared code
+! * Output directed to iulog (declared in glimmer_paramets)
+! * Added some optional debugging statements
+! I started from a version that includes some logic fixes and code cleanup
+!  done by Pat Worley in 2023. Pat's revised version differs from the code
+!  in the CESM repo as of Jan. 2026.
+! See here for information on Pat's mods:
+! * https://github.com/E3SM-Project/E3SM/pull/5534
+! * https://github.com/E3SM-Project/E3SM/pull/5549
+! * https://github.com/E3SM-Project/E3SM/pull/5560
+!------------------------------------------------------------------------
+
+!------------------------------------------------------------------------
+!- use statements -------------------------------------------------------
+!------------------------------------------------------------------------
+   !WHL - Use CISM modules instead of shared modules
+!#if ( defined noI8 )
+!   ! Workaround for when shr_kind_i8 is not supported.
+!   use shr_kind_mod,  only: r8 => shr_kind_r8, i8 => shr_kind_i4
+!#else
+!   use shr_kind_mod,  only: r8 => shr_kind_r8, i8 => shr_kind_i8
+!#endif
+!   use shr_log_mod,   only: s_loglev  => shr_log_Level
+!   use shr_log_mod,   only: s_logunit => shr_log_Unit
+!   use shr_sys_mod,   only: shr_sys_abort
+!   use shr_infnan_mod,only: shr_infnan_inf_type, assignment(=), &
+!                            shr_infnan_posinf, shr_infnan_neginf, &
+!                            shr_infnan_nan, &
+!                            shr_infnan_isnan, shr_infnan_isinf, &
+!                            shr_infnan_isposinf, shr_infnan_isneginf
+!#ifndef EAMXX_STANDALONE
+!   use perf_mod
+!#endif
+   use glimmer_global, only: r8 => dp
+   use glimmer_global, only: i8
+   use glimmer_paramets, only: iulog
+   use profile, only: t_startf, t_stopf  !WHL - replace with perf_mod?
+   ! end WHL mods
+
+   ! Import MPI fcns/types
+   use mpi
+
+!------------------------------------------------------------------------
+!- module boilerplate ---------------------------------------------------
+!------------------------------------------------------------------------
+   implicit none
+   private
+
+   save
+
+!------------------------------------------------------------------------
+! Public interfaces -----------------------------------------------------
+!------------------------------------------------------------------------
+   !WHL  - cism_reprosum_setopts is not currently called
+   public :: &
+      cism_reprosum_setopts,        &! set runtime options
+      cism_reprosum_calc,           &! calculate distributed sum
+      cism_reprosum_tolExceeded      ! utility function to check relative
+                                     !  differences against the tolerance
+
+!------------------------------------------------------------------------
+! Public data -----------------------------------------------------------
+!------------------------------------------------------------------------
+   logical, public     :: cism_reprosum_recompute = .false.
+
+   real(r8), public    :: cism_reprosum_reldiffmax = -1.0_r8
+
+   logical, parameter, public :: verbose_reprosum = .false.
+
+!------------------------------------------------------------------------
+! Private interfaces ----------------------------------------------------
+!------------------------------------------------------------------------
+   private :: &
+      ddpdd,           &! double-double sum routine
+      split_indices     ! split indices among OMP threads
+
+!------------------------------------------------------------------------
+! Private data ----------------------------------------------------------
+!------------------------------------------------------------------------
+
+   !---------------------------------------------------------------------
+   ! cism_reprosum_mod options
+   !---------------------------------------------------------------------
+   logical            :: repro_sum_use_ddpdd = .false.
+
+   !WHL - Should this code be declared?
+   !      Not sure what EAMXX_STANDALONE means
+#ifdef EAMXX_STANDALONE
+   ! Declare the C function interface
+   interface
+      subroutine cism_reprosumx86_fix_start(arg) bind(c)
+        use iso_c_binding
+        integer, intent(out) :: arg
+      end subroutine cism_reprosumx86_fix_start
+   end interface
+
+   interface
+      subroutine cism_reprosumx86_fix_end(arg) bind(c)
+        use iso_c_binding
+        integer, intent(in) :: arg
+      end subroutine cism_reprosumx86_fix_end
+   end interface
+#endif
+
+   CONTAINS
+
+!
+!========================================================================
+!
+   subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in,    &
+                                   repro_sum_rel_diff_max_in, &
+                                   repro_sum_recompute_in,    &
+                                   repro_sum_master,          &
+                                   repro_sum_logunit          )
+
+!------------------------------------------------------------------------
+! Purpose: Set runtime options
+! Author: P. Worley
+!------------------------------------------------------------------------
+!------------------------------Arguments---------------------------------
+      ! Use DDPDD algorithm instead of integer vector algorithm
+      logical, intent(in), optional :: repro_sum_use_ddpdd_in
+      real(r8), intent(in), optional :: repro_sum_rel_diff_max_in
+      ! recompute using different algorithm when difference between
+      ! reproducible and nonreproducible sums is too great
+      logical, intent(in), optional  :: repro_sum_recompute_in
+      ! flag indicating whether this MPI task should output
+      ! log messages
+      logical, intent(in), optional  :: repro_sum_master
+      ! unit number for log messages
+      integer, intent(in), optional  :: repro_sum_logunit
+!---------------------------Local Workspace------------------------------
+      integer logunit            ! unit number for log messages
+      logical master             ! local master?
+      logical,save :: firstcall = .true.  ! first call
+      integer :: ierr            ! MPI error return
+!------------------------------------------------------------------------
+
+      if ( present(repro_sum_master) ) then
+         master = repro_sum_master
+      else
+         master = .false.
+      endif
+
+      if ( present(repro_sum_logunit) ) then
+         logunit = repro_sum_logunit
+      else
+!!         logunit = s_logunit
+         logunit = iulog
+      endif
+
+      if (.not. firstcall) then
+         !WHL mod
+!         write(logunit,*) 'shr_reprosum_setopts: ERROR can only be called once'
+!         call shr_sys_abort('shr_reprosum_setopts ERROR: multiple calls')
+         write(logunit,*) 'cism_reprosum_setopts: ERROR can only be called once'
+         call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
+         ! end WHL mod
+      endif
+      firstcall = .false.
+
+      if ( present(repro_sum_use_ddpdd_in) ) then
+         repro_sum_use_ddpdd = repro_sum_use_ddpdd_in
+      endif
+      if ( present(repro_sum_rel_diff_max_in) ) then
+         cism_reprosum_reldiffmax = repro_sum_rel_diff_max_in
+      endif
+      if ( present(repro_sum_recompute_in) ) then
+         cism_reprosum_recompute = repro_sum_recompute_in
+      endif
+      if (master) then
+         if ( repro_sum_use_ddpdd ) then
+            write(logunit,*) 'cism_REPROSUM_SETOPTS: ',&
+              'Using double-double-based (scalable) usually reproducible ', &
+              'distributed sum algorithm'
+         else
+            write(logunit,*) 'cism_REPROSUM_SETOPTS: ',&
+              'Using integer-vector-based (scalable) reproducible ', &
+              'distributed sum algorithm'
+         endif
+
+         if (cism_reprosum_reldiffmax >= 0.0_r8) then
+            write(logunit,*) '                    ',&
+              'with a maximum relative error tolerance of ', &
+              cism_reprosum_reldiffmax
+            if (cism_reprosum_recompute) then
+               write(logunit,*) '                    ',&
+                 'If tolerance exceeded, sum is recomputed using ', &
+                 'a serial algorithm.'
+            else
+               write(logunit,*) '                    ',&
+                 'If tolerance exceeded, integer-vector-based sum is used ', &
+                 'but a warning is output.'
+            endif
+         else
+            write(logunit,*) '                    ',&
+              'and not comparing with floating point algorithms.'
+         endif
+
+      endif
+   end subroutine cism_reprosum_setopts
+
+!
+!========================================================================
+!
+
+   subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands,     &
+                                 nflds, ddpdd_sum,                        &
+                                 arr_gbl_max, arr_gbl_max_out,            &
+                                 arr_max_levels, arr_max_levels_out,      &
+                                 gbl_max_nsummands, gbl_max_nsummands_out,&
+                                 gbl_count, repro_sum_validate,           &
+                                 repro_sum_stats, rel_diff, commid        )
+!------------------------------------------------------------------------
+!
+! Purpose:
+! Compute the global sum of each field in 'arr' using the indicated
+! communicator with a reproducible yet scalable implementation based
+! on first converting each floating point summand into an equivalent
+! representation using a vector of integers, summing the integer
+! vectors, then converting the resulting sum back to a floating point
+! representation. An alternative is to use an 'almost always
+! reproducible' floating point algorithm (DDPDD), as described below.
+!
+! Description of integer vector algorithm:
+!-----------------------------------------
+! The basic idea is to represent the mantissa of each floating point
+! value as an integer, add these integers, and then convert back to a
+! floating point value. For a real*8 value, there are enough digits in
+! an integer*8 variable to not lose any information (in the
+! mantissa). However, each of these integers would have a different
+! implicit exponent if done in a naive way, and so the sum would not
+! be accurate. Also, even with the same 'normalization', the sum might
+! exceed the maximum value representable by an integer*8, causing
+! an overflow. Instead, a vector of integers is generated, where a
+! given element (or level using the terminology used in the code) of
+! the vector is associated with a particular exponent. The mantissa
+! for a given floating point value is then converted to some number of
+! integer values, depending on the exponent of the floating point
+! value, the normalization of its mantissa, the maximum number of
+! summands, the number of participating MPI tasks and of OpenMP
+! threads, and the exponents associated with the levels of the integer
+! vector, and added into the appropriate levels of the integer
+! vector. Each MPI task has its own integer vector representing the
+! local sum. This is then summed across all participating MPI tasks
+! using an MPI_Allreduce, and, lastly, converted back to a floating
+! point value. Note that the same approach works for a vector of
+! integer*4 variables, simply requiring more levels, both for the full
+! summation vector and for each individual real*8 summand. This is a
+! compile time option in the code, in support of systems for which the
+! compiler or MPI library has issues when using integer*8. As
+! implemented, this algorithm should work for any floating point and
+! integer type as long as they share the same base. The code is
+! written as if for real*8 and integer*8 variables, but the only
+! dependence is on the types 'r8' and 'i8', which are defined in the
+! code, currently with reference to the corresponding types in
+! shr_kind_mod. This is how integer*4 support is implemented, by
+! defining i8 to be shr_kind_i4 instead of shr_kind_i8.
+     ! !WHL - The CISM types are declared in glimmer_global.
+! For this to work, each MPI task must have the same number of levels
+! and same implicit exponent for each level. These levels must be
+! sufficient to represent the smallest and largest nonzero individual
+! summands (in absolute value) and the largest possible intermediate
+! sum, including the final sum. Most of the complexity in the
+! algorithm is in identifying the number of levels, the exponent
+! associated with each level, and the appropriate levels to target
+! when converting a floating point value into its integer vector
+! representation. There are also some subtleties in reconstructing the
+! final sum from the integer vector, as described below. For each
+! floating point value, the exponent and mantissa are extracted using
+! the fortran intrinsics 'exponent' and 'fraction'. The mantissa is
+! then 'shifted' to match the exponent for a target level in the
+! integer vector using the 'scale' intrinsic. 'int(X,i8)' is used
+! for the conversion for the given level, and subtraction between
+! this integer and the original 'shifted' value identifies the
+! remainder that will be converted to an integer for the next level
+! in the vector. The logic continues until the remainder is zero. As
+! mentioned above, the only requirement, due to the implementation
+! using these fortran intrinsics, is that floating point and integer
+! models use the same base, e.g.
+!  radix(1.0_r8) == radix(1_i8)
+! for real*8 and integer*8. If not, then the alternative algorithm
+! DDPDD mentioned above and described below is used instead. The
+! integer representation must also have enough digits for the
+! potential growth of the sum for each level, so could conceivably be
+! too small for a large number of summands.
+!
+! Upper bounds on the total number of summands and on all intermediate
+! sums are calculated as
+!  <number of MPI tasks>*<max number of summands per MPI task>
+! and
+!  <number of MPI tasks>*<max number of summands per MPI task>
+!   *<max absolute value over all nonzero summands>
+! respectively. The maximum number of summands per MPI task and the
+! maximum absolute value over all nonzero summands are global
+! information that need to be determined with additional MPI
+! collectives. The minimum nonzero absolute value summand is also
+! global information. Fortunately, all of these can be determined with
+! a single MPI_Allreduce call, so only one more than that required for
+! the sum itself. (Note that, in actuality, the exponents of max and
+! min summands are determined, and these are used to calculate bounds
+! on the maximum and minimum, allowing the use of an MPI_INTEGER
+! vector in the MPI_Allreduce call.)
+!
+! The actual code is made a little messier by (a) supporting summation
+! of multiple fields without increasing the number of MPI_Allreduce
+! calls, (b) supporting OpenMP threading of the local arithmetic, (c)
+! allowing the user to specify estimates for the global information
+! (to avoid the additional MPI_Allreduce), (d) including a check of
+! whether user specified bounds were sufficient and, if not,
+! determining the actual bounds and recomputing the sum, and (e)
+! allowing the user to specify the maximum number of levels to use,
+! potentially losing accuracy but still preserving reproducibility and
+! being somewhat cheaper to compute.
+!
+! The conversion of the local summands to vectors of integers, the
+! summation of the local vectors of integers, and the summation of the
+! distributed vectors of integers will be exact (if optional parameters
+! are not used to decrease the accuracy - see below). However, the
+! conversion of the vector of integer representation to a floating
+! point value may be subject to rounding errors. Before the
+! conversion, the vector of integers is adjusted so that all elements
+! have the same sign, and so that the value, in absolute value, at a
+! given level is strictly less than what can be represented at the
+! next lower level (larger exponent) and strictly greater than what
+! can represented at the next higher level (smaller exponent). Since
+! all elements have the same sign, the sign is set to positive
+! temporarily and then restored when the conversion to floating point
+! is complete. These are all integer operations, so no accuracy is
+! lost. These adjustments eliminate the possibility of catastrophic
+! cancellation. Also, when converting the individual elements to
+! floating point values and summing them, the summation is now
+! equivalent to concatenating the digits in the mantissas for the
+! component summands. In consequence, in the final step when each
+! element of this modified vector of integers is converted to a
+! floating point value and added into the intermediate sum, any
+! rounding is limited to the least significant digit representable
+! in the final floating point sum.
+!
+! Any such rounding error will be sensitive to the particular floating
+! values generated from the integer vector, and so will be
+! sensitive to the number of levels in the vector and the implicit
+! exponent associated with each level, which are themselves functions
+! of the numbers of MPI tasks and OpenMP threads and the number of
+! digits representable in an integer. To avoid this sensitivity,
+! (effectively) generate a new integer vector in which each component
+! integer has a fixed number of significant digits (e.g.,
+! digits(1.0_r8)) and generate the floating point values from these
+! before summing. (See comments in code for more details.) This
+! creates a sequence of floating point values to be summed that are
+! independent of, for example, the numbers of MPI tasks and OpenMP
+! threads or whether using integer*8 or integer*4 internal
+! representations in the integer vector, and thus ensure
+! reproducibility with respect to these options.
+!
+! Description of optional parameters for integer vector algorithm:
+!-----------------------------------------------------------------
+! The accuracy of the integer vector algorithm is controlled by the
+! total number of levels of integer expansion. The algorithm
+! calculates the number of levels that is required for the sum to be
+! essentially exact. (The sum as represented by the integer expansion
+! is exact, but roundoff may perturb the least significant digit of
+! the returned floating point representation of the sum.) The optional
+! parameter arr_max_levels can be used to override the calculated
+! value for each field. The optional parameter arr_max_levels_out can
+! be used to return the values used.
+!
+! The algorithm requires an upper bound on the maximum summand
+! (in absolute value) for each field, and will calculate this internally
+! using an MPI_Allreduce. However, if the optional parameters
+! arr_max_levels and arr_gbl_max are both set, then the algorithm will
+! use the values in arr_gbl_max for the upper bounds instead. If only
+! arr_gbl_max is present, then the maxima are computed internally
+! (and the specified values are ignored). The optional parameter
+! arr_gbl_max_out can be used to return the values used.
+!
+! The algorithm also requires an upper bound on the number of
+! local summands across all MPI tasks. (By definition, the number of
+! local summands is the same for each field on a given MPI task, i.e.,
+! the input parameter nsummands.) This will be calculated internally,
+! using an MPI_Allreduce, but the value in the optional argument
+! gbl_max_nsummands will be used instead if (1) it is present,
+! (2) the value is > 0, and (3) the maximum values and required number
+! of levels are also specified. (If the maximum values are calculated,
+! then the same MPI_Allreduce is used to determine the maximum numbers
+! of local summands.) The accuracy of the user-specified value is not
+! checked. However, if set to < 1, the value will instead be calculated.
+! If the optional parameter gbl_max_nsummands_out is present,
+! then the value used (gbl_max_nsummands if >= 1; calculated otherwise)
+! will be returned.
+!
+! If the user-specified upper bounds on maximum summands are
+! inaccurate or if the user-specified upper bounds (maximum summands
+! and number of local summands) and numbers of levels causes
+! any of the global sums to have fewer than the expected
+! number of significant digits, and if the optional parameter
+! repro_sum_validate is NOT set to .false., then the algorithm will
+! repeat the computations with internally calculated values for
+! arr_max_levels, arr_gbl_max, and gbl_max_nsummands.
+!
+! If requested (by setting cism_reprosum_reldiffmax >= 0.0 and passing in
+! the optional rel_diff parameter), results are compared with a
+! nonreproducible floating point algorithm.
+!
+! Note that the cost of the algorithm is not strongly correlated with
+! the number of levels, which primarily shows up as a (modest) increase
+! in the cost of the MPI_Allreduce as a function of vector length.
+! Rather the cost is more a function of (a) the number of integers
+! required to represent an individual summand and (b) the number of
+! MPI_Allreduce calls. The number of integers required to represent an
+! individual summand is 1 or 2 when using 8-byte integers for 8-byte
+! real summands when the number of local summands and number of MPI
+! tasks are not too large. As the magnitude of either of these increase,
+! the number of integers required increases. The number of
+! MPI_Allreduce calls is either 2 (specifying nothing or just
+! arr_max_levels and arr_gbl_max correctly) or 1 (specifying
+! gbl_max_nsummands, arr_max_levels, and arr_gbl_max correctly).
+! When specifying arr_max_nsummands, arr_max_levels, or arr_gbl_max
+! incorrectly, 3 or 4 MPI_Allreduce calls will be required.
+!
+! Description of alternative (DDPDD) algorithm:
+!----------------------------------------------
+! The alternative algorithm is a minor modification of a parallel
+! implementation of David Bailey's routine DDPDD by Helen He
+! and Chris Ding. See, for example,
+!  Y. He, and C. Ding, 'Using Accurate Arithmetics to Improve
+!  Numerical Reproducibility and Stability in Parallel Applications,'
+!  J. Supercomputing, vol. 18, no. 3, 2001, pp. 259–277
+! and the citations therein. Bailey uses the Knuth trick to implement
+! quadruple precision summation of double precision values with 10
+! double precision operations. The advantage of this algorithm is that
+! it requires a single MPI_Allreduce and is less expensive per summand
+! than is the integer vector algorithm. The disadvantage is that it
+! is not guaranteed to be reproducible (though it is reproducible
+! much more often than is the standard floating point algorithm).
+! This alternative is used when the optional parameter ddpdd_sum is
+! set to .true. It is also used if the integer vector algorithm radix
+! assumption does not hold.
+!
+!------------------------------------------------------------------------
+!
+! Arguments
+!
+      integer,  intent(in) :: nsummands  ! number of local summands
+      integer,  intent(in) :: dsummands  ! declared first dimension
+      integer,  intent(in) :: nflds      ! number of fields
+      real(r8), intent(in) :: arr(dsummands,nflds)
+                                         ! input array
+
+      real(r8), intent(out):: arr_gsum(nflds)
+                                         ! global sums
+
+      logical,  intent(in),    optional :: ddpdd_sum
+                                         ! use ddpdd algorithm instead
+                                         ! of integer vector algorithm
+
+      real(r8), intent(in),    optional :: arr_gbl_max(nflds)
+                                         ! upper bound on max(abs(arr))
+
+      real(r8), intent(out),   optional :: arr_gbl_max_out(nflds)
+                                         ! calculated upper bound on
+                                         ! max(abs(arr))
+
+      integer,  intent(in),    optional :: arr_max_levels(nflds)
+                                         ! maximum number of levels of
+                                         ! integer expansion to use
+
+      integer,  intent(out),   optional :: arr_max_levels_out(nflds)
+                                         ! output of number of levels of
+                                         ! integer expansion to used
+
+      integer,  intent(in),    optional :: gbl_max_nsummands
+                                         ! maximum of nsummand over all
+                                         ! MPI tasks
+
+      integer,  intent(out),   optional :: gbl_max_nsummands_out
+                                         ! calculated maximum nsummands
+                                         ! over all MPI tasks
+
+      integer,  intent(in),    optional :: gbl_count
+                                         ! was total number of summands;
+                                         ! now is ignored; use
+                                         ! gbl_max_nsummands instead
+
+      logical,  intent(in),    optional :: repro_sum_validate
+         ! flag enabling/disabling testing that gmax and  max_levels are
+         ! accurate/sufficient. Default is enabled.
+
+      integer,  intent(inout), optional :: repro_sum_stats(6)
+                                   ! increment running totals for
+                                   !  (1) one-reduction repro_sum
+                                   !  (2) two-reduction repro_sum
+                                   !  (3) both types in one call
+                                   !  (4) nonrepro_sum
+                                   !  (5) global max nsummands reduction
+                                   !  (6) global lor 3*nflds reduction
+
+      real(r8), intent(out),   optional :: rel_diff(2,nflds)
+                                         ! relative and absolute
+                                         !  differences between integer
+                                         !  vector and floating point sums
+
+      integer,  intent(in),    optional :: commid
+                                         ! MPI communicator
+
+!
+! Local workspace
+!
+      logical :: use_ddpdd_sum           ! flag indicating whether to
+                                         !  use cism_reprosum_ddpdd or not
+      logical :: recompute               ! flag indicating need to
+                                         !  determine gmax/gmin before
+                                         !  computing sum
+      logical :: validate                ! flag indicating need to
+                                         !  verify gmax and max_levels
+                                         !  are accurate/sufficient
+      integer :: gbl_lor_red             ! global lor reduction? (0/1)
+      integer :: gbl_max_red             ! global max reduction? (0/1)
+      integer :: repro_sum_fast          ! 1 reduction repro_sum? (0/1)
+      integer :: repro_sum_slow          ! 2 reduction repro_sum? (0/1)
+      integer :: repro_sum_both          ! both fast and slow? (0/1)
+      integer :: nonrepro_sum            ! nonrepro_sum? (0/1)
+
+      integer :: omp_nthreads            ! number of OpenMP threads
+      integer :: mpi_comm                ! MPI subcommunicator
+      integer :: mypid                   ! MPI task ID (COMM_WORLD)
+      integer :: tasks                   ! number of MPI tasks
+      integer :: ierr                    ! MPI error return
+      integer :: ifld, isum, ithread     ! loop variables
+      integer :: max_nsummands           ! max nsummands over all MPI tasks
+                                         !  or threads (used in both ways)
+
+      integer, allocatable :: isum_beg(:), isum_end(:)
+                                         ! range of summand indices for each
+                                         !  OpenMP thread
+      integer, allocatable :: arr_tlmin_exp(:,:)
+                                         ! per thread local exponent minima
+      integer, allocatable :: arr_tlmax_exp(:,:)
+                                         ! per thread local exponent maxima
+      integer :: arr_exp, arr_exp_tlmin, arr_exp_tlmax
+                                         ! summand exponent and working min/max
+      integer :: arr_lmin_exp(nflds)     ! local exponent minima
+      integer :: arr_lmax_exp(nflds)     ! local exponent maxima
+      integer :: arr_lextremes(0:nflds,2)! local exponent extrema
+      integer :: arr_gextremes(0:nflds,2)! global exponent extrema
+
+      integer :: arr_gmax_exp(nflds)     ! global exponents maxima
+      integer :: arr_gmin_exp(nflds)     ! global exponents minima
+      integer :: arr_max_shift           ! maximum safe exponent for
+                                         !  value < 1 (so that sum does
+                                         !  not overflow)
+      integer :: max_levels(nflds)       ! maximum number of levels of
+                                         !  integer expansion to use
+      integer :: max_level               ! maximum value in max_levels
+      integer :: extra_levels            ! number of extra levels needed
+                                         !  to guarantee that sum over threads
+                                         !  or tasks does not cause overflow
+
+      real(r8) :: xmax_nsummands         ! real(max_nsummands,r8)
+      real(r8) :: arr_lsum(nflds)        ! local sums
+      real(r8) :: arr_gsum_fast(nflds)   ! global sum calculated using
+                                         !  fast, nonreproducible,
+                                         !  floating point alg.
+      real(r8) :: abs_diff               ! absolute difference between
+                                         !  integer vector and floating point
+                                         !  sums
+      !WHL mod
+      character(len=64) :: binary_str    ! string to represent 64 bits of i8 integer
+      integer :: n
+      !end WHL mod
+#ifdef _OPENMP
+      integer omp_get_max_threads
+      external omp_get_max_threads
+#endif
+!
+!------------------------------------------------------------------------
+!
+! Initialize local statistics variables
+      gbl_lor_red = 0
+      gbl_max_red = 0
+      repro_sum_fast = 0
+      repro_sum_slow = 0
+      repro_sum_both = 0
+      nonrepro_sum = 0
+
+! Set MPI communicator
+      if ( present(commid) ) then
+         mpi_comm = commid
+      else
+         mpi_comm = MPI_COMM_WORLD
+      endif
+#ifndef EAMXX_STANDALONE
+!WHL - commented out since the profile mod does not include tbarrier_f
+!      call t_barrierf('sync_repro_sum',mpi_comm)
+#endif
+
+! Check whether should use cism_reprosum_ddpdd algorithm
+      use_ddpdd_sum = repro_sum_use_ddpdd
+      if ( present(ddpdd_sum) ) then
+         use_ddpdd_sum = ddpdd_sum
+      endif
+
+! Check whether intrinsic-based algorithm will work on this system
+! (requires floating point and integer bases to be the same)
+! If not, always use ddpdd.
+      use_ddpdd_sum = use_ddpdd_sum .or. (radix(1.0_r8) /= radix(1_i8))
+
+      if ( use_ddpdd_sum ) then
+
+#ifndef EAMXX_STANDALONE
+         call t_startf('cism_reprosum_ddpdd')
+#endif
+
+         call cism_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, &
+                              nflds, mpi_comm)
+         repro_sum_fast = 1
+
+#ifndef EAMXX_STANDALONE
+         call t_stopf('cism_reprosum_ddpdd')
+#endif
+
+      else
+
+#ifndef EAMXX_STANDALONE
+         call t_startf('cism_reprosum_int')
+#endif
+
+! Get number of MPI tasks
+         call mpi_comm_size(mpi_comm, tasks, ierr)
+
+! Get number of OpenMP threads
+#ifdef _OPENMP
+         omp_nthreads = omp_get_max_threads()
+#else
+         omp_nthreads = 1
+#endif
+
+! See if have sufficient information to not require max/min allreduce
+         recompute = .true.
+         validate = .false.
+         if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then
+            recompute = .false.
+
+! Setting lower bound on max_level*nflds to be 64 to improve OpenMP
+! performance for loopb in cism_reprosum_int
+            max_level = (64/nflds) + 1
+            do ifld=1,nflds
+               if ((arr_gbl_max(ifld) >= 0.0_r8) .and. &
+                   (arr_max_levels(ifld) > 0)) then
+
+                  arr_gmax_exp(ifld)  = exponent(arr_gbl_max(ifld))
+                  if (max_level < arr_max_levels(ifld)) &
+                     max_level = arr_max_levels(ifld)
+
+               else
+                  recompute = .true.
+               endif
+            enddo
+
+            if (.not. recompute) then
+
+! Determine maximum number of summands in local phases of the
+! algorithm
+#ifndef EAMXX_STANDALONE
+               call t_startf("repro_sum_allr_max")
+#endif
+               if ( present(gbl_max_nsummands) ) then
+                  if (gbl_max_nsummands < 1) then
+                     call mpi_allreduce (nsummands, max_nsummands, 1, &
+                                         MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
+                     gbl_max_red = 1
+                  else
+                     max_nsummands = gbl_max_nsummands
+                  endif
+               else
+                  call mpi_allreduce (nsummands, max_nsummands, 1, &
+                                      MPI_INTEGER, MPI_MAX, mpi_comm, ierr)
+                  gbl_max_red = 1
+               endif
+#ifndef EAMXX_STANDALONE
+               call t_stopf("repro_sum_allr_max")
+#endif
+
+! Determine maximum shift. Shift needs to be small enough that summation,
+! in absolute value, does not exceed maximum value representable by i8.
+
+! If requested, return max_nsummands before it is redefined
+               if ( present( gbl_max_nsummands_out) ) then
+                  gbl_max_nsummands_out = max_nsummands
+               endif
+
+! Summing within each thread first (adding 1 to max_nsummands
+! to ensure that integer division rounds up)
+               max_nsummands = (max_nsummands/omp_nthreads) + 1
+! then over threads and tasks
+               max_nsummands = max(max_nsummands, tasks*omp_nthreads)
+! A 'max' is used in the above calculation because the partial sum for
+! each thread, calculated in cism_reprosum_int, is postprocessed so that
+! each integer in the corresponding vector of integers is reduced in
+! magnitude to be less than (radix(1_i8)**arr_max_shift). Therefore,
+! the maximum shift can be calculated separately for per thread sums
+! and sums over threads and tasks, and the smaller value used. This is
+! equivalent to using max_nsummands as defined above.
+
+               xmax_nsummands = real(max_nsummands,r8)
+               arr_max_shift = digits(1_i8) - (exponent(xmax_nsummands) + 1)
+               if (arr_max_shift < 2) then
+                  !WHL mod
+!                  call shr_sys_abort('repro_sum failed: number of summands too '// &
+!                                     'large for integer vector algorithm' )
+                  write(iulog,*) 'repro_sum failed: number of summands too '// &
+                                 'large for integer vector algorithm'
+                  call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
+                  ! end WHL mod
+               endif
+! Note: by construction, each floating point value will be decomposed
+! into a vector of integers each component of which will be strictly
+! less than radix(1_i8)**arr_max_shift in absolute value, and the
+! summation of max_nsummands of these, again in absolute value, will
+! then be less than
+!  radix(1_i8)**(arr_max_shift + exponent(xmax_nsummands))
+! or radix(1_i8)**(digits(1_i8) - 1). This is more conservative than
+! necessary, but it also allows the postprocessing mentioned above
+! (and described later) to proceed without danger of introducing
+! overflow.
+
+! Determine additional number of levels needed to support the
+! postprocessing that reduces the magnitude of each component
+! of the integer vector of the partial sum for each thread
+! to be less than (radix(1_i8)**arr_max_shift).
+               extra_levels = (digits(1_i8) - 1)/arr_max_shift
+! Extra levels are indexed by (-(extra_levels-1):0)
+! Derivation of this is described in the comments in
+! cism_reprosum_int.
+
+! Calculate sum
+               if (present(repro_sum_validate)) then
+                  validate = repro_sum_validate
+               else
+                  validate = .true.
+               endif
+               call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
+                                     nflds, arr_max_shift, arr_gmax_exp, &
+                                     arr_max_levels, max_level, extra_levels, &
+                                     validate, recompute, &
+                                     omp_nthreads, mpi_comm)
+
+! Record statistics, etc.
+               repro_sum_fast = 1
+               if (recompute) then
+                  repro_sum_both = 1
+               else
+! If requested, return specified levels and upper bounds on maxima
+                  if ( present(arr_max_levels_out) ) then
+                     do ifld=1,nflds
+                        arr_max_levels_out(ifld) = arr_max_levels(ifld)
+                     enddo
+                  endif
+                  if ( present(arr_gbl_max_out) ) then
+                     do ifld=1,nflds
+                       arr_gbl_max_out(ifld) = arr_gbl_max(ifld)
+                     enddo
+                  endif
+               endif
+            endif
+         endif
+
+! Do not have sufficient information; calculate global max/min and
+! use to compute required number of levels
+         if (recompute) then
+
+! Record statistic
+            repro_sum_slow = 1
+
+! Determine maximum and minimum (non-zero) summand values and
+! maximum number of local summands
+
+! Allocate thread-specific work space
+            allocate(arr_tlmax_exp(nflds,omp_nthreads))
+            allocate(arr_tlmin_exp(nflds,omp_nthreads))
+            allocate(isum_beg(omp_nthreads))
+            allocate(isum_end(omp_nthreads))
+
+! Split summand index range over OpenMP threads
+            call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
+
+!$omp parallel do      &
+!$omp default(shared)  &
+!$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax)
+            do ithread=1,omp_nthreads
+#ifndef EAMXX_STANDALONE
+               call t_startf('repro_sum_loopa')
+#endif
+               do ifld=1,nflds
+                  arr_exp_tlmin = MAXEXPONENT(1.0_r8)
+                  arr_exp_tlmax = MINEXPONENT(1.0_r8)
+                  do isum=isum_beg(ithread),isum_end(ithread)
+                     if (arr(isum,ifld) /= 0.0_r8) then
+                        arr_exp = exponent(arr(isum,ifld))
+                        arr_exp_tlmin = min(arr_exp,arr_exp_tlmin)
+                        arr_exp_tlmax = max(arr_exp,arr_exp_tlmax)
+                     endif
+                  end do
+                  arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin
+                  arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax
+               end do
+#ifndef EAMXX_STANDALONE
+               call t_stopf('repro_sum_loopa')
+#endif
+            end do
+
+            do ifld=1,nflds
+               arr_lmax_exp(ifld) = maxval(arr_tlmax_exp(ifld,:))
+               arr_lmin_exp(ifld) = minval(arr_tlmin_exp(ifld,:))
+            end do
+            deallocate(arr_tlmin_exp,arr_tlmax_exp,isum_beg,isum_end)
+
+            arr_lextremes(0,:) = -nsummands
+            arr_lextremes(1:nflds,1) = -arr_lmax_exp(:)
+            arr_lextremes(1:nflds,2) = arr_lmin_exp(:)
+#ifndef EAMXX_STANDALONE
+            call t_startf("repro_sum_allr_minmax")
+#endif
+            call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), &
+                                MPI_INTEGER, MPI_MIN, mpi_comm, ierr)
+#ifndef EAMXX_STANDALONE
+            call t_stopf("repro_sum_allr_minmax")
+#endif
+            max_nsummands   = -arr_gextremes(0,1)
+            arr_gmax_exp(:) = -arr_gextremes(1:nflds,1)
+            arr_gmin_exp(:) =  arr_gextremes(1:nflds,2)
+
+! If a field is identically zero, arr_gmin_exp still equals MAXEXPONENT
+!  and arr_gmax_exp still equals MINEXPONENT.
+! In this case, set arr_gmin_exp = arr_gmax_exp = MINEXPONENT
+            do ifld=1,nflds
+               arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld))
+            enddo
+
+! If requested, return upper bounds on observed maxima
+            if ( present(arr_gbl_max_out) ) then
+               do ifld=1,nflds
+                  arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld))
+               enddo
+            endif
+
+! If requested, return max_nsummands before it is redefined
+            if ( present( gbl_max_nsummands_out) ) then
+               gbl_max_nsummands_out = max_nsummands
+            endif
+
+! Determine maximum shift (same as in previous branch, but with calculated
+! max_nsummands). Shift needs to be small enough that summation, in absolute
+! value, does not exceed maximum value representable by i8.
+
+! Summing within each thread first (adding 1 to max_nsummands
+! to ensure that integer division rounds up)
+            max_nsummands = (max_nsummands/omp_nthreads) + 1
+! then over threads and tasks
+            max_nsummands = max(max_nsummands, tasks*omp_nthreads)
+! A 'max' is used in the above calculation because the partial sum for
+! each thread, calculated in cism_reprosum_int, is postprocessed so that
+! each integer in the corresponding vector of integers is reduced in
+! magnitude to be less than (radix(1_i8)**arr_max_shift). Therefore,
+! the maximum shift can be calculated separately for per thread sums
+! and sums over threads and tasks, and the smaller value used. This is
+! equivalent to using max_nsummands as defined above.
+
+            xmax_nsummands = real(max_nsummands,r8)
+            arr_max_shift = digits(1_i8) - (exponent(xmax_nsummands) + 1)
+            if (arr_max_shift < 2) then
+               !WHL mod
+!               call shr_sys_abort('repro_sum failed: number of summands too '// &
+!                                  'large for integer vector algorithm' 
+               write(iulog,*) 'repro_sum failed: number of summands too '// &
+                              'large for integer vector algorithm'
+               call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
+               ! end WHL mod
+            endif
+! Note: by construction, each floating point value will be decomposed
+! into a vector of integers each component of which will be strictly
+! less than radix(1_i8)**arr_max_shift in absolute value, and the
+! summation of max_nsummands of these, again in absolute value, will
+! then be less than
+!  radix(1_i8)**(arr_max_shift + exponent(xmax_nsummands))
+! or radix(1_i8)**(digits(1_i8) - 1). This is more conservative than
+! necessary, but it also allows the postprocessing mentioned above
+! (and described later) to proceed without danger of introducing
+! overflow.
+
+! Determine maximum number of levels required for each field.
+! Need enough levels to represent both the smallest and largest
+! nonzero summands (in absolute value), and any values in between.
+! The number of digits from the most significant digit in the
+! largest summand to the most significant digit in the smallest
+! summand is (arr_gmax_exp(ifld)-arr_gmin_exp(ifld)), and the maximum
+! number of digits needed to represent the smallest value is
+! digits(1.0_r8). Divide this total number of digits by the number of
+! digits per level (arr_max_shift) to get the number of levels
+!  ((digits(1.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift)
+! with some tweaks:
+!  + 1 because first truncation for any given summand probably does
+!  not involve a maximal shift (but this adds only one to the total)
+!  + 1 to guarantee that the integer division rounds up (not down)
+! (setting lower bound on max_level*nflds to be 64 to improve OpenMP
+!  performance for loopb in cism_reprosum_int)
+            max_level = (64/nflds) + 1
+            do ifld=1,nflds
+               max_levels(ifld) = 2 + &
+                ((digits(1.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) &
+                / arr_max_shift)
+               if ( present(arr_max_levels) .and. (.not. validate) ) then
+! If validate true, then computation with arr_max_levels failed
+! previously
+                  if ( arr_max_levels(ifld) > 0 ) then
+                     max_levels(ifld) = &
+                        min(arr_max_levels(ifld),max_levels(ifld))
+                  endif
+               endif
+               if (max_level < max_levels(ifld)) &
+                  max_level = max_levels(ifld)
+            enddo
+
+! If requested, return calculated levels
+            if ( present(arr_max_levels_out) ) then
+               do ifld=1,nflds
+                  arr_max_levels_out(ifld) = max_levels(ifld)
+               enddo
+            endif
+
+! Determine additional number of levels needed to support the
+! postprocessing that reduces the magnitude of each component
+! of the integer vector of the partial sum for each thread
+! to be less than (radix(1_i8)**arr_max_shift).
+            extra_levels = (digits(1_i8) - 1)/arr_max_shift
+! Extra levels are indexed by (-(extra_levels-1):0)
+! Derivation of this is described in the comments in
+! cism_reprosum_int.
+
+! Calculate sum
+            validate = .false.
+            call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, &
+                                  nflds, arr_max_shift, arr_gmax_exp, &
+                                  max_levels, max_level, extra_levels, &
+                                  validate, recompute, &
+                                  omp_nthreads, mpi_comm)
+
+         endif
+
+#ifndef EAMXX_STANDALONE
+         call t_stopf('cism_reprosum_int')
+#endif
+
+      endif
+
+! Compare integer vector and floating point results
+      if ( present(rel_diff) ) then
+         if (cism_reprosum_reldiffmax >= 0.0_r8) then
+#ifndef EAMXX_STANDALONE
+            !WHL - commented out since the profile mod does not include tbarrier_f
+!            call t_barrierf('sync_nonrepro_sum',mpi_comm)
+#endif
+#ifndef EAMXX_STANDALONE
+            call t_startf('nonrepro_sum')
+#endif
+! Record statistic
+            nonrepro_sum = 1
+! Compute nonreproducible sum
+            arr_lsum(:) = 0.0_r8
+!$omp parallel do      &
+!$omp default(shared)  &
+!$omp private(ifld, isum)
+            do ifld=1,nflds
+               do isum=1,nsummands
+                  arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld)
+               end do
+            end do
+
+#ifndef EAMXX_STANDALONE
+            call t_startf("nonrepro_sum_allr_r8")
+#endif
+            call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, &
+                                MPI_REAL8, MPI_SUM, mpi_comm, ierr)
+#ifndef EAMXX_STANDALONE
+            call t_stopf("nonrepro_sum_allr_r8")
+#endif
+
+#ifndef EAMXX_STANDALONE
+            call t_stopf('nonrepro_sum')
+#endif
+
+! Determine differences
+!$omp parallel do      &
+!$omp default(shared)  &
+!$omp private(ifld, abs_diff)
+            do ifld=1,nflds
+               abs_diff = abs(arr_gsum_fast(ifld)-arr_gsum(ifld))
+               if (abs(arr_gsum(ifld)) > abs_diff) then
+                  rel_diff(1,ifld) = abs_diff/abs(arr_gsum(ifld))
+               else
+                  rel_diff(1,ifld) = abs_diff
+               endif
+               rel_diff(2,ifld) = abs_diff
+            enddo
+         else
+            rel_diff(:,:) = 0.0_r8
+         endif
+      endif
+
+! Return statistics
+      if ( present(repro_sum_stats) ) then
+         repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast
+         repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow
+         repro_sum_stats(3) = repro_sum_stats(3) + repro_sum_both
+         repro_sum_stats(4) = repro_sum_stats(4) + nonrepro_sum
+         repro_sum_stats(5) = repro_sum_stats(5) + gbl_max_red
+         repro_sum_stats(6) = repro_sum_stats(6) + gbl_lor_red
+      endif
+
+      if (verbose_reprosum) then
+         call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr)
+          if (mypid == 0) then
+             write(iulog,*) 'Exit reprosum, nflds, arr_gsum =', nflds, arr_gsum
+          endif
+       endif
+
+  end subroutine cism_reprosum_calc
+
+!
+!========================================================================
+!
+
+   subroutine cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, &
+                                arr_max_shift, arr_gmax_exp, max_levels,    &
+!                                max_level, extra_levels, skip_field,        &
+                                max_level, extra_levels,                    &
+                                validate, recompute, omp_nthreads, mpi_comm )
+!------------------------------------------------------------------------
+!
+! Purpose:
+! Compute the global sum of each field in 'arr' using the indicated
+! communicator with a reproducible yet scalable implementation based
+! on first converting each floating point summand into an equivalent
+! representation using a vector of integers, summing the integer
+! vectors, then converting the resulting sum back to a floating point
+! representation. The accuracy of the integer vector algorithm is
+! controlled by the number of 'levels' of integer expansion, the maximum
+! value of which is specified by max_level.
+!
+!------------------------------------------------------------------------
+!
+! Arguments
+!
+      integer,  intent(in) :: nsummands     ! number of local summands
+      integer,  intent(in) :: dsummands     ! declared first dimension
+      integer,  intent(in) :: nflds         ! number of fields
+      integer,  intent(in) :: arr_max_shift ! maximum safe exponent for
+                                            !  value < 1 (so that sum
+                                            !  does not overflow)
+      integer,  intent(in) :: arr_gmax_exp(nflds)
+                                            ! exponents of global maxima
+      integer,  intent(in) :: max_levels(nflds)
+                                            ! maximum number of levels
+                                            !  of integer expansion
+      integer,  intent(in) :: max_level     ! maximum value in
+                                            !  max_levels
+      integer,  intent(in) :: extra_levels  ! number of extra levels
+                                            ! needed to guarantee that
+                                            ! sum over threads or tasks
+                                            ! does not cause overflow
+      integer,  intent(in) :: omp_nthreads  ! number of OpenMP threads
+      integer,  intent(in) :: mpi_comm      ! MPI subcommunicator
+
+      real(r8), intent(in) :: arr(dsummands,nflds)
+                                            ! input array
+
+      logical,  intent(in) :: validate
+         ! flag indicating that accuracy of solution generated from
+         ! arr_gmax_exp and max_levels should be tested
+
+      logical,  intent(out):: recompute
+         ! flag indicating that either the upper bounds are inaccurate,
+         !  or max_levels and arr_gmax_exp do not generate accurate
+         !  enough sums
+
+      real(r8), intent(out):: arr_gsum(nflds)      ! global sums
+!
+! Local workspace
+!
+      integer, parameter  :: max_svlevel_factor = &
+                                1 + (digits(1_i8)/digits(1.0_r8))
+
+      integer(i8) :: i8_arr_tlsum_level(-(extra_levels-1):max_level,nflds,omp_nthreads)
+                                   ! integer vector representing local
+                                   !  sum (per thread, per field)
+      integer(i8) :: i8_arr_lsum_level((max_level+extra_levels+2)*nflds)
+                                   ! integer vector representing local
+                                   !  sum
+      integer(i8) :: i8_arr_level  ! integer part of summand for current
+                                   !  expansion level
+      integer(i8) :: i8_arr_gsum_level((max_level+extra_levels+2)*nflds)
+                                   ! integer vector representing global
+                                   !  sum
+      integer(i8) :: i8_gsum_level(-(extra_levels-1):max_level)
+                                   ! integer vector representing global
+                                   !  sum for one field
+      integer(i8) :: IX_8          ! integer representation of r8 value
+      integer(i8) :: i8_sign       ! sign global sum
+      integer(i8) :: i8_radix      ! radix for i8 variables (and r8
+                                   !  variables by earlier if-test)
+
+      integer :: max_error(nflds,omp_nthreads)
+                                   ! accurate upper bound on data?
+      integer :: not_exact(nflds,omp_nthreads)
+                                   ! max_levels sufficient to
+                                   !  capture all digits?
+      integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads)
+                                   ! range of summand indices for each
+                                   !  OpenMP thread
+      integer :: ifld, isum, ithread, jlevel
+                                   ! loop variables
+      integer :: arr_exp           ! exponent of summand
+      integer :: arr_shift         ! exponent used to generate integer
+                                   !  for current expansion level
+      integer :: ilevel            ! current integer expansion level
+      integer :: offset(nflds)     ! beginning location in
+                                   !  i8_arr_{g,l}sum_level for integer
+                                   !  expansion of current ifld
+      integer :: voffset           ! modification to offset used to
+                                   !  include validation metrics
+      integer :: min_level         ! index of minimum levels (including
+                                   !  extra levels) for i8_arr_tlsum_level
+      integer :: ioffset           ! offset(ifld)
+      integer :: svlevel           ! number of summands in summand_vector
+      integer :: ierr              ! MPI error return
+      integer :: LX                ! exponent of X_8 (see below)
+      integer :: veclth            ! total length of i8_arr_lsum_level
+      integer :: i8_digit_count    ! number of digits in integer
+                                   !  expansion of sum
+      integer :: i8_begin_level    ! level starting from in
+                                   !  creating next 'exactly representable'
+                                   !  floating point value from modified
+                                   !  integer expansion of the sum
+      integer :: i8_trunc_level    ! level at which the number of digits in
+                                   !  the modified integer expansion of the
+                                   !  sum exceeds the number of representable
+                                   !  digits in the floating point sum
+      integer :: i8_trunc_loc      ! location of last digit at i8_trunc_level
+                                   !  in the modified integer expansion of the
+                                   !  sum that is representable in the floating
+                                   !  point sum
+      integer(i8) :: i8_trunc_level_rem
+                                   ! truncated digits at i8_trunc_level
+                                   !  in the modified integer expansion
+                                   !  of the sum
+      integer :: curr_exp          ! exponent of partial sum during
+                                   !  reconstruction from integer vector
+      integer :: corr_exp          ! exponent of current summand in
+                                   !  reconstruction from integer vector
+
+      real(r8) :: arr_frac         ! fraction of summand
+      real(r8) :: arr_remainder    ! part of summand remaining after
+                                   !  current level of integer expansion
+      real(r8) :: X_8              ! r8 representation of current
+                                   !  i8_arr_gsum_level
+      real(r8) :: RX_8             ! r8 representation of (other)
+                                   !  integers used in calculation.
+      real(r8) :: summand_vector((max_level+extra_levels)*max_svlevel_factor)
+                                   ! vector of r8 values generated from
+                                   !  integer vector representation to be
+                                   !  summed to generate global sum
+
+      logical :: first_stepd_iteration
+                                   ! flag used to indicate whether first
+                                   !  time through process of converting
+                                   !  vector of integers into a floating
+                                   !  point value, as it requires
+                                   !  special logic
+!
+!------------------------------------------------------------------------
+! Save radix of i8 variables in an i8 variable
+      i8_radix = radix(IX_8)
+
+! If validating upper bounds, reserve space for validation metrics
+! In both cases, reserve extra levels for overflows from the top level
+      if (validate) then
+        voffset = extra_levels + 2
+      else
+        voffset = extra_levels
+      endif
+
+! For convenience, define minimum level index for i8_arr_tlsum_level
+      min_level = -(extra_levels-1)
+
+! Compute offsets for each field
+      offset(1) = voffset
+      do ifld=2,nflds
+         offset(ifld) = offset(ifld-1) &
+                        + (max_levels(ifld-1) + voffset)
+      enddo
+      veclth = offset(nflds) + max_levels(nflds)
+
+! Split summand index range over OpenMP threads
+      call split_indices(nsummands, omp_nthreads, isum_beg, isum_end)
+
+! Convert local summands to vector of integers and sum
+! (Using scale instead of set_exponent because arr_remainder may not be
+! 'normal' after level 1 calculation)
+      i8_arr_lsum_level(:) = 0_i8
+
+!$omp parallel do      &
+!$omp default(shared)  &
+!$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, &
+!$omp         arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8)
+      do ithread=1,omp_nthreads
+#ifndef EAMXX_STANDALONE
+       call t_startf('repro_sum_loopb')
+#endif
+       do ifld=1,nflds
+          ioffset = offset(ifld)
+
+          max_error(ifld,ithread) = 0
+          not_exact(ifld,ithread) = 0
+          i8_arr_tlsum_level(:,ifld,ithread) = 0_i8
+
+          do isum=isum_beg(ithread),isum_end(ithread)
+            arr_remainder = 0.0_r8
+
+            if (arr(isum,ifld) /= 0.0_r8) then
+               arr_exp   = exponent(arr(isum,ifld))
+               arr_frac  = fraction(arr(isum,ifld))
+
+! Test that global maximum upper bound is an upper bound
+               if (arr_exp > arr_gmax_exp(ifld)) then
+                  max_error(ifld,ithread) = 1
+                  exit
+               endif
+
+! Calculate first shift
+               arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
+
+! Determine first (probably) nonzero level (assuming initial fraction is
+! 'normal' - algorithm still works if this is not true)
+! NOTE: this is critical; scale will set to zero if min exponent is too small.
+               if (arr_shift < 1) then
+                  ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift
+                  arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp)
+
+                  do while (arr_shift < 1)
+                     arr_shift = arr_shift + arr_max_shift
+                     ilevel = ilevel + 1
+                  enddo
+               else
+                  ilevel = 1
+               endif
+
+               if (ilevel <= max_levels(ifld)) then
+! Apply first shift/truncate, add it to the relevant running
+! sum, and calculate the remainder.
+                  arr_remainder = scale(arr_frac,arr_shift)
+                  i8_arr_level = int(arr_remainder,i8)
+                  i8_arr_tlsum_level(ilevel,ifld,ithread) = &
+                     i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
+                  arr_remainder = arr_remainder - i8_arr_level
+
+! While the remainder is non-zero, continue to shift, truncate,
+! sum, and calculate new remainder
+                  do while ((arr_remainder /= 0.0_r8) &
+                     .and. (ilevel < max_levels(ifld)))
+                     ilevel = ilevel + 1
+                     arr_remainder = scale(arr_remainder,arr_max_shift)
+                     i8_arr_level = int(arr_remainder,i8)
+                     i8_arr_tlsum_level(ilevel,ifld,ithread) = &
+                        i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level
+                     arr_remainder = arr_remainder - i8_arr_level
+                  enddo
+
+               endif
+            endif
+
+            if (arr_remainder /= 0.0_r8) then
+               not_exact(ifld,ithread) = 1
+            endif
+
+          enddo
+! Postprocess integer vector to eliminate possibility of overflow
+! during subsequent sum over threads and tasks, as per earlier
+! comment on logic behind definition of max_nsummands. If value at a
+! given level is larger than or equal to
+! (radix(1_i8)**arr_max_shift), subtract this 'overlap' from the
+! current value and add it (appropriately shifted) to the value at
+! the next smaller level in the vector.
+! (a) As described earlier, prior to this postprocessing the integer
+!     components are each strictly less than
+!     radix(1_i8)**(digits(1_i8) - 1) in absolute value. So, after
+!     shifting, the absolute value of the amount added to level
+!     max_levels(ifld)-1 from level max_levels(ifld) is less than
+!     radix(1_i8)**(digits(1_i8) - 1 - arr_max_shift) with the
+!     resulting sum, in absolute value, being less than
+!      (radix(1_i8)**(digits(1_i8) - 1))*(1 + radix(1_i8)**(-arr_max_shift)).
+!     Any overlap from this component is then added to the level
+!     max_levels(ifld)-2, etc., with resulting intermediate sums, in
+!     absolute value, for levels 1 to max_levels(ifld) being bounded
+!     from above by 
+!      (radix(1_i8)**(digits(1_i8) - 1))*sum{i=0,inf}(radix(1_i8)**(-i*arr_max_shift)).
+!     Since radix(1_i8) >= 2 and arr_max_shift is also required to be
+!     >= 2 (otherwise the code exits with an error) this is less than
+!     or equal to 
+!      (radix(1_i8)**(digits(1_i8) - 1))*sum{i=0,inf}(2**(-2i)),
+!     or
+!      (radix(1_i8)**(digits(1_i8) - 1))*(4/3).
+!     In summary, this shows that no absolute value generated during
+!     this process will exceed the maximum value representable in i8,
+!     i.e. (radix(1_i8)**(digits(1_i8)) - 1), as long as
+!     digits(1_i8) >= 2. 
+! (b) 'ilevel==0,...,-(extra_levels-1)' correspond to extra levels
+!     used to continue the above process until values at all levels
+!     are less than radix(1_i8)**arr_max_shift in absolute value
+!     (except level -(extra_levels-1), as described below). The
+!     result of shifting the overlap from level 1 to level 0, which
+!     is initially zero, is bounded in absolute value by 
+!      (radix(1_i8)**(digits(1_i8) - 1 - arr_max_shift))*(4/3).
+!     After removing any overlap from level 0, the upper bound for
+!     level -1, which is also initially zero, is
+!      (radix(1_i8)**(digits(1_i8) - 1 - 2*arr_max_shift))*(4/3).
+!     Continuing the process, when get to level -(extra_levels-1),
+!     the upper bound is
+!      (radix(1_i8)**(digits(1_i8) - 1 - extra_levels*arr_max_shift))*(4/3).
+!     If we define
+!      extra_levels = ceiling[(digits(1_i8) - 1)/arr_max_shift - 1]
+!     then the upper bound is
+!      (radix(1_i8)**(arr_max_shift))*(4/3).
+!     Setting 
+!      extra_levels = (digits(1_i8) - 1)/arr_max_shift
+!     is then a slightly conservative estimate that achieves the same
+!     upper bound. While the above upper bound at level
+!     -(extra_levels-1)is a factor of (4/3) larger than the target
+!     radix(1_i8)**arr_max_shift, it is still small enough so that
+!     the sum over threads and tasks, bounded from above in absolute
+!     value by 
+!      (radix(1_i8)**(digits(1_i8) - 1))*(4/3),
+!     will not cause an overflow at level -(extra_levels-1) as long as
+!     digits(1_i8) >= 2.
+          do ilevel=max_levels(ifld),min_level+1,-1
+             if (abs(i8_arr_tlsum_level(ilevel,ifld,ithread)) >= &
+                    (i8_radix**arr_max_shift)) then
+
+                IX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread) &
+                       / (i8_radix**arr_max_shift)
+                i8_arr_tlsum_level(ilevel-1,ifld,ithread) = &
+                   i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8
+
+                IX_8 = IX_8*(i8_radix**arr_max_shift)
+                i8_arr_tlsum_level(ilevel,ifld,ithread)   = &
+                   i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8
+             endif
+          enddo
+       enddo
+#ifndef EAMXX_STANDALONE
+       call t_stopf('repro_sum_loopb')
+#endif
+      enddo
+
+! Sum contributions from different threads
+      do ifld=1,nflds
+         ioffset = offset(ifld)
+         do ithread = 1,omp_nthreads
+            do ilevel = min_level,max_levels(ifld)
+               i8_arr_lsum_level(ioffset+ilevel) = &
+                  i8_arr_lsum_level(ioffset+ilevel) &
+                  + i8_arr_tlsum_level(ilevel,ifld,ithread)
+            enddo
+         enddo
+      enddo
+
+! Record if upper bound was inaccurate or if level expansion stopped
+! before full accuracy was achieved
+      if (validate) then
+         do ifld=1,nflds
+            ioffset = offset(ifld)
+            i8_arr_lsum_level(ioffset-voffset+1) = maxval(max_error(ifld,:))
+            i8_arr_lsum_level(ioffset-voffset+2) = maxval(not_exact(ifld,:))
+         enddo
+      endif
+
+! Sum integer vector element-wise
+#if ( defined noI8 )
+     ! Workaround for when shr_kind_i8 is not supported.
+#ifndef EAMXX_STANDALONE
+      call t_startf("repro_sum_allr_i4")
+#endif
+      call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
+                          veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr)
+#ifndef EAMXX_STANDALONE
+      call t_stopf("repro_sum_allr_i4")
+#endif
+#else
+#ifndef EAMXX_STANDALONE
+      call t_startf("repro_sum_allr_i8")
+#endif
+      call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, &
+                          veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr)
+#ifndef EAMXX_STANDALONE
+      call t_stopf("repro_sum_allr_i8")
+#endif
+#endif
+
+#ifndef EAMXX_STANDALONE
+      call t_startf('repro_sum_finalsum')
+#endif
+! Construct global sum from integer vector representation:
+!  1) arr_max_shift is the shift applied to fraction(arr_gmax) .
+!   When shifting back, need to 'add back in' the true arr_gmax exponent.
+!   This was removed implicitly by working only with the fraction.
+!  2) To avoid the possibility of catastrophic cancellation, and
+!   an unacceptable floating point rounding error, can do some arithmetic
+!   with the integer vector so that all components have the same sign.
+!  3) If convert each integer in the integer vector to a floating
+!   point value and then add these together, smallest to largest, to
+!   calculate the final sum, there may be roundoff error in the least
+!   significant digit. This error will be sensitive to the particular
+!   floating values generated from the integer vector, and so will be
+!   sensitive to the number of levels in the vector and the implicit
+!   exponent associated with each level. So this approach is not
+!   guaranteed to be reproducible with respect to a change in the
+!   number of MPI tasks and OpenMP threads (as this changes the
+!   definition of max_nsummands, and thus also arr_max_shift). It is
+!   also not guaranteed to be reproducible with respect to changing
+!   the integer size, e.g. from i8 to i4, as this also changes
+!   arr_max_shift. However, can eliminate this potential loss of
+!   reproducibility by taking the following steps. 
+!   a) Manipulate the integer vector so that
+!    i)  the component values do not 'overlap', that is, the value
+!        represented by a component is strictly less than the value
+!        represented by the least significant digit in the previous
+!        component, and 
+!    ii) all components are positive (saving the sign to be restored
+!        to the final result).
+!   b) Identify the digit in the resulting integer vector that is the
+!      last representable in the floating point representation, then
+!      truncate the vector at this point, i.e., all digits of lesser
+!      significance in the given component and all components
+!      representing digits of lesser significance (call this the
+!      remainder). 
+!   c) Convert each integer component in the modified integer vector
+!      to its corresponding floating point value and sum the
+!      sequence. (Order is unimportant, as explained below, but here
+!      add largest to smallest.) 
+!   d) Repeat (b) and (c) for the remainder (recursively, as
+!      necessary).
+!   e) Sum all floating point numbers generated by step (c), smallest
+!      to largest. 
+!   f) Restore the sign.
+!   With the manipulations in (a) and (b), the summation in (c) is
+!   equivalent to concatenating the digits in the mantissas for the
+!   component summands, so rounding is irrelevant (so far). Repeating
+!   this with the remainder(s) generates a sequence of 'exact'
+!   floating point numbers. Summing these can still generate a
+!   rounding error in the least significant digit in the largest
+!   floating point value (which is the last representable digit in the
+!   final result), but the floating point values being summed and
+!   order of summation are independent of the number of levels and
+!   implicit exponents, so reproducibility is ensured.
+!
+!  Note that assignment of an i8 integer value to an r8 floating point
+!  variable in step (c) can lead to a loss of accuracy because the
+!  maximum number of digits in the i8 integer can be greater than the
+!  maximum number of digits representable in the r8 variable (if the
+!  xmax_nsummands correction is not very large). With the same sign
+!  and nonoverlapping properties of the integer components, these lost
+!  digits will also not be representable in the final sum. The process
+!  described above of truncating at this last representable digit, and
+!  then separately generating floating point value(s) for the
+!  remainder, takes care of this automatically. Similar reasoning
+!  applies to r4 floating point values with either i8 or i4 integer
+!  components.
+
+      recompute = .false.
+      do ifld=1,nflds
+         arr_gsum(ifld) = 0.0_r8
+         ioffset = offset(ifld)
+         svlevel = 0
+
+! If validate is .true., test whether the summand upper bound
+! was exceeded on any of the MPI tasks
+         if (validate) then
+            if (i8_arr_gsum_level(ioffset-voffset+1) /= 0_i8) then
+               recompute = .true.
+            endif
+         endif
+
+         if (.not. recompute) then
+! Copy integer vector for current field from i8_arr_gsum_level, so that
+! can be modified without changing i8_arr_gsum_level. (Preserving
+! i8_arr_gsum_level unchanged is not necessary, but is convenient for debugging
+! and makes indexing clearer and less error prone.)
+            i8_gsum_level(:) = 0_i8
+            do ilevel=min_level,max_levels(ifld)
+               i8_gsum_level(ilevel) = i8_arr_gsum_level(ioffset+ilevel)
+            enddo
+
+! Preprocess integer vector (as described in 3(a) above):
+!  i) If value larger than or equal to (radix(1_i8)**arr_max_shift),
+!     add this 'overlap' to the value at the next smaller level
+!     in the vector, resulting in nonoverlapping ranges for each
+!     component.
+!
+!     As before, no intermediate sums for levels
+!     max_levels(ifld) to -(extra_levels-2), in absolute value,
+!     will exceed the the maximum value representable in i8, but the
+!     upper bound on the final sum, in absolute value, at
+!     level -(extra_levels-1) is now 
+!      (radix(1_i8)**(digits(1_i8) - 1))*(4/3) + 
+!            + sum{i=1,inf}(radix(1_i8)**(-i*arr_max_shift))
+!      = (radix(1_i8)**(digits(1_i8) - 1))*
+!            ((4/3) + sum{i=1,inf}(radix(1_i8)**(-i*arr_max_shift)).
+!     which is less than or equal to 
+!      (radix(1_i8)**(digits(1_i8) - 1))*((4/3) + (1/3))
+!     or
+!      (radix(1_i8)**(digits(1_i8) - 1))*(5/3)
+!     which will not cause an overflow at level -(extra_levels-1)
+!     as long as digits(1_i8) >= 3.
+!
+!     Since the exponents associated with each successive level
+!     differ by arr_max_shift, monotonically decreasing with
+!     increasing level, the absolute value at each level after this
+!     preprocessing is strictly less than what can be represented at
+!     the next lower level (larger exponent). If nonzero, it is also
+!     strictly greater than what is represented at the next higher
+!     level (smaller exponent). Note that the smallest level,
+!     -(extra_levels-1), does not have to be less than
+!     (radix(1_i8)**arr_max_shift) for this 'nonoverlap' property to
+!     hold. 
+            do ilevel=max_levels(ifld),min_level+1,-1
+               if (abs(i8_gsum_level(ilevel)) >= &
+                      (i8_radix**arr_max_shift)) then
+
+                  IX_8 = i8_gsum_level(ilevel) &
+                         / (i8_radix**arr_max_shift)
+                  i8_gsum_level(ilevel-1) = &
+                     i8_gsum_level(ilevel-1) + IX_8
+
+                  IX_8 = IX_8*(i8_radix**arr_max_shift)
+                  i8_gsum_level(ilevel)   = &
+                     i8_gsum_level(ilevel) - IX_8
+               endif
+            enddo
+
+!  ii) Working consecutively from the first level with a nonzero value
+!      up to level max_levels(ifld), subtract +/- 1 from level with
+!      larger exponent (e.g., ilevel) and add  +/-
+!      (i8_radix**arr_max_shift) to level with smaller exponent
+!      (ilevel+1), when necessary, so that the value at ilevel+1
+!      has the same sign as the value at ilevel. Treat a zero value at
+!      ilevel+1 as always a different sign from the value at ilevel so
+!      that the process always makes this nonzero. (Otherwise, the
+!      wrong sign could be reintroduced by subtracting from a zero
+!      value at the next step.) When finished with the process values
+!      at all levels are either greater than or equal to zero or all
+!      are less than or equal to zero. Note that this can decrease
+!      (but not increase) the absolute value at level
+!      -(extra_levels-1) by 1. All other levels are now less than or
+!      equal to (radix(1_i8)**arr_max_shift) in absolute value rather
+!      than strictly less than. 
+            ilevel = min_level
+            do while ((i8_gsum_level(ilevel) == 0_i8) &
+                      .and. (ilevel < max_levels(ifld)))
+               ilevel = ilevel + 1
+            enddo
+!
+            if (i8_gsum_level(ilevel) < 0_i8) then
+               i8_sign = -1_i8
+            else
+               i8_sign = 1_i8
+            endif
+!
+            if (ilevel < max_levels(ifld)) then
+               do jlevel=ilevel,max_levels(ifld)-1
+                  if ((sign(1_i8,i8_gsum_level(jlevel)) &
+                       /= sign(1_i8,i8_gsum_level(jlevel+1)))&
+                      .or. (i8_gsum_level(jlevel+1) == 0_i8)) then
+                     i8_gsum_level(jlevel)   = &
+                       i8_gsum_level(jlevel) - i8_sign
+                     i8_gsum_level(jlevel+1) = &
+                        i8_gsum_level(jlevel+1) &
+                        + i8_sign*(i8_radix**arr_max_shift)
+                  endif
+               enddo
+            endif
+
+!  iii) If 'same sign' is negative, then change to positive
+!       temporarily. 
+            if (i8_sign < 0_i8) then
+               do jlevel=ilevel,max_levels(ifld)
+                  i8_gsum_level(jlevel) = -i8_gsum_level(jlevel)
+               enddo
+            endif
+
+!  iv) Nonoverlap property can be lost after imposition of same sign
+!      over components. Reintroduce this property (retaining same sign
+!      property). Note that carryover is never more than '1' to the
+!      next smaller level, so, again, no intermediate or final sums
+!      will exceed the maximum value representable in i8, including
+!      level -(extra_levels-1) as long as digits(1_i8) >= 4. 
+            do ilevel=max_levels(ifld),min_level+1,-1
+               if (abs(i8_gsum_level(ilevel)) >= &
+                      (i8_radix**arr_max_shift)) then
+
+                  IX_8 = i8_gsum_level(ilevel)/ &
+                         (i8_radix**arr_max_shift)
+                  i8_gsum_level(ilevel-1) = &
+                     i8_gsum_level(ilevel-1) + IX_8
+
+                  IX_8 = IX_8*(i8_radix**arr_max_shift)
+                  i8_gsum_level(ilevel)   = &
+                     i8_gsum_level(ilevel) - IX_8
+               endif
+            enddo
+
+! Step 3(d): iterate over steps 3(b) and 3(c), truncating integer
+! vector to 'fit' into a floating point value, then repeating with
+! remainder
+            first_stepd_iteration = .true.
+            arr_shift = arr_gmax_exp(ifld) - (min_level)*arr_max_shift
+            i8_digit_count = 0
+            i8_begin_level = min_level
+            do while (i8_begin_level <= max_levels(ifld))
+   
+! Determine at which level the total number of integer digits equals
+! or exceeds the number of digits representable in the floating point
+! sum. Then determine which digit at this level is the last
+! representable in the floating point sum. Note that this location
+! (i8_trunc_loc) is zero-based, i.e. smallest digit is at location
+! 0. Note that the exponent is a count of the number of digits for the
+! first nonzero level. All subsequent levels contribute arr_max_shift
+! digits.
+               i8_trunc_loc = 0
+               i8_trunc_level = max_levels(ifld)
+               do ilevel=i8_begin_level,max_levels(ifld)
+                  if (first_stepd_iteration) then
+! Special logic for first time through. Subsequent iterations treat
+! leading zeroes as significant.
+                     if (i8_digit_count == 0) then
+                        if (i8_gsum_level(ilevel) /= 0_i8) then
+                           X_8 = i8_gsum_level(ilevel)
+                           LX  = exponent(X_8)
+! Note that even if i8_gsum_level(ilevel) is truncated when assigned
+! to X_8, the exponent LX will still capture the original number of
+! digits.
+                        else
+                           LX = 0
+                        endif
+                     else
+                        LX = arr_max_shift
+                     endif
+                  else
+! If i8_digit_count /= 0 during the first iteration
+! (ilevel == i8_begin_level), then there is a remainder left at the
+! previous i8_trunc_level and LX should be set to zero for this
+! iteration.
+                     if ((ilevel == i8_begin_level) .and. (i8_digit_count /= 0)) then
+                        LX = 0
+                     else
+                        LX = arr_max_shift
+                     endif
+                  endif
+                  if (i8_digit_count + LX >= digits(1.0_r8)) then
+                     i8_trunc_level = ilevel
+                     i8_trunc_loc = (i8_digit_count + LX) - digits(1.0_r8)
+                     exit
+                  else
+                     i8_digit_count = i8_digit_count + LX
+                  endif
+               enddo
+               first_stepd_iteration = .false.
+
+! Truncate at i8_trunc_loc as needed and determine what the remainder
+! is. 
+               if (i8_trunc_loc == 0) then
+! No truncation is necessary, and remainder is just the components
+! for the remaining levels
+                  i8_trunc_level_rem = 0
+               else
+! Shift right to identify the digits to be preserved and truncate
+! there
+                  IX_8 = i8_gsum_level(i8_trunc_level)/ &
+                         (i8_radix**i8_trunc_loc)
+! Shift left to put digits in the correct location (right fill with
+! zeroes)
+                  IX_8 = IX_8*(i8_radix**i8_trunc_loc)
+! Calculate local remainder
+                  i8_trunc_level_rem = (i8_gsum_level(i8_trunc_level) - IX_8)
+! Update level with the truncated value
+                  i8_gsum_level(i8_trunc_level) = IX_8
+               endif
+    
+! Calculate floating point value corresponding to modified integer
+! vector. Note that, by construction, i8 integer value will fit into
+! r8 floating point value, so do not need to test for this.
+               svlevel = svlevel + 1
+               summand_vector(svlevel) = 0.0_r8
+               do ilevel=i8_begin_level,i8_trunc_level
+                  if (i8_gsum_level(ilevel) /= 0_i8) then
+
+! Convert integer to floating point representation
+                     X_8 = i8_gsum_level(ilevel)
+                     LX  = exponent(X_8)
+
+! Add to vector of floating point summands, scaling first if exponent
+! is too small to apply directly
+                     curr_exp = LX + arr_shift
+                     if (curr_exp >= MINEXPONENT(1.0_r8)) then
+                        summand_vector(svlevel) = &
+                           summand_vector(svlevel) + set_exponent(X_8,curr_exp)
+                     else
+                        RX_8 = set_exponent(X_8, &
+                                            curr_exp-MINEXPONENT(1.0_r8))
+                        summand_vector(svlevel) = &
+                           summand_vector(svlevel) + scale(RX_8,MINEXPONENT(1.0_r8))
+                     endif
+
+                  endif
+
+! Note that same arr_shift should be used for next 'step 3(d)'
+! iteration if i8_trunc_loc > 0.
+                  if ((ilevel < i8_trunc_level) .or. (i8_trunc_loc == 0)) then
+                     arr_shift = arr_shift - arr_max_shift
+                  endif
+
+               enddo
+
+               if (i8_trunc_loc == 0) then
+                  i8_digit_count = 0
+                  i8_begin_level = i8_trunc_level + 1
+               else
+                  i8_digit_count = i8_trunc_loc
+                  i8_begin_level = i8_trunc_level
+                  i8_gsum_level(i8_trunc_level) = i8_trunc_level_rem
+               endif
+
+            enddo
+
+! Step 3(e): sum vector of floating point values, smallest to largest
+            arr_gsum(ifld) = 0.0_r8
+            do jlevel=svlevel,1,-1
+               arr_gsum(ifld) = arr_gsum(ifld) + summand_vector(jlevel)
+            enddo
+
+! Step 3(f): restore the sign
+            arr_gsum(ifld) = i8_sign*arr_gsum(ifld)
+
+! If validate is .true. and some precision lost, test whether 'too
+! much' was lost, due to too loose an upper bound, too stringent a
+! limit on number of levels of expansion, cancellation, ...
+! Calculated by comparing lower bound on number of significant digits
+! with number of digits in 1.0_r8 .
+            if (validate) then
+               if (i8_arr_gsum_level(ioffset-voffset+2) /= 0_i8) then
+
+! Find first nonzero level and use exponent for this level, then
+! assume all subsequent levels contribute arr_max_shift digits.
+                  i8_digit_count = 0
+                  do ilevel=min_level,max_levels(ifld)
+                     if (i8_digit_count == 0) then
+                        if (i8_arr_gsum_level(ioffset+ilevel) /= 0_i8) then
+                           X_8 = i8_arr_gsum_level(ioffset+ilevel)
+                           LX  = exponent(X_8)
+                           i8_digit_count = LX
+                        endif
+                     else
+                        i8_digit_count = i8_digit_count + arr_max_shift
+                     endif
+                  enddo
+
+                  if (i8_digit_count < digits(1.0_r8)) then
+                     recompute = .true.
+                  endif
+               endif
+            endif
+
+         endif
+
+      enddo
+#ifndef EAMXX_STANDALONE
+      call t_stopf('repro_sum_finalsum')
+#endif
+
+   end subroutine cism_reprosum_int
+
+!
+!========================================================================
+!
+
+   logical function cism_reprosum_tolExceeded(name, nflds, master, &
+                                              logunit, rel_diff    )
+!------------------------------------------------------------------------
+!
+! Purpose:
+! Test whether distributed sum exceeds tolerance and print out a
+! warning message.
+!
+!------------------------------------------------------------------------
+!
+! Arguments
+!
+      character(len=*), intent(in) :: name    ! distributed sum identifier
+      integer,  intent(in) :: nflds           ! number of fields
+      logical,  intent(in) :: master          ! MPI task that will write
+                                              !  warning messages?
+      integer, optional, intent(in) :: logunit! unit warning messages
+                                              !  written to
+      real(r8), intent(in) :: rel_diff(2,nflds)
+                                              ! relative and absolute
+                                              !  differences between integer
+                                              !  vector and floating point sums
+
+!
+! Local workspace
+!
+      integer  :: llogunit                    ! local log unit
+      integer  :: ifld                        ! field index
+      integer  :: exceeds_limit               ! number of fields whose
+                                              !  sum exceeds tolerance
+      real(r8) :: max_rel_diff                ! maximum relative difference
+      integer  :: max_rel_diff_idx            ! field index for max. rel. diff.
+      real(r8) :: max_abs_diff                ! maximum absolute difference
+      integer  :: max_abs_diff_idx            ! field index for max. abs. diff.
+!
+!------------------------------------------------------------------------
+!
+      cism_reprosum_tolExceeded = .false.
+      if (cism_reprosum_reldiffmax < 0.0_r8) return
+
+      if ( present(logunit) ) then
+         llogunit = logunit
+      else
+!!         llogunit = s_logunit
+         llogunit = iulog
+      endif
+
+! Check that 'fast' reproducible sum is accurate enough.
+      exceeds_limit = 0
+      max_rel_diff = 0.0_r8
+      max_abs_diff = 0.0_r8
+      max_rel_diff_idx = 0
+      do ifld=1,nflds
+         if (rel_diff(1,ifld) > cism_reprosum_reldiffmax) then
+            exceeds_limit = exceeds_limit + 1
+            if (rel_diff(1,ifld) > max_rel_diff) then
+               max_rel_diff = rel_diff(1,ifld)
+               max_rel_diff_idx = ifld
+            endif
+            if (rel_diff(2,ifld) > max_abs_diff) then
+               max_abs_diff = rel_diff(2,ifld)
+               max_abs_diff_idx = ifld
+            endif
+         endif
+      enddo
+
+      if (exceeds_limit > 0) then
+         if (master) then
+            write(llogunit,*) trim(name), &
+                            ': difference between integer vector and floating point sums ', &
+                            ' exceeds tolerance in ', exceeds_limit, &
+                            ' fields.'
+            write(llogunit,*) '  Maximum relative diff: (rel)', &
+                            rel_diff(1,max_rel_diff_idx), ' (abs) ', &
+                            rel_diff(2,max_rel_diff_idx)
+            write(llogunit,*) '  Maximum absolute diff: (rel)', &
+                            rel_diff(1,max_abs_diff_idx), ' (abs) ', &
+                            rel_diff(2,max_abs_diff_idx)
+         endif
+         cism_reprosum_tolExceeded = .true.
+       endif
+
+   end function cism_reprosum_tolExceeded
+
+!
+!========================================================================
+!
+
+   subroutine cism_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands,  &
+                                  nflds, mpi_comm                       )
+!------------------------------------------------------------------------
+!
+! Purpose:
+! Compute the global sum of each field in 'arr' using the indicated
+! communicator with a reproducible yet scalable implementation based
+! on He and Ding's implementation of the double-double algorithm.
+!
+!------------------------------------------------------------------------
+!
+! Arguments
+!
+      integer,  intent(in) :: nsummands  ! number of local summands
+      integer,  intent(in) :: dsummands  ! declared first dimension
+      integer,  intent(in) :: nflds      ! number of fields
+      real(r8), intent(in) :: arr(dsummands,nflds)
+                                         ! input array
+      integer,  intent(in) :: mpi_comm   ! MPI subcommunicator
+
+      real(r8), intent(out):: arr_gsum(nflds)
+                                         ! global sums
+
+!
+! Local workspace
+!
+      integer :: old_cw                  ! for x86 processors, save
+                                         !  current arithmetic mode
+      integer :: ifld, isum              ! loop variables
+      integer :: ierr                    ! MPI error return
+
+      real(r8)    :: e, t1, t2           ! temporaries
+      complex(r8) :: arr_lsum_dd(nflds)  ! local sums (in double-double
+                                         !  format)
+      complex(r8) :: arr_gsum_dd(nflds)  ! global sums (in double-double
+                                         !  format)
+
+      integer, save :: mpi_sumdd
+      logical, save :: first_time = .true.
+
+!
+!------------------------------------------------------------------------
+!
+      !WHL - commented out
+!      call cism_reprosumx86_fix_start (old_cw)
+
+      if (first_time) then
+         call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr)
+         first_time = .false.
+      endif
+
+      do ifld=1,nflds
+         arr_lsum_dd(ifld) = (0.0_r8,0.0_r8)
+
+         do isum=1,nsummands
+
+! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s trick.
+            t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld))
+            e  = t1 - arr(isum,ifld)
+            t2 = ((real(arr_lsum_dd(ifld)) - e) &
+                  + (arr(isum,ifld) - (t1 - e))) &
+                 + aimag(arr_lsum_dd(ifld))
+
+            ! The result is t1 + t2, after normalization.
+            arr_lsum_dd(ifld) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
+         enddo
+
+      enddo
+
+#ifndef EAMXX_STANDALONE
+      call t_startf("repro_sum_allr_c16")
+#endif
+      call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, &
+                          MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr)
+#ifndef EAMXX_STANDALONE
+      call t_stopf("repro_sum_allr_c16")
+#endif
+
+      do ifld=1,nflds
+         arr_gsum(ifld) = real(arr_gsum_dd(ifld))
+      enddo
+
+      !WHL - commented out
+!      call cism_reprosumx86_fix_end (old_cw)
+
+   end subroutine cism_reprosum_ddpdd
+!
+!------------------------------------------------------------------------
+!
+   subroutine DDPDD (dda, ddb, len, itype)
+!------------------------------------------------------------------------
+!
+! Purpose:
+! Modification of original codes written by David H. Bailey
+! This subroutine computes ddb(i) = dda(i)+ddb(i)
+!
+!------------------------------------------------------------------------
+!
+! Arguments
+!
+      integer, intent(in)        :: len       ! array length
+      complex(r8), intent(in)    :: dda(len)  ! input
+      complex(r8), intent(inout) :: ddb(len)  ! result
+      integer, intent(in)        :: itype     ! unused
+!
+! Local workspace
+!
+      real(r8) e, t1, t2
+      integer i
+!
+!------------------------------------------------------------------------
+!
+      do i = 1, len
+
+! Compute dda + ddb using Knuth's trick.
+         t1 = real(dda(i)) + real(ddb(i))
+         e  = t1 - real(dda(i))
+         t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) &
+              + aimag(dda(i)) + aimag(ddb(i))
+
+! The result is t1 + t2, after normalization.
+         ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 )
+      enddo
+
+   end subroutine DDPDD
+!
+!------------------------------------------------------------------------
+!
+  subroutine split_indices(total,num_pieces,ibeg,iend)
+!------------------------------------------------------------------------
+!
+! Purpose:
+! Split range into 'num_pieces'
+!
+!------------------------------------------------------------------------
+!
+! Arguments
+!
+    integer, intent(in) :: total
+    integer, intent(in) :: num_pieces
+    integer, intent(out) :: ibeg(num_pieces), iend(num_pieces)
+!
+! Local workspace
+!
+    integer :: itmp1, itmp2, ioffset, i
+!
+!------------------------------------------------------------------------
+!
+    itmp1 = total/num_pieces
+    itmp2 = mod(total,num_pieces)
+    ioffset = 0
+    do i=1,itmp2
+       ibeg(i) = ioffset + 1
+       iend(i) = ioffset + (itmp1+1)
+       ioffset = iend(i)
+    enddo
+    do i=itmp2+1,num_pieces
+       ibeg(i) = ioffset + 1
+       if (ibeg(i) > total) then
+          iend(i) = ibeg(i) - 1
+       else
+          iend(i) = ioffset + itmp1
+          ioffset = iend(i)
+       endif
+    enddo
+
+  end subroutine split_indices
+!
+!========================================================================
+!
+end module cism_reprosum_mod
diff --git a/libglimmer/glimmer_global.F90 b/libglimmer/glimmer_global.F90
index e9eb745d..0aefadee 100644
--- a/libglimmer/glimmer_global.F90
+++ b/libglimmer/glimmer_global.F90
@@ -30,12 +30,12 @@
 
 module glimmer_global
 
-  !> Module holding global variables for Glimmer. Holds real-type
-  !> kind values, and other global code parameters.
+  !> Module holding global variables for Glimmer.
+  !> Holds real and integer kind values and other global code parameters.
 
   implicit none
 
-  integer,parameter :: sp = kind(1.0) 
+  integer,parameter :: sp = kind(1.0)
 
   !> Single precision --- Fortran single-precision real-type kind 
   !> value. Used internally.
@@ -44,7 +44,7 @@ module glimmer_global
   !> the -r8 flag), then this parameter may need to be set in agreement with 
   !> that.
 
-  integer,parameter :: dp = kind(1.0d0) 
+  integer,parameter :: dp = kind(1.0d0)
   
   !> Double precision --- Fortran double-precision real-type kind 
   !> value. Used internally.
@@ -53,6 +53,12 @@ module glimmer_global
   !> the -r8 flag), then this parameter may need to be set in agreement
   !> with that
 
+  ! Integer kinds
+  ! Note: Integers are i4 by default.
+  !       i8 integers can be used to generate reproducible sums
+  integer, parameter :: i4 = kind(1)
+  integer, parameter :: i8 = kind(1_8)
+
 !WHL - Removed rk from the code, so commenting out these declarations
 !!#ifdef GLIMMER_SP
 !!  integer,parameter :: rk=sp !< Precision of glimmer module --- the general Fortran real-type kind value for the Glimmer module and its interfaces.
diff --git a/libglimmer/glimmer_utils.F90 b/libglimmer/glimmer_utils.F90
index c21b05c9..45038da6 100644
--- a/libglimmer/glimmer_utils.F90
+++ b/libglimmer/glimmer_utils.F90
@@ -589,6 +589,91 @@ subroutine point_diag_real8_2d(&
 
   end subroutine point_diag_real8_2d
 
+!--------------------------------------------------------------------------
+
+  subroutine double_to_binary(&
+       x, binary_str, binary_full, binary_sign, binary_exponent, binary_mantissa)
+
+    ! Find the internal binary representation of a double-precision floating point number
+    ! Based on the IEEE-754 standard
+
+    use glimmer_global, only: dp, i8
+    implicit none
+
+    real(dp), intent(in) :: x
+    character(len=64), intent(out) :: binary_str    ! string representation of the binary number
+
+    integer(i8), intent(out), optional :: binary_full      ! 64 bits
+    integer, intent(out), optional :: binary_sign          ! 1 bit
+    integer, intent(out), optional :: binary_exponent      ! 11 bits
+    integer(i8), intent(out), optional :: binary_mantissa  ! 52 bits
+
+    integer :: i
+    character(len=1) :: bin(64)
+    integer (i8) :: binary_number
+    integer :: sign_bit
+    integer :: exponent_bits
+    integer :: mantissa_bits
+
+    logical :: verbose_binary = .false.
+
+    ! Transfer the double value into a 64-bit integer
+    binary_number = transfer(x, binary_number)
+
+    ! Get the sign bit (bit 64)
+    sign_bit = IAND(ishft(binary_number, -63), INT(1, 8))
+
+    ! Get the exponent bits (bits 63–53)
+    exponent_bits = IAND(ishft(binary_number, -52), INT(Z'7FF', 8))
+
+    ! Extract mantissa (fraction) bits (bits 52–1)
+    mantissa_bits = IAND(binary_number, INT(Z'FFFFFFFFFFFFF', 8))
+
+    if (present(binary_full)) binary_full = binary_number
+    if (present(binary_sign)) binary_sign = sign_bit
+    if (present(binary_exponent)) binary_exponent = exponent_bits
+    if (present(binary_mantissa)) binary_mantissa = mantissa_bits
+
+    if (verbose_binary) then
+       write(iulog,*) ' '
+       write(iulog,*) 'x =', x
+       write(iulog,*) 'IEEE-754 double precision representation of x:'
+       write(iulog,*) 'Sign bit: ', sign_bit
+       write(iulog,*) 'Exponent (11 bits):', exponent_bits
+       write(iulog,*) 'Mantissa (52 bits):', mantissa_bits
+    endif
+
+    ! Convert full 64-bit integer to a binary string
+    do i = 1, 64
+        if (btest(binary_number, 64 - i)) then
+            bin(i) = '1'
+        else
+            bin(i) = '0'
+        end if
+    end do
+
+    binary_str = concat(bin)
+    if (verbose_binary) then
+       write(iulog,*) 'Full 64-bit binary:'
+       write(iulog,*) ' ', binary_str
+    endif
+
+  end subroutine double_to_binary
+
+
+  pure function concat(arr) result(str)
+    ! Turn a character array into a string
+
+    character(len=*), intent(in) :: arr(:)
+    character(len=size(arr)) :: str
+    integer :: k
+
+    do k = 1, size(arr)
+       str(k:k) = arr(k)
+    end do
+
+  end function concat
+
 !****************************************************************************
 
 end module glimmer_utils
diff --git a/libglimmer/ncdf_template.F90.in b/libglimmer/ncdf_template.F90.in
index 177c5e29..378b18b4 100644
--- a/libglimmer/ncdf_template.F90.in
+++ b/libglimmer/ncdf_template.F90.in
@@ -437,14 +437,14 @@ contains
     ic=>model%funits%frc_first
     do while(associated(ic))
 
-!       if (main_task .and. verbose_read_forcing) write(6,*) 'possible forcing times', ic%times
+!       if (main_task .and. verbose_read_forcing) write(iulog,*) 'possible forcing times', ic%times
 
        if (ic%read_once) then  ! read once at initialization; do not re-read at runtime
 
           ic%nc%just_processed = .true.  ! prevent the file from being read
           if (main_task .and. verbose_read_forcing) then
-             write(6,*) ' '
-             write(6,*) 'In NAME_read_forcing; will not re-read the read_once file ', trim(ic%nc%filename)
+             write(iulog,*) ' '
+             write(iulog,*) 'In NAME_read_forcing; will not re-read the read_once file ', trim(ic%nc%filename)
           endif
 
        else  ! not a read_once file
@@ -464,11 +464,11 @@ contains
           endif
 
           if (main_task .and. verbose_read_forcing) then
-             write(6,*) ' '
-             write(6,*) 'In NAME_read_forcing, model time + eps =', model%numerics%time + eps
-             write(6,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset
-             write(6,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle
-             write(6,*) 'current forcing time =', current_forcing_time
+             write(iulog,*) ' '
+             write(iulog,*) 'In NAME_read_forcing, model time + eps =', model%numerics%time + eps
+             write(iulog,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset
+             write(iulog,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle
+             write(iulog,*) 'current forcing time =', current_forcing_time
           endif
 
           ! Find the time index associated with the previous model time step
@@ -476,7 +476,7 @@ contains
           do t = ic%nt, 1, -1  ! look through the time array backwards
              if (ic%times(t) <= current_forcing_time - model%numerics%tinc) then
                 t_prev = t
-                if (main_task .and. verbose_read_forcing) write(6,*) 'Previous time index =', t_prev
+                if (main_task .and. verbose_read_forcing) write(iulog,*) 'Previous time index =', t_prev
                 exit
              end if
           enddo
@@ -486,7 +486,7 @@ contains
              if ( ic%times(t) <= current_forcing_time) then
                 ! use the largest time that is smaller or equal to the current time (stepwise forcing)
                 if (main_task .and. verbose_read_forcing) &
-                     write(6,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t)
+                     write(iulog,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t)
 
                 ! If this time index (t) is larger than the previous index (t_prev), then read a new time slice.
                 ! Otherwise, we already have the current slice, and there is nothing new to read.
@@ -494,7 +494,7 @@ contains
                    ! Set the desired time to be read
                    ic%current_time = t
                    ic%nc%just_processed = .false.  ! set this to false so file will be read.
-                   if (main_task .and. verbose_read_forcing) write(6,*) 'Read new forcing slice: t, times(t) =', t, ic%times(t)
+                   if (main_task .and. verbose_read_forcing) write(iulog,*) 'Read new forcing slice: t, times(t) =', t, ic%times(t)
                 endif ! t > t_prev
 
                 exit  ! once we find the time, exit the loop
@@ -548,10 +548,10 @@ contains
        if (ic%read_once) then
 
           if (main_task .and. verbose_read_forcing) then
-             write(6,*) ' '
-             write(6,*) 'In NAME_read_forcing_once'
-             write(6,*) 'Filename =', trim(ic%nc%filename)
-             write(6,*) 'Number of slices =', ic%nt
+             write(iulog,*) ' '
+             write(iulog,*) 'In NAME_read_forcing_once'
+             write(iulog,*) 'Filename =', trim(ic%nc%filename)
+             write(iulog,*) 'Number of slices =', ic%nt
           endif
 
           write(message,*) 'Reading', ic%nt, 'slices of file ', trim(ic%nc%filename), ' just once at initialization'
@@ -568,7 +568,7 @@ contains
           do t = 1, ic%nt
 
              if (main_task .and. verbose_read_forcing) then
-                write(6,*) 'Read new forcing slice: t index, times(t) =', t, ic%times(t)
+                write(iulog,*) 'Read new forcing slice: t index, times(t) =', t, ic%times(t)
              endif
 
              ! Set the desired time to be read
@@ -588,7 +588,7 @@ contains
        endif  ! read_once
 
        if (main_task .and. verbose_read_forcing) then
-          write(6,*) 'Final ic%nc%vars = ', trim(ic%nc%vars)
+          write(iulog,*) 'Final ic%nc%vars = ', trim(ic%nc%vars)
        endif
 
        ic=>ic%next
@@ -650,13 +650,13 @@ contains
           endif
 
           if (main_task .and. verbose_read_forcing) then
-             write(6,*) ' '
-             write(6,*) 'In NAME_retrieve_forcing, model time + eps =', model%numerics%time + eps
-             write(6,*) 'Filename = ', trim(ic%nc%filename)
-             write(6,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset
-             write(6,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle
-             write(6,*) 'current forcing time =', current_forcing_time
-             write(6,*) 'variable list:', trim(ic%nc%vars)
+             write(iulog,*) ' '
+             write(iulog,*) 'In NAME_retrieve_forcing, model time + eps =', model%numerics%time + eps
+             write(iulog,*) 'Filename = ', trim(ic%nc%filename)
+             write(iulog,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset
+             write(iulog,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle
+             write(iulog,*) 'current forcing time =', current_forcing_time
+             write(iulog,*) 'variable list:', trim(ic%nc%vars)
           endif
 
           ! Optionally, associate the current forcing time with a different date in the forcing file.
@@ -669,8 +669,8 @@ contains
              open(unit=11, file=trim(ic%shuffle_file), status='old')
              this_year = int(current_forcing_time - model%numerics%tstart)
              if (main_task .and. verbose_read_forcing) then
-                write(6,*) 'shuffle_file = ', trim(ic%shuffle_file)
-                write(6,*) 'tstart, this_year =', model%numerics%tstart, this_year
+                write(iulog,*) 'shuffle_file = ', trim(ic%shuffle_file)
+                write(iulog,*) 'tstart, this_year =', model%numerics%tstart, this_year
              endif
              forcing_year = 0
              do while (forcing_year == 0)
@@ -684,11 +684,11 @@ contains
              decimal_year = current_forcing_time - floor(current_forcing_time)
              current_forcing_time = real(forcing_year,dp) + decimal_year
              if (main_task .and. verbose_read_forcing) then
-                write(6,*) 'forcing_year, decimal =', forcing_year, decimal_year
-                write(6,*) 'shuffled forcing_time =', current_forcing_time
+                write(iulog,*) 'forcing_year, decimal =', forcing_year, decimal_year
+                write(iulog,*) 'shuffled forcing_time =', current_forcing_time
              endif
           else
-             if (main_task .and. verbose_read_forcing) write(6,*) 'no shuffle_file'
+             if (main_task .and. verbose_read_forcing) write(iulog,*) 'no shuffle_file'
           endif  ! shuffle_file exists
 
           ! Find the time index associated with the previous model time step
@@ -696,7 +696,7 @@ contains
           do t = ic%nt, 1, -1  ! look through the time array backwards
              if (ic%times(t) <= current_forcing_time - model%numerics%tinc) then
                 t_prev = t
-                if (main_task .and. verbose_read_forcing) write(6,*) 'Previous time index =', t_prev
+                if (main_task .and. verbose_read_forcing) write(iulog,*) 'Previous time index =', t_prev
                 exit
              end if
           enddo
@@ -706,14 +706,14 @@ contains
              if ( ic%times(t) <= current_forcing_time) then
                 ! use the largest time that is smaller or equal to the current time (stepwise forcing)
                 if (main_task .and. verbose_read_forcing) &
-                     write(6,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t)
+                     write(iulog,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t)
                 ! If this time index (t) is larger than the previous index (t_prev), then retrieve a new time slice.
                 ! Otherwise, we already have the current slice, and there is nothing new to read.
                 if (t > t_prev) then
                    ! Set the desired time to be read
                    ic%current_time = t
                    retrieve_new_slice = .true.
-                   if (main_task .and. verbose_read_forcing) write(6,*) 'Retrieve new forcing slice'
+                   if (main_task .and. verbose_read_forcing) write(iulog,*) 'Retrieve new forcing slice'
                    write(message,*) &
                         'Retrieve slice', t, 'at forcing time', ic%times(t), 'from file ', trim(ic%nc%filename)
                    call write_log(message)
diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90
index 39a06994..d043cde2 100644
--- a/libglimmer/parallel_mpi.F90
+++ b/libglimmer/parallel_mpi.F90
@@ -27,9 +27,11 @@
 module cism_parallel
 
   use netcdf
-  use glimmer_global, only : dp, sp
+  use glimmer_global, only : dp, sp, i8
   use glimmer_paramets, only: iulog
 
+  use cism_reprosum_mod, only: cism_reprosum_calc, verbose_reprosum
+
   implicit none
 
   ! integers associated with the main global communicator
@@ -144,6 +146,10 @@ module cism_parallel
      integer :: main_rank_col  ! integer ID for the master task on the column
      logical :: main_task_col  ! true if this_rank_col = main_rank_col
 
+     ! option to compute reproducible sums
+     logical :: reprosum       ! if true, compute reproducible global sums
+                               ! (interface parallel_reduce_sum)
+
   end type parallel_type
 
   ! Information on the local & global bounds of an array
@@ -182,29 +188,29 @@ module cism_parallel
      module procedure broadcast_real8_1d
   end interface
 
-  interface distributed_gather_var
-     module procedure distributed_gather_var_integer_2d
-     module procedure distributed_gather_var_logical_2d
-     module procedure distributed_gather_var_real4_2d
-     module procedure distributed_gather_var_real4_3d
-     module procedure distributed_gather_var_real8_2d
-     module procedure distributed_gather_var_real8_3d
+  interface gather_var
+     module procedure gather_var_integer_2d
+     module procedure gather_var_logical_2d
+     module procedure gather_var_real4_2d
+     module procedure gather_var_real4_3d
+     module procedure gather_var_real8_2d
+     module procedure gather_var_real8_3d
   end interface
 
-  interface distributed_gather_var_row
-     module procedure distributed_gather_var_row_real8_2d
+  interface gather_var_row
+     module procedure gather_var_row_real8_2d
   end interface
 
-  interface distributed_gather_all_var_row
-     module procedure distributed_gather_all_var_row_real8_2d
+  interface gather_all_var_row
+     module procedure gather_all_var_row_real8_2d
   end interface
 
-  interface distributed_gather_var_col
-     module procedure distributed_gather_var_col_real8_2d
+  interface gather_var_col
+     module procedure gather_var_col_real8_2d
   end interface
 
-  interface distributed_gather_all_var_col
-     module procedure distributed_gather_all_var_col_real8_2d
+  interface gather_all_var_col
+     module procedure gather_all_var_col_real8_2d
   end interface
 
   interface distributed_get_var
@@ -232,21 +238,21 @@ module cism_parallel
      module procedure distributed_put_var_real8_3d
   end interface
 
-  interface distributed_scatter_var
-     module procedure distributed_scatter_var_integer_2d
-     module procedure distributed_scatter_var_logical_2d
-     module procedure distributed_scatter_var_real4_2d
-     module procedure distributed_scatter_var_real4_3d
-     module procedure distributed_scatter_var_real8_2d
-     module procedure distributed_scatter_var_real8_3d
+  interface scatter_var
+     module procedure scatter_var_integer_2d
+     module procedure scatter_var_logical_2d
+     module procedure scatter_var_real4_2d
+     module procedure scatter_var_real4_3d
+     module procedure scatter_var_real8_2d
+     module procedure scatter_var_real8_3d
   end interface
 
-  interface distributed_scatter_var_row
-     module procedure distributed_scatter_var_row_real8_2d
+  interface scatter_var_row
+     module procedure scatter_var_row_real8_2d
   end interface
 
-  interface distributed_scatter_var_col
-     module procedure distributed_scatter_var_col_real8_2d
+  interface scatter_var_col
+     module procedure scatter_var_col_real8_2d
   end interface
 
   interface parallel_boundary_value
@@ -291,17 +297,22 @@ module cism_parallel
   interface parallel_global_sum
      module procedure parallel_global_sum_integer_2d
      module procedure parallel_global_sum_integer_3d
-     module procedure parallel_global_sum_real4_2d
+     module procedure parallel_global_sum_integer8_2d
      module procedure parallel_global_sum_real8_2d
      module procedure parallel_global_sum_real8_3d
   end interface
 
-  interface parallel_global_sum_staggered
-     module procedure parallel_global_sum_staggered_3d_real8
-     module procedure parallel_global_sum_staggered_3d_real8_nvar
-     module procedure parallel_global_sum_staggered_2d_real8
-     module procedure parallel_global_sum_staggered_2d_real8_nvar
-  end interface parallel_global_sum_staggered
+  interface parallel_global_sum_patch
+     module procedure parallel_global_sum_patch_integer_2d
+     module procedure parallel_global_sum_patch_real8_2d
+  end interface parallel_global_sum_patch
+
+  interface parallel_global_sum_stagger
+     module procedure parallel_global_sum_stagger_real8_2d
+     module procedure parallel_global_sum_stagger_real8_3d
+     module procedure parallel_global_sum_stagger_real8_2d_nflds
+     module procedure parallel_global_sum_stagger_real8_3d_nflds
+  end interface parallel_global_sum_stagger
 
   interface parallel_halo
      module procedure parallel_halo_integer_2d
@@ -311,9 +322,11 @@ module cism_parallel
      module procedure parallel_halo_real8_2d
      module procedure parallel_halo_real8_3d
      module procedure parallel_halo_real8_4d
+     module procedure parallel_halo_integer8_4d
   end interface
 
   interface parallel_halo_extrapolate
+     module procedure parallel_halo_extrapolate_real8_1d
      module procedure parallel_halo_extrapolate_integer_2d
      module procedure parallel_halo_extrapolate_real8_2d
   end interface
@@ -330,6 +343,8 @@ module cism_parallel
   end interface
 
   interface parallel_is_zero
+     module procedure parallel_is_zero_integer_1d
+     module procedure parallel_is_zero_real8_1d
      module procedure parallel_is_zero_integer_2d
      module procedure parallel_is_zero_real8_2d
      module procedure parallel_is_zero_real8_3d
@@ -388,6 +403,7 @@ module cism_parallel
 
   interface parallel_reduce_sum
      module procedure parallel_reduce_sum_integer
+     module procedure parallel_reduce_sum_integer8
      module procedure parallel_reduce_sum_real4
      module procedure parallel_reduce_sum_real8
      module procedure parallel_reduce_sum_integer_nvar
@@ -613,10 +629,10 @@ end function distributed_execution
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_gather_var interface
+  ! subroutines belonging to the gather_var interface
 
   ! WHL, July 2019:
-  ! There is an issue with allocating the global_values array in the distributed_gather_var_*,
+  ! There is an issue with allocating the global_values array in the gather_var_*,
   !  distributed_get_var_*, distributed_print_*, and distributed_put_var_* functions and subroutines
   !  when computing only on active blocks (compute_blocks = 1).
   ! This array is allocated based on the max and min of ewlb, ewub, nslb, and nsub over the global domain.
@@ -628,7 +644,7 @@ end function distributed_execution
   !  global_minval_nslb, and global_maxval_nsub, which are now computed at initialization
   !  based on the bounds in all blocks (including inactive blocks), not just active blocks.
 
-  subroutine distributed_gather_var_integer_2d(values, global_values, parallel)
+  subroutine gather_var_integer_2d(values, global_values, parallel)
 
     ! Gather a distributed variable back to main_task node
     ! values = local portion of distributed variable
@@ -661,7 +677,7 @@ subroutine distributed_gather_var_integer_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(*,*) "distributed_gather does not currently work for"
+       write(*,*) "gather does not currently work for"
        write(*,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -685,9 +701,7 @@ subroutine distributed_gather_var_integer_2d(values, global_values, parallel)
          mpi_integer,main_rank,comm)
 
     if (main_task) then
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        !WHL - See comments above on allocating the global_values array
 !!       allocate(global_values(&
 !!                 minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),&
@@ -706,9 +720,7 @@ subroutine distributed_gather_var_integer_2d(values, global_values, parallel)
        end do
        allocate(recvbuf(displs(tasks+1)))
     else
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        allocate(global_values(1,1))  ! This prevents a problem with NULL pointers later.
        allocate(displs(1))
        allocate(recvcounts(1))
@@ -733,10 +745,10 @@ subroutine distributed_gather_var_integer_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_integer_2d
+  end subroutine gather_var_integer_2d
 
 
-  subroutine distributed_gather_var_logical_2d(values, global_values, parallel)
+  subroutine gather_var_logical_2d(values, global_values, parallel)
 
     ! Gather a distributed variable back to main_task node
     ! values = local portion of distributed variable
@@ -769,7 +781,7 @@ subroutine distributed_gather_var_logical_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(*,*) "distributed_gather does not currently work for"
+       write(*,*) "gather does not currently work for"
        write(*,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -793,9 +805,7 @@ subroutine distributed_gather_var_logical_2d(values, global_values, parallel)
          mpi_integer,main_rank,comm)
 
     if (main_task) then
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        !WHL - See comments above on allocating the global_values array
 !!       allocate(global_values(&
 !!                 minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),&
@@ -814,9 +824,7 @@ subroutine distributed_gather_var_logical_2d(values, global_values, parallel)
        end do
        allocate(recvbuf(displs(tasks+1)))
     else
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        allocate(global_values(1,1))  ! This prevents a problem with NULL pointers later.
        allocate(displs(1))
        allocate(recvcounts(1))
@@ -840,10 +848,10 @@ subroutine distributed_gather_var_logical_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_logical_2d
+  end subroutine gather_var_logical_2d
 
 
-  subroutine distributed_gather_var_real4_2d(values, global_values, parallel)
+  subroutine gather_var_real4_2d(values, global_values, parallel)
 
     ! Gather a distributed variable back to main_task node
     ! values = local portion of distributed variable
@@ -876,7 +884,7 @@ subroutine distributed_gather_var_real4_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(*,*) "distributed_gather does not currently work for"
+       write(*,*) "gather does not currently work for"
        write(*,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -900,9 +908,7 @@ subroutine distributed_gather_var_real4_2d(values, global_values, parallel)
          mpi_integer,main_rank,comm)
 
     if (main_task) then
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        !WHL - See comments above on allocating the global_values array
 !!       allocate(global_values(&
 !!                 minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),&
@@ -921,9 +927,7 @@ subroutine distributed_gather_var_real4_2d(values, global_values, parallel)
        end do
        allocate(recvbuf(displs(tasks+1)))
     else
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        allocate(global_values(1,1))  ! This prevents a problem with NULL pointers later.
        allocate(displs(1))
        allocate(recvcounts(1))
@@ -947,10 +951,10 @@ subroutine distributed_gather_var_real4_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_real4_2d
+  end subroutine gather_var_real4_2d
 
 
-  subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1, ud1)
+  subroutine gather_var_real4_3d(values, global_values, parallel, ld1, ud1)
 
     ! Gather a distributed variable back to main_task node
     ! values = local portion of distributed variable
@@ -984,7 +988,7 @@ subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1,
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(*,*) "distributed_gather does not currently work for"
+       write(*,*) "gather does not currently work for"
        write(*,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1008,9 +1012,7 @@ subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1,
          mpi_integer,main_rank,comm)
 
     if (main_task) then
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        if (present(ld1)) then
          d1l = ld1
        else
@@ -1044,9 +1046,7 @@ subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1,
        end do
        allocate(recvbuf(displs(tasks+1)))
     else
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        allocate(global_values(1,1,1))  ! This prevents a problem with NULL pointers later.
        allocate(displs(1))
        allocate(recvcounts(1))
@@ -1073,10 +1073,10 @@ subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1,
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_real4_3d
+  end subroutine gather_var_real4_3d
 
 
-  subroutine distributed_gather_var_real8_2d(values, global_values, parallel)
+  subroutine gather_var_real8_2d(values, global_values, parallel)
 
     ! Gather a distributed variable back to main_task node
     ! values = local portion of distributed variable
@@ -1109,7 +1109,7 @@ subroutine distributed_gather_var_real8_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(*,*) "distributed_gather does not currently work for"
+       write(*,*) "gather does not currently work for"
        write(*,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1133,9 +1133,7 @@ subroutine distributed_gather_var_real8_2d(values, global_values, parallel)
          mpi_integer,main_rank,comm)
 
     if (main_task) then
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        !WHL - See comments above on allocating the global_values array
 !!       allocate(global_values(&
 !!                 minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),&
@@ -1154,9 +1152,7 @@ subroutine distributed_gather_var_real8_2d(values, global_values, parallel)
        end do
        allocate(recvbuf(displs(tasks+1)))
     else
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        allocate(global_values(1,1))  ! This prevents a problem with NULL pointers later.
        allocate(displs(1))
        allocate(recvcounts(1))
@@ -1181,10 +1177,10 @@ subroutine distributed_gather_var_real8_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_real8_2d
+  end subroutine gather_var_real8_2d
 
 
-  subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1, ud1)
+  subroutine gather_var_real8_3d(values, global_values, parallel, ld1, ud1)
 
     ! Gather a distributed variable back to main_task node
     ! values = local portion of distributed variable
@@ -1193,7 +1189,7 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1,
 
     use mpi_mod
     implicit none
-    real(dp),dimension(:,:,:),intent(in) :: values
+    real(dp),dimension(:,:,:),intent(in) :: values    ! i and j are indices 2 and 3
     real(dp),dimension(:,:,:),allocatable,intent(inout) :: global_values
     integer,optional,intent(in) :: ld1, ud1
     type(parallel_type) :: parallel
@@ -1216,9 +1212,9 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1,
          global_maxval_nsub => parallel%global_maxval_nsub   &
     )
 
-    if (uhalo==0 .and. size(values,1)==local_ewn-1) then
+    if (uhalo==0 .and. size(values,2)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(*,*) "distributed_gather does not currently work for"
+       write(*,*) "gather does not currently work for"
        write(*,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1242,9 +1238,7 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1,
          mpi_integer,main_rank,comm)
 
     if (main_task) then
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        if (present(ld1)) then
          d1l = ld1
        else
@@ -1278,9 +1272,7 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1,
        end do
        allocate(recvbuf(displs(tasks+1)))
     else
-       if (allocated(global_values)) then
-          deallocate(global_values)
-       endif
+       if (allocated(global_values)) deallocate(global_values)
        allocate(global_values(1,1,1))  ! This prevents a problem with NULL pointers later.
        allocate(displs(1))
        allocate(recvcounts(1))
@@ -1307,16 +1299,16 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1,
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_real8_3d
+  end subroutine gather_var_real8_3d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_gather_var_row interface
+  ! subroutines belonging to the gather_var_row interface
 
-  subroutine distributed_gather_var_row_real8_2d(values, global_values, parallel)
+  subroutine gather_var_row_real8_2d(values, global_values, parallel)
 
     ! Gather data along a row of tasks onto the main task for that row.
-    ! Based on distributed_gather_var_real8_2d.
+    ! Based on gather_var_real8_2d.
     ! Note: The first index represents a data dimension that is the same on each task,
     !        whose size generally is less than own_ewn.
     !       The second index represents the north-south dimension, and is assumed
@@ -1351,7 +1343,7 @@ subroutine distributed_gather_var_row_real8_2d(values, global_values, parallel)
 
     if (size(values,2) /= own_nsn) then
        ! Note: Removing this restriction would require some recoding below.
-       write(*,*) "ERROR: distributed_gather_var_row requires N-S array size of own_nsn"
+       write(*,*) "ERROR: gather_var_row requires N-S array size of own_nsn"
        write(*,*) 'rank, own_nsn, size(values,2) =', this_rank, own_nsn, size(values,2)
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1446,16 +1438,16 @@ subroutine distributed_gather_var_row_real8_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_row_real8_2d
+  end subroutine gather_var_row_real8_2d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_gather_all_var_row interface
+  ! subroutines belonging to the gather_all_var_row interface
 
-  subroutine distributed_gather_all_var_row_real8_2d(values, global_values, parallel)
+  subroutine gather_all_var_row_real8_2d(values, global_values, parallel)
 
     ! Gather global data along a row of tasks onto each task for that row.
-    ! Based on distributed_gather_var_real8_2d.
+    ! Based on gather_var_real8_2d.
     ! Note: The first index represents a data dimension that is the same on each task,
     !        whose size generally is less than own_ewn.
     !       The second index represents the north-south dimension, and is assumed
@@ -1493,7 +1485,7 @@ subroutine distributed_gather_all_var_row_real8_2d(values, global_values, parall
        ! TODO: Do this recoding.  This subroutine currently fails with outflow BC, because
        !       the southern and western rows of tasks have an extra locally owned vertex,
        !       giving size(values,2) = own_nsn + 1
-       write(*,*) "ERROR: distributed_gather_var_row requires N-S array size of own_nsn"
+       write(*,*) "ERROR: gather_var_row requires N-S array size of own_nsn"
        write(*,*) 'rank, own_nsn, size(values,2) =', this_rank, own_nsn, size(values,2)
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1568,16 +1560,16 @@ subroutine distributed_gather_all_var_row_real8_2d(values, global_values, parall
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_all_var_row_real8_2d
+  end subroutine gather_all_var_row_real8_2d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_gather_var_col interface
+  ! subroutines belonging to the gather_var_col interface
 
-  subroutine distributed_gather_var_col_real8_2d(values, global_values, parallel)
+  subroutine gather_var_col_real8_2d(values, global_values, parallel)
 
     ! Gather data along a column of tasks onto the main task for that column.
-    ! Based on distributed_gather_var_real8_2d.
+    ! Based on gather_var_real8_2d.
     ! Note: The first index represents a data dimension that is the same on each task,
     !        whose size generally is less than own_nsn.
     !       The second index represents the east-west dimension, and is assumed
@@ -1612,7 +1604,7 @@ subroutine distributed_gather_var_col_real8_2d(values, global_values, parallel)
 
     if (size(values,2) /= own_ewn) then
        ! Note: Removing this restriction would require some recoding below.
-       write(*,*) "ERROR: distributed_gather_var_row requires E-W array size of own_ewn"
+       write(*,*) "ERROR: gather_var_row requires E-W array size of own_ewn"
        write(*,*) 'rank, own_ewn, size(values,2) =', this_rank, own_ewn, size(values,2)
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1708,16 +1700,16 @@ subroutine distributed_gather_var_col_real8_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_var_col_real8_2d
+  end subroutine gather_var_col_real8_2d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_gather_all_var_col interface
+  ! subroutines belonging to the gather_all_var_col interface
 
-  subroutine distributed_gather_all_var_col_real8_2d(values, global_values, parallel)
+  subroutine gather_all_var_col_real8_2d(values, global_values, parallel)
 
     ! Gather global data along a column of tasks onto each task for that column.
-    ! Based on distributed_gather_var_real8_2d.
+    ! Based on gather_var_real8_2d.
     ! Note: The first index represents a data dimension that is the same on each task,
     !        whose size generally is less than own_nsn.
     !       The second index represents the east-west dimension, and is assumed
@@ -1752,7 +1744,7 @@ subroutine distributed_gather_all_var_col_real8_2d(values, global_values, parall
 
     if (size(values,2) /= own_ewn) then
        ! Note: Removing this restriction would require some recoding below.
-       write(*,*) "ERROR: distributed_gather_var_row requires E-W array size of own_ewn"
+       write(*,*) "ERROR: gather_var_row requires E-W array size of own_ewn"
        write(*,*) 'rank, own_ewn, size(values,2) =', this_rank, own_ewn, size(values,2)
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -1827,7 +1819,7 @@ subroutine distributed_gather_all_var_col_real8_2d(values, global_values, parall
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_gather_all_var_col_real8_2d
+  end subroutine gather_all_var_col_real8_2d
 
 !=======================================================================
 
@@ -2210,6 +2202,7 @@ function distributed_get_var_real8_1d(ncid, varid, values, parallel, start)
        global_values(:) = 0.0d0
        distributed_get_var_real8_1d = &
             nf90_get_var(ncid,varid,global_values(1:myn),start)
+
        allocate(displs(tasks+1))
        allocate(sendcounts(tasks))
        sendcounts(:) = bounds(2,:)-bounds(1,:)+1
@@ -2427,7 +2420,9 @@ end function distributed_get_var_real8_3d
 
   subroutine distributed_grid(ewn,      nsn,        &
                               parallel,             &
-                              nhalo_in, global_bc_in)
+                              nhalo_in,             &
+                              global_bc_in,         &
+                              reprosum_in)
 
     ! Divide the global domain into blocks, with one task per block.
     ! Set various grid and domain variables for the local task.
@@ -2437,6 +2432,7 @@ subroutine distributed_grid(ewn,      nsn,        &
     type(parallel_type), intent(inout) :: parallel      ! info for parallel communication, computed here
     integer, intent(in), optional :: nhalo_in           ! number of rows of halo cells
     character(*), intent(in), optional :: global_bc_in  ! string indicating the global BC option
+    logical, intent(in), optional :: reprosum_in        ! if true, compute reproducible global sums
 
     integer :: best,i,j,metric
     real(dp) :: rewtasks,rnstasks
@@ -2479,7 +2475,8 @@ subroutine distributed_grid(ewn,      nsn,        &
          staggered_ilo      => parallel%staggered_ilo,      &
          staggered_ihi      => parallel%staggered_ihi,      &
          staggered_jlo      => parallel%staggered_jlo,      &
-         staggered_jhi      => parallel%staggered_jhi       &
+         staggered_jhi      => parallel%staggered_jhi,      &
+         reprosum           => parallel%reprosum            &
          )
 
     ! set the boundary conditions (periodic by default)
@@ -2665,6 +2662,25 @@ subroutine distributed_grid(ewn,      nsn,        &
         call parallel_stop(__FILE__, __LINE__)
     endif
 
+    if (present(reprosum_in)) then
+       reprosum = reprosum_in
+    else
+       reprosum = .false.
+    endif
+
+    ! If computing reproducible sums, then set some options
+    !TODO - Are these saved from one call to the next?
+    ! Note: For standalone CISM, reprosum = F by default; can set = T in the config file
+    !       For CESM coupled runs, reprosum = T by default
+    if (reprosum) then
+
+#ifdef CCSM_COUPLED
+!!       call shr_reprosum_setops()
+#else
+!!       call cism_reprosum_setops()
+#endif
+    endif  ! reprosum
+
 !    call parallel_barrier
 !    write(iulog,*) 'task, west, east, south, north:', this_rank, west, east, south, north
 
@@ -2693,6 +2709,7 @@ subroutine distributed_grid_active_blocks(ewn,      nsn,        &
                                             nx_block, ny_block,   &
                                             ice_domain_mask,      &
                                             parallel,             &
+                                            reprosum_in,          &
                                             inquire_only)
 
     ! Divide the global domain into blocks, setting various grid and domain variables
@@ -2731,6 +2748,7 @@ subroutine distributed_grid_active_blocks(ewn,      nsn,        &
     integer, intent(in), dimension(:,:) :: &
          ice_domain_mask                            ! = 1 where ice is potentially present and active, else = 0
     type(parallel_type), intent(inout) :: parallel  ! info for parallel communication, computed here
+    logical, intent(in), optional :: reprosum_in    ! if true, compute reproducible global sums
     logical, intent(in), optional :: inquire_only   ! if true, then report the number of active blocks and abort
 
     integer :: i, j, nb, nt
@@ -2809,7 +2827,8 @@ subroutine distributed_grid_active_blocks(ewn,      nsn,        &
          staggered_ilo      => parallel%staggered_ilo,      &
          staggered_ihi      => parallel%staggered_ihi,      &
          staggered_jlo      => parallel%staggered_jlo,      &
-         staggered_jhi      => parallel%staggered_jhi       &
+         staggered_jhi      => parallel%staggered_jhi,      &
+         reprosum           => parallel%reprosum            &
          )
 
     if (present(inquire_only)) then
@@ -3283,7 +3302,28 @@ subroutine distributed_grid_active_blocks(ewn,      nsn,        &
 !            southwest_corner, southeast_corner, northwest_corner, northeast_corner
     endif
 
-    ! Uncomment to print grid geometry
+    if (present(reprosum_in)) then
+       reprosum = reprosum_in
+    else
+       reprosum = .false.
+    endif
+
+    ! If computing reproducible sums, then set some options
+    !TODO - Are these saved from one call to the next?
+    ! Note: For standalone CISM, reprosum = F by default; can set = T in the config file
+    !       For CESM coupled runs, reprosum = T by default
+
+    if (reprosum) then
+
+#ifdef CCSM_COUPLED
+!!       call shr_reprosum_setops()
+#else
+!!       call cism_reprosum_setops()
+#endif
+
+    endif  ! reprosum
+
+! Uncomment to print grid geometry
 !    write(iulog,*) " "
 !    write(iulog,*) "Process ", this_rank, " Total = ", tasks, " ewtasks = ", ewtasks, " nstasks = ", nstasks
 !    write(iulog,*) "Process ", this_rank, " ewrank = ", ewrank, " nsrank = ", nsrank
@@ -4269,9 +4309,9 @@ end function distributed_put_var_real8_3d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_scatter_var interface
+  ! subroutines belonging to the scatter_var interface
 
-  subroutine distributed_scatter_var_integer_2d(values, global_values, parallel)
+  subroutine scatter_var_integer_2d(values, global_values, parallel)
 
     ! Scatter a variable on the main_task node back to the distributed
     ! values = local portion of distributed variable
@@ -4301,7 +4341,7 @@ subroutine distributed_scatter_var_integer_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(iulog,*) "distributed_scatter does not currently work for"
+       write(iulog,*) "scatter does not currently work for"
        write(iulog,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -4356,10 +4396,10 @@ subroutine distributed_scatter_var_integer_2d(values, global_values, parallel)
     deallocate(global_values)   ! TODO - Is this deallocation necessary, here and below?
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_integer_2d
+  end subroutine scatter_var_integer_2d
 
 
-  subroutine distributed_scatter_var_logical_2d(values, global_values, parallel)
+  subroutine scatter_var_logical_2d(values, global_values, parallel)
 
     ! Scatter a variable on the main_task node back to the distributed
     ! values = local portion of distributed variable
@@ -4389,7 +4429,7 @@ subroutine distributed_scatter_var_logical_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(iulog,*) "distributed_scatter does not currently work for"
+       write(iulog,*) "scatter does not currently work for"
        write(iulog,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -4444,10 +4484,10 @@ subroutine distributed_scatter_var_logical_2d(values, global_values, parallel)
     deallocate(global_values)
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_logical_2d
+  end subroutine scatter_var_logical_2d
 
 
-  subroutine distributed_scatter_var_real4_2d(values, global_values, parallel)
+  subroutine scatter_var_real4_2d(values, global_values, parallel)
 
     ! Scatter a variable on the main_task node back to the distributed
     ! values = local portion of distributed variable
@@ -4477,7 +4517,7 @@ subroutine distributed_scatter_var_real4_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(iulog,*) "distributed_scatter does not currently work for"
+       write(iulog,*) "scatter does not currently work for"
        write(iulog,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -4532,10 +4572,10 @@ subroutine distributed_scatter_var_real4_2d(values, global_values, parallel)
     deallocate(global_values)
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_real4_2d
+  end subroutine scatter_var_real4_2d
 
 
-  subroutine distributed_scatter_var_real4_3d(values, global_values, parallel)
+  subroutine scatter_var_real4_3d(values, global_values, parallel)
 
     ! Scatter a variable on the main_task node back to the distributed
     ! values = local portion of distributed variable
@@ -4565,7 +4605,7 @@ subroutine distributed_scatter_var_real4_3d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(iulog,*) "distributed_scatter does not currently work for"
+       write(iulog,*) "scatter does not currently work for"
        write(iulog,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -4622,10 +4662,10 @@ subroutine distributed_scatter_var_real4_3d(values, global_values, parallel)
     deallocate(global_values)
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_real4_3d
+  end subroutine scatter_var_real4_3d
 
 
-  subroutine distributed_scatter_var_real8_2d(values, global_values, parallel)
+  subroutine scatter_var_real8_2d(values, global_values, parallel)
 
     ! Scatter a variable on the main_task node back to the distributed
     ! values = local portion of distributed variable
@@ -4655,7 +4695,7 @@ subroutine distributed_scatter_var_real8_2d(values, global_values, parallel)
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(iulog,*) "distributed_scatter does not currently work for"
+       write(iulog,*) "scatter does not currently work for"
        write(iulog,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -4710,10 +4750,10 @@ subroutine distributed_scatter_var_real8_2d(values, global_values, parallel)
     deallocate(global_values)
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_real8_2d
+  end subroutine scatter_var_real8_2d
 
 
-  subroutine distributed_scatter_var_real8_3d(values, global_values, parallel, deallocflag)
+  subroutine scatter_var_real8_3d(values, global_values, parallel, deallocflag)
 
     ! Scatter a variable on the main_task node back to the distributed
     ! values = local portion of distributed variable
@@ -4745,7 +4785,7 @@ subroutine distributed_scatter_var_real8_3d(values, global_values, parallel, dea
 
     if (uhalo==0 .and. size(values,1)==local_ewn-1) then
        ! Fixing this would require some generalization as is done for distributed_put_var
-       write(iulog,*) "distributed_scatter does not currently work for"
+       write(iulog,*) "scatter does not currently work for"
        write(iulog,*) "variables on the staggered grid when uhalo=0"
        call parallel_stop(__FILE__, __LINE__)
     end if
@@ -4809,16 +4849,16 @@ subroutine distributed_scatter_var_real8_3d(values, global_values, parallel, dea
     if (deallocmem) deallocate(global_values)
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_real8_3d
+  end subroutine scatter_var_real8_3d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_scatter_var_row interface
+  ! subroutines belonging to the scatter_var_row interface
 
-  subroutine distributed_scatter_var_row_real8_2d(values, global_values, parallel)
+  subroutine scatter_var_row_real8_2d(values, global_values, parallel)
 
     ! Scatter data to a row of tasks from the main task for that row.
-    ! Based on distributed_scatter_var_real8_2d.
+    ! Based on scatter_var_real8_2d.
     ! Note: The first index represents a data dimension that is the same on each task,
     !        whose size generally is less than own_ewn.
     !       The second index represents the north-south dimension, and is assumed
@@ -4852,7 +4892,7 @@ subroutine distributed_scatter_var_row_real8_2d(values, global_values, parallel)
 
     if (size(values,2) /= own_nsn) then
        ! Note: Removing this restriction would require some recoding below.
-       write(iulog,*) "ERROR: distributed_scatter_var_row requires N-S array size of own_nsn"
+       write(iulog,*) "ERROR: scatter_var_row requires N-S array size of own_nsn"
        call parallel_stop(__FILE__, __LINE__)
     end if
 
@@ -4909,16 +4949,16 @@ subroutine distributed_scatter_var_row_real8_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_row_real8_2d
+  end subroutine scatter_var_row_real8_2d
 
 !=======================================================================
 
-  ! subroutines belonging to the distributed_scatter_var_col interface
+  ! subroutines belonging to the scatter_var_col interface
 
-  subroutine distributed_scatter_var_col_real8_2d(values, global_values, parallel)
+  subroutine scatter_var_col_real8_2d(values, global_values, parallel)
 
     ! Scatter data to a column of tasks from the main task for that column
-    ! Based on distributed_scatter_var_real8_2d.
+    ! Based on scatter_var_real8_2d.
     ! Note: The first index represents a data dimension that is the same on each task,
     !        whose size generally is less than own_nsn.
     !       The second index represents the east-west dimension, and is assumed
@@ -4951,7 +4991,7 @@ subroutine distributed_scatter_var_col_real8_2d(values, global_values, parallel)
 
     if (size(values,2) /= own_ewn) then
        ! Note: Removing this restriction would require some recoding below.
-       write(iulog,*) "ERROR: distributed_scatter_var_col requires E-W array size of own_nsn"
+       write(iulog,*) "ERROR: scatter_var_col requires E-W array size of own_nsn"
        call parallel_stop(__FILE__, __LINE__)
     end if
 
@@ -5008,7 +5048,7 @@ subroutine distributed_scatter_var_col_real8_2d(values, global_values, parallel)
     end associate
     ! automatic deallocation
 
-  end subroutine distributed_scatter_var_col_real8_2d
+  end subroutine scatter_var_col_real8_2d
 
 !=======================================================================
 
@@ -6047,7 +6087,7 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d)
     integer, dimension(:,:,:), intent(in), optional :: mask_3d
 
     integer :: i, j, k
-    integer :: kmax
+    integer :: nz
     integer, dimension(size(a,1),parallel%local_ewn,parallel%local_nsn) :: mask
     integer :: local_sum
     integer :: parallel_global_sum_integer_3d
@@ -6056,7 +6096,7 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d)
          local_ewn   => parallel%local_ewn,    &
          local_nsn   => parallel%local_nsn)
 
-    kmax = size(a,1)
+    nz = size(a,1)
 
     if (present(mask_3d)) then
        mask = mask_3d
@@ -6067,7 +6107,7 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d)
     local_sum = 0
     do j = nhalo+1, local_nsn-nhalo
        do i = nhalo+1, local_ewn-nhalo
-          do k = 1, kmax
+          do k = 1, nz
              if (mask(k,i,j) == 1) then
                 local_sum = local_sum + a(k,i,j)
              endif
@@ -6082,30 +6122,30 @@ end function parallel_global_sum_integer_3d
 
 !=======================================================================
 
-  function parallel_global_sum_real4_2d(a, parallel, mask_2d)
+  function parallel_global_sum_integer8_2d(a, parallel, mask_2d)
 
-    ! Calculates the global sum of a 2D single-precision field
+    ! Calculates the global sum of a 2D integer(i8) field
 
-    real(sp),dimension(:,:),intent(in) :: a
+    integer(i8), dimension(:,:),intent(in) :: a
     type(parallel_type) :: parallel
     integer, dimension(:,:), intent(in), optional :: mask_2d
 
     integer :: i, j
     integer, dimension(parallel%local_ewn,parallel%local_nsn) :: mask
-    real(sp) :: local_sum
-    real(sp) :: parallel_global_sum_real4_2d
+    integer(i8) :: local_sum
+    integer(i8) :: parallel_global_sum_integer8_2d
 
     associate(  &
          local_ewn   => parallel%local_ewn,    &
          local_nsn   => parallel%local_nsn)
 
-    if (present(mask_2d)) then
+   if (present(mask_2d)) then
        mask = mask_2d
     else
        mask = 1
     endif
 
-    local_sum = 0.0
+    local_sum = 0
     do j = nhalo+1, local_nsn-nhalo
        do i = nhalo+1, local_ewn-nhalo
           if (mask(i,j) == 1) then
@@ -6113,11 +6153,11 @@ function parallel_global_sum_real4_2d(a, parallel, mask_2d)
           endif
        enddo
     enddo
-    parallel_global_sum_real4_2d = parallel_reduce_sum(local_sum)
+    parallel_global_sum_integer8_2d = parallel_reduce_sum(local_sum)
 
     end associate
 
-  end function parallel_global_sum_real4_2d
+  end function parallel_global_sum_integer8_2d
 
 !=======================================================================
 
@@ -6134,9 +6174,17 @@ function parallel_global_sum_real8_2d(a, parallel, mask_2d)
     real(dp) :: local_sum
     real(dp) :: parallel_global_sum_real8_2d
 
+    ! variables for computing reproductible sums
+    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
+
     associate(  &
          local_ewn   => parallel%local_ewn,    &
-         local_nsn   => parallel%local_nsn)
+         local_nsn   => parallel%local_nsn,    &
+         own_ewn     => parallel%own_ewn,      &
+         own_nsn     => parallel%own_nsn)
 
     if (present(mask_2d)) then
        mask = mask_2d
@@ -6144,15 +6192,59 @@ function parallel_global_sum_real8_2d(a, parallel, mask_2d)
        mask = 1
     endif
 
-    local_sum = 0.0d0
-    do j = nhalo+1, local_nsn-nhalo
-       do i = nhalo+1, local_ewn-nhalo
-          if (mask(i,j) == 1) then
-             local_sum = local_sum + a(i,j)
-          endif
+    if (parallel%reprosum) then   ! compute using parallel_reduce_reprosum
+
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = own_ewn*own_nsn
+       nflds = 1
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
+
+       count = 0
+       do j = nhalo+1, local_nsn-nhalo
+          do i = nhalo+1, local_ewn-nhalo
+             count = count + 1
+             if (mask(i,j) == 1) then
+                arr(count,1) = a(i,j)
+             else
+                arr(count,1) = 0.0d0
+             endif
+          enddo
        enddo
-    enddo
-    parallel_global_sum_real8_2d = parallel_reduce_sum(local_sum)
+
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
+
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
+
+       parallel_global_sum_real8_2d = arr_gsum(1)
+
+       if (verbose_reprosum .and. main_task) then
+!          write(iulog,*) 'arr_gsum =', arr_gsum
+       endif
+
+       deallocate(arr)
+       deallocate(arr_gsum)
+
+    else  ! compute using parallel_reduce_sum (not reproducible)
+
+       local_sum = 0.0d0
+       do j = nhalo+1, local_nsn-nhalo
+          do i = nhalo+1, local_ewn-nhalo
+             if (mask(i,j) == 1) then
+                local_sum = local_sum + a(i,j)
+             endif
+          enddo
+       enddo
+
+       ! Compute the global sum
+       parallel_global_sum_real8_2d = parallel_reduce_sum(local_sum)
+
+    endif   ! reprosum
 
     end associate
 
@@ -6160,318 +6252,768 @@ end function parallel_global_sum_real8_2d
 
 !=======================================================================
 
-  function parallel_global_sum_real8_3d(a, parallel, mask_3d)
+  function parallel_global_sum_real8_3d(a, parallel, mask_2d)
 
     ! Calculates the global sum of a 3D double-precision field
     ! Note: The vertical dimension should be the first dimension of the input field.
 
     real(dp), dimension(:,:,:),intent(in) :: a
     type(parallel_type) :: parallel
-    integer, dimension(:,:,:), intent(in), optional :: mask_3d
+    integer, dimension(:,:), intent(in), optional :: mask_2d
 
     integer :: i, j, k
-    integer :: kmax
-    integer, dimension(size(a,1),parallel%local_ewn,parallel%local_nsn) :: mask
+    integer :: nz
+    integer, dimension(parallel%local_ewn,parallel%local_nsn) :: mask
     real(dp) :: local_sum
     real(dp) :: parallel_global_sum_real8_3d
 
+    ! variables for computing reproductible sums
+    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
+
     associate(  &
          local_ewn   => parallel%local_ewn,    &
-         local_nsn   => parallel%local_nsn)
+         local_nsn   => parallel%local_nsn,    &
+         own_ewn     => parallel%own_ewn,      &
+         own_nsn     => parallel%own_nsn)
 
-    kmax = size(a,1)
+    nz = size(a,1)
 
-    if (present(mask_3d)) then
-       mask = mask_3d
+    ! Note: The mask is 2D, since typically all layers in a column are either masked in or masked out
+    if (present(mask_2d)) then
+       mask = mask_2d
     else
        mask = 1
     endif
 
-    local_sum = 0
-    do j = nhalo+1, local_nsn-nhalo
-       do i = nhalo+1, local_ewn-nhalo
-          do k = 1, kmax
-             if (mask(k,i,j) == 1) then
-                local_sum = local_sum + a(k,i,j)
+    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
+
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = own_ewn*own_nsn*nz
+       nflds = 1
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
+
+       count = 0
+       do j = nhalo+1, local_nsn-nhalo
+          do i = nhalo+1, local_ewn-nhalo
+             if (mask(i,j) == 1) then
+                do k = 1, nz
+                   count = count + 1
+                   arr(count,1) = a(k,i,j)
+                enddo
+             else
+                do k = 1, nz
+                   count = count + 1
+                   arr(count,1) = 0.0d0
+                enddo
              endif
           enddo
        enddo
-    enddo
-    parallel_global_sum_real8_3d = parallel_reduce_sum(local_sum)
 
-    end associate
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
 
-  end function parallel_global_sum_real8_3d
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
 
-!=======================================================================
+       parallel_global_sum_real8_3d = arr_gsum(1)
 
-  ! subroutines belonging to the parallel_global_sum_staggered interface
-  !TODO - Turn these into functions, analogous to the parallel_global_sum functions above.
+       if (verbose_reprosum .and. main_task) then
+!          write(iulog,*) 'arr_gsum =', arr_gsum
+       endif
 
-  subroutine parallel_global_sum_staggered_3d_real8(&
-       nx,            ny,         &
-       nz,            parallel,   &
-       global_sum,                &
-       work1,  work2)
+       deallocate(arr)
+       deallocate(arr_gsum)
 
-    ! Sum one or two local arrays on the staggered grid, then take the global sum.
+    else  ! compute using parallel_reduce_sum (not reproducible)
 
-    integer, intent(in) :: &
-         nx, ny,             &  ! horizontal grid dimensions (for scalars)
-         nz                     ! number of vertical layers at which velocity is computed
+       local_sum = 0
+       do j = nhalo+1, local_nsn-nhalo
+          do i = nhalo+1, local_ewn-nhalo
+             if (mask(i,j) == 1) then
+                do k = 1, nz
+                   local_sum = local_sum + a(k,i,j)
+                enddo
+             endif
+          enddo
+       enddo
+       parallel_global_sum_real8_3d = parallel_reduce_sum(local_sum)
 
-    type(parallel_type), intent(in) :: &
-         parallel               ! info for parallel communication
+    endif   ! reprosum
 
-    real(dp), intent(out) :: global_sum   ! global sum
-    real(dp), intent(in), dimension(nz,nx-1,ny-1) :: work1            ! local array
-    real(dp), intent(in), dimension(nz,nx-1,ny-1), optional :: work2  ! local array
+    end associate
 
-    integer :: i, j, k
-    real(dp) :: local_sum
+  end function parallel_global_sum_real8_3d
 
-    integer :: &
-         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
-         staggered_jlo, staggered_jhi
+!=======================================================================
+  ! subroutines belonging to the parallel_global_sum_patch interface
 
-    staggered_ilo = parallel%staggered_ilo
-    staggered_ihi = parallel%staggered_ihi
-    staggered_jlo = parallel%staggered_jlo
-    staggered_jhi = parallel%staggered_jhi
+  function parallel_global_sum_patch_integer_2d(a, npatch, patch_id, parallel)
 
-    local_sum = 0.d0
+    ! Calculates the global sum of a 2D double-precision field over each
+    !  user-defined patch of the domain.
+    ! The number of patches = npatch.
+    ! Each cell has an integer ID assigning it to at most one patch.
+    ! If a cell has patch_id = 0, it belongs to no patches.
 
-    ! sum over locally owned velocity points
+    integer, dimension(:,:), intent(in) :: a
+    integer, intent(in) :: npatch
+    integer, dimension(:,:), intent(in) :: patch_id
+    type(parallel_type) :: parallel
 
-    if (present(work2)) then
-       do j = staggered_jlo, staggered_jhi
-          do i = staggered_ilo, staggered_ihi
-             do k = 1, nz
-                local_sum = local_sum + work1(k,i,j) + work2(k,i,j)
-             enddo
-          enddo
-       enddo
-    else
-       do j = staggered_jlo, staggered_jhi
-          do i = staggered_ilo, staggered_ihi
-             do k = 1, nz
-                local_sum = local_sum + work1(k,i,j)
-             enddo
-          enddo
+    integer :: i, j, np
+    integer, dimension(npatch) :: local_patch_sum
+    integer, dimension(npatch) :: parallel_global_sum_patch_integer_2d
+
+    associate(  &
+         local_ewn   => parallel%local_ewn,    &
+         local_nsn   => parallel%local_nsn)
+
+    local_patch_sum = 0
+
+    do j = nhalo+1, local_nsn-nhalo
+       do i = nhalo+1, local_ewn-nhalo
+          np = patch_id(i,j)
+          if (np > 0) then
+             local_patch_sum(np) = local_patch_sum(np) + a(i,j)
+          endif
        enddo
-    endif
+    enddo
 
-    ! take the global sum
+    parallel_global_sum_patch_integer_2d = parallel_reduce_sum(local_patch_sum)
 
-    global_sum = parallel_reduce_sum(local_sum)
+    end associate
 
-  end subroutine parallel_global_sum_staggered_3d_real8
+  end function parallel_global_sum_patch_integer_2d
 
 !=======================================================================
 
-  subroutine parallel_global_sum_staggered_3d_real8_nvar(&
-       nx,            ny,         &
-       nz,            parallel,   &
-       global_sum,                &
-       work1,  work2)
+  function parallel_global_sum_patch_real8_2d(a, npatch, patch_id, parallel)
 
-    ! Sum one or two local arrays on the staggered grid, then take the global sum.
+    ! Calculates the global sum of a 2D double-precision field over each
+    !  user-defined patch of the domain.
+    ! The number of patches = npatch.
+    ! Each cell has an integer ID assigning it to at most one patch.
+    ! If a cell has patch_id = 0, it belongs to no patches.
+    !TODO - Add a reprosum option with npatch = nflds
 
-    integer, intent(in) :: &
-         nx, ny,                &  ! horizontal grid dimensions (for scalars)
-         nz                        ! number of vertical layers at which velocity is computed
+    real(dp), dimension(:,:), intent(in) :: a
+    integer, intent(in) :: npatch
+    integer, dimension(:,:), intent(in) :: patch_id
+    type(parallel_type) :: parallel
 
-    type(parallel_type), intent(in) :: &
-         parallel               ! info for parallel communication
+    integer :: i, j, np
+    real(dp), dimension(npatch) :: local_patch_sum
+    real(dp), dimension(npatch) :: parallel_global_sum_patch_real8_2d
 
-    real(dp), intent(out), dimension(:) :: global_sum   ! global sum
+    ! variables for computing reproductible sums
+    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
 
-    real(dp), intent(in), dimension(nz,nx-1,ny-1,size(global_sum)) :: work1            ! local array
-    real(dp), intent(in), dimension(nz,nx-1,ny-1,size(global_sum)), optional :: work2  ! local array
+    associate(  &
+         local_ewn   => parallel%local_ewn,    &
+         local_nsn   => parallel%local_nsn,    &
+         own_ewn     => parallel%own_ewn,      &
+         own_nsn     => parallel%own_nsn)
 
-    integer :: i, j, k, n, nvar
-    real(dp), dimension(size(global_sum)) :: local_sum
+    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
 
-    integer :: &
-         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
-         staggered_jlo, staggered_jhi
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = own_ewn*own_nsn
+       nflds = npatch
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
 
-    staggered_ilo = parallel%staggered_ilo
-    staggered_ihi = parallel%staggered_ihi
-    staggered_jlo = parallel%staggered_jlo
-    staggered_jhi = parallel%staggered_jhi
+       count = 0
+       arr(:,:) = 0.0d0
 
-    nvar = size(global_sum)
+       do j = nhalo+1, local_nsn-nhalo
+          do i = nhalo+1, local_ewn-nhalo
+             count = count + 1
+             np = patch_id(i,j)
+             if (np > 0) then
+                arr(count,np) = a(i,j)
+             endif
+          enddo
+       enddo
 
-    local_sum(:) = 0.d0
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
 
-    do n = 1, nvar
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
 
-       ! sum over locally owned velocity points
+       parallel_global_sum_patch_real8_2d(:) = arr_gsum(:)
 
-       if (present(work2)) then
-          do j = staggered_jlo, staggered_jhi
-             do i = staggered_ilo, staggered_ihi
-                do k = 1, nz
-                   local_sum(n) = local_sum(n) + work1(k,i,j,n) + work2(k,i,j,n)
-                enddo
-             enddo
-          enddo
-       else
-          do j = staggered_jlo, staggered_jhi
-             do i = staggered_ilo, staggered_ihi
-                do k = 1, nz
-                   local_sum(n) = local_sum(n) + work1(k,i,j,n)
-                enddo
-             enddo
-          enddo
+       if (verbose_reprosum .and. main_task) then
+!          write(iulog,*) 'arr_gsum =', arr_gsum
        endif
 
-    enddo   ! nvar
+       deallocate(arr)
+       deallocate(arr_gsum)
 
-    ! take the global sum
+    else   ! compute using parallel_reduce_sum (not reproducible)
 
-    global_sum(:) = parallel_reduce_sum(local_sum(:))
+       local_patch_sum = 0.0d0
 
-  end subroutine parallel_global_sum_staggered_3d_real8_nvar
+       do j = nhalo+1, local_nsn-nhalo
+          do i = nhalo+1, local_ewn-nhalo
+             np = patch_id(i,j)
+             if (np > 0) then
+                local_patch_sum(np) = local_patch_sum(np) + a(i,j)
+             endif
+          enddo
+       enddo
 
-!=======================================================================
+       parallel_global_sum_patch_real8_2d = parallel_reduce_sum(local_patch_sum)
 
-  subroutine parallel_global_sum_staggered_2d_real8(&
-       nx,            ny,    &
-       parallel,             &
-       global_sum,           &
-       work1,  work2)
+    endif  ! reprosum
 
-    ! Sum one or two local arrays on the staggered grid, then take the global sum.
+    end associate
 
-    integer, intent(in) :: &
-         nx, ny                     ! horizontal grid dimensions (for scalars)
+  end function parallel_global_sum_patch_real8_2d
 
-    type(parallel_type), intent(in) :: &
-         parallel               ! info for parallel communication
+!=======================================================================
+  ! subroutines belonging to the parallel_global_sum_stagger interface
 
-    real(dp), intent(out) :: global_sum   ! global sum
+  function parallel_global_sum_stagger_real8_2d(arr1, parallel, arr2)
 
-    real(dp), intent(in), dimension(nx-1,ny-1) :: work1            ! local array
-    real(dp), intent(in), dimension(nx-1,ny-1), optional :: work2  ! local array
+    ! Calculate the global sum of a 2D double-precision field on the staggered grid
+    ! Similar to unstagged version, except it uses staggered_ilo/ihi/jlo/jhi
+
+    real(dp), dimension(:,:), intent(in) :: arr1
+    type(parallel_type) :: parallel
+    real(dp), dimension(:,:), intent(in), optional :: arr2
 
     integer :: i, j
     real(dp) :: local_sum
+    real(dp) :: parallel_global_sum_stagger_real8_2d
 
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
 
+    ! variables for computing reproductible sums
+    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
+
+    !TODO - associate
     staggered_ilo = parallel%staggered_ilo
     staggered_ihi = parallel%staggered_ihi
     staggered_jlo = parallel%staggered_jlo
     staggered_jhi = parallel%staggered_jhi
 
-    local_sum = 0.d0
+    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
+
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1)
+       nflds = 1
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
+
+       arr(:,:) = 0.0d0
+
+       if (present(arr2)) then  ! compute global sum of arr1 + arr2
 
-    ! sum over locally owned velocity points
+          count = 0
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                count = count + 1
+                arr(count,1) = arr1(i,j) + arr2(i,j)
+             enddo
+          enddo
 
-    if (present(work2)) then
-       do j = staggered_jlo, staggered_jhi
-          do i = staggered_ilo, staggered_ihi
-             local_sum = local_sum + work1(i,j) + work2(i,j)
+       else  ! compute global sum of arr1
+
+          count = 0
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                count = count + 1
+                arr(count,1) = arr1(i,j)
+             enddo
           enddo
-       enddo
-    else
-       do j = staggered_jlo, staggered_jhi
-          do i = staggered_ilo, staggered_ihi
-             local_sum = local_sum + work1(i,j)
+
+       endif
+
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
+
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
+
+       parallel_global_sum_stagger_real8_2d = arr_gsum(1)
+
+       if (verbose_reprosum .and. main_task) then
+!          write(iulog,*) 'arr_gsum =', arr_gsum
+       endif
+
+       deallocate(arr)
+       deallocate(arr_gsum)
+
+    else  ! compute using parallel_reduce_sum (not reproducible)
+
+       local_sum = 0.0d0
+
+       if (present(arr2)) then  ! compute global sum of arr1 + arr2
+
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                local_sum = local_sum + arr1(i,j) + arr2(i,j)
+             enddo
           enddo
-       enddo
-    endif
 
-    ! take the global sum
+       else  ! compute global sum of arr1
 
-    global_sum = parallel_reduce_sum(local_sum)
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                local_sum = local_sum + arr1(i,j)
+             enddo
+          enddo
 
-  end subroutine parallel_global_sum_staggered_2d_real8
+       endif
 
-!=======================================================================
+       parallel_global_sum_stagger_real8_2d = parallel_reduce_sum(local_sum)
 
-  subroutine parallel_global_sum_staggered_2d_real8_nvar(&
-       nx,            ny,            &
-       parallel,                     &
-       global_sum,                   &
-       work1,  work2)
+    endif   ! reprosum
 
-    ! Sum one or two local arrays on the staggered grid, then take the global sum.
+  end function parallel_global_sum_stagger_real8_2d
 
-    integer, intent(in) :: &
-         nx, ny              ! horizontal grid dimensions (for scalars)
+!=======================================================================
 
-    type(parallel_type), intent(in) :: &
-         parallel               ! info for parallel communication
+  function parallel_global_sum_stagger_real8_3d(arr1, parallel, arr2)
 
-    real(dp), intent(out), dimension(:) :: &
-         global_sum          ! global sum
+    ! Calculate the global sum of a 3D double-precision field on the staggered grid
+    ! Assumes k is the first index, followed by i and j
 
-    real(dp), intent(in), dimension(nx-1,ny-1,size(global_sum)) :: work1            ! local array
-    real(dp), intent(in), dimension(nx-1,ny-1,size(global_sum)), optional :: work2  ! local array
+    real(dp), dimension(:,:,:), intent(in) :: arr1
+    type(parallel_type) :: parallel
+    real(dp), dimension(:,:,:), intent(in), optional :: arr2
 
-    integer :: i, j, n, nvar
+    integer :: i, j, k, nz
+    real(dp) :: local_sum
+    real(dp) :: parallel_global_sum_stagger_real8_3d
 
-    real(dp), dimension(size(global_sum)) :: local_sum
+    ! variables for computing reproductible sums
+    integer :: nsummands, nflds      ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
 
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
 
+    !TODO - Associate these variables (and not the ones above)
     staggered_ilo = parallel%staggered_ilo
     staggered_ihi = parallel%staggered_ihi
     staggered_jlo = parallel%staggered_jlo
     staggered_jhi = parallel%staggered_jhi
 
-    nvar = size(global_sum)
+    nz = size(arr1,1)
+
+    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
 
-    local_sum(:) = 0.d0
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) * nz
+       nflds = 1
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
 
-    do n = 1, nvar
+       arr(:,:) = 0.0d0
 
-       ! sum over locally owned velocity points
+       if (present(arr2)) then  ! compute global sum of arr1 + arr2
 
-       if (present(work2)) then
+          count = 0
           do j = staggered_jlo, staggered_jhi
              do i = staggered_ilo, staggered_ihi
-                local_sum(n) = local_sum(n) + work1(i,j,n) + work2(i,j,n)
+                do k = 1, nz
+                   count = count + 1
+                   arr(count,1) = arr1(k,i,j) + arr2(k,i,j)
+                enddo
              enddo
           enddo
-       else
+
+       else  ! compute global sum of arr1
+
+          count = 0
           do j = staggered_jlo, staggered_jhi
              do i = staggered_ilo, staggered_ihi
-                local_sum(n) = local_sum(n) + work1(i,j,n)
+                do k = 1, nz
+                   count = count + 1
+                   arr(count,1) = arr1(k,i,j)
+                enddo
              enddo
           enddo
-       endif
 
-    enddo   ! nvar
-
-    ! take the global sum
+       endif
 
-    global_sum(:) = parallel_reduce_sum(local_sum(:))
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
 
-  end subroutine parallel_global_sum_staggered_2d_real8_nvar
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
 
-!=======================================================================
+       parallel_global_sum_stagger_real8_3d = arr_gsum(1)
 
-  ! functions belonging to the parallel_is_zero interface
+       if (verbose_reprosum .and. main_task) then
+!          write(iulog,*) 'arr_gsum =', arr_gsum
+       endif
 
-  function parallel_is_zero_integer_2d(a)
+       deallocate(arr)
+       deallocate(arr_gsum)
 
-    ! returns .true. if the field has all zero values, else returns .false.
+    else  ! compute using parallel_reduce_sum (not reproducible)
 
-    integer, dimension(:,:), intent(in) :: a
-    logical :: parallel_is_zero_integer_2d
+       local_sum = 0.0d0
 
-    real(dp) :: maxval_a
+       if (present(arr2)) then  ! compute global sum of arr1 + arr2
 
-    maxval_a = maxval(abs(a))
-    maxval_a = parallel_reduce_max(maxval_a)
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                do k = 1, nz
+                   local_sum = local_sum + arr1(k,i,j) + arr2(k,i,j)
+                enddo
+             enddo
+          enddo
+
+       else  ! compute global sum of arr1
+
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                do k = 1, nz
+                   local_sum = local_sum + arr1(k,i,j)
+                enddo
+             enddo
+          enddo
+
+       endif
+
+       parallel_global_sum_stagger_real8_3d = parallel_reduce_sum(local_sum)
+
+    endif   ! reprosum
+
+  end function parallel_global_sum_stagger_real8_3d
+
+!=======================================================================
+
+  function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2)
+
+    ! Sum one or two local arrays on the staggered grid, then take the global sum.
+    ! The final index is equal to the number of independent fields to be summed.
+    !TODO - Don't have to pass in nflds, since it equals size(a,3)?
+
+    real(dp), dimension(:,:,:), intent(in) :: arr1
+
+    integer, intent(in) :: nflds
+
+    type(parallel_type), intent(in) :: &
+         parallel               ! info for parallel communication
+
+    real(dp), dimension(:,:,:), intent(in), optional :: arr2
+
+    real(dp), dimension(size(arr1,3)) :: parallel_global_sum_stagger_real8_2d_nflds
+
+    integer :: i, j, n
+
+    real(dp), dimension(size(arr1,3)) :: local_sum
+
+    integer :: &
+         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
+         staggered_jlo, staggered_jhi
+
+    ! variables for computing reproductible sums
+    integer :: nsummands               ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
+
+    staggered_ilo = parallel%staggered_ilo
+    staggered_ihi = parallel%staggered_ihi
+    staggered_jlo = parallel%staggered_jlo
+    staggered_jhi = parallel%staggered_jhi
+
+    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
+
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1)
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
+
+       arr(:,:) = 0.0d0
+
+       do n = 1, nflds
+
+          if (present(arr2)) then  ! compute global sum of arr1 + arr2
+             count = 0
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   count = count + 1
+                   arr(count,n) = arr1(i,j,n) + arr2(i,j,n)
+                enddo
+             enddo
+          else  ! compute global sum of arr1
+             count = 0
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   count = count + 1
+                   arr(count,n) = arr1(i,j,n)
+                enddo
+             enddo
+          endif
+
+       enddo   ! nflds
+
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
+
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
+
+       parallel_global_sum_stagger_real8_2d_nflds = arr_gsum(:)
+
+       if (verbose_reprosum .and. main_task) then
+!          write(iulog,*) 'arr_gsum =', arr_gsum
+       endif
+
+       deallocate(arr)
+       deallocate(arr_gsum)
+
+    else  ! compute using parallel_reduce_sum (not reproducible)
+
+       local_sum(:) = 0.d0
+
+       do n = 1, nflds
+
+          ! sum over locally owned velocity points
+
+          if (present(arr2)) then
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   local_sum(n) = local_sum(n) + arr1(i,j,n) + arr2(i,j,n)
+                enddo
+             enddo
+          else
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   local_sum(n) = local_sum(n) + arr1(i,j,n)
+                enddo
+             enddo
+          endif
+
+       enddo   ! nflds
+
+       parallel_global_sum_stagger_real8_2d_nflds = parallel_reduce_sum(local_sum(:))
+
+    endif   ! reprosum
+
+  end function parallel_global_sum_stagger_real8_2d_nflds
+
+!=======================================================================
+
+  function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2)
+
+    ! Sum one or two local arrays on the staggered grid, then take the global sum.
+    ! Assumes k is the first index, followed by i and j.
+    ! The final index is equal to the number of independent fields to be summed.
+
+    real(dp), dimension(:,:,:,:), intent(in) :: arr1
+
+    integer, intent(in) :: nflds   ! size of final index; number of global sums to be computed
+
+    type(parallel_type), intent(in) :: &
+         parallel               ! info for parallel communication
+
+    real(dp), dimension(:,:,:,:), intent(in), optional :: arr2
+
+    real(dp), dimension(size(arr1,4)) :: parallel_global_sum_stagger_real8_3d_nflds
+
+    integer :: i, j, k, n, nz
+
+    real(dp), dimension(size(arr1,4)) :: local_sum
+
+    integer :: &
+         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
+         staggered_jlo, staggered_jhi
+
+    ! variables for computing reproductible sums
+    integer :: nsummands               ! dimensions of array passed to parallel_reduce_reprosum
+    integer :: count
+    real(dp), dimension(:,:), allocatable :: arr
+    real(dp), dimension(:), allocatable :: arr_gsum
+
+    !TODO - Associate these variables
+    staggered_ilo = parallel%staggered_ilo
+    staggered_ihi = parallel%staggered_ihi
+    staggered_jlo = parallel%staggered_jlo
+    staggered_jhi = parallel%staggered_jhi
+
+    nz = size(arr1,1)
+
+    if (parallel%reprosum) then   ! compute using cism_reprosum_calc
+
+       ! Allocate and fill arrays to pass to parallel_reduce_reprosum
+       nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) * nz
+       allocate(arr(nsummands,nflds))
+       allocate(arr_gsum(nflds))
+
+       arr(:,:) = 0.0d0
+
+       do n = 1, nflds
+
+          if (present(arr2)) then  ! compute global sum of arr1 + arr2
+             count = 0
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   do k = 1, nz
+                      count = count + 1
+                      arr(count,n) = arr1(k,i,j,n) + arr2(k,i,j,n)
+                   enddo
+                enddo
+             enddo
+          else  ! compute global sum of arr1
+             count = 0
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   do k = 1, nz
+                      count = count + 1
+                      arr(count,n) = arr1(k,i,j,n)
+                   enddo
+                enddo
+             enddo
+          endif
+
+       enddo   ! nflds
+
+       ! bug check
+       if (count /= nsummands) then
+          if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands
+          call parallel_stop(__FILE__,__LINE__)
+       endif
+
+       ! Call parallel_reduce_reprosum
+       call parallel_reduce_reprosum(arr, arr_gsum)
+
+       parallel_global_sum_stagger_real8_3d_nflds = arr_gsum(:)
+
+       if (verbose_reprosum .and. main_task) then
+!          write(iulog,*) 'arr_gsum =', arr_gsum
+       endif
+
+       deallocate(arr)
+       deallocate(arr_gsum)
+
+    else  ! compute using parallel_reduce_sum (not reproducible)
+
+       local_sum(:) = 0.d0
+
+       do n = 1, nflds
+
+          ! sum over locally owned velocity points
+
+          if (present(arr2)) then
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   do k = 1, nz
+                      local_sum(n) = local_sum(n) + arr1(k,i,j,n) + arr2(k,i,j,n)
+                   enddo
+                enddo
+             enddo
+          else
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   do k = 1, nz
+                      local_sum(n) = local_sum(n) + arr1(k,i,j,n)
+                   enddo
+                enddo
+             enddo
+          endif
+
+       enddo   ! nflds
+
+       parallel_global_sum_stagger_real8_3d_nflds = parallel_reduce_sum(local_sum(:))
+
+    endif   ! reprosum
+
+  end function parallel_global_sum_stagger_real8_3d_nflds
+
+!=======================================================================
+  ! functions belonging to the parallel_is_zero interface
+
+  function parallel_is_zero_integer_1d(a)
+
+    ! returns .true. if the field has all zero values, else returns .false.
+
+    integer, dimension(:), intent(in) :: a
+    logical :: parallel_is_zero_integer_1d
+
+    integer :: maxval_a
+
+    maxval_a = maxval(abs(a))
+    maxval_a = parallel_reduce_max(maxval_a)
+    if (maxval_a > 0) then
+       parallel_is_zero_integer_1d = .false.
+    else
+       parallel_is_zero_integer_1d = .true.
+    endif
+
+  end function parallel_is_zero_integer_1d
+
+!=======================================================================
+
+  function parallel_is_zero_real8_1d(a)
+
+    ! returns .true. if the field has all zero values, else returns .false.
+
+    real(dp), dimension(:), intent(in) :: a
+    logical :: parallel_is_zero_real8_1d
+
+    real(dp) :: maxval_a
+
+    maxval_a = maxval(abs(a))
+    maxval_a = parallel_reduce_max(maxval_a)
+    if (maxval_a > 0.0d0) then
+       parallel_is_zero_real8_1d = .false.
+    else
+       parallel_is_zero_real8_1d = .true.
+    endif
+
+  end function parallel_is_zero_real8_1d
+
+!=======================================================================
+
+  function parallel_is_zero_integer_2d(a)
+
+    ! returns .true. if the field has all zero values, else returns .false.
+
+    integer, dimension(:,:), intent(in) :: a
+    logical :: parallel_is_zero_integer_2d
+
+    integer :: maxval_a
+
+    maxval_a = maxval(abs(a))
+    maxval_a = parallel_reduce_max(maxval_a)
     if (maxval_a > 0) then
        parallel_is_zero_integer_2d = .false.
     else
@@ -7069,26 +7611,35 @@ subroutine parallel_halo_real4_2d(a, parallel)
   end subroutine parallel_halo_real4_2d
 
 
-  subroutine parallel_halo_real8_2d(a, parallel, periodic_offset_ew, periodic_offset_ns)
+  subroutine parallel_halo_real8_2d(a, parallel, &
+                                    periodic_offset_ew, periodic_offset_ns, zero_global_boundary_no_ice_bc)
 
-    !WHL - added optional arguments for periodic offsets, to support ismip-hom test cases
+    ! Added optional arguments for periodic offsets, to support ismip-hom test cases
+    ! Also added an optional argument related to the no_ice BCs
 
     use mpi_mod
     implicit none
     real(dp),dimension(:,:) :: a
     type(parallel_type) :: parallel
+
     real(dp), intent(in), optional :: &
-       periodic_offset_ew,  &! offset halo values by this amount
-                             ! if positive, the offset is positive for W halo, negative for E halo
-       periodic_offset_ns    ! offset halo values by this amount
-                             ! if positive, the offset is positive for S halo, negative for N halo
+         periodic_offset_ew,     &! offset halo values by this amount
+                                  ! if positive, the offset is positive for W halo, negative for E halo
+         periodic_offset_ns       ! offset halo values by this amount
+                                  ! if positive, the offset is positive for S halo, negative for N halo
     
+    logical, intent(in), optional :: &
+         zero_global_boundary_no_ice_bc  ! if true, then zero out values in grid cells adjacent
+                                         ! to the global boundary when using no_ice BCs
+
     integer :: erequest,ierror,nrequest,srequest,wrequest
     real(dp),dimension(lhalo, parallel%local_nsn-lhalo-uhalo) :: esend,wrecv
     real(dp),dimension(uhalo, parallel%local_nsn-lhalo-uhalo) :: erecv,wsend
     real(dp),dimension(parallel%local_ewn, lhalo) :: nsend,srecv
     real(dp),dimension(parallel%local_ewn, uhalo) :: nrecv,ssend
 
+    logical :: zero_global_boundary_no_ice  ! local version of zero_global_boundary_no_ice_bc
+
     ! begin
     associate(  &
          outflow_bc  => parallel%outflow_bc,   &
@@ -7105,6 +7656,12 @@ subroutine parallel_halo_real8_2d(a, parallel, periodic_offset_ew, periodic_offs
          northwest_corner  => parallel%northwest_corner    &
          )
 
+    if (present(zero_global_boundary_no_ice_bc)) then
+       zero_global_boundary_no_ice = zero_global_boundary_no_ice_bc
+    else
+       zero_global_boundary_no_ice = .true.
+    endif
+
     ! staggered grid
     if (size(a,1)==local_ewn-1.and.size(a,2)==local_nsn-1) return
 
@@ -7194,31 +7751,53 @@ subroutine parallel_halo_real8_2d(a, parallel, periodic_offset_ew, periodic_offs
 
     elseif (no_ice_bc) then
 
-       ! Set values to zero in cells adjacent to the global boundary;
-       ! includes halo cells and one row of locally owned cells
+       if (zero_global_boundary_no_ice) then
 
-       if (this_rank >= east) then  ! at east edge of global domain
-          a(local_ewn-uhalo:,:) = 0.d0
-       endif
+          ! Set values to zero in cells adjacent to the global boundary;
+          ! includes halo cells and one row of locally owned cells.
 
-       if (this_rank <= west) then  ! at west edge of global domain
-          a(:lhalo+1,:) = 0.d0
-       endif
+          if (this_rank >= east) then  ! at east edge of global domain
+             a(local_ewn-uhalo:,:) = 0.d0
+          endif
 
-       if (this_rank >= north) then  ! at north edge of global domain
-          a(:,local_nsn-uhalo:) = 0.d0
-       endif
+          if (this_rank <= west) then  ! at west edge of global domain
+             a(:lhalo+1,:) = 0.d0
+          endif
 
-       if (this_rank <= south) then  ! at south edge of global domain
-          a(:,:lhalo+1) = 0.d0
-       endif
+          if (this_rank >= north) then  ! at north edge of global domain
+             a(:,local_nsn-uhalo:) = 0.d0
+          endif
 
-       ! Some interior blocks have a single cell at a corner of the global boundary.
-       ! Set values in corner cells to zero, along with adjacent halo cells.
-       if (southwest_corner) a(:lhalo+1,:lhalo+1) = 0.d0
-       if (southeast_corner) a(local_ewn-lhalo:,:lhalo+1) = 0.d0
-       if (northeast_corner) a(local_ewn-lhalo:,local_nsn-lhalo:) = 0.d0
-       if (northwest_corner) a(:lhalo+1,local_nsn-lhalo:) = 0.d0
+          if (this_rank <= south) then  ! at south edge of global domain
+             a(:,:lhalo+1) = 0.d0
+          endif
+
+          ! Some interior blocks have a single cell at a corner of the global boundary.
+          ! Set values in corner cells to zero, along with adjacent halo cells.
+          if (southwest_corner) a(:lhalo+1,:lhalo+1) = 0.d0
+          if (southeast_corner) a(local_ewn-lhalo:,:lhalo+1) = 0.d0
+          if (northeast_corner) a(local_ewn-lhalo:,local_nsn-lhalo:) = 0.d0
+          if (northwest_corner) a(:lhalo+1,local_nsn-lhalo:) = 0.d0
+
+       else  ! set values to zero in halo cells but not in locally owned cells
+
+          if (this_rank >= east) then  ! at east edge of global domain
+             a(local_ewn-uhalo+1:,:) = 0.d0
+          endif
+
+          if (this_rank <= west) then  ! at west edge of global domain
+             a(:lhalo,:) = 0.d0
+          endif
+
+          if (this_rank >= north) then  ! at north edge of global domain
+             a(:,local_nsn-uhalo+1:) = 0.d0
+          endif
+
+          if (this_rank <= south) then  ! at south edge of global domain
+             a(:,:lhalo) = 0.d0
+          endif
+
+       endif   ! zero_global_boundary_no_ice
 
     endif   ! outflow or no_ice bc
 
@@ -7474,6 +8053,165 @@ subroutine parallel_halo_real8_4d(a, parallel)
 
   end subroutine parallel_halo_real8_4d
 
+
+  subroutine parallel_halo_integer8_4d(a, parallel)
+
+    use mpi_mod
+    implicit none
+    integer(i8),dimension(:,:,:,:) :: a
+    type(parallel_type) :: parallel
+
+    integer :: erequest,ierror,one,nrequest,srequest,wrequest
+    integer(i8),dimension(size(a,1), size(a,2), lhalo, parallel%local_nsn-lhalo-uhalo) :: esend,wrecv
+    integer(i8),dimension(size(a,1), size(a,2), uhalo, parallel%local_nsn-lhalo-uhalo) :: erecv,wsend
+    integer(i8),dimension(size(a,1), size(a,2), parallel%local_ewn, lhalo) :: nsend,srecv
+    integer(i8),dimension(size(a,1), size(a,2), parallel%local_ewn, uhalo) :: nrecv,ssend
+
+    ! begin
+    associate(  &
+         outflow_bc  => parallel%outflow_bc,   &
+         no_ice_bc   => parallel%no_ice_bc,    &
+         local_ewn   => parallel%local_ewn,    &
+         local_nsn   => parallel%local_nsn,    &
+         east        => parallel%east,         &
+         west        => parallel%west,         &
+         north       => parallel%north,        &
+         south       => parallel%south,        &
+         southwest_corner  => parallel%southwest_corner,   &
+         southeast_corner  => parallel%southeast_corner,   &
+         northeast_corner  => parallel%northeast_corner,   &
+         northwest_corner  => parallel%northwest_corner    &
+         )
+
+    ! staggered grid
+    if (size(a,3)==local_ewn-1.and.size(a,4)==local_nsn-1) return
+
+    ! unknown grid
+    if (size(a,3)/=local_ewn.or.size(a,4)/=local_nsn) then
+         write(iulog,*) "Unknown Grid: Size a=(", size(a,1), ",", size(a,2), ",", size(a,3), ",", size(a,4), ") &
+                 &and local_ewn and local_nsn = ", local_ewn, ",", local_nsn
+         call parallel_stop(__FILE__,__LINE__)
+    endif
+
+    ! unstaggered grid
+    call mpi_irecv(wrecv,size(wrecv),mpi_real8,west,west,&
+         comm,wrequest,ierror)
+    call mpi_irecv(erecv,size(erecv),mpi_real8,east,east,&
+         comm,erequest,ierror)
+    call mpi_irecv(srecv,size(srecv),mpi_real8,south,south,&
+         comm,srequest,ierror)
+    call mpi_irecv(nrecv,size(nrecv),mpi_real8,north,north,&
+         comm,nrequest,ierror)
+
+    esend(:,:,:,:) = &
+         a(:,:,local_ewn-uhalo-lhalo+1:local_ewn-uhalo,1+lhalo:local_nsn-uhalo)
+    call mpi_send(esend,size(esend),mpi_real8,east,this_rank,comm,ierror)
+    wsend(:,:,:,:) = a(:,:,1+lhalo:1+lhalo+uhalo-1,1+lhalo:local_nsn-uhalo)
+    call mpi_send(wsend,size(wsend),mpi_real8,west,this_rank,comm,ierror)
+
+    call mpi_wait(wrequest,mpi_status_ignore,ierror)
+    a(:,:,:lhalo,1+lhalo:local_nsn-uhalo) = wrecv(:,:,:,:)
+    call mpi_wait(erequest,mpi_status_ignore,ierror)
+    a(:,:,local_ewn-uhalo+1:,1+lhalo:local_nsn-uhalo) = erecv(:,:,:,:)
+
+    nsend(:,:,:,:) = a(:,:,:,local_nsn-uhalo-lhalo+1:local_nsn-uhalo)
+    call mpi_send(nsend,size(nsend),mpi_real8,north,this_rank,comm,ierror)
+    ssend(:,:,:,:) = a(:,:,:,1+lhalo:1+lhalo+uhalo-1)
+    call mpi_send(ssend,size(ssend),mpi_real8,south,this_rank,comm,ierror)
+
+    call mpi_wait(srequest,mpi_status_ignore,ierror)
+    a(:,:,:,:lhalo) = srecv(:,:,:,:)
+    call mpi_wait(nrequest,mpi_status_ignore,ierror)
+    a(:,:,:,local_nsn-uhalo+1:) = nrecv(:,:,:,:)
+
+    if (outflow_bc) then   ! set values in global halo to zero
+                           ! interior halo cells should not be affected
+
+       if (this_rank >= east) then  ! at east edge of global domain
+          a(:,:,local_ewn-uhalo+1:,:) = 0
+       endif
+
+       if (this_rank <= west) then  ! at west edge of global domain
+          a(:,:,:lhalo,:) = 0
+       endif
+
+       if (this_rank >= north) then  ! at north edge of global domain
+          a(:,:,:,local_nsn-uhalo+1:) = 0
+       endif
+
+       if (this_rank <= south) then  ! at south edge of global domain
+          a(:,:,:,:lhalo) = 0
+       endif
+
+    elseif (no_ice_bc) then
+
+       ! Set values to zero in cells adjacent to the global boundary;
+       ! includes halo cells and one row of locally owned cells
+
+       if (this_rank >= east) then  ! at east edge of global domain
+          a(:,:,local_ewn-uhalo:,:) = 0
+       endif
+
+       if (this_rank <= west) then  ! at west edge of global domain
+          a(:,:,:lhalo+1,:) = 0
+       endif
+
+       if (this_rank >= north) then  ! at north edge of global domain
+          a(:,:,:,local_nsn-uhalo:) = 0
+       endif
+
+       if (this_rank <= south) then  ! at south edge of global domain
+          a(:,:,:,:lhalo+1) = 0
+       endif
+
+       ! Some interior blocks have a single cell at a corner of the global boundary.
+       ! Set values in corner cells to zero, along with adjacent halo cells.
+       if (southwest_corner) a(:,:,:lhalo+1,:lhalo+1) = 0
+       if (southeast_corner) a(:,:,local_ewn-lhalo:,:lhalo+1) = 0
+       if (northeast_corner) a(:,:,local_ewn-lhalo:,local_nsn-lhalo:) = 0
+       if (northwest_corner) a(:,:,:lhalo+1,local_nsn-lhalo:) = 0
+
+    endif   ! outflow or no_ice bc
+
+    end associate
+
+  end subroutine parallel_halo_integer8_4d
+
+!=======================================================================
+
+  ! subroutines for 1D halo updates
+
+  subroutine parallel_halo_extrapolate_real8_1d(a, parallel, interval_in)
+
+    !Note: Extrapolate a 1D real8 variable into halo cells to the east and west.
+    !      Currently used only to compute halo values for grid cell coordinates.
+
+    use mpi_mod
+    implicit none
+    real(dp),dimension(:) :: a
+    type(parallel_type) :: parallel
+    real(dp),intent(in), optional :: &
+         interval_in  ! uniform difference between adjacent values, e.g. grid cell size dew or dns
+
+    integer :: i
+    real(dp) :: interval  ! local version of interval_in
+
+    if (present(interval_in)) then
+       interval = interval_in
+    else
+       interval = 0.0d0
+    endif
+
+    do i = 1, lhalo
+       a(i) = a(lhalo+1) - interval*(lhalo+1-i)
+    enddo
+
+    do i = size(a)-uhalo+1, size(a)
+       a(i) = a(size(a)-uhalo) + interval*(uhalo+i-size(a))
+    enddo
+
+  end subroutine parallel_halo_extrapolate_real8_1d
+
 !=======================================================================
 
   ! subroutines belonging to the parallel_halo_extrapolate interface
@@ -8681,6 +9419,23 @@ function parallel_reduce_sum_integer(x)
   end function parallel_reduce_sum_integer
 
 
+  function parallel_reduce_sum_integer8(x)
+
+    use mpi_mod
+    implicit none
+    integer(i8) :: x
+
+    integer :: ierror
+    integer(i8) :: recvbuf,sendbuf, parallel_reduce_sum_integer8
+
+    ! begin
+    sendbuf = x
+    call mpi_allreduce(sendbuf,recvbuf,1,mpi_integer8,mpi_sum,comm,ierror)
+    parallel_reduce_sum_integer8 = recvbuf
+
+  end function parallel_reduce_sum_integer8
+
+
   function parallel_reduce_sum_real4(x)
 
     use mpi_mod
@@ -8701,13 +9456,20 @@ end function parallel_reduce_sum_real4
   function parallel_reduce_sum_real8(x)
 
     use mpi_mod
+
     implicit none
+
     real(dp) :: x
 
     integer :: ierror
     real(dp) :: recvbuf,sendbuf, parallel_reduce_sum_real8
 
+    ! Input and output arguments for subroutine cism_reprosum_calc
+    real(dp), dimension(:,:), allocatable :: arr       ! array to be summed over processors
+    real(dp), dimension(:), allocatable :: arr_gsum    ! global sum of arr
+
     ! begin
+
     sendbuf = x
     call mpi_allreduce(sendbuf,recvbuf,1,mpi_real8,mpi_sum,comm,ierror)
     parallel_reduce_sum_real8 = recvbuf
@@ -8742,14 +9504,128 @@ function parallel_reduce_sum_real8_nvar(x)
     integer :: ierror, nvar
     real(dp), dimension(size(x)) :: recvbuf,sendbuf, parallel_reduce_sum_real8_nvar
 
+    ! Input and output arguments for subroutine cism_reprosum_calc
+    real(dp), dimension(:,:), allocatable :: arr       ! array to be summed over processors
+    real(dp), dimension(:), allocatable :: arr_gsum    ! global sum of arr
+
     ! begin
     nvar = size(x)
+
     sendbuf = x
     call mpi_allreduce(sendbuf,recvbuf,nvar,mpi_real8,mpi_sum,comm,ierror)
     parallel_reduce_sum_real8_nvar = recvbuf
 
   end function parallel_reduce_sum_real8_nvar
 
+!=======================================================================
+
+  subroutine parallel_reduce_reprosum(arr, arr_gsum)
+
+    ! Compute a reproducible global sum for a floating-point variable or array.
+    ! Can be called from parallel_global_sum, parallel_global_sum_patch, or
+    ! parallel_global_sum_stagger.
+
+    implicit none
+
+    real(dp), dimension(:,:), intent(in) :: arr
+    real(dp), dimension(:), intent(out) :: arr_gsum
+
+    ! Notes on subroutine cism_reprosum_calc:
+    ! The first five arguments are required: arr(dsummands,nflds), arr_gsum(dsummands),
+    !  dsummands, dflds and nsummands. Typically, nsummands = dsummands = number of local values.
+    ! We use the default fixed-precision algorithm (instead of ddpdd, which apparently is less robust).
+    ! We do not allow Inf or NaN values in the input array.
+    ! Typically, the algorithm calls mpi_allreduce twice. By passing in both arr_gbl_max and arr_max_levels,
+    !  it may be possible to call mpi_allreduce just once, improving performance.
+    !  If we don't pass these arguments, then arr_gbl_max and arr_max_levels are computed internally..
+    !  By passing arr_glb_max_out and arr_max_levels_out, we can see the calculated values.
+    ! By passing rel_diff, we can verify that the computed reproducible sum is close
+    !  to the (nonreproducible) floating-point value.
+    !
+!!    !    commid                    ! MPI communicator
+    ! See comments in cism_reprosum_calc for more info
+
+    ! Required arguments
+
+    integer :: dsummands, nflds                      ! dimensions of arr
+    integer :: nsummands                             ! number of processors
+
+    ! Optional arguments
+
+    real(dp), dimension(:), allocatable :: &
+         arr_gbl_max,         & ! upper bound on max(abs(arr))
+         arr_gbl_max_out        ! calculated upper bound on max(abs(arr))
+
+    real(dp), dimension(:,:), allocatable :: &
+         rel_diff               ! relative and absolute differences between fixed and floating point sums
+
+    integer, dimension(:), allocatable :: &
+         arr_max_levels,      & ! maximum number of levels of integer expansion to use
+         arr_max_levels_out     ! output of number of levels of integer expansion to use
+
+    integer :: &
+         gbl_max_nsummands,   & ! maximum of nsummand over all processes
+         gbl_max_nsummands_out  ! calculated maximum nsummands over all processes
+
+    integer, dimension(6) :: &
+         repro_sum_stats(6)     ! increment running totals for
+                                !  (1) one-reduction repro_sum
+                                !  (2) two-reduction repro_sum
+                                !  (3) both types in one call
+                                !  (4) nonrepro_sum
+                                !  (5) global max nsummands reduction
+                                !  (6) global lor 3*nflds reduction
+
+    logical :: &
+         ddpdd_sum,           & ! use ddpdd algorithm instead of fixed-precision algorithm
+         repro_sum_validate     ! flag enabling/disabling testing that gmax and max_levels
+                                ! are accurate/sufficient. Default is enabled.
+
+    ! Set parameters and allocate arrays
+    dsummands = size(arr,1)
+    nflds = size(arr,2)
+    nsummands = dsummands
+
+    allocate (arr_gbl_max(nflds))
+    allocate (arr_gbl_max_out(nflds))
+    allocate (arr_max_levels(nflds))
+    allocate (arr_max_levels_out(nflds))
+    allocate (rel_diff(2,nflds))
+
+    ddpdd_sum = .false.
+    repro_sum_validate = .true.
+
+    ! The following subroutine is adapted from shr_reprosum_calc in CESM shared code.
+
+    call cism_reprosum_calc(&
+         arr,        arr_gsum,                          &
+         nsummands,  dsummands,  nflds,                 &
+         ddpdd_sum = ddpdd_sum,                         &
+!            arr_gbl_max = arr_gbl_max,                     &
+         arr_gbl_max_out = arr_gbl_max_out,             &
+!            arr_max_levels = arr_max_levels,               &
+         arr_max_levels_out = arr_max_levels_out,       &
+!            gbl_max_nsummands = gbl_max_nsummands,         &
+         gbl_max_nsummands_out = gbl_max_nsummands_out, &
+         repro_sum_validate = repro_sum_validate,       &
+         repro_sum_stats = repro_sum_stats,             &
+         rel_diff = rel_diff)
+
+    if (verbose_reprosum .and. main_task) then
+!       write(iulog,*) 'arr_gbl_max_out =', arr_gbl_max_out
+!       write(iulog,*) 'arr_max_levels_out =', arr_max_levels_out
+!       write(iulog,*) 'gbl_max_nsummands_out =', gbl_max_nsummands_out
+!       write(iulog,*) 'rel diff =', rel_diff(1,:)
+!       write(iulog,*) 'abs diff =', rel_diff(2,:)
+!       write(iulog,*) 'stats =', repro_sum_stats(:)
+    endif
+
+    deallocate(arr_gbl_max, arr_gbl_max_out)
+    deallocate(arr_max_levels, arr_max_levels_out)
+    deallocate(rel_diff)
+
+  end subroutine parallel_reduce_reprosum
+
 !=======================================================================
 
   ! functions belonging to the parallel_reduce_max interface
@@ -9122,7 +9998,7 @@ subroutine parallel_test_comm_row_col(parallel)
        enddo
     endif   ! this_rank
 
-    call distributed_gather_var_row(test_array, global_test_array, parallel)
+    call gather_var_row(test_array, global_test_array, parallel)
 
 !!    if (parallel%main_task_row) then
     if (parallel%main_task_row .and. this_rank == 0) then
@@ -9136,7 +10012,7 @@ subroutine parallel_test_comm_row_col(parallel)
        write(iulog,*) ' '
     endif
 
-    call distributed_scatter_var_row(test_array, global_test_array, parallel)
+    call scatter_var_row(test_array, global_test_array, parallel)
 
     if (this_rank == 0) then
        write(iulog,*) ' '
@@ -9176,7 +10052,7 @@ subroutine parallel_test_comm_row_col(parallel)
        enddo
     endif   ! this_rank
 
-    call distributed_gather_var_col(test_array, global_test_array, parallel)
+    call gather_var_col(test_array, global_test_array, parallel)
 
 !!    if (parallel%main_task_col) then
     if (parallel%main_task_col .and. this_rank == 0) then
@@ -9190,7 +10066,7 @@ subroutine parallel_test_comm_row_col(parallel)
        write(iulog,*) ' '
     endif
 
-    call distributed_scatter_var_col(test_array, global_test_array, parallel)
+    call scatter_var_col(test_array, global_test_array, parallel)
 
     if (this_rank == 0) then
        write(iulog,*) ' '
diff --git a/libglimmer/profile.F90 b/libglimmer/profile.F90
index f88ee51c..92840b14 100644
--- a/libglimmer/profile.F90
+++ b/libglimmer/profile.F90
@@ -35,8 +35,6 @@ module profile
 
 #if (defined CCSMCOUPLED || defined CESMTIMERS)
   use perf_mod
-  !TODO - Add an 'only' for 'use cism_parallel'?
-  use cism_parallel
 #endif
 
   use glimmer_global, only: dp
@@ -89,16 +87,18 @@ end subroutine profile_init
   
   !> register a new series of meassurements
   function profile_register(prof,msg)
-    use glimmer_log
+    use glimmer_paramets, only: iulog
+    use mpi
     implicit none
     type(profile_type) :: prof !< structure storing profile definitions
     character(len=*), intent(in) :: msg !< the message to be associated
     integer profile_register
+    integer :: ierr
 
     prof%nump = prof%nump+1
     if (prof%nump > max_prof) then
-       call write_log('Maximum number of profiles reached',type=GM_FATAL, &
-            file=__FILE__,line=__LINE__)
+       write(iulog,*) ('Maximum number of profiles reached')
+       call mpi_abort(MPI_COMM_WORLD, 1001, ierr)
     end if
     profile_register = prof%nump
     prof%pname(prof%nump) = trim(msg)
diff --git a/libglint/glint_initialise.F90 b/libglint/glint_initialise.F90
index a94a105e..79cd9038 100644
--- a/libglint/glint_initialise.F90
+++ b/libglint/glint_initialise.F90
@@ -607,7 +607,7 @@ subroutine setup_lgrid_fulldomain(instance, grid, grid_orog)
     use glint_global_grid  , only : global_grid
     use glimmer_coordinates, only : coordsystem_new
     use glide_types        , only : get_dew, get_dns
-    use cism_parallel       , only : parallel_type, distributed_gather_var
+    use cism_parallel       , only : parallel_type, gather_var
 
     implicit none
 
@@ -630,7 +630,7 @@ subroutine setup_lgrid_fulldomain(instance, grid, grid_orog)
     global_ewn = instance%model%parallel%global_ewn
     global_nsn = instance%model%parallel%global_nsn
 
-    call distributed_gather_var(instance%out_mask, out_mask_fulldomain, parallel)
+    call gather_var(instance%out_mask, out_mask_fulldomain, parallel)
 
     if (main_task) then
 
diff --git a/libglint/glint_interp.F90 b/libglint/glint_interp.F90
index 83491b99..5ef84eb0 100644
--- a/libglint/glint_interp.F90
+++ b/libglint/glint_interp.F90
@@ -248,7 +248,7 @@ subroutine interp_to_local (lgrid_fulldomain, global,        &
     use glimmer_utils
     use glimmer_coordinates
     use glimmer_log
-    use cism_parallel, only : main_task, parallel_type, distributed_scatter_var, parallel_halo
+    use cism_parallel, only : main_task, parallel_type, scatter_var, parallel_halo
 
     !TODO - Not sure we need localsp now that the code is fully double precision 
 
@@ -299,9 +299,9 @@ subroutine interp_to_local (lgrid_fulldomain, global,        &
 
     ! Allocate variables to hold result of interpolation
     ! We allocate size 0 arrays on non-main task (rather than leaving variables
-    ! unallocated there), because distributed_scatter_var tries to do a deallocate on all tasks
+    ! unallocated there), because scatter_var tries to do a deallocate on all tasks
     ! Note that coordsystem_allocate can't be used here because it only works on pointer
-    ! variables, and the *_fulldomain variables are non-pointers (as is required for distributed_scatter_var)
+    ! variables, and the *_fulldomain variables are non-pointers (as is required for scatter_var)
 
     if (present(localsp)) then
        if (main_task) then
@@ -450,25 +450,25 @@ subroutine interp_to_local (lgrid_fulldomain, global,        &
     end if  ! main_task
  
     ! Main task scatters interpolated data from the full domain to the task owning each point
-    ! Note that distributed_scatter_var doesn't set halo values, so we need to do a halo
+    ! Note that scatter_var doesn't set halo values, so we need to do a halo
     !  update if it's important to have correct values in the halo cells.
     ! Although it's not strictly necessary to have the halo values, we compute them just in
     !  case another part of the code (e.g., glissade_temp) assumes they are available.
 
     if (present(localsp)) then
        localsp(:,:) = 0.d0
-       call distributed_scatter_var(localsp, localsp_fulldomain, parallel)
+       call scatter_var(localsp, localsp_fulldomain, parallel)
        call parallel_halo(localsp, parallel)
     endif
 
     if (present(localdp)) then
        localdp(:,:) = 0.d0
-       call distributed_scatter_var(localdp, localdp_fulldomain, parallel)
+       call scatter_var(localdp, localdp_fulldomain, parallel)
        call parallel_halo(localdp, parallel)
     endif
 
     ! We do NOT deallocate the local*_fulldomain variables here, because the
-    ! distributed_scatter_var routines do this deallocation
+    ! scatter_var routines do this deallocation
 
   end subroutine interp_to_local
 
@@ -489,7 +489,7 @@ subroutine copy_to_local (lgrid_fulldomain, global,   &
     ! on the main task.
 
     use glimmer_coordinates
-    use cism_parallel, only : main_task, parallel_type, distributed_scatter_var, parallel_halo
+    use cism_parallel, only : main_task, parallel_type, scatter_var, parallel_halo
 
     ! Argument declarations
 
@@ -524,16 +524,16 @@ subroutine copy_to_local (lgrid_fulldomain, global,   &
     end if
 
     ! Main task scatters interpolated data from the full domain to the task owning each point
-    ! Note that distributed_scatter_var doesn't set halo values, so we need to do a halo
+    ! Note that scatter_var doesn't set halo values, so we need to do a halo
     !  update if it's important to have correct values in the halo cells.
     ! Although it's not strictly necessary to have the halo values, we compute them just in
     !  case another part of the code (e.g., glissade_temp) assumes they are available.
 
     local(:,:) = 0.d0
-    call distributed_scatter_var(local, local_fulldomain, parallel)
+    call scatter_var(local, local_fulldomain, parallel)
     call parallel_halo(local, parallel)
 
-    ! We do NOT deallocate local_fulldomain here, because the distributed_scatter_var
+    ! We do NOT deallocate local_fulldomain here, because the scatter_var
     ! routine does this deallocation
 
   end subroutine copy_to_local
@@ -695,7 +695,7 @@ subroutine local_to_global_avg(ups, parallel, local, global, mask)
     !> \texttt{interp\_to\_local} routine.
     !> \end{itemize}
 
-    use cism_parallel, only : main_task, parallel_type, distributed_gather_var
+    use cism_parallel, only : main_task, parallel_type, gather_var
 
     ! Arguments
 
@@ -729,8 +729,8 @@ subroutine local_to_global_avg(ups, parallel, local, global, mask)
 
     ! Gather 'local' and 'tempmask' onto main task, which is the only one that does the regridding
 
-    call distributed_gather_var(local, local_fulldomain, parallel)
-    call distributed_gather_var(tempmask, tempmask_fulldomain, parallel)
+    call gather_var(local, local_fulldomain, parallel)
+    call gather_var(tempmask, tempmask_fulldomain, parallel)
 
     ! Main task does regridding
 
@@ -785,7 +785,7 @@ subroutine local_to_global_sum(ups, parallel, local, global, mask)
     !> \item \texttt{gboxn} is the same size as \texttt{global}
     !> \end{itemize}
 
-    use cism_parallel, only : main_task, parallel_type, distributed_gather_var
+    use cism_parallel, only : main_task, parallel_type, gather_var
 
     ! Arguments
 
@@ -816,8 +816,8 @@ subroutine local_to_global_sum(ups, parallel, local, global, mask)
 
     ! Gather 'local' and 'tempmask' onto main task, which is the only one that does the regridding
 
-    call distributed_gather_var(local, local_fulldomain, parallel)
-    call distributed_gather_var(tempmask, tempmask_fulldomain, parallel)
+    call gather_var(local, local_fulldomain, parallel)
+    call gather_var(tempmask, tempmask_fulldomain, parallel)
 
     ! Main task does regridding
     if (main_task) then
@@ -854,7 +854,7 @@ subroutine local_to_global_min(ups, parallel, local, global, mask)
     !> \item \texttt{gboxn} is the same size as \texttt{global}
     !> \end{itemize}
 
-    use cism_parallel, only : main_task, parallel_type, distributed_gather_var
+    use cism_parallel, only : main_task, parallel_type, gather_var
 
     ! Arguments
 
@@ -885,8 +885,8 @@ subroutine local_to_global_min(ups, parallel, local, global, mask)
 
     ! Gather 'local' and 'tempmask' onto main task, which is the only one that does the regridding
 
-    call distributed_gather_var(local, local_fulldomain, parallel)
-    call distributed_gather_var(tempmask, tempmask_fulldomain, parallel)
+    call gather_var(local, local_fulldomain, parallel)
+    call gather_var(tempmask, tempmask_fulldomain, parallel)
 
     ! Main task does regridding
     if (main_task) then
diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90
index 83eaec79..2f3acb4f 100644
--- a/libglissade/glissade.F90
+++ b/libglissade/glissade.F90
@@ -85,20 +85,19 @@ module glissade
 !=======================================================================
 
 ! Note: There is no glissade_config subroutine; glide_config works for all dycores.
-
+!       glide_config is called from cism_init_dycore before glissade_initialise.
 !=======================================================================
 
   subroutine glissade_initialise(model, evolve_ice)
 
     ! initialise Glissade model instance
 
-    use cism_parallel, only: parallel_type, distributed_gather_var,  &
-         distributed_scatter_var, parallel_finalise, &
+    use cism_parallel, only: parallel_type, parallel_finalise, &
          distributed_grid, distributed_grid_active_blocks,  parallel_global_edge_mask, &
-         parallel_halo, parallel_halo_extrapolate, parallel_reduce_max, &
+         parallel_halo, parallel_halo_extrapolate, &
          staggered_parallel_halo_extrapolate, staggered_no_penetration_mask, &
          parallel_create_comm_row, parallel_create_comm_col, &
-         parallel_is_zero, not_parallel
+         parallel_reduce_max, parallel_is_zero, not_parallel
 
     use glide_setup
     use glimmer_ncio, only: openall_in, openall_out, glimmer_nc_get_var, glimmer_nc_get_dimlength
@@ -218,11 +217,12 @@ subroutine glissade_initialise(model, evolve_ice)
           ! The subroutine will report how many tasks are needed to compute on all active blocks, and then abort.
           ! The user can then resubmit (on an optimal number of processors) with model%options%compute_blocks = ACTIVE_BLOCKS.
 
-          call distributed_grid_active_blocks(model%general%ewn,      model%general%nsn,      &
-                                              model%general%nx_block, model%general%ny_block, &
-                                              model%general%ice_domain_mask,                  &
-                                              model%parallel,                                 &
-                                              inquire_only = .true.)
+          call distributed_grid_active_blocks(&
+               model%general%ewn,      model%general%nsn,      &
+               model%general%nx_block, model%general%ny_block, &
+               model%general%ice_domain_mask,                  &
+               model%parallel,                                 &
+               inquire_only = .true.)
 
        else  ! compute_blocks = ACTIVE_BLOCKS_ONLY
 
@@ -235,10 +235,12 @@ subroutine glissade_initialise(model, evolve_ice)
              model%general%global_bc = GLOBAL_BC_NO_ICE
           endif
 
-          call distributed_grid_active_blocks(model%general%ewn,      model%general%nsn,      &
-                                              model%general%nx_block, model%general%ny_block, &
-                                              model%general%ice_domain_mask,                  &
-                                              model%parallel)
+          call distributed_grid_active_blocks(&
+               model%general%ewn,      model%general%nsn,      &
+               model%general%nx_block, model%general%ny_block, &
+               model%general%ice_domain_mask,                  &
+               model%parallel,                                 &
+               reprosum_in = model%options%reproducible_sums)
 
        endif   ! compute_blocks
 
@@ -247,13 +249,19 @@ subroutine glissade_initialise(model, evolve_ice)
 
     elseif (model%general%global_bc == GLOBAL_BC_OUTFLOW) then
 
-       call distributed_grid(model%general%ewn, model%general%nsn, &
-                             model%parallel,    global_bc_in = 'outflow')
+       call distributed_grid(&
+            model%general%ewn,   model%general%nsn,         &
+            model%parallel,                                 &
+            reprosum_in = model%options%reproducible_sums,  &
+            global_bc_in = 'outflow')
 
     elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then
 
-       call distributed_grid(model%general%ewn, model%general%nsn, &
-                             model%parallel,     global_bc_in = 'no_ice')
+       call distributed_grid(&
+            model%general%ewn,   model%general%nsn,         &
+            model%parallel,                                 &
+            reprosum_in = model%options%reproducible_sums,  &
+            global_bc_in = 'no_ice')
 
     elseif (model%general%global_bc == GLOBAL_BC_NO_PENETRATION) then
 
@@ -261,15 +269,19 @@ subroutine glissade_initialise(model, evolve_ice)
        !       The difference is that we also use no-penetration masks for (uvel,vvel) at the global boundary
        !       (computed by calling staggered_no_penetration_mask below).
 
-       call distributed_grid(model%general%ewn, model%general%nsn, &
-                             model%parallel,     global_bc_in = 'no_penetration')
+       call distributed_grid(&
+            model%general%ewn,   model%general%nsn,         &
+            model%parallel,                                 &
+            reprosum_in = model%options%reproducible_sums,  &
+            global_bc_in = 'no_penetration')
 
     else  ! global_bc = GLOBAL_BC_PERIODIC
 
-!       call distributed_grid(model%general%ewn, model%general%nsn, global_bc_in = 'periodic')
-
-       call distributed_grid(model%general%ewn, model%general%nsn, &
-                             model%parallel,  global_bc_in = 'periodic')
+       call distributed_grid(&
+            model%general%ewn,   model%general%nsn,         &
+            model%parallel,                                 &
+            reprosum_in = model%options%reproducible_sums,  &
+            global_bc_in = 'periodic')
 
     endif
 
@@ -354,8 +366,47 @@ subroutine glissade_initialise(model, evolve_ice)
     itest = model%numerics%idiag_local
     jtest = model%numerics%jdiag_local
 
-    ! Check that lat and lon fields were read in, if desired
-    !TODO - Use the parallel_is_nonzero function instead, here and below
+    ! Make sure the grid coordinates (x1,y1) and (x0,y0) have been read in.
+    ! If (x1,y1) have not been read in, then abort the run.
+    ! If (x0,y0) have not been read in, then compute them from (x1,y1).
+    ! Extrapolate these coordinates to halo cells as needed.
+    ! Note: The extrapolation works only on a regular grid.
+    !TODO - Put the following code in a subroutine.
+
+    if (parallel_is_zero(model%general%x1)) then
+       call write_log('model%general%x1 = 0.0 everywhere', GM_FATAL)
+    else  ! extrapolate x1 to halo cells
+       call parallel_halo_extrapolate(model%general%x1, parallel, model%numerics%dew)
+    endif
+
+    if (parallel_is_zero(model%general%y1)) then
+       call write_log('model%general%y1 = 0.0 everywhere', GM_FATAL)
+    else  ! extrapolate y1 to halo cells
+       call parallel_halo_extrapolate(model%general%y1, parallel, model%numerics%dns)
+    endif
+
+    ! Check whether x0 and y0 were read in. If not, then compute them from x1 and y1.
+    if (parallel_is_zero(model%general%x0)) then
+       if (main_task) write(iulog,*) 'x0 not read in; initialize from x1'
+       do i = 1, model%general%ewn-1
+          model%general%x0(i) = 0.5d0 * (model%general%x1(i) + model%general%x1(i+1))
+       enddo
+    else
+       ! extrapolate x0 to halo cells
+       call parallel_halo_extrapolate(model%general%x0, parallel, model%numerics%dew)
+    endif
+
+    if (parallel_is_zero(model%general%y0)) then
+       if (main_task) write(iulog,*) 'y0 not read in; initialize from y1'
+       do j = 1, model%general%nsn-1
+          model%general%y0(j) = 0.5d0 * (model%general%y1(j) + model%general%y1(j+1))
+       enddo
+    else
+       ! extrapolate y0 to halo cells
+       call parallel_halo_extrapolate(model%general%y0, parallel, model%numerics%dns)
+    endif
+
+    ! Check that lat and lon fields were read in
     if (model%options%read_lat_lon) then
        if (parallel_is_zero(model%general%lat)) then
           call write_log('Failed to read latitude (lat) field from input file', GM_FATAL)
@@ -574,50 +625,9 @@ subroutine glissade_initialise(model, evolve_ice)
 
     endif  ! geothermal heat flux
 
-    ! If running with glaciers, then process the input glacier data
-    ! On start-up, this subroutine counts the glaciers.  It should be called before glide_io_createall,
-    !  which needs to know nglacier to set up glacier output files with the right dimensions.
-    ! On restart, most of the required glacier arrays are in the restart file, and this subroutine
-    !  computes a few remaining variable.
-
-    if (model%options%enable_glaciers) then
-
-       ! Glaciers are run with a no-ice BC to allow removal of inactive regions.
-       ! This can be problematic when running in a sub-region that has glaciers along the global boundary.
-       ! A halo update here for 'thck' will remove ice from cells along the global boundary.
-       ! It is best to do this before initializing glaciers, so that ice that initially exists
-       !  in these cells is removed before computing the area and thickness targets.
-       !TODO - These calls are repeated a few lines below.  Try moving them up, before the call
-       !       to glissade_glacier_init.  I don't think it's possible to move the glissade_glacier_init call
-       !       down, because we need to compute nglacier before setting up output files.
-
-       call parallel_halo(model%geometry%thck, parallel)
-       ! calculate the lower and upper ice surface
-       call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus, model%geometry%lsrf)
-       model%geometry%usrf = max(0.d0, model%geometry%thck + model%geometry%lsrf)
-
-       ! Initialize glaciers
-       ! Note: This subroutine can return modified values of model%numerics%dew, model%numerics%dns,
-       !        and model%geometry%cell_area.
-       !       This is a fix to deal with the fact that actual grid cell dimensions can be different
-       !        from the nominal dimensions on a projected grid.
-       !       See comments near the top of glissade_glacier_init.
-
-       call glissade_glacier_init(model, model%glacier)
-
-    endif
-
-    ! open all output files
-    call openall_out(model)
-
-    ! create glide I/O variables
-    call glide_io_createall(model, model)
-
-    ! initialize glissade components
-
     ! Set some variables in halo cells
     ! Note: We need thck and artm in halo cells so that temperature will be initialized correctly
-    !        (if not read from input file).
+    !        (if not read from the input file).
     !       We do an update here for temp in case temp is read from an input file.
     !       If temp is computed below in glissade_init_therm (based on the value of options%temp_init),
     !        then the halos will receive the correct values.
@@ -626,37 +636,36 @@ subroutine glissade_initialise(model, evolve_ice)
     call parallel_halo(model%climate%artm,  parallel)
     call parallel_halo(model%temper%temp,   parallel)
     call parallel_halo(model%temper%tempunstag, parallel)
-
-    ! calculate the lower and upper ice surface
-    call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus, model%geometry%lsrf)
-    model%geometry%usrf = max(0.d0, model%geometry%thck + model%geometry%lsrf)
+    if (model%options%whichtemp == TEMP_ENTHALPY) &
+         call parallel_halo(model%temper%waterfrac, parallel)
 
     ! Note: For outflow BCs, most fields (thck, usrf, temp, etc.) are set to zero in the global halo,
     !        to create ice-free conditions. However, we might not want to set topg = 0 in the global halo,
     !        because then the global halo will be interpreted as ice-free land, whereas we may prefer to
     !        treat it as ice-free ocean. For this reason, topg is extrapolated from adjacent cells.
-    !       Similarly, for no_ice BCs, we want to zero out ice state variables adjacent to the global boundary,
+    !       For no_ice BCs, we want to zero out ice state variables adjacent to the global boundary,
     !        but we do not want to zero out the topography.
-    ! Note: For periodic BCs, there is an optional argument periodic_offset_ew for topg.
-    !       This is for ismip-hom experiments. A positive EW offset means that
+    !       For periodic BCs, there are optional periodic_offset arguments for topg.
+    !        These are for ismip-hom experiments or similar geometries. A positive EW offset means that
     !        the topography in west halo cells will be raised, and the topography
     !        in east halo cells will be lowered.  This ensures that the topography
     !        and upper surface elevation are continuous between halo cells
     !        and locally owned cells at the edge of the global domain.
-    !       In other cases (anything but ismip-hom), periodic_offset_ew = periodic_offset_ns = 0,
-    !        and this argument will have no effect.
+    !       After this call, topg does not need another halo update unless isostasy is active.
 
-    if (model%general%global_bc == GLOBAL_BC_OUTFLOW .or.  &
-        model%general%global_bc == GLOBAL_BC_NO_ICE) then
+    if (model%general%global_bc == GLOBAL_BC_OUTFLOW) then
        call parallel_halo_extrapolate(model%geometry%topg, parallel)
+    elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then
+       call parallel_halo(model%geometry%topg, parallel, zero_global_boundary_no_ice_bc = .false.)
     else  ! other global BCs, including periodic
        call parallel_halo(model%geometry%topg, parallel, &
                           periodic_offset_ew = model%numerics%periodic_offset_ew, &
                           periodic_offset_ns = model%numerics%periodic_offset_ns)
     endif
 
-    if (model%options%whichtemp == TEMP_ENTHALPY) &
-         call parallel_halo(model%temper%waterfrac, parallel)
+    ! calculate the lower and upper ice surface (will be correct in halos following the halo updates above)
+    call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus, model%geometry%lsrf)
+    model%geometry%usrf = max(0.d0, model%geometry%thck + model%geometry%lsrf)
 
     ! halo update for kinbcmask (= 1 where uvel and vvel are prescribed, elsewhere = 0)
     ! Note: Instead of assuming that kinbcmask is periodic, we extrapolate it into the global halo
@@ -664,6 +673,34 @@ subroutine glissade_initialise(model, evolve_ice)
     !       on the global staggered grid).
     call staggered_parallel_halo_extrapolate (model%velocity%kinbcmask, parallel)  ! = 1 for Dirichlet BCs
 
+    if (model%options%enable_glaciers) then
+
+       ! If running with glaciers, then process the input glacier data and initialize glacier arrays
+
+       ! Note: On start-up, this subroutine counts the glaciers.  It should be called before glide_io_createall,
+       !       which needs to know nglacier to set up glacier output files with the right dimensions.
+       !       On restart, most of the required glacier arrays are in the restart file, and this subroutine
+       !       computes a few remaining variables.
+       ! Note: Glaciers are usually run with a no-ice BC to allow removal of inactive regions.
+       !       This means that any grid cells adjacent to the global boundary are not handled correctly.
+       !       The preceding halo update for 'thck' removes ice from these grid cells.
+       ! Note: If glacier%length_scale_factor /= 1, This subroutine modifies the values of model%numerics%dew,
+       !       model%numerics%dns, model%geometry%cell_area, and the grid coordinates (x0,y0) and (x1,y1).
+       !       This is done if the true grid cell dimensions differ from the nominal dimensions on a projected grid.
+       !       See comments near the top of glissade_glacier_init.
+
+       call glissade_glacier_init(model, model%glacier)
+
+    endif
+
+    ! open all output files
+    call openall_out(model)
+
+    ! create glide I/O variables
+    call glide_io_createall(model, model)
+
+    ! initialize glissade components
+
     !TODO - Remove call to init_velo in glissade_initialise?
     !       Most of what's done in init_velo is needed for SIA only, but still need velowk for call to wvelintg
     call init_velo(model)
@@ -815,8 +852,7 @@ subroutine glissade_initialise(model, evolve_ice)
     call glide_set_mask(model%numerics,                                &
                         model%geometry%thck,  model%geometry%topg,     &
                         model%general%ewn,    model%general%nsn,       &
-                        model%climate%eus,    model%geometry%thkmask,  &
-                        model%geometry%iarea, model%geometry%ivol)
+                        model%climate%eus,    model%geometry%thkmask)
 
     ! compute halo for relaxed topography
     ! Note: See comments above with regard to the halo update for topg.
@@ -824,10 +860,11 @@ subroutine glissade_initialise(model, evolve_ice)
     !        adjacent to or beyond the global boundary. This is an appropriate treatment for
     !        ice state variables, but not for bed topography and related fields (like relx).
     !TODO - Is this halo update necessary?
-    if (model%general%global_bc == GLOBAL_BC_OUTFLOW .or. &
-        model%general%global_bc == GLOBAL_BC_NO_ICE) then
+    if (model%general%global_bc == GLOBAL_BC_OUTFLOW) then
        call parallel_halo_extrapolate(model%isostasy%relx, parallel)
-    else
+    elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then
+       call parallel_halo(model%isostasy%relx, parallel, zero_global_boundary_no_ice_bc = .false.)
+    else  ! other global BCs, including periodic
        call parallel_halo(model%isostasy%relx, parallel)
     endif
 
@@ -879,8 +916,7 @@ subroutine glissade_initialise(model, evolve_ice)
        call glide_set_mask(model%numerics,                                &
                            model%geometry%thck,  model%geometry%topg,     &
                            model%general%ewn,    model%general%nsn,       &
-                           model%climate%eus,    model%geometry%thkmask,  &
-                           model%geometry%iarea, model%geometry%ivol)
+                           model%climate%eus,    model%geometry%thkmask)
 
     endif  ! initial calving
 
@@ -907,7 +943,7 @@ subroutine glissade_initialise(model, evolve_ice)
     !TODO:  Have a single option that is applied with or without glaciers enabled?
 
     if (model%options%which_ho_powerlaw_c == HO_POWERLAW_C_CONSTANT .or. &
-        model%options%is_restart == NO_RESTART) then
+        parallel_is_zero(model%basal_physics%powerlaw_c)) then
        if (model%options%enable_glaciers .and. &
             model%glacier%set_powerlaw_c /= GLACIER_POWERLAW_C_CONSTANT) then
           ! do nothing; see note above
@@ -917,15 +953,29 @@ subroutine glissade_initialise(model, evolve_ice)
     endif
 
     ! Initialize coulomb_c
-    ! If inverting for coulomb_c, we read in the saved coulomb_c field on restart.
-    if (model%options%which_ho_coulomb_c == HO_COULOMB_C_CONSTANT .or. &
-        model%options%is_restart == NO_RESTART) then
+    ! Note: If inverting for coulomb_c, then coulomb_c is initialized here.
+    !       On restart, however, the saved coulomb_c (or alternatively,
+    !        coulomb_c_hi and coulomb_c_lo, for the elevation-based option)
+    !        should have been read from the restart file and is not reset here.
+
+    if (model%options%which_ho_coulomb_c == HO_COULOMB_C_CONSTANT) then
+
+       model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const
+
+    else   ! either inverting for coulomb_c or reading values from an input file
+
+       if (model%options%elevation_based_coulomb_c) then  ! need coulomb_c_hi and coulomb_c_lo
 
-       !TODO - Make sure the initialization is correct when reading from an external file in a restart.
-       if (model%options%elevation_based_coulomb_c) then
+          if (parallel_is_zero(model%basal_physics%coulomb_c_hi) .or. &
+              parallel_is_zero(model%basal_physics%coulomb_c_lo)) then
 
-          model%basal_physics%coulomb_c_hi = model%basal_physics%coulomb_c_const_hi
-          model%basal_physics%coulomb_c_lo = model%basal_physics%coulomb_c_const_lo
+             ! initialize to constants
+             model%basal_physics%coulomb_c_hi = model%basal_physics%coulomb_c_const_hi
+             model%basal_physics%coulomb_c_lo = model%basal_physics%coulomb_c_const_lo
+
+          endif
+
+          ! Given coulomb_c_hi and coulomb_c_lo, compute coulomb_c based on elevation
 
           call glissade_elevation_based_coulomb_c(&
                model%general%ewn,   model%general%nsn,   &
@@ -937,17 +987,24 @@ subroutine glissade_initialise(model, evolve_ice)
                model%basal_physics%coulomb_c_bed_hi,     &
                model%basal_physics%coulomb_c)
 
-          call parallel_halo(model%basal_physics%coulomb_c, parallel)
+       else   ! coulomb_c not elevation-based
+
+          if (parallel_is_zero(model%basal_physics%coulomb_c)) then
+
+             ! initialize to constant
+             model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const
 
-          if (verbose_inversion) then
-             call point_diag(model%basal_physics%coulomb_c, 'Initial coulomb_c', itest, jtest, rtest, 7, 7)
           endif
 
-       else
-          model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const
+       endif   ! elevation-based
+
+       call parallel_halo(model%basal_physics%coulomb_c, parallel)
+
+       if (verbose_inversion) then
+          call point_diag(model%basal_physics%coulomb_c, 'Initial coulomb_c', itest, jtest, rtest, 7, 7)
        endif
 
-    endif
+    endif   ! coulomb_c options
 
     ! Optionally, do initial calculations for inversion
     ! At the start of the run (but not on restart), this might lead to further thickness adjustments,
@@ -1104,6 +1161,7 @@ subroutine glissade_initialise(model, evolve_ice)
 
           call glissade_basin_average(&
                model%general%ewn, model%general%nsn,   &
+               parallel,                               &
                model%ocean_data%nbasin,                &
                model%ocean_data%basin_number,          &
                floating_mask * 1.0d0,                  &   ! real mask
@@ -1155,7 +1213,7 @@ subroutine glissade_tstep(model, time)
 
     use glimmer_paramets, only: eps11
     use glimmer_physcon, only: scyr
-    use glide_mask, only: glide_set_mask, calc_iareaf_iareag
+    use glide_mask, only: glide_set_mask
     use glissade_mass_balance, only: glissade_prepare_climate_forcing
 
     implicit none
@@ -1360,15 +1418,7 @@ subroutine glissade_tstep(model, time)
     call glide_set_mask(model%numerics,                                &
                         model%geometry%thck,  model%geometry%topg,     &
                         model%general%ewn,    model%general%nsn,       &
-                        model%climate%eus,    model%geometry%thkmask,  &
-                        model%geometry%iarea, model%geometry%ivol)
-
-    ! --- Calculate global area of ice that is floating and grounded.
-    !TODO  May want to calculate iareaf and iareag in glide_write_diag and remove those calculations here.  
-
-    call calc_iareaf_iareag(model%numerics%dew,    model%numerics%dns,     &
-                            model%geometry%thkmask,                        &
-                            model%geometry%iareaf, model%geometry%iareag)
+                        model%climate%eus,    model%geometry%thkmask)
 
     ! ------------------------------------------------------------------------
     ! Do the vertical thermal solve if it is time to do so.
@@ -1742,8 +1792,6 @@ subroutine glissade_thermal_solve(model, dt)
     use glissade_therm, only: glissade_therm_driver
     use glissade_basal_water, only: glissade_calcbwat, glissade_bwat_flux_routing
     use glissade_masks, only: glissade_get_masks
-    !WHL - debug
-    use cism_parallel, only: parallel_reduce_max
 
     implicit none
 
@@ -1882,12 +1930,6 @@ subroutine glissade_thermal_solve(model, dt)
           endwhere
        endif
 
-       !WHL - debug - Set mask = 0 where thck = 0 for dome test
-       ! An alternative would be to identify cells that have a path through land to the domain edge
-       where (model%geometry%thck == 0)
-          bwat_mask = 0
-       endwhere
-
        call parallel_halo(bwat_mask, parallel)
 
        ! Set the meltwater source for the basal hydrology scheme.
@@ -1902,15 +1944,6 @@ subroutine glissade_thermal_solve(model, dt)
           model%basal_hydro%bmlt_hydro = 0.0d0
        endwhere
 
-       call glissade_calcbwat(&
-            model%options%which_ho_bwat,      &
-            model%basal_hydro,                &
-            dt,                               &  ! s
-            model%geometry%thck,              &  ! m
-            model%numerics%thklim_temp,       &  ! m
-            model%basal_melt%bmlt_ground,     &  ! m/s
-            model%basal_hydro%bwat)              ! m
-
        ! Compute the steady-state basal water flux based on a flux-routing scheme
 
        call glissade_bwat_flux_routing(&
@@ -1926,12 +1959,14 @@ subroutine glissade_thermal_solve(model, dt)
             floating_mask,                                    &
             model%basal_hydro%bmlt_hydro,                     &  ! m/s
             model%temper%bpmp - model%temper%btemp_ground,    &  ! degC
-            model%basal_hydro%btemp_scale,                    &  ! degC
+            model%basal_hydro%btemp_flow_scale,               &  ! degC
+            model%basal_hydro%btemp_freeze_scale,             &  ! degC
             model%basal_hydro%bwatflx,                        &  ! m/s
             model%basal_hydro%bwat_diag,                      &  ! m
             model%temper%bhydroflx,                           &  ! W/m2
             model%basal_hydro%head,                           &  ! m
-            model%basal_hydro%grad_head)                         ! m/m
+            model%basal_hydro%grad_head,                      &  ! m/m
+            reprosum_in = model%options%reproducible_sums)
 
        ! halo updates (not sure if all are needed)
        call parallel_halo(model%basal_hydro%bwatflx, parallel)
@@ -2203,7 +2238,6 @@ subroutine glissade_thickness_tracer_solve(model)
 
        ! pre-transport halo updates for thickness and tracers
        call parallel_halo(model%geometry%thck, parallel)
-       call parallel_halo(model%geometry%topg, parallel)
        call parallel_halo_tracers(model%geometry%tracers, parallel)
        call parallel_halo_tracers(model%geometry%tracers_usrf, parallel)
        call parallel_halo_tracers(model%geometry%tracers_lsrf, parallel)
@@ -2430,6 +2464,7 @@ subroutine glissade_calving_solve(model, init_calving)
     use glissade_masks, only: glissade_get_masks, glissade_ocean_connection_mask, &
          glissade_calving_front_mask
     use glissade_grounding_line, only: glissade_grounded_fraction
+
     implicit none
 
     type(glide_global_type), intent(inout) :: model   ! model instance
@@ -3041,12 +3076,14 @@ subroutine glissade_isostasy_solve(model)
        !        but the argument is included to be on the safe side.
        ! TODO: Do we need similar logic for halo updates of relx?
 
-       if (model%general%global_bc == GLOBAL_BC_OUTFLOW .or. &
-           model%general%global_bc == GLOBAL_BC_NO_ICE) then
+       if (model%general%global_bc == GLOBAL_BC_OUTFLOW) then
           call parallel_halo_extrapolate(model%geometry%topg, parallel)
+       elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then
+          call parallel_halo(model%geometry%topg, parallel, zero_global_boundary_no_ice_bc = .false.)
        else  ! other global BCs, including periodic
           call parallel_halo(model%geometry%topg, parallel, &
-                             periodic_offset_ew = model%numerics%periodic_offset_ew)
+                          periodic_offset_ew = model%numerics%periodic_offset_ew, &
+                          periodic_offset_ns = model%numerics%periodic_offset_ns)
        endif
 
        ! update the marine connection mask, which depends on topg
diff --git a/libglissade/glissade_basal_traction.F90 b/libglissade/glissade_basal_traction.F90
index a5142dab..91ec33a1 100644
--- a/libglissade/glissade_basal_traction.F90
+++ b/libglissade/glissade_basal_traction.F90
@@ -52,7 +52,7 @@ module glissade_basal_traction
   use glimmer_utils, only: point_diag
   use glide_types
   use cism_parallel, only : this_rank, main_task, parallel_type, &
-       parallel_halo, staggered_parallel_halo, parallel_globalindex, distributed_scatter_var
+       parallel_halo, staggered_parallel_halo, parallel_globalindex, scatter_var
 
   implicit none
 
@@ -157,9 +157,10 @@ subroutine glissade_calcbeta (&
        big_lambda,         & ! bedrock characteristics
        flwa_basal_stag       ! basal flwa interpolated to the staggered grid (Pa^{-n} yr^{-1})
 
-  ! variables for Tsai et al. parameterization
-  real(dp) :: taub_powerlaw  ! basal shear stress given by a power law as in Tsai et al. (2015)
-  real(dp) :: taub_coulomb   ! basal shear stress given by Coulomb friction as in Tsai et al. (2015)
+  ! variables for mixed power/Coulomb laws
+  real(dp) :: taub           ! basal shear stress
+  real(dp) :: taub_powerlaw  ! basal shear stress given by a power law
+  real(dp) :: taub_coulomb   ! basal shear stress given by Coulomb friction
 
   ! variables for pseudo-plastic law
   real(dp) :: q              ! exponent for pseudo-plastic law (unitless)
@@ -185,6 +186,7 @@ subroutine glissade_calcbeta (&
   ! Enforce a minimum speed to prevent beta from become very large when velocity is small.
   speed(:,:) = dsqrt(thisvel(:,:)**2 + othervel(:,:)**2 + smallnum**2)
 
+  !TODO - Should this be done only for powerlaw sliding, and not for ZI or PP?
   ! If beta_powerlaw_umax is set to a nonzero value, then limit the speed to this value.
   ! Note: The actual ice speed can be greater than umax.  This is just a way of shutting off the feedback
   !        between beta and ice speed (beta down as speed up) when the ice speed is large.
@@ -347,7 +349,7 @@ subroutine glissade_calcbeta (&
        !      The following code sets beta on the full grid as prescribed by Pattyn et al. (2008).
 
        ! Allocate a global array on the main task only.
-       ! On other tasks, allocate a size 0 array, since distributed_scatter_var wants to deallocate on all tasks.
+       ! On other tasks, allocate a size 0 array, since scatter_var wants to deallocate on all tasks.
        if (main_task) then
           allocate(beta_global(parallel%global_ewn, parallel%global_nsn))
        else
@@ -377,9 +379,9 @@ subroutine glissade_calcbeta (&
        ! Note: beta_extend has dimensions (ewn,nsn), so it can receive scattered data from beta_global.
        allocate(beta_extend(ewn, nsn))
        beta_extend(:,:) = 0.d0
-       call distributed_scatter_var(beta_extend, beta_global, parallel)
+       call scatter_var(beta_extend, beta_global, parallel)
 
-       ! distributed_scatter_var does not update the halo, so do an update here
+       ! scatter_var does not update the halo, so do an update here
        call parallel_halo(beta_extend, parallel)
 
        ! Copy beta_extend to beta on the local processor.
@@ -391,7 +393,7 @@ subroutine glissade_calcbeta (&
           enddo
        enddo
 
-      ! beta_extend is no longer needed (beta_global is deallocated in distributed_scatter_var)
+      ! beta_extend is no longer needed (beta_global is deallocated in scatter_var)
       deallocate(beta_extend)
 
     case(HO_BABC_BETA_EXTERNAL)   ! use beta value from external file
@@ -504,9 +506,9 @@ subroutine glissade_calcbeta (&
           beta = 1.0d8
        end where
 
-    case(HO_BABC_COULOMB_POWERLAW_SCHOOF)
+    case(HO_BABC_SCHOOF)
 
-       ! Use the basal friction formulation of Schoof (2005), modified following Asay-Davis et al. (2016).
+       ! Use the basal friction formulation of Schoof (2005), formluated following Asay-Davis et al. (2016).
        ! This formulation uses a constant value of basal flwa, which allows several Coulomb parameters
        !  (lambda_max, m_max and flwa_basal) to be combined into a single parameter powerlaw_c, 
        !  as in the Tsai power law below.
@@ -576,16 +578,48 @@ subroutine glissade_calcbeta (&
 !         write(iulog,*) ew, ns, speed(ew,ns), basal_physics%effecpress_stag(ew,ns), beta(ew,ns), beta(ew,ns)*speed(ew,ns)
 !      enddo
 
-    case(HO_BABC_COULOMB_POWERLAW_TSAI)
+    case(HO_BABC_MODIFIED_SCHOOF)
 
-      ! Basal stress representation based on Tsai et al. (2015)
-      ! The basal stress is the minimum of two values:
-      ! (1) power law:          tau_b = powerlaw_c * |u_b|^(1/powerlaw_m)
-      ! (2) Coulomb friction:   tau_b = coulomb_c * N
-      !                             N = effective pressure = rhoi*g*(H - H_f)
-      !                           H_f = flotation thickness = (rhow/rhoi)*(eus-topg)
-      ! This value of N is obtained by setting p_ocean_penetration = 1.0 in the config file.
-      ! The other parameters (powerlaw_c, powerlaw_m and coulomb_c) can also be set in the config file.
+       ! Modified version of the Schoof law, with a simpler albebraic form
+       ! The basal stress is given by
+       ! 1/tau_b = 1/tau_p + 1/tau_c
+       ! where tau_p = powerlaw_c * |u_b|^(1/powerlaw_m)
+       !       tau_c = coulomb_c * N
+       !           N = effective pressure
+       ! Note: taub = 1/2 of the harmonic mean of tau_p and tau_c
+
+       do ns = 1, nsn-1
+          do ew = 1, ewn-1
+
+             taub_powerlaw = basal_physics%powerlaw_c(ew,ns) * speed(ew,ns)**(1.d0/basal_physics%powerlaw_m)
+             taub_coulomb  = basal_physics%coulomb_c(ew,ns) * basal_physics%effecpress_stag(ew,ns)
+
+             if (taub_coulomb > 0.0d0 .and. taub_powerlaw > 0.0d0) then
+                taub = 1.0d0 / (1.0d0/taub_powerlaw + 1.0d0/taub_coulomb)
+             elseif (taub_powerlaw > 0.0d0) then
+                taub = taub_powerlaw
+             elseif (taub_coulomb > 0.0d0) then
+                taub = taub_coulomb
+             endif
+             beta(ew,ns) = taub / speed(ew,ns)
+
+             if (verbose_beta .and. present(rtest) .and. present(itest) .and. present(jtest)) then
+                if (this_rank == rtest .and. ew == itest .and. ns == jtest) then
+                   write(iulog,*) ' '
+                   write(iulog,'(a38,3i4,4f12.3)') 'rank, i, j, tau_p, tau_c, tau_b, beta:', &
+                        this_rank, ew, ns, taub_powerlaw, taub_coulomb, taub, beta(ew,ns)
+                endif
+             endif
+          enddo   ! ew
+       enddo   ! ns
+
+    case(HO_BABC_TSAI)
+
+       ! Basal stress representation based on Tsai et al. (2015)
+       ! The basal stress is the minimum of two values:
+       ! (1) power law:          tau_p = powerlaw_c * |u_b|^(1/powerlaw_m)
+       ! (2) Coulomb friction:   tau_c = coulomb_c * N
+       !                             N = effective pressure
 
        do ns = 1, nsn-1
           do ew = 1, ewn-1
@@ -607,18 +641,6 @@ subroutine glissade_calcbeta (&
           beta(:,:) = beta(:,:) * basal_physics%c_space_factor_stag(:,:)
        endif
 
-    case(HO_BABC_SIMPLE)    ! simple pattern; also useful for debugging and test cases
-                            ! (here, a strip of weak bed surrounded by stronger bed to simulate an ice stream)
-
-      beta(:,:) = 1.d4        ! Pa yr/m
-
-      !TODO - Change this loop to work in parallel (set beta on the global grid and scatter to local)
-      do ns = 5, nsn-5
-         do ew = 1, ewn-1
-            beta(ew,ns) = 100.d0      ! Pa yr/m
-         end do
-      end do
-
     case default
        ! do nothing
 
@@ -686,11 +708,10 @@ subroutine glissade_calcbeta (&
    !TODO - Move this halo update to a higher level?
    call staggered_parallel_halo(beta, parallel)
 
-   !WHL - debug
    if (verbose_beta .and. present(rtest) .and. present(itest) .and. present(jtest)) then
       if (this_rank == rtest) then
          ew = itest; ns = jtest
-         write(iulog,*) 'End of calcbeta, r, i, j, speed, f_ground, beta:', &
+         write(iulog,'(a48,3i4,3f12.5)') 'End of calcbeta, r, i, j, speed, f_ground, beta:', &
               rtest, ew, ns, speed(ew,ns), f_ground(ew,ns), beta(ew,ns)
       endif
    endif
diff --git a/libglissade/glissade_basal_water.F90 b/libglissade/glissade_basal_water.F90
index 3b642157..68ef2d3f 100644
--- a/libglissade/glissade_basal_water.F90
+++ b/libglissade/glissade_basal_water.F90
@@ -26,20 +26,33 @@
 
 module glissade_basal_water
 
-   use glimmer_global, only: dp
+   use glimmer_global, only: dp, i8
    use glimmer_paramets, only: iulog, eps11, eps08
    use glimmer_physcon, only: rhoi, rhow, lhci, grav, scyr
    use glimmer_log
    use glimmer_utils, only: point_diag
    use glide_types
-   use cism_parallel, only: main_task, this_rank, nhalo, parallel_type, parallel_halo
+   use cism_parallel, only: main_task, this_rank, nhalo, parallel_type, &
+        parallel_halo, parallel_global_sum
+
+   !WHL - debug
+   use glimmer_utils, only: double_to_binary
 
    implicit none
 
    private
    public :: glissade_basal_water_init, glissade_calcbwat, glissade_bwat_flux_routing
 
-   logical, parameter :: verbose_bwat = .false.
+!!   logical, parameter :: verbose_bwat = .false.
+   logical, parameter :: verbose_bwat = .true.
+
+   character(len=64) :: binary_str
+
+   ! two versions of this subroutine; the second supports reproducible sums
+   interface route_flux_to_margin_or_halo
+      module procedure route_flux_to_margin_or_halo_real8
+      module procedure route_flux_to_margin_or_halo_integer8
+   end interface
 
  contains
 
@@ -172,10 +185,13 @@ subroutine glissade_bwat_flux_routing(&
        thklim,                       &
        bwat_mask,     floating_mask, &
        bmlt_hydro,                   &
-       delta_Tb,      btemp_scale,   &
+       delta_Tb,                     &
+       btemp_flow_scale,             &
+       btemp_freeze_scale,           &
        bwatflx,       bwat_diag,     &
        bhydroflx,                    &
-       head,          grad_head)
+       head,          grad_head,     &
+       reprosum_in)
 
     ! Compute the subglacial water flux and water depth using a steady-state flux routing scheme.
     ! Water is routed down the hydropotential. For routing purposes, assume p_w = p_i (i.e., N = 0).
@@ -208,13 +224,20 @@ subroutine glissade_bwat_flux_routing(&
          delta_Tb                   ! difference T_pmp - T_bed (degC)
 
     real(dp), intent(in) ::  &
-         thklim,                  & ! minimum ice thickness for basal melt and hydropotential calculations (m)
+         thklim                     ! minimum ice thickness for basal melt and hydropotential calculations (m)
                                     ! Note: This is typically model%geometry%thklim_temp
-         btemp_scale                ! temperature scale for transition from frozen to thawed bed (degC)
+
+    ! Note: These scales ensure a smooth transition in behavior between frozen and thawed beds.
+    !       Both scales are computed in a similar way, but they apply to different parts of the algorithm.
+    ! TODO: Decide whether to keep both scales. Only the flow scale works for reprosums, so we might want
+    !       to remove the freeze scale.
+    real(dp), intent(in) ::  &
+         btemp_flow_scale,        & ! temperature scale for routing water flow around cells with a frozen bed (deg C)
+         btemp_freeze_scale         ! temperature scale for refreezing water beneath cells with a frozen bed (degC)
 
     integer, dimension(nx,ny), intent(in) ::  &
          bwat_mask,               & ! mask to identify cells through which basal water is routed;
-                                    ! = 0 for floating and ocean cells; cells at global domain edge;
+                                    ! = 0 for floating and ocean cells, cells at global domain edge,
                                     !  and cells with thck = 0 and forced negative SMB
          floating_mask              ! = 1 if ice is present (thck > thklim) and floating, else = 0
 
@@ -233,6 +256,11 @@ subroutine glissade_bwat_flux_routing(&
          head,                    & ! hydraulic head (m)
          grad_head                  ! gradient of hydraulic head (m/m), averaged to cell centers
 
+    ! Note: The reprosum option requires (1) D8 routing (each cell routes its flux to one downstream neighbor only)
+    !       and (2) no temperature-weighted refreezing.
+    logical, intent(in), optional :: &
+         reprosum_in                ! if true, then do a computation independent of the number of tasks
+
     ! Local variables
 
     integer :: i, j, p
@@ -273,6 +301,8 @@ subroutine glissade_bwat_flux_routing(&
     real(dp) :: c_flux_to_depth              ! proportionality coefficient in Sommers et al., Eq. 6
     real(dp) :: Reynolds                     ! Reynolds number (unitless), = 0 for pure laminar flow
 
+    logical :: reprosum                      ! local version of reprosum_in
+
     integer :: nx_test, ny_test
     real(dp), dimension(:,:), allocatable :: phi_test
     integer,  dimension(:,:), allocatable :: mask_test
@@ -281,12 +311,16 @@ subroutine glissade_bwat_flux_routing(&
        write(iulog,*) 'In glissade_bwat_flux_routing: rtest, itest, jtest =', rtest, itest, jtest
     endif
 
+    if (present(reprosum_in)) then
+       reprosum = reprosum_in
+    else
+       reprosum = .false.
+    endif
+
     ! Uncomment if the following fields are not already up to date in halo cells
 !    call parallel_halo(thk,  parallel)
 !    call parallel_halo(topg, parallel)
     call parallel_halo(bmlt_hydro, parallel)
-    !TODO - Add bfrz?
-
 
     ! Compute the hydraulic head
     ! For purposes of flux routing, assume N = 0.
@@ -309,7 +343,6 @@ subroutine glissade_bwat_flux_routing(&
     endif
 
     ! Route basal water down the gradient of hydraulic head, giving a water flux
-    ! TODO - Pass in bfrz_pot, return bfrz?
 
     call route_basal_water(&
          nx,      ny,            &
@@ -320,11 +353,13 @@ subroutine glissade_bwat_flux_routing(&
          head,                   &
          bmlt_hydro,             &
          delta_Tb,               &
-         btemp_scale,            &
+         btemp_flow_scale,       &
+         btemp_freeze_scale,     &
          bwat_mask,              &
          bwatflx,                &
          bwatflx_refreeze,       &
-         lakes)
+         lakes,                  &
+         reprosum)
 
     call parallel_halo(bwatflx, parallel)
 
@@ -362,7 +397,11 @@ subroutine glissade_bwat_flux_routing(&
     bwatflx(:,:) = bwatflx(:,:) / (dx*dy)
 
     ! Given bwatflx_refreeze in m^3/s, compute bhydroflx in W/m2.
-    ! This is the heat flux needed to refreeze the meltwater held in each cell
+    ! This is the heat flux needed to refreeze the meltwater held in each cell.
+    ! This heat flux is supplied at the bed during the next thermal solve.
+    ! If there is more than enough heat to thaw the bed, some meltwater will be returned later
+    !  instead of refrozen.
+
     bhydroflx(:,:) = bwatflx_refreeze(:,:) * rhoi * lhci / (dx*dy)
 
     if (verbose_bwat) then
@@ -439,11 +478,13 @@ subroutine route_basal_water(&
          head,                   &
          bmlt_hydro,             &
          delta_Tb,               &
-         btemp_scale,            &
+         btemp_flow_scale,       &
+         btemp_freeze_scale,     &
          bwat_mask,              &
          bwatflx,                &
          bwatflx_refreeze,       &
-         lakes)
+         lakes,                  &
+         reprosum_in)
 
     ! Route water from the basal melt field to its destination, recording the water flux along the way.
     ! Water flow direction is determined according to the gradient of the hydraulic head.
@@ -452,11 +493,7 @@ subroutine route_basal_water(&
     ! This results in the lakes field, which is the difference between the filled head and the original head.
     !  The method used is by Quinn et. al. (1991).
     !
-    ! Based on code by Jesse Johnson (2005), adapted from the glimmer_routing file by Ian Rutt.
-
-    ! TODO - Pass in bfrz_pot, return bfrz.
-
-    use cism_parallel, only: parallel_global_sum
+    ! Originally based on code by Jesse Johnson and Ian Rutt in the Glimmer model
 
     !WHL - debug
     use cism_parallel, only: parallel_globalindex, parallel_reduce_max
@@ -481,8 +518,9 @@ subroutine route_basal_water(&
          delta_Tb                ! difference T_pmp - T_bed (degC)
 
     real(dp), intent(in) :: &
-         btemp_scale             ! temperature scale for transition from frozen to thawed bed (degC)
-                                 ! If btemp_scale = 0, assume no temperature dependence
+         btemp_flow_scale,     & ! temperature scale for routing water flow around cells with a frozen bed (deg C)
+         btemp_freeze_scale      ! temperature scale for refreezing water beneath cells with a frozen bed (degC)
+                                 ! If scale = 0, assume no temperature dependence
 
     real(dp), dimension(nx,ny), intent(inout)  ::  &
          head                    ! hydraulic head (m)
@@ -490,19 +528,22 @@ subroutine route_basal_water(&
 
     integer, dimension(nx,ny), intent(in)  ::  &
          bwat_mask               ! mask to identify cells through which basal water is routed;
-                                 ! = 1 where ice is present and not floating
+                                 ! excludes floating and ocean cells
 
     real(dp), dimension(nx,ny), intent(out) ::  &
          bwatflx,             &  ! water flux through a grid cell (m^3/s)
          bwatflx_refreeze,    &  ! water flux held for refreezing (m^3/s)
          lakes                   ! lakes field, difference between filled and original head
 
+    logical, intent(in), optional :: &
+         reprosum_in             ! if true, then do a computation independent of the number of tasks
+
     ! Local variables
 
     integer :: nlocal            ! number of locally owned cells
     integer :: count, count_max  ! iteration counters
-    integer :: i, j, k, ii, jj, ip, jp, p
-    integer :: i1, j1, i2, j2, itmp, jtmp, iglobal, jglobal
+    integer :: i, j, k, iglobal, jglobal
+    integer :: ii, jj, imax, jmax
 
     logical :: finished    ! true when an iterative loop has finished
 
@@ -510,15 +551,14 @@ subroutine route_basal_water(&
          sorted_ij         ! i and j indices of all cells, sorted from low to high values of head
 
     real(dp), dimension(-1:1,-1:1,nx,ny) ::  &
-         flux_fraction, &  ! fraction of flux from each cell that flows downhill to each of 8 neighbors
-         bwatflx_halo      ! water flux (m^3/s) routed to a neighboring halo cell; routed further in next iteration
+         flux_fraction     ! fraction of flux from each cell that flows downhill to each of 8 neighbors
 
     real(dp), dimension(nx,ny) ::  &
          head_filled,           & ! head after depressions are filled (m)
+         btemp_weight_flow,     & ! temp-dependent weighting factor, forcing flow around cells with frozen beds
+         btemp_weight_freeze,   & ! temp-dependent weighting factor, favoring refreezing in cells with frozen beds
          bwatflx_accum,         & ! water flux through the cell (m^3/s) accumulated over multiple iterations
-         bwatflx_refreeze_accum,& ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations
-         sum_bwatflx_halo,      & ! bwatflx summed over the first 2 dimensions in each grid cell
-         btemp_weight             ! temperature-dependent weighting factor, favoring flow where the bed is thawed
+         bwatflx_refreeze_accum   ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations
 
     integer, dimension(nx,ny) ::  &
          local_mask,            & ! = 1 for cells owned by the local processor, else = 0
@@ -533,8 +573,33 @@ subroutine route_basal_water(&
          err,                   & ! water conservation error
          global_flux_sum          ! flux sum over all cells in global domain
 
+    ! The following i8 variables are for computing reproducible sums
+    integer(i8), dimension(nx,ny) ::  &
+         bwatflx_int,               & ! water flux through a grid cell (m^3/s)
+         btemp_weight_freeze_int,   & ! temp-dependent weighting factor, favoring refreezing in cells with frozen beds
+         bwatflx_accum_int,         & ! water flux through the cell (m^3/s) accumulated over multiple iterations
+         bwatflx_refreeze_accum_int   ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations
+
+    integer(i8), dimension(-1:1,-1:1,nx,ny) ::  &
+         flux_fraction_int            ! fraction of flux from each cell that flows downhill to each of 8 neighbors
+
+    real(dp), parameter :: &
+         factor_bwatflx = 1.d16       ! factor for converting between bwatflx and bwatflx_int;
+                                      ! large value desired for water mass conservation
+
+    logical :: reprosum               ! local version of reprosum_in
+
     character(len=100) :: message
 
+    !WHL - debug
+    character(len=64) :: binary_str
+
+    if (present(reprosum_in)) then
+       reprosum = reprosum_in
+    else
+       reprosum = .false.
+    endif
+
     ! Allocate the sorted_ij array
 
     nlocal = parallel%own_ewn * parallel%own_nsn
@@ -608,28 +673,44 @@ subroutine route_basal_water(&
        enddo
     enddo
 
-    ! Compute a temperature-dependent weighting factor for flux routing.
-    ! This is used in two parts of the code:
-    ! (1) In subroutine get_flux_fraction, btemp_weight is used to weight potential downstream paths.
-    !     A small value of btemp_weight means that a cell is less likely to receive water from upstream.
-    ! (2) When water enters a frozen cell (delta_Tb > 0), btemp_weight is used to determine
-    !     how much of the flux is refrozen in place rather than passing through.
-    !     A low value of btemp_weight means that less water passes through.
-
-    btemp_weight = 1.0d0
+    ! Compute temperature-dependent weighting factors for flux routing.
+    ! There are two scales with related but distinct functions:
+    ! (1) In subroutine get_flux_fraction, btemp_flow_scale is used to weigh potential downstream paths.
+    !     A low value of btemp_weight_flow means that water is less likely to pass through.
+    ! (2) When water enters a frozen cell (delta_Tb > 0), btemp_freeze_scale determines
+    !     how much of the flux refreezes in place rather than passing through.
+    !     A small value of btemp_weight_freeze means that more water refreezes, and less passes through.
+    ! Note: For reproducible sums, refreezing is not supported; must have btemp_weight_freeze = 1 everywhere.
+    ! TODO: Possibly remove btemp_freeze_scale and just keep btemp_flow_scale.
+
+    btemp_weight_flow = 1.0d0
+    if (btemp_flow_scale > 0.0d0) then
+       if (.not. reprosum) then
+          where (bwat_mask == 1)
+             where (delta_Tb > 0.0d0)
+                btemp_weight_flow = exp(-delta_Tb/btemp_flow_scale)
+             endwhere
+          endwhere
+       endif
+    endif
 
-    if (btemp_scale > 0.0d0) then
-       where (bwat_mask == 1)
-          where (delta_Tb > 0.0d0)
-             btemp_weight = exp(-delta_Tb/btemp_scale)
+    btemp_weight_freeze = 1.0d0
+    if (btemp_freeze_scale > 0.0d0) then
+       if (.not. reprosum) then
+          where (bwat_mask == 1)
+             where (delta_Tb > 0.0d0)
+                btemp_weight_freeze = exp(-delta_Tb/btemp_freeze_scale)
+             endwhere
           endwhere
-       endwhere
-       if (verbose_bwat) then
-          call point_diag(delta_Tb, 'Tpmp - Tb', itest, jtest, rtest, 7, 7)
-          call point_diag(btemp_weight, 'btemp_weight', itest, jtest, rtest, 7, 7)
        endif
     endif
 
+    if (verbose_bwat) then
+       call point_diag(delta_Tb, 'Tpmp - Tb', itest, jtest, rtest, 7, 7)
+       call point_diag(btemp_weight_flow, 'btemp_weight_flow', itest, jtest, rtest, 7, 7)
+       call point_diag(btemp_weight_freeze, 'btemp_weight_freeze', itest, jtest, rtest, 7, 7)
+    endif
+
     ! Compute the fraction of the incoming flux sent to each downstream neighbor.
 
     call get_flux_fraction(&
@@ -639,18 +720,12 @@ subroutine route_basal_water(&
          flux_routing_scheme,   &
          sorted_ij,             &
          head,                  &
-         btemp_weight,          &
+         btemp_weight_flow,     &
          bwat_mask,             &
          flux_fraction)
 
-    ! Initialize bwatflx in locally owned cells with the basal melt, which will be routed downslope.
-    ! Multiply by area, so units are m^3/s.
-    ! The halo water flux, bwatflx_halo, holds water routed to halo cells;
-    !  it will be routed downhill during the next iteration.
-    ! The accumulated flux, bwatflx_accum, holds the total flux over multiple iterations.
-    ! Some or all of the water entering a frozen cell can be refrozen in place.
-    !  The heat flux associated with refreezing is passed to the next thermal solve.
-    !  If this heat flux is enough to thaw the cell, some of the meltwater is returned later.
+    ! Initialize bwatflx in locally owned cells.
+    ! Set to the local melt rate, multiplied by area (so the units are m^3/s).
 
     bwatflx = 0.0d0
     do j = nhalo+1, ny-nhalo
@@ -659,174 +734,132 @@ subroutine route_basal_water(&
        enddo
     enddo
 
-    ! Initialize other fluxes
-    bwatflx_halo = 0.0d0
-    bwatflx_refreeze = 0.0d0
-    bwatflx_accum = 0.0d0
-    bwatflx_refreeze_accum = 0.0d0
-
     ! Compute total input of meltwater (m^3/s)
     total_flux_in = parallel_global_sum(bwatflx, parallel)
-
     if (verbose_bwat .and. this_rank == rtest) then
-       write(iulog,*) ' '
        write(iulog,*) 'Total input basal melt flux (m^3/s):', total_flux_in
+!!       call double_to_binary(total_flux_in, binary_str)
+!!       write(iulog,*) '   Binary string', binary_str
     endif
 
-    ! Loop over locally owned cells, from highest to lowest.
-    ! During each iteration, there are two possible outcomes for routing:
-    ! (1) Routed to the ice sheet margin, to a cell with bwat_mask = 0.
-    !     In this case, the routing of that flux is done.
-    ! (2) Routed to a halo cell, i.e. a downslope cell on a neighboring processor.
-    !     In this case, the flux will be routed further downhill on the next iteration.
-    ! When all the water has been routed to the margin, we are done.
+    ! Route the water downstream, keeping track of the steady-state flux through each cell.
+    ! The loop goes from highest to lowest values of head on the local processor.
+    ! At the end of the loop, all the incoming flux has either been
+    ! (1) routed to the ice sheet margin,
+    ! (2) set aside for later refreezing, or
+    ! (3) routed to a halo cell, from which it will continue downstream on the next iteration.
+    ! When all the water has been routed to the margin or set aside for refreezing, we are done.
 
-    count = 0
     ! Note: It is hard to predict how many iterations will be sufficient.
-    !       With Dinf or FD8, we can have flow back and forth across processor boundaries,
+    !       With Dinf or FD8 we can have flow back and forth across processor boundaries,
     !        requiring many iterations to reach the margin.
     !       For Greenland 4 km, Dinf requires ~20 iterations on 4 cores, and FD8 can require > 40.
     !       For Antarctica 8 km, FD8 can require > 50.
-    count_max = 100
-    finished = .false.
 
-    do while (.not.finished)
-
-       count = count + 1
+    ! Initialize the cumulative fluxes
+    bwatflx_accum = 0.0d0
+    bwatflx_refreeze_accum = 0.0d0
 
-       if (verbose_bwat .and. this_rank == rtest) then
-          write(iulog,*) 'flux routing, count =', count
-       endif
+    if (reprosum) then
 
-       do k = nlocal, 1, -1
+       ! Convert bwatflx to a scaled i8 array
+       bwatflx_int(:,:) = nint(bwatflx(:,:)*factor_bwatflx, i8)
 
-          ! Get i and j indices of current cell
-          i = sorted_ij(k,1)
-          j = sorted_ij(k,2)
+       ! Convert flux_fraction to i8
+       ! Note: This will work only for the D8 scheme, where all the flux goes downstream
+       !       to a single cell.
+       flux_fraction_int(:,:,:,:) = nint(flux_fraction(:,:,:,:), i8)
 
-          ! Route the flux
-          if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then
+       ! Convert btemp_weight_freeze to i8
+       btemp_weight_freeze_int(:,:) = 1
+       ! Note: Can round up to 1 and down to 0 by uncommenting the following line.
+       !       However, a mix of 1's and 0's leads to oscillations in basal temperature,
+       !        so it is safer to turn off refreezing by setting btemp_weight_freeze = 1 everywhere.
+!       btemp_weight_freeze_int(:,:) = nint(btemp_weight_freeze(:,:), i8)
 
-             ! Distribute the flux to downslope neighbors.
-             ! Where the bed is frozen, all or part of the flux is refrozen in place instead of being routed downstream.
-             do jj = -1,1
-                do ii = -1,1
-                   ip = i + ii
-                   jp = j + jj
-                   if (flux_fraction(ii,jj,i,j) > 0.0d0) then
-                      if (halo_mask(ip,jp) == 1) then
-                         bwatflx_halo(ii,jj,i,j) = bwatflx(i,j)*flux_fraction(ii,jj,i,j)*btemp_weight(i,j)
-                         bwatflx_refreeze(i,j) = bwatflx_refreeze(i,j) &
-                              + bwatflx(i,j)*flux_fraction(ii,jj,i,j)*(1.0d0 - btemp_weight(i,j))
-                         if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then
-                            write(iulog,*) 'Flux to halo, i, j, ii, jj, flux:', &
-                                 i, j, ii, jj, bwatflx(i,j)*flux_fraction(ii,jj,i,j)
-                         endif
-                      elseif (local_mask(ip,jp) == 1) then
-                         bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx(i,j)*flux_fraction(ii,jj,i,j)*btemp_weight(i,j)
-                         bwatflx_refreeze(i,j) = bwatflx_refreeze(i,j) &
-                              + bwatflx(i,j)*flux_fraction(ii,jj,i,j)*(1.0d0 - btemp_weight(i,j))
-                      endif
-                   endif   ! flux_fraction > 0
-                enddo
-             enddo
-          endif  ! bwat_mask = 1, bwatflx > 0
-       enddo  ! loop from high to low
+       ! Initialize other arrays
+       bwatflx_accum_int = 0
+       bwatflx_refreeze_accum_int = 0
 
-       ! Accumulate bwatflx from the latest iteration, then reset to zero for the next iteration.
-       bwatflx_accum = bwatflx_accum + bwatflx
-       bwatflx_refreeze_accum = bwatflx_refreeze_accum + bwatflx_refreeze
-       bwatflx = 0.0d0
-       bwatflx_refreeze = 0.0d0
+    endif   ! reprosum
 
-       if (verbose_bwat .and. this_rank == rtest .and. count <= 2) then
-          i = itest
-          j = jtest
-          write(iulog,*) 'i, j, bwatflx_accum:', i, j, bwatflx_accum(i,j)
-       endif
+    count = 0
+    count_max = 100
+    finished = .false.
 
-       ! If bwatflx_halo = 0 everywhere, then we are done.
-       ! (If the remaining flux is very small (< eps11), discard it to avoid
-       !  unnecessary extra iterations.)
-       ! If bwatflx_halo remains, then communicate it to neighboring tasks and
-       !  continue routing on the next iteration.
-
-       do j = 1, ny
-          do i = 1, nx
-             sum_bwatflx_halo(i,j) = sum(bwatflx_halo(:,:,i,j))
-!             if (verbose_bwat .and. sum_bwatflx_halo(i,j) > eps11 .and. count > 50) then
-!               write(iulog,*) 'Nonzero bwatflx_halo, count, rank, i, j, sum_bwatflx_halo:', &
-!                     count, this_rank, i, j, sum_bwatflx_halo(i,j)
-!               call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-!               write(iulog,*) '     iglobal, jglobal:', iglobal, jglobal
-!             endif
-          enddo
-       enddo
-       global_flux_sum = parallel_global_sum(sum_bwatflx_halo, parallel)
+    do while (.not.finished)
 
-       if (verbose_bwat .and. count <= 2) then
-          if (this_rank == rtest) then
-             write(iulog,*) 'Before halo update, sum of bwatflx_halo:', global_flux_sum
-          endif
-          call point_diag(sum_bwatflx_halo, 'sum_bwatflx_halo', itest, jtest, rtest, 7, 7)
+       count = count + 1
+       if (verbose_bwat .and. this_rank == rtest) write(iulog,*) 'flux routing, count =', count
+       if (count > count_max) then
+          call write_log('Hydrology error: too many iterations in route_basal_water', GM_FATAL)
        endif
 
-       if (global_flux_sum > eps11) then
-
-          finished = .false.
+       if (reprosum) then
+
+          ! route downstream
+          ! Note: The fluxes are scaled by factor_bwatflx
+
+          call route_flux_to_margin_or_halo(&
+            nx, ny, nlocal,             &
+            itest, jtest, rtest, count, &
+            parallel,                   &
+            sorted_ij,                  &
+            local_mask,                 &
+            halo_mask,                  &
+            bwat_mask,                  &
+            flux_fraction_int,          &
+            btemp_weight_freeze_int,    &
+            bwatflx_int,                &
+            bwatflx_accum_int,          &
+            bwatflx_refreeze_accum_int, &
+            finished)
 
-          ! Communicate bmltflx_halo to the halo cells of neighboring processors
-          call parallel_halo(bwatflx_halo(:,:,:,:), parallel)
-
-          ! bmltflx_halo is now available in the halo cells of the local processor.
-          ! Route downslope to the adjacent locally owned cells.
-          ! These fluxes will be routed further downslope during the next iteration.
-
-          do j = 2, ny-1
-             do i = 2, nx-1
-                if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0.0d0) then
-                   do jj = -1,1
-                      do ii = -1,1
-                         if (bwatflx_halo(ii,jj,i,j) > 0.0d0) then
-                            ip = i + ii
-                            jp = j + jj
-                            if (local_mask(ip,jp) == 1) then
-                               bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx_halo(ii,jj,i,j)
-                               if (verbose_bwat .and. ip==itest .and. jp==jtest .and. this_rank==rtest &
-                                    .and. count <= 2) then
-                                  write(iulog,*) 'Nonzero bwatflx from halo, rank, i, j:', &
-                                       this_rank, ip, jp, bwatflx_halo(ii,jj,i,j)
-                               endif
-                            endif
-                         endif   !  bwatflx_halo > 0 to a local cell
-                      enddo   ! ii
-                   enddo   ! jj
-                endif   ! bwatflx_halo > 0 from this halo cell
-             enddo   ! i
-          enddo   ! j
+          if (verbose_bwat .and. this_rank == rtest .and. count <= 2) then
+             i = itest; j = jtest
+             write(iulog,*) 'count, rank i, j, bwatflx_accum (m/yr), bwatflx_refreeze_accum:', &
+                  count, rtest, i, j, real(bwatflx_accum_int(i,j),dp)/factor_bwatflx, &
+                  real(bwatflx_refreeze_accum_int(i,j),dp)/factor_bwatflx
+          endif
 
-          ! Reset bwatflx_halo for the next iteration
-          bwatflx_halo = 0.0d0
+       else   ! non-reproducible sums
+
+          call route_flux_to_margin_or_halo(&
+            nx, ny, nlocal,      &
+            itest, jtest, rtest, count, &
+            parallel,            &
+            sorted_ij,           &
+            local_mask,          &
+            halo_mask,           &
+            bwat_mask,           &
+            flux_fraction,       &
+            btemp_weight_freeze, &
+            bwatflx,             &
+            bwatflx_accum,       &
+            bwatflx_refreeze_accum, &
+            finished)
 
-          global_flux_sum = parallel_global_sum(bwatflx, parallel)
           if (verbose_bwat .and. this_rank == rtest .and. count <= 2) then
-             ! Should be equal to the global sum of bwatflx_halo computed above
-             write(iulog,*) 'After halo update, sum(bwatflx from halo) =', global_flux_sum
-             write(iulog,*) ' '
+             i = itest; j = jtest
+             write(iulog,*) 'count, rank i, j, bwatflx_accum(m/yr), bwatflx_refreeze_accum:', &
+                  count, rtest, i, j, bwatflx_accum(i,j) * scyr/(dx*dy), &
+                  bwatflx_refreeze_accum(i,j) * scyr/(dx*dy)
           endif
 
-       else   ! bwatflx_halo = 0 everywhere; no fluxes to route to adjacent processors
-          if (verbose_bwat .and. this_rank == rtest) write(iulog,*) 'Done routing fluxes'
-          finished = .true.
-          bwatflx = bwatflx_accum
-          bwatflx_refreeze = bwatflx_refreeze_accum
-       endif
+       endif   ! reprosum
 
-       if (count > count_max) then
-          call write_log('Hydrology error: too many iterations in route_basal_water', GM_FATAL)
-       endif
+    enddo   ! finished
+
+    if (reprosum) then
+       ! Convert fluxes back to real(dp)
+       bwatflx_accum = real(bwatflx_accum_int, dp) / factor_bwatflx
+       bwatflx_refreeze_accum = real(bwatflx_refreeze_accum_int, dp) / factor_bwatflx
+    endif
 
-    enddo  ! finished routing
+    ! Copy the accumulated values to the output arrays bwatflx and bwatflx_refreeze
+    bwatflx = bwatflx_accum
+    bwatflx_refreeze = bwatflx_refreeze_accum
+    if (verbose_bwat .and. this_rank == rtest) write(iulog,*) 'Done routing fluxes'
 
     ! Identify cells just beyond the ice sheet margin, which can receive from upstream but not send downstream
     where (bwat_mask == 0 .and. bwatflx > 0.0d0)
@@ -847,7 +880,6 @@ subroutine route_basal_water(&
        write(iulog,*) 'Difference between output and input =', total_flux_out - total_flux_in
     endif
 
-
     ! Not sure if a threshold of eps11 is large enough.  Increase if needed.
     if (total_flux_in > 0.0d0) then
        err = abs(total_flux_in - total_flux_out)
@@ -1031,8 +1063,7 @@ subroutine fill_depressions(&
     ! Continue until no further lowering of phi is possible.  At that point, phi = phi_out.
     ! Note: Setting eps = 0 would result in flat surfaces that would need to be fixed later.
 
-    use cism_parallel, only: parallel_reduce_sum
-    use cism_parallel, only: parallel_globalindex
+    use cism_parallel, only: parallel_reduce_sum, parallel_globalindex
 
     implicit none
 
@@ -1054,7 +1085,7 @@ subroutine fill_depressions(&
     real(dp), dimension(nx,ny), intent(out) :: &
          phi                 ! output field with depressions filled
 
-    ! Local variables --------------------------------------
+    ! Local variables
 
     logical, dimension(nx,ny) ::  &
          known               ! = true for cells where the final phi(i,j) is known
@@ -1264,7 +1295,7 @@ subroutine sort_heights(&
     integer, intent(in) ::  &
          nx, ny,               & ! number of grid cells in each direction
          nlocal,               & ! number of locally owned cells
-         itest, jtest, rtest     ! coordinates of diagnostic point
+         itest, jtest, rtest     ! coordinates of diagnostic point  !! not currently used
 
     real(dp), dimension(nx,ny), intent(in) :: &
          phi                     ! input field, to be sorted from low to high
@@ -1307,17 +1338,6 @@ subroutine sort_heights(&
 
     call indexx(vect, ind)
 
-    if (verbose_bwat .and. this_rank == rtest) then
-       write(iulog,*) ' '
-       write(iulog,*) 'Sort from low to high, nlocal =', nlocal
-       write(iulog,*) 'k, local i and j, ind(k), phi:'
-       do k = nlocal, nlocal-10, -1
-          i = floor(real(ind(k)-1)/real(ny_local)) + 1 + nhalo
-          j = mod(ind(k)-1,ny_local) + 1 + nhalo
-          write(iulog,*) k, i, j, ind(k), phi(i,j)
-       enddo
-    endif
-
     ! Fill the sorted_ij array with the i and j values of each cell.
     ! Note: These are the i and j values we would have if there were no halo cells.
     do k = 1, nlocal
@@ -1339,7 +1359,7 @@ subroutine get_flux_fraction(&
        flux_routing_scheme,   &
        sorted_ij,             &
        head,                  &
-       btemp_weight,          &
+       btemp_weight_flow,     &
        bwat_mask,             &
        flux_fraction)
 
@@ -1369,7 +1389,7 @@ subroutine get_flux_fraction(&
 
     real(dp), dimension(nx,ny), intent(in) :: &
          head,                 & ! hydraulic head (m)
-         btemp_weight            ! temperature-dependent weighting factor, favoring flow where the bed is thawed
+         btemp_weight_flow       ! temperature-dependent weighting factor, forcing flow around cells with frozen beds
 
     integer, dimension(nx,ny), intent(in) :: &
          bwat_mask               ! = 1 for cells in the region where basal water fluxes can be nonzero
@@ -1428,7 +1448,7 @@ subroutine get_flux_fraction(&
                    jp = j + jj
                    if (ip >= 1 .and. ip <= nx .and. jp > 1 .and. jp <= ny) then
                       if (head(ip,jp) < head(i,j)) then
-                         slope(ii,jj) = btemp_weight(ip,jp) * (head(i,j) - head(ip,jp)) / dists(ii,jj)
+                         slope(ii,jj) = btemp_weight_flow(ip,jp) * (head(i,j) - head(ip,jp)) / dists(ii,jj)
                       endif
                    endif
                 endif
@@ -1588,6 +1608,390 @@ subroutine get_flux_fraction(&
 
   end subroutine get_flux_fraction
 
+!==============================================================
+
+  subroutine route_flux_to_margin_or_halo_real8(&
+       nx, ny, nlocal,      &
+       itest, jtest, rtest, count, &
+       parallel,            &
+       sorted_ij,           &
+       local_mask,          &
+       halo_mask,           &
+       bwat_mask,           &
+       flux_fraction,       &
+       btemp_weight_freeze, &
+       bwatflx,             &
+       bwatflx_accum,       &
+       bwatflx_refreeze_accum, &
+       finished)
+
+    ! Given the input bwatflx, route the water downstream, keeping track of fluxes along the way.
+    ! The loop goes from highest to lowest values of 'head' on the local processor.
+    ! At the end of the loop, all the incoming flux has either been
+    ! (1) routed to the ice sheet margin;
+    ! (2) set aside for later refreezing; or
+    ! (3) routed to a halo cell, from which it continues downstream the next time the subroutine is called.
+    ! The subroutine is called iteratively until all no water remains in halo cells.
+
+    implicit none
+
+    ! Input/output variables
+
+    integer, intent(in) :: &
+         nx, ny,               & ! number of cells in each direction
+         nlocal,               & ! number of locally owned grid cells on the processor
+         itest, jtest, rtest,  & ! coordinates of diagnostic point
+         count                   ! iteration count (diagnostic only)
+
+    type(parallel_type), intent(in) ::  &
+         parallel                ! info for parallel communication
+
+    integer, dimension(nlocal,2), intent(in)  :: &
+         sorted_ij               ! i and j indices of each local cell, sorted low to high
+
+    integer, dimension(nx,ny), intent(in) :: &
+         local_mask,           & ! = 1 for cells owned by the local processor, else = 0
+         halo_mask,            & ! = 1 for the layer of halo cells adjacent to locally owned cells, else = 0
+         bwat_mask               ! = 1 for cells through which basal water is routed; excludes floating and ocean cells
+
+    real(dp), dimension(-1:1,-1:1,nx,ny), intent(in) ::  &
+         flux_fraction           ! fraction of flux from a cell that flows downhill to each of 8 neighbors
+                                 ! last two indices identify the source cell;
+                                 ! 1st two indices give relative location of receiving cell
+
+    real(dp), dimension(nx,ny), intent(in) :: &
+         btemp_weight_freeze     ! temperature-dependent weighting factor, favoring refreezing at frozen beds
+
+    real(dp), dimension(nx,ny), intent(inout) :: &
+         bwatflx,              & ! on input: water flux (m^3/s) to be routed to the margin or halo
+                                 ! on output: flux routed to halo, to be routed further next time
+         bwatflx_accum,        & ! cumulative bwatflx (m/3/s) over multiple iterations
+         bwatflx_refreeze_accum  ! cumulative bwatflx_refreeze (m^s/s) over multiple iterations
+
+    logical, intent(inout) :: &
+         finished                ! initially F; set to T when all water has been routed as far as it can go
+
+    ! Local variables
+
+    integer :: i, j, k
+    integer :: ii, jj, ip, jp
+
+    real(dp), dimension(-1:1,-1:1,nx,ny)::  &
+         bwatflx_halo            ! flux routed to halo cells
+                                 ! last two indices identify the source cell;
+                                 ! 1st two indices give relative location of receiving cell
+
+    real(dp), dimension(nx,ny) :: &
+         bwatflx_refreeze,     & ! flux (m^3/s) saved for later refreezing; not routed further downstream
+         sum_bwatflx_halo        ! bwatflx_halo summed over the first 2 indices
+
+    real(dp) :: &
+         flx_thru,             & ! flux (m^3/s) that continues downstream
+         global_halo_sum         ! global sum of water flux in halo cells
+
+    ! Initialize fluxes
+    bwatflx_refreeze = 0.0d0
+    bwatflx_halo = 0.0d0
+
+    ! loop from high to low values on the local processor
+    do k = nlocal, 1, -1
+
+       ! Get i and j indices of current cell
+       i = sorted_ij(k,1)
+       j = sorted_ij(k,2)
+
+       if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then
+
+          ! Distribute the flux to downstream neighbors.
+          ! Based on the temperature-dependent weighting factor btemp_weight_freeze, all or part of the flux
+          !  is refrozen in place instead of being routed downstream.
+          flx_thru = bwatflx(i,j) * btemp_weight_freeze(i,j)
+          bwatflx_refreeze(i,j) = bwatflx(i,j) * (1.0d0 - btemp_weight_freeze(i,j))
+          do jj = -1,1
+             do ii = -1,1
+                ip = i + ii
+                jp = j + jj
+                if (flux_fraction(ii,jj,i,j) > 0.0d0) then
+                   if (halo_mask(ip,jp) == 1) then
+                      bwatflx_halo(ii,jj,i,j) = flx_thru*flux_fraction(ii,jj,i,j)
+                      if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then
+                         write(iulog,*) 'Flux to halo, i, j, ii, jj, flux:', &
+                              i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j)
+                      endif
+                   elseif (local_mask(ip,jp) == 1) then
+                      bwatflx(ip,jp) = bwatflx(ip,jp) + flx_thru*flux_fraction(ii,jj,i,j)
+                      if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then
+                         write(iulog,*) 'Flux to neighbor, i, j, ii, jj, flux:', &
+                              i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j)
+                      endif
+                   endif
+                endif   ! flux_fraction > 0
+             enddo  ! ii
+          enddo  ! jj
+       endif  ! bwat_mask = 1, bwatflx > 0
+    enddo  ! loop from high to low
+
+    ! Accumulate the fluxes in the output arrays
+    bwatflx_accum = bwatflx_accum + bwatflx
+    bwatflx_refreeze_accum = bwatflx_refreeze_accum + bwatflx_refreeze
+
+    ! Compute the total bwatflx in halo cells
+    do j = 1, ny
+       do i = 1, nx
+          sum_bwatflx_halo(i,j) = sum(bwatflx_halo(:,:,i,j))
+       enddo
+    enddo
+    global_halo_sum = parallel_global_sum(sum_bwatflx_halo, parallel)
+
+    ! If bwatflx_halo = 0 everywhere, then we are done.
+    ! Where bwatflx_halo is nonzero, communicate it to the neighboring task.
+    ! It will be routed further downstream the next time this subroutine is called.
+
+    if (global_halo_sum > 0.0d0) then
+
+       if (verbose_bwat .and. count <= 2) then
+          if (this_rank == rtest) write(iulog,*) 'Before halo update, global_halo_sum:', global_halo_sum
+          call point_diag(sum_bwatflx_halo, 'sum_bwatflx_halo', itest, jtest, rtest, 7, 7)
+       endif
+
+       ! Reset bwatflx to zero for the halo transfer
+       bwatflx = 0.0d0
+
+       ! Communicate bmltflx_halo to the halo cells of neighboring processors
+       call parallel_halo(bwatflx_halo(:,:,:,:), parallel)
+
+       ! bmltflx_halo is now available in the halo cells of the local processor.
+       ! Route downslope to the adjacent locally owned cells.
+       ! These fluxes will be routed further downstream during the next iteration.
+
+       do j = 2, ny-1
+          do i = 2, nx-1
+             if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0.0d0) then
+                do jj = -1,1
+                   do ii = -1,1
+                      if (bwatflx_halo(ii,jj,i,j) > 0.0d0) then
+                         ip = i + ii
+                         jp = j + jj
+                         if (local_mask(ip,jp) == 1) then
+                            bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx_halo(ii,jj,i,j)
+                            if (verbose_bwat .and. ip==itest .and. jp==jtest .and. this_rank==rtest .and. count <= 2) then
+                               write(iulog,*) 'Nonzero bwatflx from halo, rank, i, j:', &
+                                    this_rank, ip, jp, bwatflx_halo(ii,jj,i,j)
+                            endif
+                         endif
+                      endif   !  bwatflx_halo > 0 to a local cell
+                   enddo   ! ii
+                enddo   ! jj
+             endif   ! bwatflx_halo > 0 from this halo cell
+          enddo   ! i
+       enddo   ! j
+
+    else
+
+       finished = .true.   ! no water in halo cells to route further
+
+    endif  ! global_halo_sum > 0
+
+  end subroutine route_flux_to_margin_or_halo_real8
+
+!==============================================================
+
+  subroutine route_flux_to_margin_or_halo_integer8(&
+       nx, ny, nlocal,             &
+       itest, jtest, rtest, count, &
+       parallel,                   &
+       sorted_ij,                  &
+       local_mask,                 &
+       halo_mask,                  &
+       bwat_mask,                  &
+       flux_fraction,              &
+       btemp_weight_freeze,        &
+       bwatflx,                    &
+       bwatflx_accum,              &
+       bwatflx_refreeze_accum,     &
+       finished)
+
+    ! Given the input bwatflx, route the water downstream, keeping track of fluxes along the way.
+    ! The loop goes from highest to lowest values of 'head' on the local processor.
+    ! At the end of the loop, all the incoming flux has either been
+    ! (1) routed to the ice sheet margin;
+    ! (2) set aside for later refreezing; or
+    ! (3) routed to a halo cell, from which it continues downstream the next time the subroutine is called.
+    ! The subroutine is called iteratively until all no water remains in halo cells.
+
+    implicit none
+
+    ! Input/output variables
+
+    integer, intent(in) :: &
+         nx, ny,               & ! number of cells in each direction
+         nlocal,               & ! number of locally owned grid cells on the processor
+         itest, jtest, rtest,  & ! coordinates of diagnostic point
+         count                   ! iteration count (diagnostic only)
+
+    type(parallel_type), intent(in) ::  &
+         parallel                ! info for parallel communication
+
+    integer, dimension(nlocal,2), intent(in)  :: &
+         sorted_ij               ! i and j indices of each local cell, sorted low to high
+
+    integer, dimension(nx,ny), intent(in) :: &
+         local_mask,           & ! = 1 for cells owned by the local processor, else = 0
+         halo_mask,            & ! = 1 for the layer of halo cells adjacent to locally owned cells, else = 0
+         bwat_mask               ! = 1 for cells through which basal water is routed; excludes floating and ocean cells
+
+    ! Note: Both flux_fraction and btemp_weight_freeze are constrained to be 0 or 1.
+    !       This means that the routing is limited to D8 (all the flux goes to one downstream cell),
+    !        and partial refreezing is not allowed (i.e., btemp_weight_freeze = 1 everywhere).
+    !       Thus, btemp_weight_freeze is not needed, but I kept it to keep the code similar to the subroutine above.
+    !       We could make refreezing all-or-nothing (i.e., weights of either 0 or 1), but this leads to
+    !        oscillations in bed temperature.
+    !       I thought of rescaling flux_fraction and btemp_weight_freeze to largish i8 integers (e.g., 1000)
+    !        to keep everything BFB, and then scaling back at the end. The problem is that this subroutine
+    !        may need to be called repeatedly, and each scaling would lead to larger and larger integers
+    !        that eventually exceed the i8 limit on integer size, ~10^(19).
+
+    integer(i8), dimension(-1:1,-1:1,nx,ny), intent(in) ::  &
+         flux_fraction           ! fraction of flux from a cell that flows downhill to each of 8 neighbors
+                                 ! last two indices identify the source cell;
+                                 ! 1st two indices give relative location of receiving cell
+
+    integer(i8), dimension(nx,ny), intent(in) :: &
+         btemp_weight_freeze     ! temperature-dependent weighting factor, favoring refreezing at frozen beds
+
+    integer(i8), dimension(nx,ny), intent(inout) :: &
+         bwatflx,              & ! on input: water flux (m^3/s * factor_bwatflx) to be routed to the margin or halo
+                                 ! on output: flux routed to halo, to be routed further next time
+         bwatflx_accum,        & ! cumulative bwatflx (m^3/s * factor_bwatflx) over multiple iterations
+         bwatflx_refreeze_accum  ! cumulative bwatflx_refreeze (m^3/s * factor_bwatflx) over multiple iterations
+
+    logical, intent(inout) :: &
+         finished                ! initially F; set to T when all water has been routed as far as it can go
+
+    ! Local variables
+
+    integer :: i, j, k
+    integer :: ii, jj, ip, jp
+
+    ! Note: Some of the local variables are scaled by products of all three scale factors above.
+    integer(i8), dimension(-1:1,-1:1,nx,ny)::  &
+         bwatflx_halo            ! flux routed to halo cells
+                                 ! last two indices identify the source cell;
+                                 ! 1st two indices give relative location of receiving cell
+
+    integer(i8), dimension(nx,ny) :: &
+         bwatflx_refreeze,     & ! flux  saved for later refreezing; not routed further downstream
+         sum_bwatflx_halo        ! bwatflx_halo summed over the first 2 indices
+
+    integer(i8) :: &
+         flx_thru,             & ! flux (m^3/s) that continues downstream
+         global_halo_sum         ! global sum of water flux in halo cells
+
+    real(dp), dimension(nx,ny)::  &
+         bwatflx_dp, bwatflx_halo_dp, bwatflx_refreeze_dp  ! temporary dp versions of i8 arrays
+
+    ! Initialize fluxes
+    bwatflx_halo = 0
+    bwatflx_refreeze = 0
+
+    ! loop from high to low values on the local processor
+    do k = nlocal, 1, -1
+
+       ! Get i and j indices of current cell
+       i = sorted_ij(k,1)
+       j = sorted_ij(k,2)
+
+       if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then
+
+          ! Distribute the flux to downstream neighbors.
+          ! Note: If btemp_weight_freeze = 1 everwhere, there is no refreezing.
+          flx_thru = bwatflx(i,j) * btemp_weight_freeze(i,j)
+          bwatflx_refreeze(i,j) = bwatflx(i,j) * (1 - btemp_weight_freeze(i,j))
+          do jj = -1,1
+             do ii = -1,1
+                ip = i + ii
+                jp = j + jj
+                if (flux_fraction(ii,jj,i,j) > 0) then
+                   if (halo_mask(ip,jp) == 1) then
+                      bwatflx_halo(ii,jj,i,j) = flx_thru*flux_fraction(ii,jj,i,j)
+                      if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then
+                         write(iulog,*) 'Flux to halo, i, j, ii, jj, flux:', &
+                              i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j)
+                      endif
+                   elseif (local_mask(ip,jp) == 1) then
+                      bwatflx(ip,jp) = bwatflx(ip,jp) + flx_thru*flux_fraction(ii,jj,i,j)
+                      if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then
+                         write(iulog,*) 'Flux to neighbor, i, j, ii, jj, flux:', &
+                              i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j)
+                      endif
+                   endif
+                endif   ! flux_fraction > 0
+             enddo  ! ii
+          enddo  ! jj
+       endif  ! bwat_mask = 1, bwatflx > 0
+    enddo  ! loop from high to low
+
+    ! Accumulate the fluxes in the output arrays
+    bwatflx_accum = bwatflx_accum + bwatflx
+    bwatflx_refreeze_accum = bwatflx_refreeze_accum + bwatflx_refreeze
+
+    ! Compute the total bwatflx in halo cells
+    do j = 1, ny
+       do i = 1, nx
+          sum_bwatflx_halo(i,j) = sum(bwatflx_halo(:,:,i,j))
+       enddo
+    enddo
+    global_halo_sum = parallel_global_sum(sum_bwatflx_halo, parallel)
+
+    ! If bwatflx_halo = 0 everywhere, then we are done.
+    ! Where bwatflx_halo is nonzero, communicate it to the neighboring task.
+    ! It will be routed further downstream the next time this subroutine is called.
+
+    if (global_halo_sum > 0) then
+
+       if (verbose_bwat .and. count <= 2) then
+          if (this_rank == rtest) write(iulog,*) 'Before halo update, global_halo_sum (m^3/s):', global_halo_sum
+       endif
+
+       ! Reset bwatflx to zero for the halo transfer
+       bwatflx = 0
+
+       ! Communicate bmltflx_halo to the halo cells of neighboring processors
+       call parallel_halo(bwatflx_halo(:,:,:,:), parallel)
+
+       ! bmltflx_halo is now available in the halo cells of the local processor.
+       ! Route downslope to the adjacent locally owned cells.
+       ! These fluxes will be routed further downstream during the next iteration.
+       do j = 2, ny-1
+          do i = 2, nx-1
+             if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0) then
+                do jj = -1,1
+                   do ii = -1,1
+                      if (bwatflx_halo(ii,jj,i,j) > 0) then
+                         ip = i + ii
+                         jp = j + jj
+                         if (local_mask(ip,jp) == 1) then
+                            bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx_halo(ii,jj,i,j)
+                            if (verbose_bwat .and. ip==itest .and. jp==jtest .and. this_rank==rtest .and. count <= 2) then
+                               write(iulog,*) 'Nonzero bwatflx from halo, rank, i, j:', &
+                                    this_rank, ip, jp, bwatflx_halo(ii,jj,i,j)
+                            endif
+                         endif
+                      endif   !  bwatflx_halo > 0 to a local cell
+                   enddo   ! ii
+                enddo   ! jj
+             endif   ! bwatflx_halo > 0 from this halo cell
+          enddo   ! i
+       enddo   ! j
+
+    else
+
+       finished = .true.   ! no water in halo cells to route further
+
+    endif  ! global_halo_sum > 0
+
+  end subroutine route_flux_to_margin_or_halo_integer8
+
 !==============================================================
 
   !+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
@@ -1596,8 +2000,8 @@ end subroutine get_flux_fraction
   ! They are a GPL-licenced replacement for the Numerical Recipes routine indexx.
   ! They are not derived from any NR code, but are based on a quicksort routine by
   ! Michael Lamont (http://linux.wku.edu/~lamonml/kb.html), originally written
-  ! in C, and issued under the GNU General Public License. The conversion to
-  ! Fortran 90, and modification to do an index sort was done by Ian Rutt.
+  ! in C, and issued under the GNU General Public License. Ian Rutt did the conversion
+  ! to Fortran 90 and modified the algorithm to do an index sort.
   !
   !+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
diff --git a/libglissade/glissade_bmlt_float.F90 b/libglissade/glissade_bmlt_float.F90
index 7a0da2d5..227ed6e4 100644
--- a/libglissade/glissade_bmlt_float.F90
+++ b/libglissade/glissade_bmlt_float.F90
@@ -1014,6 +1014,7 @@ subroutine glissade_bmlt_float_thermal_forcing(&
 
        call glissade_basin_average(&
             nx,        ny,                   &
+            parallel,                        &
             ocean_data%nbasin,               &
             ocean_data%basin_number,         &
             thermal_forcing_mask * f_float,  &
@@ -1024,6 +1025,7 @@ subroutine glissade_bmlt_float_thermal_forcing(&
 
        call glissade_basin_average(&
             nx,        ny,                   &
+            parallel,                        &
             ocean_data%nbasin,               &
             ocean_data%basin_number,         &
             thermal_forcing_mask * f_float,  &
@@ -1126,6 +1128,7 @@ subroutine glissade_bmlt_float_thermal_forcing(&
 
                 call glissade_basin_average(&
                      nx,        ny,                   &
+                     parallel,                        &
                      ocean_data%nbasin,               &
                      ocean_data%basin_number,         &
                      thermal_forcing_mask * f_float,  &
diff --git a/libglissade/glissade_calving.F90 b/libglissade/glissade_calving.F90
index f2ad1cb1..04903763 100644
--- a/libglissade/glissade_calving.F90
+++ b/libglissade/glissade_calving.F90
@@ -37,10 +37,9 @@ module glissade_calving
   use glimmer_utils, only: point_diag
 
   use cism_parallel, only: this_rank, main_task, nhalo, &
-       parallel_halo, parallel_globalindex, &
+       parallel_halo, parallel_globalindex, parallel_global_sum, &
        parallel_reduce_sum, parallel_reduce_max, parallel_reduce_log_or
 
-
   implicit none
 
   private
@@ -50,7 +49,6 @@ module glissade_calving
   public :: verbose_calving
 
   logical, parameter :: verbose_calving = .false.
-!!  logical, parameter :: verbose_calving = .true.
 
 contains
 
@@ -582,15 +580,7 @@ subroutine glissade_calve_ice(nx,             ny,      &
        if (verbose_calving) then
           call point_diag(cf_length, 'cf_length (m)', itest, jtest, rtest, 7, 7)
           ! Diagnose the total CF length
-          total_cf_length = 0.0d0
-          do j = nhalo+1, ny-nhalo
-             do i = nhalo+1, nx-nhalo
-                if (calving_front_mask(i,j) == 1) then
-                   total_cf_length = total_cf_length + cf_length(i,j)
-                endif
-             enddo
-          enddo
-          total_cf_length =  parallel_reduce_sum(total_cf_length)
+          total_cf_length = parallel_global_sum(cf_length, parallel, calving_front_mask)
           if (this_rank == rtest) then
              write(iulog,*) 'Total CF length (km)', total_cf_length/1000.d0
           endif
@@ -871,13 +861,7 @@ subroutine glissade_calve_ice(nx,             ny,      &
           endif
 
           ! Compute the total ice area and the area of each quadrant
-          total_ice_area = 0.0d0
-          do j = nhalo+1, ny-nhalo
-             do i = nhalo+1, nx-nhalo
-                total_ice_area = total_ice_area + dx*dy*calving%effective_areafrac(i,j)
-             enddo
-          enddo
-          total_ice_area = parallel_reduce_sum(total_ice_area)
+          total_ice_area = parallel_global_sum(dx*dy*calving%effective_areafrac, parallel)
 
           if (verbose_calving) then
              if (this_rank == rtest) then
diff --git a/libglissade/glissade_glacier.F90 b/libglissade/glissade_glacier.F90
index cce8bdd9..ee9e1657 100644
--- a/libglissade/glissade_glacier.F90
+++ b/libglissade/glissade_glacier.F90
@@ -74,8 +74,8 @@ subroutine glissade_glacier_init(model, glacier)
     ! Another array, cism_to_rgi_glacier_id, identifies the RGI ID associated with each CISM ID.
     ! The CISM input file contains the RGI IDs.
 
-    use cism_parallel, only: distributed_gather_var, distributed_scatter_var, &
-         parallel_reduce_sum, parallel_reduce_max, parallel_reduce_min, parallel_is_zero, &
+    use cism_parallel, only: gather_var, scatter_var, &
+         parallel_global_sum, parallel_reduce_max, parallel_reduce_min, parallel_is_zero, &
          broadcast, parallel_halo, staggered_parallel_halo, parallel_globalindex
 
     type(glide_global_type),intent(inout) :: model
@@ -91,7 +91,7 @@ subroutine glissade_glacier_init(model, glacier)
 
     integer :: i, j, nc, ng, count
     integer :: iglobal, jglobal
-    integer :: ng_west, ng_east, ng_south, ng_north
+    integer :: ng_ne, ng_nw, ng_se, ng_sw
     integer :: min_id, max_id
     real(dp) :: max_glcval
     real(dp) :: theta_rad             ! latitude in radians
@@ -113,6 +113,9 @@ subroutine glissade_glacier_init(model, glacier)
          current_id,                & ! current glacier_id from list
          gid_minval, gid_maxval       ! min and max values of glacier_id
 
+    integer, dimension(model%general%ewn,model%general%nsn) :: &
+         glacier_mask                 ! = 1 for cells with glaciers (glacier_id > 0), else = 0
+
     type(parallel_type) :: parallel   ! info for parallel communication
 
     !WHL - debug, for quicksort test
@@ -152,9 +155,15 @@ subroutine glissade_glacier_init(model, glacier)
 
     if (glacier%scale_area) then
 
-       ! Optionally, rescale the grid cell dimensions dew and dns
+       ! Optionally, rescale the grid cell dimensions and coordinates
        ! This is answer-changing throughout the code.
+       ! Note: The global arrays model%general%x1_global, etc., which are written to output files, are not rescaled.
+       !       These arrays are computed from the input file, which typically ignores the scale factor.
        if (glacier%length_scale_factor /= 1.0d0) then
+          model%general%x0 = model%general%x0 * glacier%length_scale_factor
+          model%general%y0 = model%general%y0 * glacier%length_scale_factor
+          model%general%x1 = model%general%x1 * glacier%length_scale_factor
+          model%general%y1 = model%general%y1 * glacier%length_scale_factor
           model%numerics%dew = model%numerics%dew * glacier%length_scale_factor
           model%numerics%dns = model%numerics%dns * glacier%length_scale_factor
           dew = model%numerics%dew
@@ -194,8 +203,7 @@ subroutine glissade_glacier_init(model, glacier)
        !  and these arrays should already have the correct dimensions.
 
        if (associated(glacier%glacierid)) deallocate(glacier%glacierid)
-       if (associated(glacier%cism_to_rgi_glacier_id)) &
-            deallocate(glacier%cism_to_rgi_glacier_id)
+       if (associated(glacier%cism_to_rgi_glacier_id)) deallocate(glacier%cism_to_rgi_glacier_id)
        if (associated(glacier%area)) deallocate(glacier%area)
        if (associated(glacier%volume)) deallocate(glacier%volume)
        if (associated(glacier%area_init)) deallocate(glacier%area_init)
@@ -220,28 +228,16 @@ subroutine glissade_glacier_init(model, glacier)
        ! Count the number of cells with glaciers
        ! Loop over locally owned cells
 
-       count = 0
-       do j = nhalo+1, nsn-nhalo
-          do i = nhalo+1, ewn-nhalo
-             if (glacier%rgi_glacier_id(i,j) > 0) then
-                count = count + 1
-             elseif (glacier%rgi_glacier_id(i,j) < 0) then  ! should not happen
-                call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                write(message,*) 'RGI glacier_id < 0: i, j, value =', &
-                     iglobal, jglobal, glacier%rgi_glacier_id(i,j)
-                call write_log(message, GM_FATAL)
-             endif
-          enddo
-       enddo
-
-       ncells_glacier = parallel_reduce_sum(count)
+       glacier_mask = 0
+       where (glacier%rgi_glacier_id > 0) glacier_mask = 1
+       ncells_glacier = parallel_global_sum(glacier_mask, parallel)
 
        ! Gather the RGI glacier IDs to the main task
        if (main_task) allocate(rgi_glacier_id_global(global_ewn, global_nsn))
-       call distributed_gather_var(glacier%rgi_glacier_id, rgi_glacier_id_global, parallel)
+       call gather_var(glacier%rgi_glacier_id, rgi_glacier_id_global, parallel)
 
        ! Allocate a global array for the CISM glacier IDs on the main task.
-       ! Allocate a size 0 array on other tasks; distributed_scatter_var wants arrays allocated on all tasks.
+       ! Allocate a size 0 array on other tasks; scatter_var wants arrays allocated on all tasks.
        if (main_task) then
           allocate(cism_glacier_id_global(global_ewn,global_nsn))
        else
@@ -387,8 +383,8 @@ subroutine glissade_glacier_init(model, glacier)
        endif   ! main_task
 
        ! Scatter cism_glacier_id_global to all processors
-       ! Note: This global array is deallocated in the distributed_scatter_var subroutine
-       call distributed_scatter_var(glacier%cism_glacier_id, cism_glacier_id_global, parallel)
+       ! Note: This global array is deallocated in the scatter_var subroutine
+       call scatter_var(glacier%cism_glacier_id, cism_glacier_id_global, parallel)
 
        call parallel_halo(glacier%cism_glacier_id, parallel)
 
@@ -434,6 +430,7 @@ subroutine glissade_glacier_init(model, glacier)
 
        call glacier_area_volume(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id_init,     &
             model%geometry%cell_area,         &  ! m^2
@@ -543,6 +540,7 @@ subroutine glissade_glacier_init(model, glacier)
 
        call glacier_2d_to_1d(&
             ewn,                   nsn,                           &
+            parallel,                                             &
             nglacier,              glacier%cism_glacier_id_init,  &
             model%climate%smb_obs, glacier%smb_obs)
 
@@ -614,6 +612,7 @@ subroutine glissade_glacier_init(model, glacier)
 
        call glacier_area_volume(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id,          &
             model%geometry%cell_area,         &  ! m^2
@@ -626,6 +625,7 @@ subroutine glissade_glacier_init(model, glacier)
 
        call glacier_area_volume(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id_init,     &
             model%geometry%cell_area,         &  ! m^2
@@ -667,35 +667,25 @@ subroutine glissade_glacier_init(model, glacier)
     endif
     call broadcast(glacier%ngdiag, rtest)
 
-    ! Define a mask whose value is 1 at vertices along the boundary between two glaciers.
-    ! At runtime, Cp is set to a large value at masked vertices to reduce flow between glaciers.
+    ! Define a mask whose value is 1 at vertices that border two different glaciers.
+    ! At runtime, Cp is set to a large value at these vertices to reduce mass exchange between glaciers.
+    !TODO: Consider removing the mask. This would allow CISM to reduce basal friction to thin the ice if needed.
     glacier%boundary_mask(:,:) = 0
 
-    ! Loop over locally owned cells
-    do j = nhalo, nsn-nhalo
-       do i = nhalo, ewn-nhalo
-          ng = glacier%cism_glacier_id_init(i,j)
-          if (ng > 0) then
-             ng_west  = glacier%cism_glacier_id_init(i-1,j)
-             ng_east  = glacier%cism_glacier_id_init(i+1,j)
-             ng_south = glacier%cism_glacier_id_init(i,j-1)
-             ng_north = glacier%cism_glacier_id_init(i,j+1)
-             if (ng_west > 0 .and. ng_west /= ng) then
-                glacier%boundary_mask(i-1,j-1) = 1
-                glacier%boundary_mask(i-1,j)   = 1
-             endif
-             if (ng_east > 0 .and. ng_east /= ng) then
-                glacier%boundary_mask(i,j-1) = 1
-                glacier%boundary_mask(i,j)   = 1
-             endif
-             if (ng_south > 0 .and. ng_south /= ng) then
-                glacier%boundary_mask(i-1,j-1) = 1
-                glacier%boundary_mask(i,j-1)   = 1
-             endif
-             if (ng_north > 0 .and. ng_north /= ng) then
-                glacier%boundary_mask(i-1,j) = 1
-                glacier%boundary_mask(i,j)   = 1
-             endif
+    ! Loop over locally owned vertices
+    do j = nhalo+1, nsn-nhalo
+       do i = nhalo+1, ewn-nhalo
+          ng_ne = glacier%cism_glacier_id_init(i+1,j+1)
+          ng_nw = glacier%cism_glacier_id_init(i,j+1)
+          ng_se = glacier%cism_glacier_id_init(i+1,j)
+          ng_sw = glacier%cism_glacier_id_init(i,j)
+          if ( (ng_ne > 0 .and. ng_nw > 0 .and. ng_ne /= ng_nw) .or. &
+               (ng_ne > 0 .and. ng_se > 0 .and. ng_ne /= ng_se) .or. &
+               (ng_ne > 0 .and. ng_sw > 0 .and. ng_ne /= ng_sw) .or. &
+               (ng_nw > 0 .and. ng_se > 0 .and. ng_nw /= ng_se) .or. &
+               (ng_nw > 0 .and. ng_sw > 0 .and. ng_nw /= ng_sw) .or. &
+               (ng_se > 0 .and. ng_sw > 0 .and. ng_se /= ng_sw) ) then
+             glacier%boundary_mask(i,j) = 1
           endif
        enddo
     enddo
@@ -703,6 +693,7 @@ subroutine glissade_glacier_init(model, glacier)
     call staggered_parallel_halo(glacier%boundary_mask, parallel)
 
     if (verbose_glacier) then
+       call point_diag(glacier%cism_glacier_id_init, 'cism_glacier_id_init', itest, jtest, rtest, 7, 7)
        call point_diag(glacier%boundary_mask, 'Glacier boundary mask', itest, jtest, rtest, 7, 7)
     endif
 
@@ -730,7 +721,7 @@ subroutine glissade_glacier_update(model, glacier)
 
     use glissade_grid_operators, only: glissade_stagger
     use glissade_utils, only: glissade_usrf_to_thck
-    use cism_parallel, only: parallel_reduce_sum, parallel_global_sum, &
+    use cism_parallel, only: parallel_global_sum, &
          parallel_halo, staggered_parallel_halo
 
     ! Do glacier inversion (if applicable), update glacier masks, and compute glacier diagnostics.
@@ -767,12 +758,13 @@ subroutine glissade_glacier_update(model, glacier)
     integer :: i, j, ng
 
     integer, dimension(model%general%ewn, model%general%nsn) ::  &
-         ice_mask                   ! = 1 where ice is present (thck > thklim), else = 0
+         ice_mask,                & ! = 1 where ice is present (thck > thklim), else = 0
+         glacier_mask               ! temporary mask
 
     real(dp), dimension(model%general%ewn, model%general%nsn) ::  &
          thck,                    & ! ice thickness (m)
          dthck_dt,                & ! rate of change of thickness (m/yr)
-         cell_area,               & ! grid cell area (m^2)
+         cell_area_uniform,       & ! grid cell area defined as dew*dns(m^2)
          thck_old,                & ! saved value of ice thickness (m)
          artm,                    & ! artm, baseline or current date
          snow,                    & ! snowfall, baseline or current date
@@ -872,14 +864,12 @@ subroutine glissade_glacier_update(model, glacier)
 
     nglacier = glacier%nglacier
     ngdiag = glacier%ngdiag
+    cell_area_uniform = dew*dns
 
     ! some unit conversions
-    !TODO - Use model%geometry%thck without a copy.
     !       Skip these conversion and use SI units (s instead of yr) in the code.
-    dt = model%numerics%dt /scyr                      ! s to yr
-    thck = model%geometry%thck
+    dt = model%numerics%dt /scyr                    ! s to yr
     dthck_dt = model%geometry%dthck_dt * scyr       ! m/s to m/yr
-    cell_area = model%geometry%cell_area            ! model units to m^2
 
     ! Accumulate the 2D fields used for mu_star and alpha_snow inversion: snow and Tpos.
     ! Also accumulate dthck_dt, which is used for powerlaw_c inversion.
@@ -1189,23 +1179,23 @@ subroutine glissade_glacier_update(model, glacier)
 
        if (glacier%redistribute_advanced_ice) then
 
-          thck_old = thck
+          thck_old = model%geometry%thck
 
           call glacier_redistribute_advanced_ice(&
             ewn,             nsn,               &
+            parallel,                           &
             itest,   jtest,  rtest,             &
             nglacier,        ngdiag,            &
             real(glacier_update_interval,dp),   &  ! yr
-            dew*dns,                            &  ! m^2
+            cell_area_uniform,                  &  ! m^2
             glacier%thinning_rate_advanced_ice, &  ! m/yr
             glacier%cism_glacier_id_init,       &
             glacier%smb_glacier_id,             &
             model%climate%smb,                  &  ! m/yr
-            thck,                               &  ! m
-            parallel)
+            model%geometry%thck)                   ! m
 
           glacier%dthck_dt_annmean = glacier%dthck_dt_annmean + &
-               (thck - thck_old) / real(glacier_update_interval,dp)
+               (model%geometry%thck - thck_old) / real(glacier_update_interval,dp)
 
        endif   ! redistribute advanced ice
 
@@ -1230,6 +1220,7 @@ subroutine glissade_glacier_update(model, glacier)
 
        call glacier_2d_to_1d_weighted(&
             ewn,                  nsn,       &
+            parallel,                        &
             nglacier,                        &
             glacier%smb_glacier_id_init,     &
             smb_weight_init,                 &
@@ -1249,6 +1240,7 @@ subroutine glissade_glacier_update(model, glacier)
 
        call glacier_2d_to_1d_weighted(&
             ewn,                  nsn,       &
+            parallel,                        &
             nglacier,                        &
             glacier%smb_glacier_id,          &
             smb_weight_current,              &
@@ -1276,6 +1268,7 @@ subroutine glissade_glacier_update(model, glacier)
 
              call glacier_invert_mu_star_alpha_snow(&
                   ewn,                         nsn,                   &
+                  parallel,                                           &
                   itest,       jtest,          rtest,                 &
                   nglacier,                    ngdiag,                &
                   glacier%smb_glacier_id_init,                        &
@@ -1300,6 +1293,7 @@ subroutine glissade_glacier_update(model, glacier)
 
              call glacier_invert_mu_star(&
                   ewn,                   nsn,                  &
+                  parallel,                                    &
                   itest,     jtest,      rtest,                &
                   nglacier,              ngdiag,               &
                   glacier%smb_glacier_id_init,                 &
@@ -1318,13 +1312,13 @@ subroutine glissade_glacier_update(model, glacier)
        endif   ! set_mu_star
 
        ! advance/retreat diagnostics
-       ! Note: This subroutine assumes cell_area = dew*dns for all cells
        call glacier_area_advance_retreat(&
             ewn,           nsn,           &
+            parallel,                     &
             nglacier,                     &
             glacier%cism_glacier_id_init, &
             glacier%cism_glacier_id,      &
-            dew*dns,                      &
+            cell_area_uniform,            &
             area_initial,                 &
             area_current,                 &
             area_advance,                 &
@@ -1383,7 +1377,8 @@ subroutine glissade_glacier_update(model, glacier)
           ! Interpolate thck to the staggered grid
           call glissade_stagger(&
                ewn,         nsn,              &
-               thck,        stag_thck)
+               model%geometry%thck,           &
+               stag_thck)
 
           ! Interpolate dthck_dt to the staggered grid
           call glissade_stagger(&
@@ -1433,8 +1428,7 @@ subroutine glissade_glacier_update(model, glacier)
        !-------------------------------------------------------------------------
 
        if (verbose_glacier) then
-          call point_diag(model%geometry%topg, 'topg', itest, jtest, rtest, 7, 7)
-          call point_diag(thck, 'Before advance_retreat, thck', itest, jtest, rtest, 7, 7)
+          call point_diag(model%geometry%thck, 'Before advance_retreat, thck', itest, jtest, rtest, 7, 7)
        endif
 
        ! Assign nonzero IDs in grid cells where ice has reached the minimum glacier thickness.
@@ -1443,17 +1437,17 @@ subroutine glissade_glacier_update(model, glacier)
 
        call glacier_advance_retreat(&
             ewn,             nsn,           &
+            parallel,                       &
             itest,   jtest,  rtest,         &
             nglacier,                       &
             glacier%minthck,                &  ! m
-            thck,                           &  ! m
+            model%geometry%thck,            &  ! m
             glacier%snow_annmean,           &  ! mm/yr w.e.
             glacier%Tpos_annmean,           &  ! deg C
             glacier%mu_star,                &  ! mm/yr/deg
             glacier%alpha_snow,             &  ! unitless
             glacier%cism_glacier_id_init,   &
-            glacier%cism_glacier_id,        &
-            parallel)
+            glacier%cism_glacier_id)
 
        ! Compute smb_glacier_id, which determines where the SMB is computed. It is the union of
        !       (1) cism_glacier_id > 0
@@ -1559,7 +1553,7 @@ subroutine glissade_glacier_update(model, glacier)
        endif   ! set_mu_star
 
        if (verbose_glacier) then
-          call point_diag(thck, 'After advance_retreat, thck', itest, jtest, rtest, 7, 7)
+          call point_diag(model%geometry%thck, 'After advance_retreat, thck', itest, jtest, rtest, 7, 7)
           call point_diag(glacier%cism_glacier_id_init, 'cism_glacier_id_init', itest, jtest, rtest, 7, 7)
           call point_diag(glacier%smb_glacier_id_init, 'smb_glacier_id_init', itest, jtest, rtest, 7, 7)
           call point_diag(glacier%cism_glacier_id, 'New cism_glacier_id', itest, jtest, rtest, 7, 7)
@@ -1591,16 +1585,20 @@ subroutine glissade_glacier_update(model, glacier)
        ! (1) Include only cells that are part of the initial glacier extent
        call glacier_accumulation_area_ratio(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id_init,     &
+            cell_area_uniform,                &
             model%climate%smb,                &
             aar_init)
 
        ! (2) Include all cells in the glacier
        call glacier_accumulation_area_ratio(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id,          &
+            cell_area_uniform,                &
             model%climate%smb,                &
             aar)
 
@@ -1630,16 +1628,20 @@ subroutine glissade_glacier_update(model, glacier)
           ! (1) Include only cells that are part of the initial glacier extent
           call glacier_accumulation_area_ratio(&
                ewn,           nsn,               &
+               parallel,                         &
                nglacier,                         &
                glacier%cism_glacier_id_init,     &
+               cell_area_uniform,                &
                glacier%smb_recent,               &
                aar_init_recent)
 
           ! (2) Include all cells in the glacier
           call glacier_accumulation_area_ratio(&
                ewn,           nsn,               &
+               parallel,                         &
                nglacier,                         &
                glacier%cism_glacier_id,          &
+               cell_area_uniform,                &
                glacier%smb_recent,               &
                aar_recent)
 
@@ -1661,10 +1663,11 @@ subroutine glissade_glacier_update(model, glacier)
 
        call glacier_area_volume(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id,          &
-            cell_area,                        &  ! m^2
-            thck,                             &  ! m
+            model%geometry%cell_area,         &  ! m^2
+            model%geometry%thck,              &  ! m
             glacier%diagnostic_minthck,       &  ! m
             glacier%area,                     &  ! m^2
             glacier%volume)                      ! m^3
@@ -1674,10 +1677,11 @@ subroutine glissade_glacier_update(model, glacier)
 
        call glacier_area_volume(&
             ewn,           nsn,               &
+            parallel,                         &
             nglacier,                         &
             glacier%cism_glacier_id_init,     &
-            cell_area,                        &  ! m^2
-            thck,                             &  ! m
+            model%geometry%cell_area,         &  ! m^2
+            model%geometry%thck,              &  ! m
             glacier%diagnostic_minthck,       &  ! m
             glacier%area_init_extent,         &  ! m^2
             glacier%volume_init_extent)          ! m^3
@@ -1701,6 +1705,7 @@ subroutine glissade_glacier_update(model, glacier)
 
           call glacier_area_volume(&
                ewn,           nsn,               &
+               parallel,                         &
                nglacier,                         &
                glacier%cism_glacier_id_init,     &
                model%geometry%cell_area,         &  ! m^2
@@ -1718,28 +1723,21 @@ subroutine glissade_glacier_update(model, glacier)
 
        if (verbose_glacier) then
 
-          ! debug - count cells in masks
-          count_cgii = 0
-          count_cgi = 0
-          count_sgii = 0
-          count_sgi = 0
-          do j = nhalo+1, nsn-nhalo
-             do i = nhalo+1, ewn-nhalo
-                ng = glacier%cism_glacier_id_init(i,j)
-                if (ng == ngdiag) count_cgii = count_cgii + 1
-                ng = glacier%cism_glacier_id(i,j)
-                if (ng == ngdiag) count_cgi  = count_cgi  + 1
-                ng = glacier%smb_glacier_id_init(i,j)
-                if (ng == ngdiag) count_sgii = count_sgii + 1
-                ng = glacier%smb_glacier_id(i,j)
-                if (ng == ngdiag) count_sgi  = count_sgi  + 1
-             enddo
-          enddo
+          glacier_mask = 0
+          where (glacier%cism_glacier_id_init == ngdiag) glacier_mask = 1
+          count_cgii = parallel_global_sum(glacier_mask, parallel)
 
-          count_cgii = parallel_reduce_sum(count_cgii)
-          count_cgi  = parallel_reduce_sum(count_cgi)
-          count_sgii = parallel_reduce_sum(count_sgii)
-          count_sgi  = parallel_reduce_sum(count_sgi)
+          glacier_mask = 0
+          where (glacier%cism_glacier_id == ngdiag) glacier_mask = 1
+          count_cgi = parallel_global_sum(glacier_mask, parallel)
+
+          glacier_mask = 0
+          where (glacier%smb_glacier_id_init == ngdiag) glacier_mask = 1
+          count_sgii = parallel_global_sum(glacier_mask, parallel)
+
+          glacier_mask = 0
+          where (glacier%smb_glacier_id == ngdiag) glacier_mask = 1
+          count_sgi = parallel_global_sum(glacier_mask, parallel)
 
           if (this_rank == rtest) then
              write(iulog,*) ' '
@@ -1752,15 +1750,13 @@ subroutine glissade_glacier_update(model, glacier)
 
     endif   ! glacier_update_inverval
 
-    ! Copy fields back to model derived type
-    model%geometry%thck = thck
-
   end subroutine glissade_glacier_update
 
 !****************************************************
 
   subroutine glacier_invert_mu_star(&
        ewn,              nsn,           &
+       parallel,                        &
        itest,   jtest,   rtest,         &
        nglacier,         ngdiag,        &
        smb_glacier_id_init,             &
@@ -1785,6 +1781,9 @@ subroutine glacier_invert_mu_star(&
          nglacier,                    & ! total number of glaciers in the domain
          ngdiag                         ! CISM ID of diagnostic glacier
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) :: &
          smb_glacier_id_init            ! smb_glacier_id based on the initial glacier extent
 
@@ -1862,6 +1861,7 @@ subroutine glacier_invert_mu_star(&
 
     call glacier_2d_to_1d_weighted(&
          ewn,           nsn,                   &
+         parallel,                             &
          nglacier,                             &
          smb_glacier_id_init,                  &
          smb_weight,                           &
@@ -1869,6 +1869,7 @@ subroutine glacier_invert_mu_star(&
 
     call glacier_2d_to_1d_weighted(&
          ewn,           nsn,                   &
+         parallel,                             &
          nglacier,                             &
          smb_glacier_id_init,                  &
          smb_weight,                           &
@@ -1993,6 +1994,7 @@ end subroutine glacier_invert_mu_star
 
   subroutine glacier_invert_mu_star_alpha_snow(&
        ewn,              nsn,            &
+       parallel,                         &
        itest,   jtest,   rtest,          &
        nglacier,         ngdiag,         &
        smb_glacier_id_init,              &
@@ -2023,6 +2025,9 @@ subroutine glacier_invert_mu_star_alpha_snow(&
          nglacier,                    & ! total number of glaciers in the domain
          ngdiag                         ! CISM ID of diagnostic glacier
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) :: &
          smb_glacier_id_init            ! smb_glacier_id based on the initial glacier extent
 
@@ -2114,6 +2119,7 @@ subroutine glacier_invert_mu_star_alpha_snow(&
 
     call glacier_2d_to_1d_weighted(&
          ewn,                  nsn,       &
+         parallel,                        &
          nglacier,                        &
          smb_glacier_id_init,             &
          smb_weight,                      &
@@ -2121,6 +2127,7 @@ subroutine glacier_invert_mu_star_alpha_snow(&
 
     call glacier_2d_to_1d_weighted(&
          ewn,                  nsn,       &
+         parallel,                        &
          nglacier,                        &
          smb_glacier_id_init,             &
          smb_weight,                      &
@@ -2128,6 +2135,7 @@ subroutine glacier_invert_mu_star_alpha_snow(&
 
     call glacier_2d_to_1d_weighted(&
          ewn,                  nsn,       &
+         parallel,                        &
          nglacier,                        &
          smb_glacier_id_init,             &
          smb_weight,                      &
@@ -2135,6 +2143,7 @@ subroutine glacier_invert_mu_star_alpha_snow(&
 
     call glacier_2d_to_1d_weighted(&
          ewn,                  nsn,       &
+         parallel,                        &
          nglacier,                        &
          smb_glacier_id_init,             &
          smb_weight,                      &
@@ -2482,6 +2491,7 @@ subroutine glacier_invert_powerlaw_c(&
 
     if (verbose_glacier) then
        call point_diag(stag_thck, 'stag_thck (m)', itest, jtest, rtest, 7, 7)
+       call point_diag(stag_thck_target, 'stag_thck_target (m)', itest, jtest, rtest, 7, 7)
        call point_diag(stag_dthck, 'stag_thck - stag_thck_target (m)', itest, jtest, rtest, 7, 7)
        call point_diag(stag_dthck_dt, 'stag_dthck_dt (m/yr)', itest, jtest, rtest, 7, 7)
        call point_diag(powerlaw_c, 'new powerlaw_c', itest, jtest, rtest, 7, 7)
@@ -2533,6 +2543,7 @@ end subroutine glacier_calc_snow
 
   subroutine glacier_redistribute_advanced_ice(&
        ewn,             nsn,          &
+       parallel,                      &
        itest,   jtest,  rtest,        &
        nglacier,        ngdiag,       &
        glacier_update_interval,       & ! yr
@@ -2541,15 +2552,14 @@ subroutine glacier_redistribute_advanced_ice(&
        cism_glacier_id_init,          &
        smb_glacier_id,                &
        smb,                           & ! m/yr
-       thck,                          & ! m
-       parallel)
+       thck)                            ! m
 
     ! Limit glacier advance in the accumulation zone.
     ! This applies to grid cells that are initially ice-free, into which ice is advected.
     ! The fix here is to thin the ice in these cells at a prescribed rate and
     !  redistribute the mass conservatively across the glacier.
 
-    use cism_parallel, only: parallel_reduce_sum, parallel_halo
+    use cism_parallel, only: parallel_halo, parallel_global_sum_patch
 
     ! input/output arguments
 
@@ -2559,11 +2569,16 @@ subroutine glacier_redistribute_advanced_ice(&
          nglacier,                    & ! number of glaciers
          ngdiag                         ! CISM ID of diagnostic glacier
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     real(dp), intent(in) :: &
          glacier_update_interval,     & ! time interval (yr) of the glacier update, typically 1 yr
-         cell_area,                   & ! grid cell area (m^2), assumed to be the same for each cell
          thinning_rate_advanced_ice     ! thinning rate (m/yr) where glaciers advance in the accumulation zone
 
+    real(dp), dimension(ewn,nsn), intent(in) :: &
+         cell_area                      ! grid cell area (m^2)
+
     integer, dimension(ewn,nsn), intent(in) :: &
          cism_glacier_id_init,        & ! integer glacier ID at the start of the run
          smb_glacier_id                 ! integer ID for current glacier cells and adjacent glacier-free cells
@@ -2574,14 +2589,10 @@ subroutine glacier_redistribute_advanced_ice(&
     real(dp), dimension(ewn,nsn), intent(inout) ::  &
          thck                           ! ice thickness (m)
 
-    type(parallel_type), intent(in) :: parallel   ! info for parallel communication
-
     ! local variables
 
     integer :: i, j, ng
 
-    real(dp) :: dthck                   ! thickness change (m)
-
     real(dp), dimension(nglacier) :: &
          glacier_area_init,           & ! glacier area based on cism_glacier_id_init
          glacier_vol_removed,         & ! total volume (m^3) removed from each advanced cells in each glacier
@@ -2589,53 +2600,37 @@ subroutine glacier_redistribute_advanced_ice(&
          glacier_vol_1,               & ! volume (m^3) of each glacier before thinning and restribution
          glacier_vol_2                  ! volume (m^3) of each glacier after thinning and restribution
 
+    real(dp), dimension(ewn,nsn) ::  &
+         dthck                          ! thickness removed (m)
+
+    integer, dimension(ewn,nsn) :: &
+         glacier_id                     ! temporary glacier ID
+
+    glacier_id = max(cism_glacier_id_init, smb_glacier_id)
+
     ! Compute the total volume of each glacier before limiting advance.
     ! Note: This includes adjacent glacier-free cells that might have a small nonzero thickness
     !       (i.e., cism_glacier_id = 0 but smb_glacier_id > 0).
-    !TODO: Write a sum-over-glaciers subroutine
 
-    glacier_vol_1(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = smb_glacier_id(i,j)
-          if (ng > 0) then
-             glacier_vol_1(ng) = glacier_vol_1(ng) + cell_area*thck(i,j)
-          endif
-       enddo
-    enddo
-    glacier_vol_1 = parallel_reduce_sum(glacier_vol_1)
+    glacier_vol_1 = parallel_global_sum_patch(cell_area*thck, nglacier, glacier_id, parallel)
 
     ! compute the area of each glacier over its initial extent
-    glacier_area_init(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = cism_glacier_id_init(i,j)
-          if (ng > 0) then
-             glacier_area_init(ng) = glacier_area_init(ng) + cell_area
-          endif
-       enddo
-    enddo
-    glacier_area_init = parallel_reduce_sum(glacier_area_init)
+
+    glacier_area_init = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id_init, parallel)
 
     ! Compute thinning in advanced grid cells
     ! This includes potential advanced cells adjacent to current glacier cells.
     ! Note: Currently, SMB is set to 0 in advanced cells where SMB would be > 0 otherwise.
     !       The logic below (smb >= 0) ensures that ice in these cells is thinned.
 
-    glacier_vol_removed(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          if (cism_glacier_id_init(i,j) == 0 .and. smb_glacier_id(i,j) > 0) then ! advanced cell
-             if (smb(i,j) >= 0.d0) then   ! accumulation zone
-                ng = smb_glacier_id(i,j)
-                dthck = min(thinning_rate_advanced_ice*glacier_update_interval, thck(i,j))
-                thck(i,j) = thck(i,j) - dthck
-                glacier_vol_removed(ng) = glacier_vol_removed(ng) + cell_area*dthck
-             endif
-          endif
-       enddo
-    enddo
-    glacier_vol_removed = parallel_reduce_sum(glacier_vol_removed)
+    dthck = 0.0d0
+    where (cism_glacier_id_init == 0 .and. smb_glacier_id > 0)   ! advanced cell
+       where (smb >= 0.0d0)   ! accumulation zone
+          dthck = min(thinning_rate_advanced_ice*glacier_update_interval, thck)
+          thck = thck - dthck
+       endwhere
+    endwhere
+    glacier_vol_removed = parallel_global_sum_patch(cell_area*dthck, nglacier, smb_glacier_id, parallel)
 
     ! Assuming conservation of volume, compute the thickness to be added to each glacier.
     ! Only cells within the initial glacier extent can thicken.
@@ -2659,16 +2654,8 @@ subroutine glacier_redistribute_advanced_ice(&
     call parallel_halo(thck, parallel)
 
     ! Compute the volume of each glacier after limiting advance
-    glacier_vol_2(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = max(cism_glacier_id_init(i,j), smb_glacier_id(i,j))
-          if (ng > 0) then
-             glacier_vol_2(ng) = glacier_vol_2(ng) + cell_area*thck(i,j)
-          endif
-       enddo
-    enddo
-    glacier_vol_2 = parallel_reduce_sum(glacier_vol_2)
+
+    glacier_vol_2 = parallel_global_sum_patch(cell_area*thck, nglacier, glacier_id, parallel)
 
     ! conservation check
     do ng = 1, nglacier
@@ -2685,6 +2672,7 @@ end subroutine glacier_redistribute_advanced_ice
 
   subroutine glacier_advance_retreat(&
        ewn,             nsn,         &
+       parallel,                     &
        itest,   jtest,  rtest,       &
        nglacier,                     &
        glacier_minthck,              &
@@ -2694,8 +2682,7 @@ subroutine glacier_advance_retreat(&
        mu_star,                      &
        alpha_snow,                   &
        cism_glacier_id_init,         &
-       cism_glacier_id,              &
-       parallel)
+       cism_glacier_id)
 
     ! Allow glaciers to advance and retreat.
     !
@@ -2727,6 +2714,9 @@ subroutine glacier_advance_retreat(&
          itest, jtest, rtest,         & ! coordinates of diagnostic cell
          nglacier                       ! number of glaciers
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for diagnostic only
+
     real(dp), intent(in) :: &
          glacier_minthck                ! min ice thickness (m) counted as part of a glacier
 
@@ -2747,8 +2737,6 @@ subroutine glacier_advance_retreat(&
     integer, dimension(ewn,nsn), intent(inout) :: &
          cism_glacier_id                ! current cism glacier_id, > 0 for glaciated cells
 
-    type(parallel_type), intent(in) :: parallel  ! diagnostic only
-
     ! local variables
 
     integer, dimension(ewn,nsn) :: &
@@ -2859,7 +2847,7 @@ subroutine glacier_advance_retreat(&
     call parallel_halo(cism_glacier_id, parallel)
 
     ! Check advanced cells (beyond the initial extent) for problematic glacier IDs.
-    ! This code protects against glacier 'pirating', which ccan occur when an advanced cell
+    ! This code protects against glacier 'pirating', which can occur when an advanced cell
     !  is adjacent to two different glaciers, call them A and B.
     ! Suppose the cell is fed primarily by glacier A but has the same ID as glacier B,
     !  and has a more positive SMB as a result of belonging to B rather than A.
@@ -3038,13 +3026,14 @@ end subroutine update_smb_glacier_id
 
   subroutine glacier_2d_to_1d(&
        ewn,           nsn,              &
+       parallel,                        &
        nglacier,      cism_glacier_id,  &
        field_2d,      glacier_field)
 
     ! Given a 2D field, compute the average of the field over each glacier
     !TODO - Pass in cellarea to compute an area average.
 
-    use cism_parallel, only: parallel_reduce_sum
+    use cism_parallel, only: parallel_global_sum_patch
 
     ! input/output arguments
 
@@ -3052,6 +3041,9 @@ subroutine glacier_2d_to_1d(&
          ewn, nsn,                    & ! number of cells in each horizontal direction
          nglacier                       ! total number of glaciers in the domain
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) ::  &
          cism_glacier_id                ! integer glacier ID in the range (1, nglacier)
 
@@ -3067,22 +3059,12 @@ subroutine glacier_2d_to_1d(&
 
     integer, dimension(nglacier) :: ncells_glacier
 
-    ncells_glacier(:) = 0
-    glacier_field(:) = 0.0d0
+    integer, dimension(ewn,nsn) :: ones   ! matrix = 1 everywhere
 
-    ! Loop over locally owned cells
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = cism_glacier_id(i,j)
-          if (ng > 0) then
-             ncells_glacier(ng) = ncells_glacier(ng) + 1
-             glacier_field(ng) = glacier_field(ng) + field_2d(i,j)
-          endif
-       enddo
-    enddo
+    ones(:,:) = 1
 
-    ncells_glacier = parallel_reduce_sum(ncells_glacier)
-    glacier_field  = parallel_reduce_sum(glacier_field)
+    ncells_glacier = parallel_global_sum_patch(ones, nglacier, cism_glacier_id, parallel)
+    glacier_field = parallel_global_sum_patch(field_2d, nglacier, cism_glacier_id, parallel)
 
     where (ncells_glacier > 0)
        glacier_field = glacier_field/ncells_glacier
@@ -3094,6 +3076,7 @@ end subroutine glacier_2d_to_1d
 
   subroutine glacier_2d_to_1d_weighted(&
        ewn,           nsn,              &
+       parallel,                        &
        nglacier,                        &
        glacier_id,    weight,           &
        field_2d,      glacier_field)
@@ -3101,7 +3084,7 @@ subroutine glacier_2d_to_1d_weighted(&
     ! Given a 2D field, compute the average of the field over each glacier
     ! Certain grid cells (e.g., at the glacier periphery) can be given weights between 0 and 1.
 
-    use cism_parallel, only: parallel_reduce_sum
+    use cism_parallel, only: parallel_global_sum_patch
 
     ! input/output arguments
 
@@ -3109,6 +3092,9 @@ subroutine glacier_2d_to_1d_weighted(&
          ewn, nsn,                    & ! number of cells in each horizontal direction
          nglacier                       ! total number of glaciers in the domain
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) ::  &
          glacier_id                     ! integer glacier ID
 
@@ -3123,26 +3109,11 @@ subroutine glacier_2d_to_1d_weighted(&
 
     ! local variables
 
-    integer :: i, j, ng
-
     real(dp), dimension(nglacier) :: sum_weights
 
-    sum_weights(:) = 0.0d0
-    glacier_field(:) = 0.0d0
-
-    ! Loop over locally owned cells
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = glacier_id(i,j)
-          if (ng > 0) then
-             sum_weights(ng) = sum_weights(ng) + weight(i,j)
-             glacier_field(ng) = glacier_field(ng) + weight(i,j) * field_2d(i,j)
-          endif
-       enddo
-    enddo
+    sum_weights = parallel_global_sum_patch(weight, nglacier, glacier_id, parallel)
+    glacier_field = parallel_global_sum_patch(weight*field_2d, nglacier, glacier_id, parallel)
 
-    sum_weights = parallel_reduce_sum(sum_weights)
-    glacier_field  = parallel_reduce_sum(glacier_field)
     where (sum_weights > 0.0d0)
        glacier_field = glacier_field/sum_weights
     endwhere
@@ -3196,12 +3167,13 @@ end subroutine glacier_1d_to_2d
 
   subroutine glacier_area_volume(&
        ewn,           nsn,               &
+       parallel,                         &
        nglacier,      cism_glacier_id,   &
        cell_area,     thck,              &
        diagnostic_minthck,               &
        area,          volume)
 
-    use cism_parallel, only: parallel_reduce_sum
+    use cism_parallel, only: parallel_global_sum_patch
 
     ! input/output arguments
 
@@ -3209,6 +3181,9 @@ subroutine glacier_area_volume(&
          ewn, nsn,                    & ! number of cells in each horizontal direction
          nglacier                       ! total number of glaciers in the domain
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) ::  &
          cism_glacier_id                ! integer glacier ID in the range (1, nglacier)
 
@@ -3226,36 +3201,22 @@ subroutine glacier_area_volume(&
 
     ! local variables
 
-    real(dp), dimension(nglacier) ::  &
-         local_area, local_volume       ! area and volume on each processor, before global sum
-
-    integer :: i, j, ng
-
-    ! Initialize the output arrays
-    area(:) = 0.0d0
-    volume(:) = 0.0d0
-
-    ! Initialize local arrays
-    local_area(:) = 0.0d0
-    local_volume(:) = 0.0d0
+    real(dp), dimension(ewn,nsn) ::  &
+         diag_area, diag_volume       ! area and volume where thck >= diagnostic_minthck
 
     ! Compute the area and volume of each glacier.
-    ! We need parallel sums, since a glacier can lie on two or more processors.
+    ! Need parallel sums, since a glacier can lie on two or more processors.
 
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = cism_glacier_id(i,j)
-          if (ng > 0) then
-             if (thck(i,j) >= diagnostic_minthck) then
-                local_area(ng) = local_area(ng) + cell_area(i,j)
-                local_volume(ng) = local_volume(ng) + cell_area(i,j) * thck(i,j)
-             endif
-          endif
-       enddo
-    enddo
+    where(thck >= diagnostic_minthck)
+       diag_area = cell_area
+       diag_volume = cell_area*thck
+    elsewhere
+       diag_area = 0.0d0
+       diag_volume = 0.0d0
+    endwhere
 
-    area   = parallel_reduce_sum(local_area)
-    volume = parallel_reduce_sum(local_volume)
+    area = parallel_global_sum_patch(diag_area, nglacier, cism_glacier_id, parallel)
+    volume = parallel_global_sum_patch(diag_volume, nglacier, cism_glacier_id, parallel)
 
   end subroutine glacier_area_volume
 
@@ -3263,6 +3224,7 @@ end subroutine glacier_area_volume
 
   subroutine glacier_area_advance_retreat(&
        ewn,           nsn,     &
+       parallel,               &
        nglacier,               &
        cism_glacier_id_init,   &
        cism_glacier_id,        &
@@ -3272,7 +3234,7 @@ subroutine glacier_area_advance_retreat(&
        area_advance,           &
        area_retreat)
 
-    use cism_parallel, only: parallel_reduce_sum
+    use cism_parallel, only: parallel_global_sum_patch
 
     ! For each glacier, compare the current glacier area (as given by cism_glacier_id)
     ! to the initial area (given by cism_glacier_id_init).
@@ -3287,12 +3249,15 @@ subroutine glacier_area_advance_retreat(&
          ewn, nsn,                    & ! number of cells in each horizontal direction
          nglacier                       ! total number of glaciers in the domain
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) ::  &
          cism_glacier_id_init,        & ! integer glacier ID in the range (1, nglacier), initial value
          cism_glacier_id                ! integer glacier ID in the range (1, nglacier), current value
 
-    real(dp), intent(in) ::  &
-         cell_area                      ! grid cell area = dew*dns (m^2); same for all cells
+    real(dp), dimension(ewn,nsn), intent(in) ::  &
+         cell_area                      ! grid cell area (m^2)
 
     real(dp), dimension(nglacier), intent(out) ::  &
          area_initial,                & ! initial glacier area
@@ -3302,69 +3267,38 @@ subroutine glacier_area_advance_retreat(&
 
     ! local variables
 
-    real(dp), dimension(nglacier) :: &
-         local_area                     ! area on each processor, before global sum
+    integer, dimension(ewn,nsn) :: glacier_id    ! temporary glacier ID
 
-    integer :: i, j, ng, ngi
-
-    ! Initialize the output arrays
-    area_initial(:) = 0.0d0
-    area_current(:) = 0.0d0
-    area_advance(:) = 0.0d0
-    area_retreat(:) = 0.0d0
+    integer :: ng
 
     ! Compute the area of each glacier over the initial and current masks.
     ! We need parallel sums, since a glacier can lie on two or more processors.
 
-    ! init area
-    local_area(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ngi = cism_glacier_id_init(i,j)
-          if (ngi > 0) then
-             local_area(ngi) = local_area(ngi) + cell_area
-          endif
-       enddo
-    enddo
-    area_initial = parallel_reduce_sum(local_area)
+    area_initial = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id_init, parallel)
 
     ! current area
-    local_area(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = cism_glacier_id(i,j)
-          if (ng > 0) then
-             local_area(ng) = local_area(ng) + cell_area
-          endif
-       enddo
-    enddo
-    area_current = parallel_reduce_sum(local_area)
+
+    area_current = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id, parallel)
 
     ! area where the glacier has advanced
-    local_area(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ngi = cism_glacier_id_init(i,j)
-          ng  = cism_glacier_id(i,j)
-          if (ngi == 0 .and. ng > 0) then
-             local_area(ng) = local_area(ng) + cell_area
-          endif
-       enddo
-    enddo
-    area_advance = parallel_reduce_sum(local_area)
+
+    where (cism_glacier_id_init == 0 .and. cism_glacier_id > 0)
+       glacier_id = cism_glacier_id
+    elsewhere
+       glacier_id = 0
+    endwhere
+
+    area_advance = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel)
 
     ! area where the glacier has retreated
-    local_area(:) = 0.0d0
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ngi = cism_glacier_id_init(i,j)
-          ng  = cism_glacier_id(i,j)
-          if (ngi > 0 .and. ng == 0) then
-             local_area(ngi) = local_area(ngi) + cell_area
-          endif
-       enddo
-    enddo
-    area_retreat = parallel_reduce_sum(local_area)
+
+    where (cism_glacier_id_init > 0 .and. cism_glacier_id == 0)
+       glacier_id = cism_glacier_id_init
+    elsewhere
+       glacier_id = 0
+    endwhere
+
+    area_retreat = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel)
 
     ! bug check
     do ng = 1, nglacier
@@ -3382,15 +3316,16 @@ end subroutine glacier_area_advance_retreat
 
   subroutine glacier_accumulation_area_ratio(&
        ewn,           nsn,     &
+       parallel,               &
        nglacier,               &
        cism_glacier_id,        &
+       cell_area,              &
        smb,                    &
        aar)
 
     ! Compute the accumulation area ratio (AAR) for each glacier.
-    ! Note: In this subroutine the grid cell area is assumed equal for all cells.
 
-    use cism_parallel, only: parallel_reduce_sum
+    use cism_parallel, only: parallel_global_sum_patch
 
     ! input/output arguments
 
@@ -3398,9 +3333,15 @@ subroutine glacier_accumulation_area_ratio(&
          ewn, nsn,                    & ! number of cells in each horizontal direction
          nglacier                       ! total number of glaciers in the domain
 
+    type(parallel_type), intent(in) :: &
+         parallel                       ! info for parallel communication
+
     integer, dimension(ewn,nsn), intent(in) ::  &
          cism_glacier_id                ! integer glacier ID in the range (1, nglacier)
 
+    real(dp), dimension(ewn,nsn), intent(in) ::  &
+         cell_area                      ! grid cell area = dew*dns (m^2); same for all cells
+
     real(dp), dimension(ewn,nsn), intent(in) ::  &
          smb                            ! surface mass balance (mm/yr w.e.)
 
@@ -3409,34 +3350,32 @@ subroutine glacier_accumulation_area_ratio(&
 
     ! local variables
 
-    integer :: i, j, ng
+!    integer :: i, j, ng
 
     real(dp), dimension(nglacier) :: &
          ablat_area,                  & ! area of accumulation zone (SMB < 0)
          accum_area                     ! area of accumulation zone (SMB > 0)
 
-    ! initialize
-    ablat_area(:) = 0.0d0
-    accum_area(:) = 0.0d0
+    integer, dimension(ewn,nsn) :: glacier_id   ! temporary glacier ID
 
     ! Compute the accumulation and ablation area for each glacier
     ! Note: Grid cells with SMB = 0 are not counted in either zone.
 
-    do j = nhalo+1, nsn-nhalo
-       do i = nhalo+1, ewn-nhalo
-          ng = cism_glacier_id(i,j)
-          if (ng > 0) then
-             if (smb(i,j) > 0.0d0) then
-                accum_area(ng) = accum_area(ng) + 1.0d0
-             elseif (smb(i,j) < 0.0d0) then
-                ablat_area(ng) = ablat_area(ng) + 1.0d0
-             endif
-          endif
-       enddo   ! i
-    enddo   ! j
+    where (cism_glacier_id > 0 .and. smb > 0.0d0)
+       glacier_id = cism_glacier_id
+    elsewhere
+       glacier_id = 0
+    endwhere
+
+    accum_area = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel)
+
+    where (cism_glacier_id > 0 .and. smb < 0.0d0)
+       glacier_id = cism_glacier_id
+    elsewhere
+       glacier_id = 0
+    endwhere
 
-    accum_area = parallel_reduce_sum(accum_area)
-    ablat_area = parallel_reduce_sum(ablat_area)
+    ablat_area = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel)
 
     ! Compute the AAR for each glacier
 
diff --git a/libglissade/glissade_inversion.F90 b/libglissade/glissade_inversion.F90
index 71a40c7a..54c688b3 100644
--- a/libglissade/glissade_inversion.F90
+++ b/libglissade/glissade_inversion.F90
@@ -47,7 +47,7 @@ module glissade_inversion
   ! a target ice thickness field.
   !-----------------------------------------------------------------------------
 
-    logical, parameter :: verbose_inversion = .false.
+  logical, parameter :: verbose_inversion = .false.
 
 !***********************************************************************
 
@@ -89,9 +89,6 @@ subroutine glissade_inversion_init(model)
          f_flotation,          & ! flotation function (m)
          thck_obs                ! observed ice thickness, derived from usrf_obs and topg
 
-    real(dp), dimension(model%general%ewn, model%general%nsn) ::  &
-         coulomb_c_icegrid      ! initial coulomb_c at cell centers based on masks
-
     real(dp) :: h_obs, h_flotation, h_buff   ! thck_obs, flotation thickness, and thck_flotation_buffer scaled to m
     real(dp) :: dh                           ! h_obs - h_flotation
     real(dp) :: dh_decimal                   ! decimal part remaining after subtracting the truncation of dh
@@ -226,7 +223,6 @@ subroutine glissade_inversion_init(model)
        endif
     endif  ! inversion for Cp, Cc or deltaT_ocn
 
-
     !----------------------------------------------------------------------
     ! If inverting for E, then make sure there is a target surface speed, velo_sfc_obs.
     !----------------------------------------------------------------------
@@ -254,6 +250,9 @@ subroutine glissade_inversion_init(model)
 
     !----------------------------------------------------------------------
     ! computations specific to powerlaw_c (Cp) and coulomb_c (Cc) inversion
+    ! Note: Most sliding laws have inversion for Cp or Cc, but not both.
+    !       The modified Schoof law, however, supports inversion for both.
+    !       (This could be extended to the School and Tsai laws.)
     !----------------------------------------------------------------------
 
     if (model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION .or.  &
@@ -268,7 +267,11 @@ subroutine glissade_inversion_init(model)
           call point_diag(model%basal_physics%powerlaw_c, 'init_inversion for powerlaw_c', itest, jtest, rtest, 7, 7)
        endif
 
-    elseif (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then
+    endif   ! invert for powerlaw_c
+
+    !TODO - Add distinct logic for powerlaw_c_inversion_basin?
+
+    if (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then
 
        if (parallel_is_zero(model%basal_physics%coulomb_c)) then
           ! initialize coulomb_c (for which we will invert)
@@ -280,7 +283,9 @@ subroutine glissade_inversion_init(model)
                'init_inversion for coulomb_c', itest, jtest, rtest, 7, 7)
        endif
 
-    elseif (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION_BASIN) then
+    endif   ! invert for coulomb_c
+
+    if (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION_BASIN) then
 
        !TODO - Should this calculation be done in glissade_initialise?
        if (parallel_is_zero(model%basal_physics%coulomb_c_lo)) then
@@ -311,10 +316,11 @@ subroutine glissade_inversion_init(model)
                'init_inversion for basin-scale coulomb_c', itest, jtest, rtest, 7, 7)
        endif
 
-    endif
+    endif   ! invert for coulomb_c_basin
 
     !----------------------------------------------------------------------
     ! computations specific to flow_enhancement_factor inversion
+    ! TODO: Remove this inversion option?
     !----------------------------------------------------------------------
 
     if (model%options%which_ho_flow_enhancement_factor == HO_FLOW_ENHANCEMENT_FACTOR_INVERSION) then
@@ -557,7 +563,7 @@ subroutine glissade_inversion_solve(model)
        call staggered_parallel_halo(stag_thck, parallel)
        call staggered_parallel_halo(stag_dthck_dt, parallel)
 
-       ! Invert for powerlaw_c or coulomb_c
+       ! Invert for powerlaw_c and/or coulomb_c
        ! The logic is the same for each; only the max and min values and the in/out field are different.
 
        if ( model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION) then
@@ -593,7 +599,9 @@ subroutine glissade_inversion_solve(model)
              call point_diag(model%basal_physics%powerlaw_c, 'New powerlaw_c', itest, jtest, rtest, 7, 7)
           endif
 
-       elseif ( model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then
+       endif   ! invert for powerlaw_c
+
+       if (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then
 
           if (verbose_inversion .and. this_rank == rtest) then
              write(iulog,*) ' '
@@ -624,23 +632,18 @@ subroutine glissade_inversion_solve(model)
 
           if (verbose_inversion) then
              call point_diag(model%basal_physics%effecpress_stag, 'effecpress_stag', itest, jtest, rtest, 7, 7, '(f10.1)')
-             call point_diag(rhoi*grav*stag_thck, 'overburden', itest, jtest, rtest, 7, 7, '(f10.1)')
-             call point_diag((model%geometry%thck - thck_obs), 'thck - thck_obs (m)', itest, jtest, rtest, 7, 7)
              call point_diag(model%basal_physics%coulomb_c, 'New coulomb_c', itest, jtest, rtest, 7, 7, '(f10.5)')
           endif   ! verbose_inversion
 
-       endif  ! invert for powerlaw_c or coulomb_c
-
-    else   ! do not invert for powerlaw_c or coulomb_c; just print optional diagnostics
+       endif   ! invert for coulomb_c
 
-       if (verbose_inversion) then
-          call point_diag(model%geometry%f_ground, 'f_ground at vertices', itest, jtest, rtest, 7, 7, '(f10.4)')
-          call point_diag(model%basal_physics%powerlaw_c, 'powerlaw_c', itest, jtest, rtest, 7, 7, '(f10.2)')
-          call point_diag(model%basal_physics%coulomb_c, 'coulomb_c', itest, jtest, rtest, 7, 7, '(f10.4)')
-       endif
+    elseif (verbose_inversion) then   ! not inverting, but print some diagnostic values
 
-    endif   ! invert for powerlaw_c or coulomb_c
+       call point_diag(model%geometry%f_ground, 'f_ground at vertices', itest, jtest, rtest, 7, 7, '(f10.4)')
+       call point_diag(model%basal_physics%powerlaw_c, 'powerlaw_c', itest, jtest, rtest, 7, 7, '(f10.2)')
+       call point_diag(model%basal_physics%coulomb_c, 'coulomb_c', itest, jtest, rtest, 7, 7, '(f10.4)')
 
+    endif
 
     ! If inverting for powerlaw_c or coulomb_c at the basin scale, then update it here
 
@@ -699,6 +702,7 @@ subroutine glissade_inversion_solve(model)
             ewn, nsn,                                  &
             model%numerics%dew,                        &  ! m
             model%numerics%dns,                        &  ! m
+            parallel,                                  &
             itest, jtest, rtest,                       &
             model%ocean_data%nbasin,                   &
             model%ocean_data%basin_number,             &
@@ -780,6 +784,7 @@ subroutine glissade_inversion_solve(model)
             ewn, nsn,                                  &
             model%numerics%dew,                        &  ! m
             model%numerics%dns,                        &  ! m
+            parallel,                                  &
             itest, jtest, rtest,                       &
             model%ocean_data%nbasin,                   &
             model%ocean_data%basin_number,             &
@@ -853,6 +858,7 @@ subroutine glissade_inversion_solve(model)
             ewn, nsn,                                  &
             model%numerics%dew,                        &  ! m
             model%numerics%dns,                        &  ! m
+            parallel,                                  &
             itest, jtest, rtest,                       &
             model%ocean_data%nbasin,                   &
             model%ocean_data%basin_number,             &
@@ -906,6 +912,7 @@ subroutine glissade_inversion_solve(model)
             ewn, nsn,                                  &
             model%numerics%dew,                        &  ! m
             model%numerics%dns,                        &  ! m
+            parallel,                                  &
             itest, jtest, rtest,                       &
             model%ocean_data%nbasin,                   &
             model%ocean_data%basin_number,             &
@@ -944,6 +951,7 @@ subroutine glissade_inversion_solve(model)
        if (model%ocean_data%nbasin > 1) then
           call glissade_basin_average(&
                model%general%ewn, model%general%nsn,  &
+               model%parallel,                        &
                model%ocean_data%nbasin,               &
                model%ocean_data%basin_number,         &
                floating_mask * 1.0d0,                 &   ! real mask
@@ -1256,6 +1264,8 @@ subroutine invert_basal_friction(&
           if (f_ground(i,j) > 0.0d0) then  ! ice is at least partly grounded
 
              ! Compute tendency terms based on the thickness target
+             !TODO: Try putting max(babc_thck_scale, stag_dthck_obs) in the denominator
+             !      Alex Robinson says this might improve convergence
              term_thck = -stag_dthck(i,j) / (babc_thck_scale*babc_timescale)
              term_dHdt = -stag_dthck_dt(i,j) * 2.0d0 / babc_thck_scale
 
@@ -1325,8 +1335,8 @@ subroutine invert_basal_friction(&
        call point_diag(stag_dthck, 'stag_thck - stag_thck_obs', itest, jtest, rtest, 7, 7)
        call point_diag(stag_dthck_dt*scyr, 'stag_dthck_dt (m/yr)', itest, jtest, rtest, 7, 7)
        call point_diag(f_ground, 'f_ground', itest, jtest, rtest, 7, 7)
-       call point_diag(del2_logc, 'del2(logC)', itest, jtest, rtest, 7, 7, '(e12.3)')
-       call point_diag(logC, 'logC', itest, jtest, rtest, 7, 7)
+!!       call point_diag(del2_logc, 'del2(logC)', itest, jtest, rtest, 7, 7, '(e12.3)')
+!!       call point_diag(logC, 'logC', itest, jtest, rtest, 7, 7)
        call point_diag(dlogc, 'dlogC', itest, jtest, rtest, 7, 7, '(e12.3)')
     endif
 
@@ -1338,6 +1348,7 @@ subroutine invert_basal_friction_basin(&
        dt,                          &
        nx,            ny,           &
        dx,            dy,           &
+       parallel,                    &
        itest, jtest,  rtest,        &
        nbasin,                      &
        basin_number,                &
@@ -1372,6 +1383,9 @@ subroutine invert_basal_friction_basin(&
     real(dp), intent(in) :: &
          dx, dy                  ! grid cell size in each direction (m)
 
+    type(parallel_type), intent(in) :: &
+         parallel                ! info for parallel communication
+
     integer, intent(in) :: &
          itest, jtest, rtest     ! coordinates of diagnostic point
 
@@ -1418,6 +1432,7 @@ subroutine invert_basal_friction_basin(&
     call get_basin_targets(&
          nx,           ny,                  &
          dx,           dy,                  &
+         parallel,                          &
          nbasin,       basin_number,        &
          itest, jtest, rtest,               &
          stag_thck,    stag_dthck_dt,       &
@@ -1430,6 +1445,7 @@ subroutine invert_basal_friction_basin(&
 
     call glissade_basin_average(&
          nx,          ny,                   &
+         parallel,                          &
          nbasin,      basin_number,         &
          stag_rmask,                        &
          friction_c,  friction_c_basin)
@@ -1495,6 +1511,7 @@ subroutine invert_deltaT_ocn_basin(&
        dt,                          &
        nx,            ny,           &
        dx,            dy,           &
+       parallel,                    &
        itest, jtest,  rtest,        &
        nbasin,                      &
        basin_number,                &
@@ -1534,6 +1551,9 @@ subroutine invert_deltaT_ocn_basin(&
     real(dp), intent(in) :: &
          dx, dy                  ! grid cell size in each direction (m)
 
+    type(parallel_type), intent(in) :: &
+         parallel                ! info for parallel communication
+
     integer, intent(in) :: &
          itest, jtest, rtest     ! coordinates of diagnostic point
 
@@ -1592,6 +1612,7 @@ subroutine invert_deltaT_ocn_basin(&
     call get_basin_targets(&
          nx,           ny,                  &
          dx,           dy,                  &
+         parallel,                          &
          nbasin,       basin_number,        &
          itest, jtest, rtest,               &
          thck,         dthck_dt,            &
@@ -1610,6 +1631,7 @@ subroutine invert_deltaT_ocn_basin(&
 
     call glissade_basin_average(&
          nx,          ny,                   &
+         parallel,                          &
          nbasin,      basin_number,         &
          mask,                              &
          deltaT_ocn,  deltaT_basin)
@@ -2142,6 +2164,7 @@ end subroutine invert_flow_enhancement_factor
   subroutine get_basin_targets(&
        nx,           ny,                &
        dx,           dy,                &
+       parallel,                        &
        nbasin,       basin_number,      &
        itest, jtest, rtest,             &
        thck,         dthck_dt,          &
@@ -2167,6 +2190,9 @@ subroutine get_basin_targets(&
     real(dp), intent(in) :: &
          dx, dy                  ! grid cell size in each direction (m)
 
+    type(parallel_type), intent(in) :: &
+         parallel                ! info for parallel communication
+
     integer, intent(in) :: &
          nbasin                  ! number of basins
 
@@ -2221,6 +2247,7 @@ subroutine get_basin_targets(&
 
     call glissade_basin_sum(&
          nx,         ny,                &
+         parallel,                      &
          nbasin,     basin_number,      &
          target_rmask,                  &
          cell_area,                     &
@@ -2232,6 +2259,7 @@ subroutine get_basin_targets(&
 
     call glissade_basin_sum(&
          nx,         ny,                &
+         parallel,                      &
          nbasin,     basin_number,      &
          target_rmask,                  &
          thck_target*dx*dy,             &
@@ -2241,6 +2269,7 @@ subroutine get_basin_targets(&
 
     call glissade_basin_sum(&
          nx,         ny,                &
+         parallel,                      &
          nbasin,     basin_number,      &
          target_rmask,                  &
          thck*dx*dy,                    &
@@ -2250,6 +2279,7 @@ subroutine get_basin_targets(&
 
     call glissade_basin_sum(&
          nx,         ny,                &
+         parallel,                      &
          nbasin,     basin_number,      &
          target_rmask,                  &
          dthck_dt*dx*dy,                &
diff --git a/libglissade/glissade_mass_balance.F90 b/libglissade/glissade_mass_balance.F90
index 09c1b3ce..fb03dc79 100644
--- a/libglissade/glissade_mass_balance.F90
+++ b/libglissade/glissade_mass_balance.F90
@@ -43,7 +43,7 @@ module glissade_mass_balance
     use glimmer_utils, only: point_diag
     use glide_types
     use cism_parallel, only: this_rank, main_task, nhalo, lhalo, uhalo, &
-         parallel_halo, parallel_reduce_max, parallel_reduce_sum, parallel_globalindex
+         parallel_halo, parallel_reduce_max, parallel_global_sum, parallel_globalindex
 
     implicit none
     save
@@ -68,8 +68,8 @@ subroutine glissade_mass_balance_init(model)
 
     ! Initialize some fields related to the surface mass balance
 
-    use glimmer_paramets, only: eps11
     use glimmer_physcon, only: rhow, rhoi, scyr
+    use cism_parallel, only: parallel_is_zero
 
     ! input/output arguments
 
@@ -77,9 +77,33 @@ subroutine glissade_mass_balance_init(model)
 
     ! local variables
 
-    real(dp) :: local_maxval, global_maxval
     character(len=100) :: message
-      
+
+    ! Initialize artm for the case that we are reading in artm_ref or artm_3d.
+    ! For some temp_init options, this is needed for correct interior temperatures.
+    ! Note: Do not call if glaciers are enabled. When running with glaciers, artm is
+    !       accumulated over year 1 from a forcing file which hasn't been read in yet.
+    ! TODO: Think about how to initialize glacier temperatures. Currently assume artm = 0,
+    !       which isn't realistic.
+    if (model%options%artm_input_function /= ARTM_INPUT_FUNCTION_XY) then
+       if (.not.model%options%enable_glaciers) then
+          call downscale_artm(model)
+       endif
+    endif
+
+    ! Initialize smb for the case that we are reading in smb_ref or smb_3d.
+    ! This is not strictly needed, since the SMB will be recomputed before it is used,
+    !  but can be a helpful diagnostic.
+    ! Note: Do not call if glaciers are enabled. When running with glaciers, smb is
+    !       accumulated over year 1 from a forcing file which hasn't been read in yet.
+    ! TODO - call downscale_smb for the PDD option?
+    if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ .or. &
+        model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then
+       if (.not.model%options%enable_glaciers) then
+          call downscale_smb(model)
+       endif
+    endif
+
     ! Initialize acab, if SMB (with different units) was read in
     if (model%options%smb_input == SMB_INPUT_MMYR_WE) then
        ! Convert units from mm/yr w.e. to m/s ice
@@ -87,6 +111,7 @@ subroutine glissade_mass_balance_init(model)
     endif
 
     ! Initialize artm_corrected.  This is equal to artm, plus any prescribed temperature anomaly.
+    !TODO - Not sure this is needed
     model%climate%artm_corrected(:,:) = model%climate%artm(:,:)
     
     if (model%options%enable_artm_anomaly) then
@@ -96,9 +121,7 @@ subroutine glissade_mass_balance_init(model)
        ! Note: The artm_anomaly field does not change during the run,
        !       but it is possible to ramp up the anomaly using artm_anomaly_timescale.
 
-       local_maxval = maxval(abs(model%climate%artm_anomaly))
-       global_maxval = parallel_reduce_max(local_maxval)
-       if (global_maxval < eps11) then
+       if (parallel_is_zero(model%climate%artm_anomaly)) then
           model%climate%artm_anomaly = model%climate%artm_anomaly_const
           write(message,*) &
                'Setting artm_anomaly = constant value (degC):', model%climate%artm_anomaly_const
@@ -109,7 +132,6 @@ subroutine glissade_mass_balance_init(model)
           endif
        endif
     endif
-    !TODO - Write a short utility function to compute global_maxval of any field.
     !TODO - Repeat for snow and precip anomalies
 
     ! If acab is to be overwritten for some cells, then set overwrite_acab_mask = 1 for these cells.
@@ -206,52 +228,9 @@ subroutine glissade_prepare_climate_forcing(model)
     ! Downscaling of artm_ref to artm (at the ice surface) happens below, followed by the SMB calculation.
     !-------------------------------------------------------
 
-    ! Downscale artm to the current surface elevation if needed.
-    ! The downscaling options are:
-    ! (0) artm(x,y); no dependence on surface elevation
-    ! (1) artm(x,y) + d(artm)/dz(x,y) * dz; artm depends on input field at reference elevation, plus vertical correction
-    ! (2) artm(x,y,z); artm obtained by linear interpolation between values prescribed at adjacent vertical levels
-    ! (3) artm(x,y) adjusted with a uniform lapse rate
-    ! For options (1) - (3), the elevation-dependent artm is computed here.
-
-    if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then
-
-       ! compute artm by a lapse-rate correction to the reference value
-       model%climate%artm(:,:) = model%climate%artm_ref(:,:) + &
-            (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%artm_gradz(:,:)
-
-    elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then
-
-       if (parallel_is_zero(model%climate%artm_3d)) then
-          write(message,*) 'Error: artm_3d = 0 everywhere with artm_input_function =', model%options%artm_input_function
-          call write_log(trim(message), GM_FATAL)
-       endif
-
-       ! Note: With linear_extrapolate_in = T, the values outside the range are obtained by linear extrapolation
-       !        from the top two or bottom two values.
-       !       For temperature, which varies roughly linearly with elevation, this is more accurate
-       !        than simply extending the top and bottom values.
-       !       This call includes a halo update.
-
-       call glissade_vertical_interpolate(&
-            ewn,                   nsn,                 &
-            nzatm,                 model%climate%zatm,  &
-            model%geometry%usrf,                        &
-            model%climate%artm_3d,                      &
-            model%climate%artm,                         &
-            linear_extrapolate_in = .true.)
-
-    elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then
-
-       ! compute artm by a lapse-rate correction to artm_ref
-       ! T_lapse is defined as positive for T decreasing with height
-
-       model%climate%artm(:,:) = model%climate%artm_ref(:,:) - &
-            (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%t_lapse
-
-    endif   ! artm_input_function
-
-    call parallel_halo(model%climate%artm, parallel)
+    if (model%options%artm_input_function /= ARTM_INPUT_FUNCTION_XY) then
+       call downscale_artm(model)
+    endif
 
     ! Optionally, add an anomaly to the surface air temperature
     ! Typically, artm_corrected = artm, but sometimes (e.g., for ISMIP6 forcing experiments),
@@ -348,155 +327,10 @@ subroutine glissade_prepare_climate_forcing(model)
     !  which is passed to the main mass balance driver.
     !-------------------------------------------------------------------------
 
-    ! ------------------------------------------------------------------------
-    ! Depending on the SMB input options, compute model%climate%acab at the ice surface.
-    ! The options are:
-    ! (0) SMB(x,y); no dependence on surface elevation
-    ! (1) SMB(x,y) + dSMB/dz(x,y) * dz; SMB depends on input field at reference elevation, plus vertical correction
-    ! (2) SMB(x,y,z); SMB obtained by linear interpolation between values prescribed at adjacent vertical levels
-    ! (3) SMB obtained from precip and artm using a positive-degree scheme
-    !
-    ! Options (1) and (2) require input fields with SMB units of mm/yr w.e. (SMB_INPUT_MMYR_WE)
-    ! For these options, the elevation-dependent SMB is computed here.
-    ! ------------------------------------------------------------------------
-
-    if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then
-
-       ! downscale SMB to the local surface elevation
-       model%climate%smb(:,:) = model%climate%smb_ref(:,:) + &
-            (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%smb_gradz(:,:)
-
-    elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then
-
-       ! downscale SMB to the local surface elevation
-       ! Note: With linear_extrapolate_in = F, the values at top and bottom levels are simply extended upward and downward.
-       !       For SMB, this is safer than linear extrapolation (especially when extrapolating upward).
-
-       if (parallel_is_zero(model%climate%smb_3d)) then
-          write(message,*) 'Error: smb_3d = 0 everywhere with smb_input_function =', model%options%smb_input_function
-          call write_log(trim(message), GM_FATAL)
-       endif
-
-       call glissade_vertical_interpolate(&
-            ewn,                   nsn,                 &
-            nzatm,                 model%climate%zatm,  &
-            model%geometry%usrf,                        &
-            model%climate%smb_3d,                       &
-            model%climate%smb,                          &
-            linear_extrapolate_in = .false.)
-
-    elseif  (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then
-
-       ! Compute SMB using a simple PDD scheme:
-       ! (1) Partition precip as rain or snow based on the downscaled artm
-       ! (2) Compute ablation based on artm and a degree factor
-       ! Assume that artm has already been downscaled, if needed, based on artm_input_function.
-
-       ! Note: This is similar to the SMB calculation for glaciers, but that calculation is done in the glacier module.
-       ! TODO: Put the glacier values of snow_threshold_min and snow_threshold_max in the climate derived type.
-
-       ! compute snow accumulation (mm/yr w.e.)
-       where (model%climate%artm > model%climate%snow_threshold_max)
-          model%climate%snow = 0.0d0   ! all precip falls as rain
-       elsewhere (model%climate%artm < model%climate%snow_threshold_min)
-          model%climate%snow = model%climate%precip   ! all precip falls as snow
-       elsewhere (model%climate%artm > model%climate%snow_threshold_min)
-          model%climate%snow = model%climate%precip * (model%climate%snow_threshold_max - model%climate%artm)  &
-               / (model%climate%snow_threshold_max - model%climate%snow_threshold_min)
-       endwhere
-
-       ! compute ablation (mm/yr w.e.)
-       ! Note: degree_factor has units of mm/yr w.e./degC to be consistent with other mass-balance variables.
-       !       It is like mu_star for glaciers.
-       model%climate%ablation = model%climate%degree_factor * max(model%climate%artm - model%climate%tmlt, 0.0d0)
-
-       ! compute smb (mm/yr w.e.)
-       model%climate%smb = model%climate%snow - model%climate%ablation
-
-       ! set smb = 0 for open ocean
-       where (model%geometry%thck == 0.0d0 .and. (model%geometry%topg - model%climate%eus) < 0.0d0)
-          model%climate%smb = 0.0d0
-       endwhere
-
-    endif   ! smb_input_function
-
-    ! For the non-default smb_input_function options, make sure that model%climate%smb is nonzero somewhere; else abort.
-    ! For the default option, do not abort, since idealized tests often have a zero SMB.
-
-    call parallel_halo(model%climate%smb, parallel)
-
-    if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ .or. &
-        model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ .or. &
-        model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then
-       if (parallel_is_zero(model%climate%smb)) then
-          write(message,*) 'Error: smb = 0 everywhere with smb_input_function =', model%options%smb_input_function
-          call write_log(trim(message), GM_FATAL)
-       endif
+    if (model%options%smb_input_function /= SMB_INPUT_FUNCTION_XY) then
+       call downscale_smb(model)
     endif
 
-    ! optional diagnostics
-    if (verbose_smb) then
-
-       if (this_rank == rtest) then
-          write(iulog,*) 'Computing runtime smb with smb_input_function =', model%options%smb_input_function
-       endif
-       call point_diag(model%geometry%usrf, 'usrf (m)', itest, jtest, rtest, 7, 7)
-
-       if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY) then
-          call point_diag(model%climate%smb, 'smb (mm/yr)', itest, jtest, rtest, 7, 7)
-       elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then
-          call point_diag(model%geometry%usrf - model%climate%usrf_ref, 'usrf - usrf_ref (m)', &
-               itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%smb_ref, 'reference smb (mm/yr)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%smb_gradz, 'smb_gradz (mm/yr per m)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7)
-       elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then
-          if (this_rank == rtest) then
-             write(iulog,*) ' '
-             write(iulog,*) 'smb_3d at each level:'
-          endif
-          do k = 1, nzatm
-             if (this_rank == rtest) then
-                write(iulog,*) ' '
-                write(iulog,*) 'k =', k
-             endif
-             call point_diag(model%climate%smb_3d(k,:,:), 'smb_3d (mm/yr)', itest, jtest, rtest, 7, 7)
-          enddo
-          call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7)
-       elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then
-          call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%precip, 'precip (mm/yr)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%snow, 'snow (mm/yr)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%ablation, 'ablation (mm/yr)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%smb,'smb (mm/yr)', itest, jtest, rtest, 7, 7)
-       endif  ! smb_input_function
-
-       if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY) then
-          call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7)
-       elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then
-          call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%artm_gradz*1000.d0, 'artm_gradz (deg C per km)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7)
-       elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then
-          if (this_rank == rtest) then
-             write(iulog,*) ' '
-             write(iulog,*) 'artm_3d at each level:'
-          endif
-          do k = 1, nzatm
-             if (this_rank == rtest) then
-                write(iulog,*) ' '
-                write(iulog,*) 'k =', k
-             endif
-             call point_diag(model%climate%artm_3d(k,:,:), 'artm_3d (deg C)', itest, jtest, rtest, 7, 7)
-          enddo
-          call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7)
-       elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then
-          call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7)
-          call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7)
-       endif   ! artm_input_function
-
-    endif  ! verbose_smb
-
     ! Compute a corrected smb field that includes any anomalies or correction factors.
 
     ! initialize
@@ -614,13 +448,354 @@ subroutine glissade_prepare_climate_forcing(model)
 
   end subroutine glissade_prepare_climate_forcing
 
+!=======================================================================
+
+  subroutine downscale_artm(model)
+
+    use glissade_grid_operators, only: glissade_vertical_interpolate
+    use cism_parallel, only: parallel_is_zero
+
+    ! input/output arguments
+
+    type(glide_global_type), intent(inout) :: model   ! model instance
+
+    ! local variables
+
+    integer :: itest, jtest, rtest      ! coordinates of diagnostic cell
+    integer :: i, j, k
+    integer :: ewn, nsn
+    integer :: nzatm                    ! number of atmosphere levels at which smb_3d and artm_3d are provided
+
+    type(parallel_type) :: parallel     ! info for parallel communication
+
+    character(len=100) :: message
+
+    ! initialize
+
+    rtest = -999
+    itest = 1
+    jtest = 1
+    if (this_rank == model%numerics%rdiag_local) then
+       rtest = model%numerics%rdiag_local
+       itest = model%numerics%idiag_local
+       jtest = model%numerics%jdiag_local
+    endif
+
+    ewn = model%general%ewn
+    nsn = model%general%nsn
+    nzatm = model%climate%nzatm
+    parallel = model%parallel
+
+    ! Downscale artm to the current surface elevation if needed.
+    ! The downscaling options are:
+    ! (0) artm(x,y); no dependence on surface elevation
+    ! (1) artm(x,y) + d(artm)/dz(x,y) * dz; artm depends on input field at reference elevation, plus vertical correction
+    ! (2) artm(x,y,z); artm obtained by linear interpolation between values prescribed at adjacent vertical levels
+    ! (3) artm(x,y) adjusted with a uniform lapse rate
+    ! For options (1) - (3), the elevation-dependent artm is computed here.
+
+    ! Make sure the required input fields are present with nonzero values
+
+    if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then
+
+       if (parallel_is_zero(model%climate%artm_ref) .or. &
+           parallel_is_zero(model%climate%usrf_ref) .or. &
+           parallel_is_zero(model%climate%artm_gradz)) then
+          write(message,*) &
+               'Error: Must have nonzero artm_ref, artm_gradz and usrf_ref with artm_input_function =', &
+               model%options%artm_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+
+    elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then
+
+       if (parallel_is_zero(model%climate%artm_3d)) then
+          write(message,*) &
+               'Error: Must have nonzero artm_3d with artm_input_function =', &
+               model%options%artm_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+
+    elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then
+
+       if (parallel_is_zero(model%climate%artm_ref) .or. &
+           parallel_is_zero(model%climate%usrf_ref) .or. &
+           model%climate%t_lapse <= 0.0d0) then
+          write(message,*) &
+               'Error: Must have t_lapse > 0 and nonzero artm_ref, usrf_ref with artm_input_function =', &
+               model%options%artm_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+
+    endif
+
+    ! Do the downscaling
+
+    if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then
+
+       ! compute artm by a lapse-rate correction to the reference value
+       model%climate%artm(:,:) = model%climate%artm_ref(:,:) + &
+            (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%artm_gradz(:,:)
+
+    elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then
+
+       ! Note: With linear_extrapolate_in = T, the values outside the range are obtained
+       !        by linear extrapolation from the top two or bottom two values.
+       !       For temperature, which varies roughly linearly with elevation, this is more accurate
+       !        than simply extending the top and bottom values.
+
+       call glissade_vertical_interpolate(&
+            ewn,                   nsn,                 &
+            nzatm,                 model%climate%zatm,  &
+            model%geometry%usrf,                        &
+            model%climate%artm_3d,                      &
+            model%climate%artm,                         &
+            linear_extrapolate_in = .true.)
+
+    elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then
+
+       ! compute artm by a lapse-rate correction to artm_ref
+       ! T_lapse is defined as positive for T decreasing with height
+
+       model%climate%artm(:,:) = model%climate%artm_ref(:,:) - &
+            (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%t_lapse
+
+    endif   ! artm_input_function
+
+    call parallel_halo(model%climate%artm, parallel)
+
+    ! optional diagnostics
+
+    if (verbose_smb) then
+       if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY) then
+          call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7)
+       elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then
+          call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%artm_gradz*1000.d0, 'artm_gradz (deg C per km)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7)
+       elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then
+          if (this_rank == rtest) then
+             write(iulog,*) ' '
+             write(iulog,*) 'artm_3d at each level:'
+          endif
+          do k = 1, nzatm
+             if (this_rank == rtest) then
+                write(iulog,*) ' '
+                write(iulog,*) 'k =', k
+             endif
+             call point_diag(model%climate%artm_3d(k,:,:), 'artm_3d (deg C)', itest, jtest, rtest, 7, 7)
+          enddo
+          call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7)
+       elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then
+          call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7)
+       endif   ! artm_input_function
+    endif
+
+  end subroutine downscale_artm
+
+!=======================================================================
+
+  subroutine downscale_smb(model)
+
+    use glissade_grid_operators, only: glissade_vertical_interpolate
+    use cism_parallel, only: parallel_is_zero
+
+    ! input/output arguments
+
+    type(glide_global_type), intent(inout) :: model   ! model instance
+
+    ! local variables
+
+    integer :: itest, jtest, rtest      ! coordinates of diagnostic cell
+    integer :: i, j, k
+    integer :: ewn, nsn
+    integer :: nzatm                    ! number of atmosphere levels at which smb_3d and artm_3d are provided
+
+    type(parallel_type) :: parallel     ! info for parallel communication
+
+    character(len=100) :: message
+
+    ! initialize
+
+    rtest = -999
+    itest = 1
+    jtest = 1
+    if (this_rank == model%numerics%rdiag_local) then
+       rtest = model%numerics%rdiag_local
+       itest = model%numerics%idiag_local
+       jtest = model%numerics%jdiag_local
+    endif
+
+    ewn = model%general%ewn
+    nsn = model%general%nsn
+    nzatm = model%climate%nzatm
+    parallel = model%parallel
+
+    ! ------------------------------------------------------------------------
+    ! Depending on the SMB input options, compute model%climate%acab at the ice surface.
+    ! The options are:
+    ! (0) SMB(x,y); no dependence on surface elevation
+    ! (1) SMB(x,y) + dSMB/dz(x,y) * dz; SMB depends on input field at reference elevation, plus vertical correction
+    ! (2) SMB(x,y,z); SMB obtained by linear interpolation between values prescribed at adjacent vertical levels
+    ! (3) SMB obtained from precip and artm using a positive-degree scheme
+    !
+    ! Options (1) and (2) require input fields with SMB units of mm/yr w.e. (SMB_INPUT_MMYR_WE)
+    ! For these options, the elevation-dependent SMB is computed here.
+    ! ------------------------------------------------------------------------
+
+    ! Make sure the required input fields are present with nonzero values
+
+    if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then
+
+       if (parallel_is_zero(model%climate%smb_ref) .or. &
+           parallel_is_zero(model%climate%usrf_ref) .or. &
+           parallel_is_zero(model%climate%smb_gradz)) then
+          write(message,*) &
+               'Error: Must have nonzero smb_ref, smb_gradz and usrf_ref with smb_input_function =', &
+               model%options%smb_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+
+    elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then
+
+       if (parallel_is_zero(model%climate%smb_3d)) then
+          write(message,*) &
+               'Error: Must have nonzero smb_3d with smb_input_function =', &
+               model%options%smb_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+
+    elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then
+
+       if (parallel_is_zero(model%climate%artm) .or. &
+           parallel_is_zero(model%climate%precip)) then
+          write(message,*) &
+               'Error: Must have nonzero artm and precip with smb_input_function =', &
+               model%options%smb_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+
+    endif
+
+    ! Do the downscaling
+
+    if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then
+
+       ! downscale SMB to the local surface elevation
+       model%climate%smb(:,:) = model%climate%smb_ref(:,:) + &
+            (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%smb_gradz(:,:)
+
+    elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then
+
+       ! downscale SMB to the local surface elevation
+       ! Note: With linear_extrapolate_in = F, the values at top and bottom levels are simply extended upward and downward.
+       !       For SMB, this is safer than linear extrapolation (especially when extrapolating upward).
+
+       call glissade_vertical_interpolate(&
+            ewn,                   nsn,                 &
+            nzatm,                 model%climate%zatm,  &
+            model%geometry%usrf,                        &
+            model%climate%smb_3d,                       &
+            model%climate%smb,                          &
+            linear_extrapolate_in = .false.)
+
+    elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then
+
+       ! Compute SMB using a simple PDD scheme:
+       ! (1) Partition precip as rain or snow based on the downscaled artm
+       ! (2) Compute ablation based on artm and a degree factor
+       ! Assume that artm has already been downscaled, if needed, based on artm_input_function.
+
+       ! Note: This is similar to the SMB calculation for glaciers, but that calculation is done in the glacier module.
+       ! TODO: Put the glacier values of snow_threshold_min and snow_threshold_max in the climate derived type.
+
+       ! compute snow accumulation (mm/yr w.e.)
+       where (model%climate%artm > model%climate%snow_threshold_max)
+          model%climate%snow = 0.0d0   ! all precip falls as rain
+       elsewhere (model%climate%artm < model%climate%snow_threshold_min)
+          model%climate%snow = model%climate%precip   ! all precip falls as snow
+       elsewhere (model%climate%artm > model%climate%snow_threshold_min)
+          model%climate%snow = model%climate%precip * (model%climate%snow_threshold_max - model%climate%artm)  &
+               / (model%climate%snow_threshold_max - model%climate%snow_threshold_min)
+       endwhere
+
+       ! compute ablation (mm/yr w.e.)
+       ! Note: degree_factor has units of mm/yr w.e./degC to be consistent with other mass-balance variables.
+       !       It is like mu_star for glaciers.
+       model%climate%ablation = model%climate%degree_factor * max(model%climate%artm - model%climate%tmlt, 0.0d0)
+
+       ! compute smb (mm/yr w.e.)
+       model%climate%smb = model%climate%snow - model%climate%ablation
+
+       ! set smb = 0 for open ocean
+       where (model%geometry%thck == 0.0d0 .and. (model%geometry%topg - model%climate%eus) < 0.0d0)
+          model%climate%smb = 0.0d0
+       endwhere
+
+    endif   ! smb_input_function
+
+    call parallel_halo(model%climate%smb, parallel)
+
+    ! For the non-default smb_input_function options, make sure the SMB is nonzero somewhere.
+    ! For the default option, do not abort, since idealized tests often have a zero SMB.
+
+    if (model%options%smb_input_function /= SMB_INPUT_FUNCTION_XY) then
+       if (parallel_is_zero(model%climate%smb)) then
+          write(message,*) 'Error: smb = 0 everywhere with smb_input_function =', &
+               model%options%smb_input_function
+          call write_log(trim(message), GM_FATAL)
+       endif
+    endif
+
+    ! optional diagnostics
+
+    if (verbose_smb) then
+
+       if (this_rank == rtest) then
+          write(iulog,*) 'Computing runtime smb with smb_input_function =', model%options%smb_input_function
+       endif
+       call point_diag(model%geometry%usrf, 'usrf (m)', itest, jtest, rtest, 7, 7)
+
+       if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY) then
+          call point_diag(model%climate%smb, 'smb (mm/yr)', itest, jtest, rtest, 7, 7)
+       elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then
+          call point_diag(model%geometry%usrf - model%climate%usrf_ref, 'usrf - usrf_ref (m)', &
+               itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%smb_ref, 'reference smb (mm/yr)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%smb_gradz, 'smb_gradz (mm/yr per m)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7)
+       elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then
+          if (this_rank == rtest) then
+             write(iulog,*) ' '
+             write(iulog,*) 'smb_3d at each level:'
+          endif
+          do k = 1, nzatm
+             if (this_rank == rtest) then
+                write(iulog,*) ' '
+                write(iulog,*) 'k =', k
+             endif
+             call point_diag(model%climate%smb_3d(k,:,:), 'smb_3d (mm/yr)', itest, jtest, rtest, 7, 7)
+          enddo
+          call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7)
+       elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then
+          call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%precip, 'precip (mm/yr)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%snow, 'snow (mm/yr)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%ablation, 'ablation (mm/yr)', itest, jtest, rtest, 7, 7)
+          call point_diag(model%climate%smb,'smb (mm/yr)', itest, jtest, rtest, 7, 7)
+       endif  ! smb_input_function
+
+    endif  ! verbose_smb
+
+  end subroutine downscale_smb
+
 !=======================================================================
 
   subroutine glissade_apply_smb(model)
 
     ! Apply the SMB at the upper and lower surfaces, and recompute tracer values.
 
-    use glimmer_paramets, only: eps11
     use glimmer_physcon, only: rhow, rhoi, scyr
     use glissade_masks, only: glissade_get_masks
     use glissade_calving, only: verbose_calving
@@ -928,6 +1103,7 @@ subroutine mass_balance_driver(&
        call glissade_sum_mass_and_tracers(&
             nx,                ny,              &
             nlyr,              ntracers,        &
+            parallel,                           &
             thck_layer(:,:,:), msum_init,       &
             tracers(:,:,:,:),  mtsum_init(:))
 
@@ -1006,9 +1182,9 @@ subroutine mass_balance_driver(&
              enddo
           enddo
 
-          sum_acab = parallel_reduce_sum(sum_acab)
-          sum_bmlt = parallel_reduce_sum(sum_bmlt)
-          sum_melt_potential = parallel_reduce_sum(sum_melt_potential)
+          sum_acab = parallel_global_sum(acab*effective_areafrac, parallel)
+          sum_bmlt = parallel_global_sum(bmlt*effective_areafrac, parallel)
+          sum_melt_potential = parallel_global_sum(melt_potential, parallel)
 
           msum_init = msum_init + (sum_acab - sum_bmlt)*dt
 
@@ -1017,6 +1193,7 @@ subroutine mass_balance_driver(&
           call glissade_sum_mass_and_tracers(&
                nx,                ny,              &
                nlyr,              ntracers,        &
+               parallel,                           &
                thck_layer(:,:,:), msum_final,      &
                tracers(:,:,:,:),  mtsum_final(:))
 
diff --git a/libglissade/glissade_transport.F90 b/libglissade/glissade_transport.F90
index b42d6866..9ba618f4 100644
--- a/libglissade/glissade_transport.F90
+++ b/libglissade/glissade_transport.F90
@@ -45,7 +45,7 @@ module glissade_transport
     use glimmer_log
     use glissade_remap, only: glissade_horizontal_remap, make_remap_mask, puny
     use cism_parallel, only: this_rank, main_task, nhalo, lhalo, uhalo, staggered_lhalo, staggered_uhalo, &
-         parallel_type, parallel_reduce_max, parallel_reduce_sum, parallel_reduce_minloc, &
+         parallel_type, parallel_global_sum, parallel_reduce_max, parallel_reduce_minloc, &
          parallel_globalindex, broadcast
 
     implicit none
@@ -445,6 +445,7 @@ subroutine glissade_transport_driver(dt,                         &
          call glissade_sum_mass_and_tracers(&
               nx,                ny,              &
               nlyr,              ntracers,        &
+              parallel,                           &
               thck_layer(:,:,:), msum_init,       &
               tracers(:,:,:,:),  mtsum_init(:))
       endif
@@ -603,6 +604,7 @@ subroutine glissade_transport_driver(dt,                         &
          call glissade_sum_mass_and_tracers(&
               nx,                ny,              &
               nlyr,              ntracers,        &
+              parallel,                           &
               thck_layer(:,:,:), msum_final,      &
               tracers(:,:,:,:),  mtsum_final(:))
 
@@ -661,6 +663,7 @@ subroutine glissade_transport_driver(dt,                         &
          call glissade_sum_mass_and_tracers(&
               nx,                ny,              &
               nlyr,              ntracers,        &
+              parallel,                           &
               thck_layer(:,:,:), msum_final,      &
               tracers(:,:,:,:),  mtsum_final(:))
 
@@ -950,6 +953,7 @@ end subroutine glissade_check_cfl
     subroutine glissade_sum_mass_and_tracers(&
          nx,         ny,        &
          nlyr,       ntracer,   &
+         parallel,              &
          thck_layer, msum,      &
          tracer,     mtsum)
 
@@ -963,6 +967,9 @@ subroutine glissade_sum_mass_and_tracers(&
          nlyr,                 &! number of vertical layers
          ntracer                ! number of tracers
 
+      type(parallel_type), intent(in) :: &
+         parallel               ! info for parallel communication
+
       real(dp), dimension (nx,ny,nlyr), intent(in) ::     &
          thck_layer             ! ice layer thickness
 
@@ -977,30 +984,21 @@ subroutine glissade_sum_mass_and_tracers(&
 
       ! Local arguments
 
-      integer :: i, j, nt
-
-      msum  = 0.d0
-      if (present(mtsum)) mtsum(:) = 0.d0
+      integer :: nt, k
 
-      do j = 1+nhalo, ny-nhalo
-         do i = 1+nhalo, nx-nhalo
-               
-            ! accumulate ice mass and mass*tracers
-            ! (actually, accumulate thickness, assuming rhoi*dx*dy is the same for each cell)
-
-            msum = msum + sum(thck_layer(i,j,:))
-
-            if (present(mtsum)) then
-               do nt = 1, ntracer
-                  mtsum(nt) =  mtsum(nt) + sum(tracer(i,j,nt,:)*thck_layer(i,j,:))
-               enddo
-            endif
-
-         enddo    ! i
-      enddo       ! j
+      msum  = 0.0d0
+      do k = 1, nlyr
+           msum = msum + parallel_global_sum(thck_layer(:,:,k), parallel)
+      enddo
 
-      msum = parallel_reduce_sum(msum)
-      if (present(mtsum)) mtsum = parallel_reduce_sum(mtsum)
+      if (present(mtsum)) then
+         mtsum(:) = 0.0d0
+         do k = 1, nlyr
+            do nt = 1, ntracer
+               mtsum(nt) = mtsum(nt) + parallel_global_sum(tracer(:,:,nt,k)*thck_layer(:,:,k), parallel)
+            enddo
+         enddo
+      endif
 
     end subroutine glissade_sum_mass_and_tracers
 
diff --git a/libglissade/glissade_utils.F90 b/libglissade/glissade_utils.F90
index e7e3431f..4f1de07f 100644
--- a/libglissade/glissade_utils.F90
+++ b/libglissade/glissade_utils.F90
@@ -44,7 +44,12 @@ module glissade_utils
        glissade_basin_sum, glissade_basin_average, &
        glissade_usrf_to_thck, glissade_thck_to_usrf, &
        glissade_edge_fluxes, glissade_input_fluxes, &
-       glissade_rms_error
+       glissade_rms_error, write_array_to_file
+
+  interface write_array_to_file
+     module procedure write_array_to_file_real8_2d
+     module procedure write_array_to_file_real8_3d
+  end interface
 
 contains
 
@@ -516,8 +521,10 @@ end subroutine glissade_adjust_topography
 
 !****************************************************
 
+  !TODO - Calls to this subroutine could be replaced by inline calls to parallel_global_sum_patch
   subroutine glissade_basin_sum(&
        nx,           ny,            &
+       parallel,                    &
        nbasin,       basin_number,  &
        rmask,                       &
        field_2d,                    &
@@ -527,11 +534,14 @@ subroutine glissade_basin_sum(&
     ! The sum is taken over grid cells with mask = 1.
     ! All cells are weighted equally.
 
-    use cism_parallel, only: parallel_reduce_sum, nhalo
+    use cism_parallel, only: parallel_global_sum_patch
 
     integer, intent(in) :: &
          nx, ny                    !> number of grid cells in each dimension
 
+    type(parallel_type), intent(in) :: &
+         parallel                  !> info for parallel communication
+
     integer, intent(in) :: &
          nbasin                    !> number of basins
 
@@ -546,29 +556,10 @@ subroutine glissade_basin_sum(&
     real(dp), dimension(nbasin), intent(out) :: &
          field_basin_sum           !> basin-sum output field
 
-    ! local variables
-
-    integer :: i, j, nb
-
     !TODO - Replace sumcell with sumarea, and pass in cell area.
     !       Current algorithm assumes all cells with mask = 1 have equal weight.
 
-    real(dp), dimension(nbasin) ::  &
-         sumfield_local     ! sum of field on local task
-
-    sumfield_local(:) = 0.0d0
-
-    ! loop over locally owned cells
-    do j = nhalo+1, ny-nhalo
-       do i = nhalo+1, nx-nhalo
-          nb = basin_number(i,j)
-          if (nb >= 1) then
-             sumfield_local(nb) = sumfield_local(nb) + rmask(i,j)*field_2d(i,j)
-          endif
-       enddo
-    enddo
-
-    field_basin_sum(:) =  parallel_reduce_sum(sumfield_local(:))
+    field_basin_sum = parallel_global_sum_patch(rmask*field_2d, nbasin, basin_number, parallel)
 
   end subroutine glissade_basin_sum
 
@@ -576,6 +567,7 @@ end subroutine glissade_basin_sum
 
   subroutine glissade_basin_average(&
        nx,           ny,            &
+       parallel,                    &
        nbasin,       basin_number,  &
        rmask,                       &
        field_2d,                    &
@@ -586,11 +578,14 @@ subroutine glissade_basin_average(&
     ! All cells are weighted equally.
     ! Note: This subroutine assumes an input field located at cell centers
 
-    use cism_parallel, only: parallel_reduce_sum, nhalo
+    use cism_parallel, only: parallel_global_sum_patch
 
     integer, intent(in) :: &
          nx, ny                    !> number of grid cells in each dimension
 
+    type(parallel_type), intent(in) :: &
+         parallel                  !> info for parallel communication
+
     integer, intent(in) :: &
          nbasin                    !> number of basins
 
@@ -607,33 +602,17 @@ subroutine glissade_basin_average(&
 
     ! local variables
 
-    integer :: i, j, nb
+    integer :: nb
 
     !TODO - Replace sumcell with sumarea, and pass in cell area.
     !       Current algorithm assumes all cells with mask = 1 have equal weight.
 
     real(dp), dimension(nbasin) ::  &
-         summask_local,          & ! sum of mask in each basin on local task
          summask_global,         & ! sum of mask in each basin on full domain
-         sumfield_local,         & ! sum of field on local task
          sumfield_global           ! sum of field over full domain
 
-    summask_local(:) = 0.0d0
-    sumfield_local(:) = 0.0d0
-
-    ! loop over locally owned cells only
-    do j = nhalo+1, ny-nhalo
-       do i = nhalo+1, nx-nhalo
-          nb = basin_number(i,j)
-          if (nb >= 1) then
-             summask_local(nb) = summask_local(nb) + rmask(i,j)
-             sumfield_local(nb) = sumfield_local(nb) + rmask(i,j)*field_2d(i,j)
-          endif
-       enddo
-    enddo
-
-    summask_global(:)  =  parallel_reduce_sum(summask_local(:))
-    sumfield_global(:) =  parallel_reduce_sum(sumfield_local(:))
+    summask_global = parallel_global_sum_patch(rmask, nbasin, basin_number, parallel)
+    sumfield_global = parallel_global_sum_patch(rmask*field_2d, nbasin, basin_number, parallel)
 
     do nb = 1, nbasin
        if (summask_global(nb) > tiny(0.0d0)) then
@@ -708,7 +687,7 @@ subroutine glissade_usrf_to_thck(usrf, topg, eus, thck)
     ! That is, if topg - eus < 0 (marine-based ice), and if the upper surface is too close
     !  to sea level to ground the ice, then the ice thickness is chosen to satisfy
     !  rhoi*H = -rhoo*(topg-eus).
-    ! Note: usrf, topg, eus and thck must all have the same units (often but not necessarily meters).
+    ! Note: usrf, topg, eus and thck must all have the same units (usually but not necessarily meters).
 
     use glimmer_physcon, only : rhoo, rhoi
 
@@ -971,10 +950,154 @@ subroutine glissade_input_fluxes(&
 
   end subroutine glissade_input_fluxes
 
+
+  ! subroutines belonging to the write_array_to_file interface
+  subroutine write_array_to_file_real8_2d(arr, fileunit, filename, parallel, write_binary)
+
+    ! Copy the input array into a global array and write all values to an output file.
+    ! This can be useful for debugging, if we want to find differences between two fields
+    ! (e.g., in two different runs).
+    ! This version writes out 64-bit character strings corresponding to the binary representation
+    ! of each floating-point variable. This can be useful for BFB comparisons.
+    ! Sometimes, two floating-point variables appear to have the same values in base 10,
+    ! when the last few bits actually vary.
+    !TODO - Allow either float or binary output
+
+    use glimmer_utils, only: double_to_binary
+    use cism_parallel, only: gather_var
+
+    real(dp), dimension(:,:), intent(in) :: arr
+    integer, intent(in) :: fileunit
+    character(len=*), intent(in) :: filename
+    type(parallel_type), intent(in) :: parallel
+    logical, intent(in), optional :: write_binary
+
+    integer :: i, j
+    character(len=64) :: binary_str
+    real(dp), dimension(:,:), allocatable :: arr_global
+    logical :: binary_output
+
+    if (present(write_binary)) then
+       binary_output = write_binary
+    else
+       binary_output = .false.
+    endif
+
+    call gather_var(arr, arr_global, parallel)
+    if (main_task) then
+       open(unit=fileunit, file=trim(filename), status='replace', position='append')
+
+       if (binary_output) then
+          do j = 1, parallel%global_nsn
+             do i = 1, parallel%global_ewn
+                call double_to_binary(arr_global(i,j), binary_str)
+                write (fileunit, '(2i6,a4,a64)') i, j, '    ', binary_str
+             enddo
+          enddo
+       else
+          do j = 1, parallel%global_nsn
+             do i = 1, parallel%global_ewn
+                write (fileunit, '(2i6,a4,f24.16)') i, j, '    ', arr_global(i,j)
+             enddo
+          enddo
+       endif
+
+       close(unit=fileunit)
+       deallocate(arr_global)
+    endif
+
+  end subroutine write_array_to_file_real8_2d
+
+
+  subroutine write_array_to_file_real8_3d(arr, fileunit, filename, parallel, write_binary, cycle_indices)
+
+    ! Copy the input array into a global array and write all values to an output file.
+    ! This can be useful for debugging, if we want to find differences between two fields
+    ! (e.g., in two different runs).
+    ! This version writes out 64-bit character strings corresponding to the binary representation
+    ! of each floating-point variable. This can be useful for BFB comparisons.
+    ! Sometimes, two floating-point variables appear to have the same values in base 10,
+    ! when the last few bits actually vary.
+    !TODO - Allow either float or binary output
+
+    use glimmer_utils, only: double_to_binary
+    use cism_parallel, only: gather_var
+
+    real(dp), dimension(:,:,:), intent(in) :: arr    ! first two indices are i and j
+    integer, intent(in) :: fileunit
+    character(len=*), intent(in) :: filename
+    type(parallel_type), intent(in) :: parallel
+    logical, intent(in), optional :: write_binary
+    logical, intent(in), optional :: cycle_indices   ! if true, then index 3->1, 1->2, 2->3
+
+    integer :: i, j, k, kmax
+    character(len=64) :: binary_str
+    real(dp), dimension(:,:,:), allocatable :: arr_global
+    real(dp), dimension(:,:,:), allocatable :: arr_cycle
+    logical :: binary_output
+    logical :: cycle_ind
+
+    if (present(write_binary)) then
+       binary_output = write_binary
+    else
+       binary_output = .false.
+    endif
+
+    if (present(cycle_indices)) then
+       cycle_ind = cycle_indices
+    else
+       cycle_ind = .false.
+    endif
+
+    if (cycle_ind) then
+       allocate(arr_cycle(size(arr,3), size(arr,1), size(arr,2)))
+       kmax = size(arr,3)
+       do j = 1, size(arr,2)
+          do i = 1, size(arr,1)
+             do k = 1, kmax
+                arr_cycle(k,i,j) = arr(i,j,k)
+             enddo
+          enddo
+       enddo
+       call gather_var(arr_cycle, arr_global, parallel)
+       deallocate(arr_cycle)
+    else
+       kmax = size(arr,1)
+       call gather_var(arr, arr_global, parallel)
+    endif
+
+    if (main_task) then
+       open(unit=fileunit, file=trim(filename), status='unknown')
+
+       if (binary_output) then
+          do j = 1, parallel%global_nsn
+             do i = 1, parallel%global_ewn
+                do k = 1, kmax
+                   call double_to_binary(arr_global(k,i,j), binary_str)
+                   write (fileunit, '(3i6,a4,a64)') i, j, k, '    ', binary_str
+                enddo
+             enddo
+          enddo
+       else
+          do j = 1, parallel%global_nsn
+             do i = 1, parallel%global_ewn
+                do k = 1, kmax
+                   write (fileunit, '(3i6,a4,f24.16)') i, j, k, '    ', arr_global(k,i,j)
+                enddo
+             enddo
+          enddo
+       endif
+
+       close(unit=fileunit)
+       deallocate(arr_global)
+    endif
+
+  end subroutine write_array_to_file_real8_3d
+
 !****************************************************************************
 
 !TODO - Other utility subroutines to add here?
-!       E.g., tridiag; calclsrf; subroutines to zero out tracers
+!       E.g., calclsrf; subroutines to zero out tracers
 
 !****************************************************************************
 
diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90
index 3b146e44..1ea5b2eb 100644
--- a/libglissade/glissade_velo_higher.F90
+++ b/libglissade/glissade_velo_higher.F90
@@ -56,11 +56,11 @@
 
   module glissade_velo_higher
 
-    use glimmer_global, only: dp
+    use glimmer_global, only: dp, i8
     use glimmer_physcon, only: n_glen, rhoi, rhoo, grav, scyr, pi
-    use glimmer_paramets, only: iulog, eps08, eps10, eps11
+    use glimmer_paramets, only: iulog, eps11, eps10
     use glimmer_paramets, only: velo_scale, len_scale   ! used for whichefvs = HO_EFVS_FLOWFACT
-    use glimmer_utils, only: point_diag
+    use glimmer_utils, only: point_diag, double_to_binary
     use glimmer_log
     use glimmer_sparse_type
     use glimmer_sparse
@@ -92,7 +92,9 @@ module glissade_velo_higher
     use cism_parallel, only: this_rank, main_task, nhalo, tasks, &
          parallel_type, parallel_halo, staggered_parallel_halo, parallel_globalindex, &
          parallel_reduce_max, parallel_reduce_sum, not_parallel
-
+    !WHL - debug
+    use cism_parallel, only: parallel_global_sum_stagger
+    use cism_reprosum_mod, only: verbose_reprosum
     implicit none
 
     private
@@ -225,8 +227,8 @@ module glissade_velo_higher
 !    logical :: verbose_bfric = .true.
     logical :: verbose_trilinos = .false.
 !    logical :: verbose_trilinos = .true.
-    logical :: verbose_beta = .false.
-!    logical :: verbose_beta = .true.
+!    logical :: verbose_beta = .false.
+    logical :: verbose_beta = .true.
     logical :: verbose_efvs = .false.
 !    logical :: verbose_efvs = .true.
     logical :: verbose_tau = .false.
@@ -239,6 +241,8 @@ module glissade_velo_higher
 !    logical :: verbose_L1L2 = .true.
     logical :: verbose_diva = .false.
 !    logical :: verbose_diva = .true.
+    logical :: verbose_bp = .false.
+!    logical :: verbose_bp = .true.
     logical :: verbose_glp = .false.
 !    logical :: verbose_glp = .true.
     logical :: verbose_picard = .false.
@@ -259,6 +263,10 @@ module glissade_velo_higher
        dphi_dyr_3d_vav,    &! vertical avg of dphi_dyr_3d
        dphi_dzr_3d_vav      ! vertical avg of dphi_dzr_3d
 
+    !WHL - debug for reprosum
+    character(len=64) :: binary_str   ! string representation of binary number (chain of 0's and 1's)
+    character(len=64) :: binary_str1, binary_str2
+
   contains
 
 !****************************************************************************
@@ -669,6 +677,9 @@ subroutine glissade_velo_higher_solve(model,                &
     use glide_thck, only: glide_calclsrf
     use profile, only: t_startf, t_stopf
 
+    !WHL - debug
+    use glissade_utils, only: write_array_to_file
+
     !----------------------------------------------------------------
     ! Input-output arguments
     !----------------------------------------------------------------
@@ -1047,11 +1058,13 @@ subroutine glissade_velo_higher_solve(model,                &
          uvel_2d_old, vvel_2d_old,  & ! velocity solution from previous nonlinear iteration
          duvel_2d, dvvel_2d,        & ! difference between current and previous velocity solutions
          uvel_2d_sav, vvel_2d_sav,  & ! current best value for velocity solution (smallest residual)
-         beta_internal_sav            ! beta_internal associated with saved velocity
+         beta_internal_sav,         & ! beta_internal associated with saved velocity
+         beta_eff_x_sav, beta_eff_y_sav
 
     real(dp), dimension(:,:,:), allocatable :: &
          Auu_2d_sav, Auv_2d_sav,    & ! assembled matrices associated with (uvel_2d_sav, vvel_2d_sav)
-         Avu_2d_sav, Avv_2d_sav
+         Avu_2d_sav, Avv_2d_sav,    &
+         omega_k_sav
 
     ! for an accelerated 3D solve:
     real(dp), dimension(:,:,:), allocatable :: &
@@ -1066,6 +1079,15 @@ subroutine glissade_velo_higher_solve(model,                &
     integer :: itest, jtest    ! coordinates of diagnostic point
     integer :: rtest           ! task number for processor containing diagnostic point
 
+    !WHL - debug
+    real(dp), dimension(nNodeNeighbors_2d) :: &
+         sum_Auu, sum_Auv, sum_Avu, sum_Avv
+    real(dp) :: sum_uvel, sum_vvel
+    real(dp) :: sum_bu, sum_bv
+    real(dp) :: sum_flwa, sum_flwafact, sum_btrx, sum_btry, sum_stagusrf, sum_stagthck
+    real(dp) :: sum_betax, sum_betay, sum_omega, sum_stag_omega
+    real(dp), dimension(:,:), allocatable :: arr_global  ! temporary global array
+
     call t_startf('glissade_vhs_init')
     rtest = -999
     itest = 1
@@ -1268,7 +1290,15 @@ subroutine glissade_velo_higher_solve(model,                &
        allocate(vsav_2d(nx-1,ny-1))
        allocate(resid_u_2d(nx-1,ny-1))
        allocate(resid_v_2d(nx-1,ny-1))
-       if (accel_picard) then
+    else   ! 3d
+       allocate(Auu(nNodeNeighbors_3d,nz,nx-1,ny-1))
+       allocate(Auv(nNodeNeighbors_3d,nz,nx-1,ny-1))
+       allocate(Avu(nNodeNeighbors_3d,nz,nx-1,ny-1))
+       allocate(Avv(nNodeNeighbors_3d,nz,nx-1,ny-1))
+    endif
+
+    if (accel_picard) then
+       if (solve_2d) then
           allocate(uvel_2d_old(nx-1,ny-1))
           allocate(vvel_2d_old(nx-1,ny-1))
           allocate(duvel_2d(nx-1,ny-1))
@@ -1279,14 +1309,7 @@ subroutine glissade_velo_higher_solve(model,                &
           allocate(Auv_2d_sav(nx-1,ny-1,nNodeNeighbors_2d))
           allocate(Avu_2d_sav(nx-1,ny-1,nNodeNeighbors_2d))
           allocate(Avv_2d_sav(nx-1,ny-1,nNodeNeighbors_2d))
-          allocate(beta_internal_sav(nx-1,ny-1))
-       endif
-    else
-       allocate(Auu(nNodeNeighbors_3d,nz,nx-1,ny-1))
-       allocate(Auv(nNodeNeighbors_3d,nz,nx-1,ny-1))
-       allocate(Avu(nNodeNeighbors_3d,nz,nx-1,ny-1))
-       allocate(Avv(nNodeNeighbors_3d,nz,nx-1,ny-1))
-       if (accel_picard) then
+       else   ! 3d
           allocate(uvel_old(nz,nx-1,ny-1))
           allocate(vvel_old(nz,nx-1,ny-1))
           allocate(duvel(nz,nx-1,ny-1))
@@ -1297,9 +1320,13 @@ subroutine glissade_velo_higher_solve(model,                &
           allocate(Auv_sav(nNodeNeighbors_3d,nz,nx-1,ny-1))
           allocate(Avu_sav(nNodeNeighbors_3d,nz,nx-1,ny-1))
           allocate(Avv_sav(nNodeNeighbors_3d,nz,nx-1,ny-1))
-          allocate(beta_internal_sav(nx-1,ny-1))
        endif
-    endif
+       allocate(beta_internal_sav(nx-1,ny-1))
+       !Note: The next three are used only for DIVA, but it's simpler to allocate them regardless
+       allocate(beta_eff_x_sav(nx-1,ny-1))
+       allocate(beta_eff_y_sav(nx-1,ny-1))
+       allocate(omega_k_sav(nz,nx,ny))
+    endif   ! accel_picard
 
     if (whichapprox == HO_APPROX_DIVA) then
 !!       call parallel_halo(efvs, parallel)   ! efvs halo update is in glissade_diagnostic_variable_solve
@@ -1585,19 +1612,21 @@ subroutine glissade_velo_higher_solve(model,                &
     !  unique local ID to each such node.
     !------------------------------------------------------------------------------
 
-!pw call t_startf('glissade_get_vertex_geom')
-    call get_vertex_geometry(nx,             ny,                   &
-                             nz,             nhalo,                &
-                             parallel,                             &
-                             dx,             dy,                   &
-                             itest,  jtest,  rtest,                &
-                             ice_mask,                      &
-                             xVertex,        yVertex,              &
-                             active_cell,    active_vertex,        &
-                             nNodesSolve,    nVerticesSolve,       &
-                             nodeID,         vertexID,             &
-                             iNodeIndex,     jNodeIndex,  kNodeIndex, &
-                             iVertexIndex,   jVertexIndex)
+    !pw call t_startf('glissade_get_vertex_geom')
+    call get_vertex_geometry(&
+         nx,               ny,                 &
+         nz,               nhalo,              &
+         parallel,                             &
+         model%general%x0, model%general%y0,   &
+         dx,               dy,                 &
+         itest,  jtest,    rtest,              &
+         ice_mask,                             &
+         xVertex,          yVertex,            &
+         active_cell,      active_vertex,      &
+         nNodesSolve,      nVerticesSolve,     &
+         nodeID,           vertexID,           &
+         iNodeIndex,       jNodeIndex,   kNodeIndex, &
+         iVertexIndex,     jVertexIndex)
 !pw call t_stopf('glissade_get_vertex_geom')
 
     ! Zero out the velocity for inactive vertices
@@ -2181,8 +2210,7 @@ subroutine glissade_velo_higher_solve(model,                &
                         itest,   jtest,     rtest,                &
                         active_vertex,      diva_level_index,     &
                         ice_plus_land_mask,                       &
-                        stag_omega,         omega_k,              &
-                        beta_internal,                            &
+                        omega_k,            beta_internal,        &
                         beta_eff_x,         beta_eff_y,           &
                         stag_theta_slope_x, stag_theta_slope_y,   &
                         stag_diva_slope_factor_x,                 &
@@ -2263,7 +2291,6 @@ subroutine glissade_velo_higher_solve(model,                &
              vbas(:,:) = vvel(nz,:,:)
           endif
 
-!!       if (verbose_beta) then
           if (verbose_beta .and. counter==1) then
              if (this_rank == rtest) write(iulog,*) 'Before calcbeta, counter =', counter
              call point_diag(usrf, 'usrf (m)', itest, jtest, rtest, 7, 7)
@@ -2327,6 +2354,36 @@ subroutine glissade_velo_higher_solve(model,                &
              usav_2d(:,:) = uvel_2d(:,:)
              vsav_2d(:,:) = vvel_2d(:,:)
 
+             !WHL - debug - BFB check
+             sum_uvel = parallel_global_sum_stagger(uvel_2d, parallel)
+             sum_vvel = parallel_global_sum_stagger(vvel_2d, parallel)
+             sum_flwa = parallel_global_sum_stagger(flwa, parallel)
+             sum_flwafact = parallel_global_sum_stagger(flwafact, parallel)
+             sum_btrx = parallel_global_sum_stagger(btractx, parallel)
+             sum_btry = parallel_global_sum_stagger(btracty, parallel)
+             sum_stagusrf = parallel_global_sum_stagger(stagusrf, parallel)
+             sum_stagthck = parallel_global_sum_stagger(stagthck, parallel)
+!!             if (this_rank == rtest) then
+             if (0 == 1) then
+                write(iulog,*) ' '
+                call double_to_binary(sum_uvel, binary_str)
+                write(iulog,*) 'Before assembly: sum_uvel, binary_str:', sum_uvel, binary_str
+                call double_to_binary(sum_vvel, binary_str)
+                write(iulog,*) 'Before assembly: sum_vvel, binary_str:', sum_vvel, binary_str
+                call double_to_binary(sum_flwa, binary_str)
+                write(iulog,*) 'Before assembly: sum_flwa, binary_str:', sum_flwa, binary_str
+                call double_to_binary(sum_flwafact, binary_str)
+                write(iulog,*) 'Before assembly: sum_flwafact, binary_str:', sum_flwafact, binary_str
+                call double_to_binary(sum_btrx, binary_str)
+                write(iulog,*) 'Before assembly: sum_btrx, binary_str:', sum_btrx, binary_str
+                call double_to_binary(sum_btry, binary_str)
+                write(iulog,*) 'Before assembly: sum_btry, binary_str:', sum_btry, binary_str
+                call double_to_binary(sum_stagusrf, binary_str)
+                write(iulog,*) 'Before assembly: sum_stagusrf, binary_str:', sum_stagusrf, binary_str
+                call double_to_binary(sum_stagthck, binary_str)
+                write(iulog,*) 'Before assembly: sum_stagthck, binary_str:', sum_stagthck, binary_str
+             endif
+
              ! Assemble the matrix
 
              call assemble_stiffness_matrix_2d(nx,               ny,              &
@@ -2351,6 +2408,45 @@ subroutine glissade_velo_higher_solve(model,                &
                                                omega_k,          omega,   &
                                                efvs_qp_3d)
 
+             !WHL - debug - BFB check
+             if (verbose_reprosum .and. counter == 1) then
+!!                if (main_task) write(iulog,*) 'Write out matrices after assemble_stiffness_matrix'
+!!                call write_array_to_file(Auu_2d, 21, 'global_Auu1', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(Auv_2d, 22, 'global_Auv1', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(Avu_2d, 23, 'global_Avu1', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(Avv_2d, 24, 'global_Avv1', parallel, write_binary = .true., cycle_indices = .true.)
+                sum_Auu(:) = parallel_global_sum_stagger(Auu_2d, nNodeNeighbors_2d, parallel)
+                sum_Auv(:) = parallel_global_sum_stagger(Auv_2d, nNodeNeighbors_2d, parallel)
+                sum_Avu(:) = parallel_global_sum_stagger(Avu_2d, nNodeNeighbors_2d, parallel)
+                sum_Avv(:) = parallel_global_sum_stagger(Avv_2d, nNodeNeighbors_2d, parallel)
+                if (main_task) then
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Auu(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Auu(n), binary_str)
+                      write(iulog,*) n, sum_Auu(n), binary_str
+                   enddo
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Auv(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Auv(n), binary_str)
+                      write(iulog,*) n, sum_Auv(n), binary_str
+                   enddo
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Avu(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Avu(n), binary_str)
+                      write(iulog,*) n, sum_Avu(n), binary_str
+                   enddo
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Avv(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Avv(n), binary_str)
+                      write(iulog,*) n, sum_Avv(n), binary_str
+                   enddo
+                endif
+             endif   ! verbose_reprosum
+
              if (whichapprox == HO_APPROX_DIVA) then
 
                 ! Halo update for omega
@@ -2409,8 +2505,8 @@ subroutine glissade_velo_higher_solve(model,                &
                 beta_eff_x(:,:) = 0.d0
                 beta_eff_y(:,:) = 0.d0
 
-                !Note: The 'if' is not strictly needed, since the corrected beta_eff is equal
-                !      to the uncorrected beta_eff whe slope_factor = 1.0 and theta_slope = 0.0
+                !Note: The 'if diva_slope_correction' is not strictly needed, since the more complicated
+                !      equations reduce to the simpler ones when slope_factor = 1.0 and theta_slope = 0.0.
                 if (diva_slope_correction) then  ! compute a larger beta_eff at each vertex based on the slope
 
                    if (whichbabc == HO_BABC_NO_SLIP) then
@@ -2433,8 +2529,8 @@ subroutine glissade_velo_higher_solve(model,                &
                          beta_eff_y = 1.d0 / stag_omega
                       endwhere
                    else   ! slip allowed at bed
-                      beta_eff_x = beta_internal(:,:) / (1.d0 + beta_internal*stag_omega)
-                      beta_eff_y = beta_internal(:,:) / (1.d0 + beta_internal*stag_omega)
+                      beta_eff_x = beta_internal / (1.d0 + beta_internal*stag_omega)
+                      beta_eff_y = beta_internal / (1.d0 + beta_internal*stag_omega)
                    endif
 
                 endif
@@ -2566,6 +2662,18 @@ subroutine glissade_velo_higher_solve(model,                &
              call staggered_parallel_halo(bv_2d(:,:), parallel)
              call t_stopf('glissade_halo_bxxs')
 
+             !WHL - debug - Write all the matrix elements and rhs elements (in binary form) to files
+             if (verbose_reprosum .and. counter == 1) then
+!!                if (main_task) write(iulog,*) 'Write out matrices after adding BC'
+!!                call write_array_to_file(Auu_2d(:,:,5), 30, 'global_Auu2', parallel)  ! diagonal terms only
+!!                call write_array_to_file(Auu_2d, 31, 'global_Auu2', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(Auv_2d, 32, 'global_Auv2', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(Avu_2d, 33, 'global_Avu2', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(Avv_2d, 34, 'global_Avv2', parallel, write_binary = .true., cycle_indices = .true.)
+!!                call write_array_to_file(bu_2d,  35, 'global_bu2',  parallel, write_binary = .true.)
+!!                call write_array_to_file(bv_2d,  36, 'global_bv2',  parallel, write_binary = .true.)
+             endif
+
              !---------------------------------------------------------------------------
              ! Check symmetry of assembled matrix
              !
@@ -2595,6 +2703,41 @@ subroutine glissade_velo_higher_solve(model,                &
                                     active_vertex,           &
                                     nNonzeros)
 
+             if (verbose_reprosum) then
+                sum_Auu(:) = parallel_global_sum_stagger(Auu_2d, nNodeNeighbors_2d, parallel)
+                sum_Auv(:) = parallel_global_sum_stagger(Auv_2d, nNodeNeighbors_2d, parallel)
+                sum_Avu(:) = parallel_global_sum_stagger(Avu_2d, nNodeNeighbors_2d, parallel)
+                sum_Avv(:) = parallel_global_sum_stagger(Avv_2d, nNodeNeighbors_2d, parallel)
+                sum_bu = parallel_global_sum_stagger(bu_2d, parallel)
+                sum_bv = parallel_global_sum_stagger(bv_2d, parallel)
+                if (main_task) then
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Auu(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Auu(n), binary_str)
+                      write(iulog,*) n, sum_Auu(n), binary_str
+                   enddo
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Auv(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Auv(n), binary_str)
+                      write(iulog,*) n, sum_Auv(n), binary_str
+                   enddo
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Avu(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Avu(n), binary_str)
+                      write(iulog,*) n, sum_Avu(n), binary_str
+                   enddo
+                   write(iulog,*) ' '
+                   write(iulog,*) 'After assembly: n, sum_Avv(n), binary_str:'
+                   do n = 1, nNodeNeighbors_2d
+                      call double_to_binary(sum_Avv(n), binary_str)
+                      write(iulog,*) n, sum_Avv(n), binary_str
+                   enddo
+                endif
+             endif   ! verbose_reprosum
+
              if (write_matrix) then
                 if (counter == 1) then    ! first outer iteration only
                    call t_startf('glissade_wrt_mat')
@@ -2608,7 +2751,7 @@ subroutine glissade_velo_higher_solve(model,                &
                 endif
              endif   ! write_matrix
 
-             if (verbose_matrix .and. this_rank==rtest) then
+             if (verbose_matrix .and. main_task) then
                 i = itest
                 j = jtest
                 write(iulog,*) ' '
@@ -2879,6 +3022,7 @@ subroutine glissade_velo_higher_solve(model,                &
 
           if (solve_2d) then
 
+
              call t_startf('glissade_resid_vec')
              call compute_residual_vector_2d(nx,            ny,            &
                                              parallel,                     &
@@ -2895,41 +3039,40 @@ subroutine glissade_velo_higher_solve(model,                &
              call t_startf('glissade_accel_picard')
              if (accel_picard) then
 
-                if (verbose_picard) then
-                   if (this_rank == rtest) then
-                      write(iulog,*) ' '
-                      write(iulog,*) 'Saved L2 norm, new L2 norm:', L2_norm_alpha_sav, L2_norm
-                   endif
-                   call point_diag(resid_u_2d, 'resid_u_2d', itest, jtest, rtest, 7, 7, '(e10.3)')
-                   call point_diag(uvel_2d, 'uvel_2d', itest, jtest, rtest, 7, 7, '(f10.3)')
+                if (verbose_picard .and. this_rank == rtest) then
+                   write(iulog,*) ' '
+                   write(iulog,*) 'Saved L2 norm, new L2 norm:', L2_norm_alpha_sav, L2_norm
                 endif
 
                 if (counter >= 2) then
 
-                   call evaluate_accelerated_picard_2d(nx,            ny,                   &
-                                                       L2_norm,       L2_norm_large,        &
-                                                       L2_norm_alpha_sav,                   &
-                                                       alpha_accel,   alpha_accel_max,      &
-                                                       gamma_accel,   resid_reduction_threshold,  &
-                                                       uvel_2d,       vvel_2d,              &
-                                                       Auu_2d,        Auv_2d,               &
-                                                       Avu_2d,        Avv_2d,               &
-                                                       uvel_2d_old,   vvel_2d_old,          &
-                                                       duvel_2d,      dvvel_2d,             &
-                                                       uvel_2d_sav,   vvel_2d_sav,          &
-                                                       Auu_2d_sav,    Auv_2d_sav,           &
-                                                       Avu_2d_sav,    Avv_2d_sav,           &
-                                                       beta_internal, beta_internal_sav,    &
-                                                       assembly_is_done)
+                   call evaluate_accelerated_picard_2d(&
+                        whichapprox,   rtest,                &
+                        L2_norm,       L2_norm_large,        &
+                        L2_norm_alpha_sav,                   &
+                        alpha_accel,   alpha_accel_max,      &
+                        gamma_accel,   resid_reduction_threshold,  &
+                        uvel_2d,       vvel_2d,              &
+                        Auu_2d,        Auv_2d,               &
+                        Avu_2d,        Avv_2d,               &
+                        uvel_2d_old,   vvel_2d_old,          &
+                        duvel_2d,      dvvel_2d,             &
+                        uvel_2d_sav,   vvel_2d_sav,          &
+                        Auu_2d_sav,    Auv_2d_sav,           &
+                        Avu_2d_sav,    Avv_2d_sav,           &
+                        beta_internal, beta_internal_sav,    &
+                        beta_eff_x,    beta_eff_x_sav,       &
+                        beta_eff_y,    beta_eff_y_sav,       &
+                        omega_k,       omega_k_sav,          &
+                        assembly_is_done)
 
                 else   ! counter = 1
 
                    ! proceed to the matrix solution
                    assembly_is_done = .true.
 
-                   if (verbose_picard .and. main_task) then
-                      write(iulog,*) 'nonlinear counter = 1; continue to matrix solver'
-                   endif
+                   if (verbose_picard .and. this_rank == rtest) &
+                        write(iulog,*) 'nonlinear counter = 1; continue to matrix solver'
 
                 endif  ! counter >= 2
 
@@ -2970,20 +3113,22 @@ subroutine glissade_velo_higher_solve(model,                &
 
                 if (counter >= 2) then
 
-                   call evaluate_accelerated_picard_3d(L2_norm,       L2_norm_large,        &
-                                                       L2_norm_alpha_sav,                   &
-                                                       alpha_accel,   alpha_accel_max,      &
-                                                       gamma_accel,   resid_reduction_threshold,  &
-                                                       uvel,          vvel,                 &
-                                                       Auu,           Auv,                  &
-                                                       Avu,           Avv,                  &
-                                                       uvel_old,      vvel_old,             &
-                                                       duvel,         dvvel,                &
-                                                       uvel_sav,      vvel_sav,             &
-                                                       Auu_sav,       Auv_sav,              &
-                                                       Avu_sav,       Avv_sav,              &
-                                                       beta_internal, beta_internal_sav,    &
-                                                       assembly_is_done)
+                   call evaluate_accelerated_picard_3d(&
+                        rtest,                               &
+                        L2_norm,       L2_norm_large,        &
+                        L2_norm_alpha_sav,                   &
+                        alpha_accel,   alpha_accel_max,      &
+                        gamma_accel,   resid_reduction_threshold,  &
+                        uvel,          vvel,                 &
+                        Auu,           Auv,                  &
+                        Avu,           Avv,                  &
+                        uvel_old,      vvel_old,             &
+                        duvel,         dvvel,                &
+                        uvel_sav,      vvel_sav,             &
+                        Auu_sav,       Auv_sav,              &
+                        Avu_sav,       Avv_sav,              &
+                        beta_internal, beta_internal_sav,    &
+                        assembly_is_done)
 
                 else   ! counter = 1
 
@@ -3012,9 +3157,9 @@ subroutine glissade_velo_higher_solve(model,                &
        ! Optional diagnostics
 
        if (verbose_beta .and. counter > 1 .and. mod(counter-1,12)==0) then
-!!       if (verbose_beta) then
 
-          call point_diag(log10(max(beta_internal,1.d-99)), 'log_beta', itest, jtest, rtest, 7, 7, '(f10.5)')
+          if (this_rank == rtest) write(iulog,*) 'Counter =', counter
+          call point_diag(log10(max(beta_internal,1.d-99)), 'log_beta', itest, jtest, rtest, 7, 7)
           if (solve_2d) then
              call point_diag(uvel_2d, 'Mean uvel (m/yr)', itest, jtest, rtest, 7, 7)
              call point_diag(vvel_2d, 'Mean vvel (m/yr)', itest, jtest, rtest, 7, 7)
@@ -3137,7 +3282,8 @@ subroutine glissade_velo_higher_solve(model,                &
              
                 call pcg_solver_standard_3d(nx,           ny,            &
                                             nz,           parallel,      &
-                                            indxA_3d,     active_vertex, &
+                                            indxA_2d,     indxA_3d,      &
+                                            active_vertex,               &
                                             Auu,          Auv,           &
                                             Avu,          Avv,           &
                                             bu,           bv,            &
@@ -3407,8 +3553,7 @@ subroutine glissade_velo_higher_solve(model,                &
                   itest,   jtest,     rtest,                &
                   active_vertex,      diva_level_index,     &
                   ice_plus_land_mask,                       &
-                  stag_omega,         omega_k,              &
-                  beta_internal,                            &
+                  omega_k,            beta_internal,        &
                   beta_eff_x,         beta_eff_y,           &
                   stag_theta_slope_x, stag_theta_slope_y,   &
                   stag_diva_slope_factor_x,                 &
@@ -3591,7 +3736,7 @@ subroutine glissade_velo_higher_solve(model,                &
 
     !WHL - debug
     !TODO - One diagnostic to write out column velocities for any approximation
-    if (whichapprox == HO_APPROX_BP .and. this_rank==rtest) then
+    if (verbose_bp .and. whichapprox == HO_APPROX_BP .and. this_rank==rtest) then
        write(iulog,*) ' '
        i = itest
        j = jtest
@@ -3702,6 +3847,8 @@ subroutine glissade_velo_higher_solve(model,                &
           deallocate(uvel_2d_sav, vvel_2d_sav)
           deallocate(Auu_2d_sav, Auv_2d_sav, Avu_2d_sav, Avv_2d_sav)
           deallocate(beta_internal_sav)
+          deallocate(beta_eff_x_sav, beta_eff_y_sav)
+          deallocate(omega_k_sav)
        endif
     else
        deallocate(Auu, Auv, Avu, Avv)
@@ -3884,6 +4031,7 @@ end subroutine glissade_velo_higher_scale_output
   subroutine get_vertex_geometry(nx,             ny,                   &
                                  nz,             nhalo,                &
                                  parallel,                             &
+                                 x0,             y0,                   &
                                  dx,             dy,                   &
                                  itest,  jtest,  rtest,                &
                                  ice_mask,                             &
@@ -3914,11 +4062,17 @@ subroutine get_vertex_geometry(nx,             ny,                   &
        nhalo                      ! number of halo layers
 
     type(parallel_type), intent(in) :: &
-         parallel                  ! info for parallel communication
+         parallel                 ! info for parallel communication
+
+    real(dp), dimension(nx-1), intent(in) ::  &
+         x0                       ! x coordinates of vertices
+
+    real(dp), dimension(ny-1), intent(in) ::  &
+         y0                       ! y coordinates of vertices
 
     real(dp), intent(in) ::  &
-       dx,  dy                ! grid cell length and width (m)
-                              ! assumed to have the same value for each grid cell
+         dx,  dy                  ! grid cell length and width (m)
+                                  ! assumed to have the same value for each grid cell
 
     integer, intent(in) :: &
        itest, jtest, rtest    ! coordinates of diagnostic point
@@ -3970,17 +4124,15 @@ subroutine get_vertex_geometry(nx,             ny,                   &
     staggered_jhi = parallel%staggered_jhi
 
     !----------------------------------------------------------------
-    ! Compute the x and y coordinates of each vertex.
+    ! Copy the x and y coordinates of each vertex from x0 and y0.
     ! By convention, vertex (i,j) lies at the NE corner of cell(i,j).
     !----------------------------------------------------------------
 
-    xVertex(:,:) = 0.d0
-    yVertex(:,:) = 0.d0
     do j = 1, ny-1
-    do i = 1, nx-1
-       xVertex(i,j) = dx * i
-       yVertex(i,j) = dy * j
-    enddo
+       do i = 1, nx-1
+          xVertex(i,j) = x0(i)
+          yVertex(i,j) = y0(j)
+       enddo
     enddo
 
     ! Identify the active cells.
@@ -4252,7 +4404,8 @@ subroutine load_vector_gravity(nx,               ny,              &
                       loadv(kNode,iNode,jNode) = loadv(kNode,iNode,jNode) - &
                            rhoi*grav * wqp_3d(p) * detJ/vol0 * dsdy_qp * phi_3d(n,p)
 
-                      if (verbose_load .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest .and. p==ptest) then
+                      if (verbose_load .and. this_rank==rtest .and. &
+                           i==itest .and. j==jtest .and. k==ktest .and. p==ptest) then
                          write(iulog,*) ' '
                          write(iulog,*) 'n, phi_3d(n), delta(loadu), delta(loadv):', n, phi_3d(n,p), &
                                   rhoi*grav*wqp_3d(p)*detJ/vol0 * dsdx_qp * phi_3d(n,p), &
@@ -4754,26 +4907,38 @@ end subroutine lateral_shelf_bc
 
 !****************************************************************************
 
-  subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
-                                          nz,               sigma,           &
+  subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
+                                          nz,                                &
+                                          sigma,            stagsigma,       &
                                           nhalo,                             &
                                           itest,   jtest,   rtest,           &
                                           active_cell,                       &
                                           xVertex,          yVertex,         &
-                                          uvel,             vvel,            &
+                                          uvel_2d,          vvel_2d,         &
                                           stagusrf,         stagthck,        &
-                                          flwafact,         whichapprox,     &
+                                          flwa,             flwafact,        &
+                                          whichapprox,                       &
+                                          diva_slope_factor_x, diva_slope_factor_y, &
                                           whichefvs,        efvs,            &
                                           efvs_constant,    effstrain_min,   &
                                           Auu,              Auv,             &
-                                          Avu,              Avv)
+                                          Avu,              Avv,             &
+                                          dusrf_dx,         dusrf_dy,        &
+                                          thck,                              &
+                                          btractx,          btracty,         &
+                                          omega_k,          omega,           &
+                                          efvs_qp_3d)
 
     !----------------------------------------------------------------
     ! Assemble the stiffness matrix A in the linear system Ax = b.
     ! This subroutine is called for each nonlinear iteration if
     !  we are iterating on the effective viscosity.
+    ! The matrix A can be based on the shallow-shelf approximation or
+    !  the depth-integrated L1L2 approximation (Schoof and Hindmarsh, 2010).
     !----------------------------------------------------------------
 
+    use glissade_grid_operators, only: glissade_vertical_average
+
     !----------------------------------------------------------------
     ! Input-output arguments
     !----------------------------------------------------------------
@@ -4781,37 +4946,45 @@ subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
     integer, intent(in) ::      &
        nx, ny,                  &    ! horizontal grid dimensions
        nz,                      &    ! number of vertical levels at which velocity is computed
-                                     ! Note: the number of elements per column is nz-1
+                                     ! (used for flwafact)
        nhalo                         ! number of halo layers
 
     real(dp), dimension(nz), intent(in) ::    &
-       sigma                         ! sigma vertical coordinate
+       sigma              ! sigma vertical coordinate
+
+    real(dp), dimension(nz-1), intent(in) ::    &
+       stagsigma          ! staggered sigma vertical coordinate
 
     integer, intent(in) :: &
        itest, jtest, rtest           ! coordinates of diagnostic point
 
     logical, dimension(nx,ny), intent(in) ::  &
-       active_cell                   ! true if cell contains ice and borders a locally owned vertex
+       active_cell        ! true if cell contains ice and borders a locally owned vertex
 
     real(dp), dimension(nx-1,ny-1), intent(in) ::   &
-       xVertex, yVertex     ! x and y coordinates of vertices
+       xVertex, yVertex   ! x and y coordinates of vertices
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
-       uvel, vvel         ! velocity components (m/yr)
+    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
+       uvel_2d, vvel_2d   ! 2D velocity components (m/yr)
 
     real(dp), dimension(nx-1,ny-1), intent(in) ::  &
        stagusrf,       &  ! upper surface elevation on staggered grid (m)
        stagthck           ! ice thickness on staggered grid (m)
 
+    !TODO - Pass in flwa only, and compute flwafact here?
     real(dp), dimension(nz-1,nx,ny), intent(in) ::  &
-       flwafact           ! temperature-based flow factor, 0.5 * A^(-1/n), 
+       flwa,             &! temperature-based flow factor A, Pa^{-n} yr^{-1}
+       flwafact           ! temperature-based flow factor, 0.5 * A^(-1/n), Pa yr^(1/n)
                           ! used to compute the effective viscosity
-                          ! units: Pa yr^(1/n)
 
     integer, intent(in) ::   &
-       whichapprox,     & ! option for Stokes approximation (BP, SSA, SIA)
+       whichapprox,     & ! option for Stokes approximation (BP, L1L2, SSA, SIA)
        whichefvs          ! option for effective viscosity calculation 
 
+    real(dp), dimension(nx,ny), intent(in) ::   &
+       diva_slope_factor_x, &  ! correction factor for DIVA in x direction, based on theta_slope_x
+       diva_slope_factor_y     ! correction factor for DIVA in y direction, based on theta_slope_y
+
     real(dp), dimension(nz-1,nx,ny), intent(out) ::  &
        efvs               ! effective viscosity (Pa yr)
 
@@ -4819,36 +4992,61 @@ subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
        efvs_constant,   & ! constant value of effective viscosity (Pa yr)
        effstrain_min      ! minimum value of effective strain rate (yr^-1) for computing viscosity
 
-    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(out) ::  &
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(out) ::  &
        Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
        Avu, Avv
 
+    ! The following optional arguments are used for the L1L2 approximation only
+
+    real(dp), dimension(nx-1,ny-1), intent(in), optional ::  &
+       dusrf_dx,       &  ! upper surface elevation gradient on staggered grid (m/m)
+       dusrf_dy           ! needed for L1L2 assembly only
+
+    ! The following optional arguments are used for DIVA only
+
+    real(dp), dimension(nx,ny), intent(in), optional ::   &
+       thck               ! ice thickness (m)
+
+    real(dp), dimension(nx-1,ny-1), intent(in), optional ::   &
+       btractx, btracty   ! components of basal traction (Pa)
+
+    real(dp), dimension(nz,nx,ny), intent(out), optional :: &
+       omega_k            ! single integral, defined by Goldberg (2011) eq. 32
+
+    real(dp), dimension(nx,ny), intent(out), optional :: &
+       omega              ! double integral, defined by Goldberg (2011) eq. 35
+                          ! Note: omega here = Goldberg's omega/H
+
+    real(dp), dimension(nz-1,nQuadPoints_2d,nx,ny), intent(inout), optional ::  &
+       efvs_qp_3d         ! effective viscosity (Pa yr)
+
     !---------------------------------------------------------
     ! Local variables
     !---------------------------------------------------------
 
-    real(dp), dimension(nQuadPoints_3d) ::   &
+    real(dp), dimension(nQuadPoints_2d) ::   &
        detJ               ! determinant of J
 
-    real(dp), dimension(nNodesPerElement_3d) ::   &
-       dphi_dx_3d, dphi_dy_3d, dphi_dz_3d  ! derivatives of basis function, evaluated at quad pt
+    real(dp), dimension(nNodesPerElement_2d) ::   &
+       dphi_dx_2d, dphi_dy_2d, dphi_dz_2d  ! derivatives of basis function, evaluated at quad pts
+                                           ! set dphi_dz = 0 for 2D problem
 
     !----------------------------------------------------------------
-    ! Note: Kuu, Kuv, Kvu, and Kvv are 8x8 components of the stiffness matrix
-    !       for the local element.  (The combined stiffness matrix is 16x16.)
+    ! Note: Kuu, Kuv, Kvu, and Kvv are 4x4 components of the stiffness matrix
+    !       for the local element.  (The combined stiffness matrix is 8x8.)
     !
-    ! Once these matrices are formed, their coefficients are summed into the assembled
-    !  matrices Auu, Auv, Avu, Avv.  The A matrices each have as many rows as there are
-    !  active nodes, but only 27 columns, corresponding to the 27 vertices that belong to
-    !  the 8 elements sharing a given node.
+    ! Once these matrices are formed, their coefficients are summed into the global
+    !  matrices Auu_2d, Auv_2d, Avu_2d, Avv_2d.  The global matrices each have as
+    !  many rows as there are active vertices, but only 9 columns, corresponding to
+    !  the 9 vertices of the 4 elements sharing a given node.
     !
     ! The native structured PCG solver works with the dense A matrices in the form
     ! computed here.  For the SLAP solver, the terms of the A matrices are put
-    ! in a sparse matrix during preprocessing.  For the Trilinos solver, the terms
-    ! of the A matrices are passed to Trilinos one row at a time. 
+    ! in a sparse matrix format during preprocessing.  For the Trilinos solver,
+    ! the terms of the A matrices are passed to Trilinos one row at a time.
     !----------------------------------------------------------------
 
-    real(dp), dimension(nNodesPerElement_3d, nNodesPerElement_3d) ::   &   !
+    real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) ::   &   !
        Kuu,          &    ! element stiffness matrix, divided into 4 parts as shown below
        Kuv,          &    !  
        Kvu,          &    !
@@ -4859,25 +5057,36 @@ subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
                           !         
                           ! Kvu may not be needed if matrix is symmetric, but is included for now
 
-    real(dp), dimension(nNodesPerElement_3d) ::     &
-       x, y, z,         & ! Cartesian coordinates of nodes
-       u, v,            & ! u and v at nodes
-       s                  ! upper surface elevation at nodes
+    real(dp), dimension(nNodesPerElement_2d) ::     &
+       x, y,            & ! Cartesian coordinates of vertices
+       u, v,            & ! depth-integrated mean velocity at vertices (m/yr)
+       h,               & ! thickness at vertices (m)
+       s,               & ! upper surface elevation at vertices (m)
+       bx, by,          & ! basal traction at vertices (Pa) (DIVA only)
+       dsdx, dsdy         ! upper surface elevation gradient at vertices (m/m) (L1L2 only)
 
-    real(dp), dimension(nQuadPoints_3d) ::    &
-       efvs_qp            ! effective viscosity at a quad pt
+    real(dp), dimension(nQuadPoints_2d) ::    &
+       efvs_qp_vertavg    ! vertically averaged effective viscosity at a quad pt
+
+    real(dp) ::         &
+       h_qp               ! thickness at a quad pt
+
+    real(dp), dimension(nz-1,nQuadPoints_2d) ::    &
+       efvs_qp            ! effective viscosity at each layer in a cell column
+                          ! corresponding to a quad pt
 
     logical, parameter ::   &
        check_symmetry_element = .true.  ! if true, then check symmetry of element matrix
-                                        !Note: Can speed up assembly a bit by setting to false for production
+
+    real(dp), dimension(nx,ny) ::  &
+       flwafact_2d        ! vertically averaged flow factor
 
     integer :: i, j, k, n, p
-    integer :: iNode, jNode, kNode
+    integer :: iVertex, jVertex
 
     if (verbose_matrix .and. main_task) then
        write(iulog,*) ' '
-       write(iulog,*) 'In assemble_stiffness_matrix_3d'
-       write(iulog,*) 'itest, jtest, ktest, rtest =', itest, jtest, ktest, rtest
+       write(iulog,*) 'In assemble_stiffness_matrix_2d'
     endif
 
     ! Initialize effective viscosity
@@ -4885,10 +5094,17 @@ subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
 
     ! Initialize global stiffness matrix
 
-    Auu(:,:,:,:) = 0.d0
-    Auv(:,:,:,:) = 0.d0
-    Avu(:,:,:,:) = 0.d0
-    Avv(:,:,:,:) = 0.d0
+    Auu(:,:,:) = 0.d0
+    Auv(:,:,:) = 0.d0
+    Avu(:,:,:) = 0.d0
+    Avv(:,:,:) = 0.d0
+
+    ! Compute vertical average of flow factor (SSA only)
+    if (whichapprox == HO_APPROX_SSA) then
+       call glissade_vertical_average(nx,       ny,      &
+                                      nz,       sigma,   &
+                                      flwafact, flwafact_2d)
+    endif
 
     ! Sum over elements in active cells 
     ! Loop over all cells that border locally owned vertices.
@@ -4898,112 +5114,200 @@ subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
 
        if (active_cell(i,j)) then
 
-          do k = 1, nz-1    ! loop over elements in this column 
-                            ! assume k increases from upper surface to bed
-
-             ! Initialize element stiffness matrix
-             Kuu(:,:) = 0.d0
-             Kuv(:,:) = 0.d0
-             Kvu(:,:) = 0.d0
-             Kvv(:,:) = 0.d0
+          ! Initialize element stiffness matrix
+          Kuu(:,:) = 0.d0
+          Kuv(:,:) = 0.d0
+          Kvu(:,:) = 0.d0
+          Kvv(:,:) = 0.d0
   
-             ! compute spatial coordinates, velocity, and upper surface elevation for each node
-
-             do n = 1, nNodesPerElement_3d
+          ! Compute spatial coordinates, velocity, thickness and surface elevation for each vertex
+          ! Also compute surface elevation gradient (for L1L2) and basal traction (for DIVA)
+          do n = 1, nNodesPerElement_2d
 
-                ! Determine (k,i,j) for this node
-                ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j).
-                ! Indices for other nodes are computed relative to this node.
-                iNode = i + ishift(7,n)
-                jNode = j + jshift(7,n)
-                kNode = k + kshift(7,n)
+             ! Determine (i,j) for this vertex
+             ! The reason for the '3' is that node 3, in the NE corner of the grid cell, has index (i,j).
+             ! Indices for other nodes are computed relative to this vertex.
+             iVertex = i + ishift(3,n)
+             jVertex = j + jshift(3,n)
 
-                x(n) = xVertex(iNode,jNode)
-                y(n) = yVertex(iNode,jNode)
-                z(n) = stagusrf(iNode,jNode) - sigma(kNode)*stagthck(iNode,jNode)
-                u(n) = uvel(kNode,iNode,jNode)
-                v(n) = vvel(kNode,iNode,jNode)
-                s(n) = stagusrf(iNode,jNode)
+             x(n) = xVertex(iVertex,jVertex)
+             y(n) = yVertex(iVertex,jVertex)
+             u(n) = uvel_2d(iVertex,jVertex)
+             v(n) = vvel_2d(iVertex,jVertex)
+             s(n) = stagusrf(iVertex,jVertex)
+             h(n) = stagthck(iVertex,jVertex)
+             if (present(dusrf_dx) .and. present(dusrf_dy)) then  ! L1L2
+                dsdx(n) = dusrf_dx(iVertex,jVertex)
+                dsdy(n) = dusrf_dy(iVertex,jVertex)
+             endif
+             if (present(btractx) .and. present(btracty)) then    ! DIVA
+                bx(n) = btractx(iVertex,jVertex)
+                by(n) = btracty(iVertex,jVertex)
+             endif
 
-                if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
-                   write(iulog,*) ' '
-                   write(iulog,*) 'i, j, k, n, x, y, z:', i, j, k, n, x(n), y(n), z(n)
-                   write(iulog,*) 's, u, v:', s(n), u(n), v(n)
-                endif
+             if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest) then
+                write(iulog,*) ' '
+                write(iulog,*) 'i, j, n, x, y:', i, j, n, x(n), y(n)
+                write(iulog,*) 's, h, u, v:', s(n), h(n), u(n), v(n)
+                if (present(btractx) .and. present(btracty)) write(iulog,*) 'bx, by:', bx(n), by(n)
+             endif
 
-             enddo   ! nodes per element
+          enddo   ! vertices per element
 
-             ! Loop over quadrature points for this element
+          ! Loop over quadrature points for this element
    
-             do p = 1, nQuadPoints_3d
+          do p = 1, nQuadPoints_2d
 
-                ! Evaluate the derivatives of the element basis functions at this quadrature point.
-                !WHL - Pass in i, j, k, and p to the following subroutines for debugging.
+             ! Evaluate the derivatives of the element basis functions at this quadrature point.
 
-                call get_basis_function_derivatives_3d(x(:),             y(:),             z(:),              &          
-                                                       dphi_dxr_3d(:,p), dphi_dyr_3d(:,p), dphi_dzr_3d(:,p),  &
-                                                       dphi_dx_3d(:),    dphi_dy_3d(:),    dphi_dz_3d(:),     &
-                                                       detJ(p),                                               &
-                                                       itest, jtest, rtest,                                   &
-                                                       i, j, k, p)
+             call get_basis_function_derivatives_2d(x(:),             y(:),          &
+                                                    dphi_dxr_2d(:,p), dphi_dyr_2d(:,p), &
+                                                    dphi_dx_2d(:),    dphi_dy_2d(:),    &
+                                                    detJ(p),                            &
+                                                    itest, jtest, rtest,                &
+                                                    i, j, p)
 
-!          call t_startf('glissade_effective_viscosity')
+             dphi_dz_2d(:) = 0.d0
+
+             if (whichapprox == HO_APPROX_L1L2) then
+
+                ! Compute effective viscosity for each layer at this quadrature point
+                !TODO - sigma -> stagsigma for L1L2 viscosity?
+                call compute_effective_viscosity_L1L2(whichefvs,                               &
+                                                      efvs_constant,        effstrain_min,     &
+                                                      nz,                   sigma,             &
+                                                      nNodesPerElement_2d,  phi_2d(:,p),       &
+                                                      dphi_dx_2d(:),        dphi_dy_2d(:),     &
+                                                      u(:),                 v(:),              &
+                                                      h(:),                                    &
+                                                      dsdx(:),              dsdy(:),           &
+                                                      flwa(:,i,j),          flwafact(:,i,j),   &
+                                                      efvs_qp(:,p),                            &
+                                                      itest,    jtest,      rtest,             &
+                                                      i, j, p)
+
+                ! Compute vertical average of effective viscosity
+                efvs_qp_vertavg(p) = 0.d0
+                do k = 1, nz-1
+                   efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p) * (sigma(k+1) - sigma(k))
+                enddo
+
+             elseif (whichapprox == HO_APPROX_DIVA) then
+
+                ! Copy efvs_qp from global array to local column array
+                efvs_qp(:,:) = efvs_qp_3d(:,:,i,j)
+
+                ! Compute effective viscosity for each layer at this quadrature point
+                ! Note: efvs_qp_3d is intent(inout); old value is used to compute new value
+                call compute_effective_viscosity_diva(whichefvs,                               &
+                                                      efvs_constant,        effstrain_min,     &
+                                                      nz,                   stagsigma,         &
+                                                      nNodesPerElement_2d,  phi_2d(:,p),       &
+                                                      dphi_dx_2d(:),        dphi_dy_2d(:),     &
+                                                      u(:),                 v(:),              &
+                                                      bx(:),                by(:),             &
+                                                      diva_slope_factor_x(i,j),                &
+                                                      diva_slope_factor_y(i,j),                &
+                                                      h(:),                                    &
+                                                      flwa(:,i,j),          flwafact(:,i,j),   &
+                                                      efvs_qp(:,p),                            &
+                                                      itest,    jtest,      rtest,             &
+                                                      i, j, p)
+
+                if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then
+                   write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(:,p)
+                endif
+
+                !WHL - Copy local efvs_qp to the global array
+                efvs_qp_3d(:,:,i,j) = efvs_qp(:,:)
+
+                ! Compute vertical average of effective viscosity
+                efvs_qp_vertavg(p) = 0.d0
+                do k = 1, nz-1
+                   efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p)*(sigma(k+1) - sigma(k))
+                enddo
+
+             else     ! SSA
+
+                ! Compute vertically averaged effective viscosity at this quadrature point
+                !TODO - Why do we pass in dphi_dz_2d here and not elsewhere?
                 call compute_effective_viscosity(whichefvs,        whichapprox,                       &
                                                  efvs_constant,    effstrain_min,                     &
-                                                 nNodesPerElement_3d,                                 &
-                                                 dphi_dx_3d(:),    dphi_dy_3d(:),    dphi_dz_3d(:),   &
-                                                 u(:),             v(:),                              & 
-                                                 flwafact(k,i,j),  efvs_qp(p),                        &
+                                                 nNodesPerElement_2d,                                 &
+                                                 dphi_dx_2d(:),    dphi_dy_2d(:),    dphi_dz_2d(:),   &
+                                                 u(:),             v(:),                              &
+                                                 flwafact_2d(i,j), efvs_qp_vertavg(p),                &
                                                  itest, jtest, rtest,                                 &
-                                                 i, j, k, p )
-!          call t_stopf('glissade_effective_viscosity')
+                                                 i, j, 1, p)
 
-                if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then
-                   write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(p)
-                endif
+                ! Copy vertically averaged value to all levels
+                efvs_qp(:,p) = efvs_qp_vertavg(p)
 
-                ! Increment the element stiffness matrix with the contribution from each quadrature point.
+             endif    ! whichapprox
 
-!          call t_startf('glissade_compute_element_matrix')
-                call compute_element_matrix(whichapprox,     nNodesPerElement_3d,               & 
-                                            wqp_3d(p),       detJ(p),          efvs_qp(p),      &
-                                            dphi_dx_3d(:),   dphi_dy_3d(:),    dphi_dz_3d(:),   &
-                                            Kuu(:,:),        Kuv(:,:),                          &
-                                            Kvu(:,:),        Kvv(:,:),                          &
-                                            itest, jtest, rtest,                                &
-                                            i, j, k, p )
-!          call t_stopf('glissade_compute_element_matrix')
+             ! Compute ice thickness at this quadrature point
 
-             enddo   ! nQuadPoints_3d
+             h_qp = 0.d0
+             do n = 1, nNodesPerElement_2d
+                h_qp = h_qp + phi_2d(n,p) * h(n)
+             enddo
 
-             ! Compute average of effective viscosity over quad pts
-             efvs(k,i,j) = 0.d0
+             ! Increment the element stiffness matrix with the contribution from each quadrature point.
+             ! Note: The effective viscosity is multiplied by thickness since the equation to be solved
+             !       is vertically integrated.
 
-             do p = 1, nQuadPoints_3d
-                efvs(k,i,j) = efvs(k,i,j) + efvs_qp(p)
+             call compute_element_matrix(whichapprox,     nNodesPerElement_2d,               &
+                                         wqp_2d(p),       detJ(p),                           &
+                                         h_qp*efvs_qp_vertavg(p),                            &
+                                         dphi_dx_2d(:),   dphi_dy_2d(:),    dphi_dz_2d(:),   &
+                                         Kuu(:,:),        Kuv(:,:),                          &
+                                         Kvu(:,:),        Kvv(:,:),                          &
+                                         itest, jtest, rtest,                                &
+                                         i, j, 1, p )
+
+          enddo   ! nQuadPoints_2d
+
+          if (whichapprox == HO_APPROX_DIVA) then
+
+             ! Compute vertical integrals needed for the 2D solve and 3D velocity reconstruction
+             call compute_integrals_diva(nz,             sigma,         &
+                                         itest,  jtest,  rtest,         &
+                                         thck(i,j),      efvs_qp(:,:),  &
+                                         omega_k(:,i,j), omega(i,j),    &
+                                         i, j)
+
+          endif
+
+          ! Compute average of effective viscosity over quad points.
+          ! For L1L2 and DIVA there is a different efvs in each layer.
+          ! For SSA, simply write the vertical average value to each layer.
+
+          efvs(:,i,j) = 0.d0
+          do p = 1, nQuadPoints_2d
+             do k = 1, nz-1
+                efvs(k,i,j) = efvs(k,i,j) + efvs_qp(k,p)
              enddo
-             efvs(k,i,j) = efvs(k,i,j) / nQuadPoints_3d
-             
-             if (check_symmetry_element) then
-                call check_symmetry_element_matrix(nNodesPerElement_3d,  &
-                                                   Kuu, Kuv, Kvu, Kvv)
-             endif
+          enddo
+          efvs(:,i,j) = efvs(:,i,j) / nQuadPoints_2d
 
-             ! Sum terms of element matrix K into dense assembled matrix A
+          if (check_symmetry_element) then
+             call check_symmetry_element_matrix(nNodesPerElement_2d,   &
+                                                Kuu, Kuv, Kvu, Kvv)
+          endif
 
-             call element_to_global_matrix_3d(nx,           ny,          nz,    &
-                                              i,            j,           k,     &
-                                              itest,        jtest,       rtest, &
-                                              Kuu,          Kuv,                &
-                                              Kvu,          Kvv,                &
-                                              Auu,          Auv,                &
-                                              Avu,          Avv)
+          ! Sum the terms of element matrix K into the dense assembled matrix A
 
-          enddo   ! nz  (loop over elements in this column)
+          call element_to_global_matrix_2d(nx,           ny,        &
+                                           i,            j,         &
+                                           itest, jtest, rtest,     &
+                                           Kuu,          Kuv,       &
+                                           Kvu,          Kvv,       &
+                                           Auu,          Auv,       &
+                                           Avu,          Avv)
 
           if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then
              write(iulog,*) ' '
-             write(iulog,*) 'Assembled 3D matrix, i, j =', i, j
+             write(iulog,*) 'Assembled 2D matrix, i, j =', i, j
              write(iulog,*) 'k, flwafact, efvs:'
              do k = 1, nz-1
                 write(iulog,*) k, flwafact(k,i,j), efvs(k,i,j)
@@ -5015,42 +5319,30 @@ subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
     enddo      ! i
     enddo      ! j
 
-  end subroutine assemble_stiffness_matrix_3d
+  end subroutine assemble_stiffness_matrix_2d
 
 !****************************************************************************
 
-  subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
-                                          nz,                                &
-                                          sigma,            stagsigma,       &
+  subroutine assemble_stiffness_matrix_3d(nx,               ny,              &
+                                          nz,               sigma,           &
                                           nhalo,                             &
                                           itest,   jtest,   rtest,           &
                                           active_cell,                       &
                                           xVertex,          yVertex,         &
-                                          uvel_2d,          vvel_2d,         &
+                                          uvel,             vvel,            &
                                           stagusrf,         stagthck,        &
-                                          flwa,             flwafact,        &
-                                          whichapprox,                       &
-                                          diva_slope_factor_x, diva_slope_factor_y, &
+                                          flwafact,         whichapprox,     &
                                           whichefvs,        efvs,            &
                                           efvs_constant,    effstrain_min,   &
                                           Auu,              Auv,             &
-                                          Avu,              Avv,             &
-                                          dusrf_dx,         dusrf_dy,        &
-                                          thck,                              &
-                                          btractx,          btracty,         &
-                                          omega_k,          omega,           &
-                                          efvs_qp_3d)
-  
+                                          Avu,              Avv)
+
     !----------------------------------------------------------------
     ! Assemble the stiffness matrix A in the linear system Ax = b.
     ! This subroutine is called for each nonlinear iteration if
     !  we are iterating on the effective viscosity.
-    ! The matrix A can be based on the shallow-shelf approximation or 
-    !  the depth-integrated L1L2 approximation (Schoof and Hindmarsh, 2010).
     !----------------------------------------------------------------
 
-    use glissade_grid_operators, only: glissade_vertical_average
-
     !----------------------------------------------------------------
     ! Input-output arguments
     !----------------------------------------------------------------
@@ -5058,45 +5350,37 @@ subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
     integer, intent(in) ::      &
        nx, ny,                  &    ! horizontal grid dimensions
        nz,                      &    ! number of vertical levels at which velocity is computed
-                                     ! (used for flwafact)
+                                     ! Note: the number of elements per column is nz-1
        nhalo                         ! number of halo layers
 
     real(dp), dimension(nz), intent(in) ::    &
-       sigma              ! sigma vertical coordinate
-
-    real(dp), dimension(nz-1), intent(in) ::    &
-       stagsigma          ! staggered sigma vertical coordinate
+       sigma                         ! sigma vertical coordinate
 
     integer, intent(in) :: &
        itest, jtest, rtest           ! coordinates of diagnostic point
 
     logical, dimension(nx,ny), intent(in) ::  &
-       active_cell        ! true if cell contains ice and borders a locally owned vertex
+       active_cell                   ! true if cell contains ice and borders a locally owned vertex
 
     real(dp), dimension(nx-1,ny-1), intent(in) ::   &
-       xVertex, yVertex   ! x and y coordinates of vertices
+       xVertex, yVertex     ! x and y coordinates of vertices
 
-    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
-       uvel_2d, vvel_2d   ! 2D velocity components (m/yr)
+    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
+       uvel, vvel         ! velocity components (m/yr)
 
     real(dp), dimension(nx-1,ny-1), intent(in) ::  &
        stagusrf,       &  ! upper surface elevation on staggered grid (m)
        stagthck           ! ice thickness on staggered grid (m)
 
-    !TODO - Pass in flwa only, and compute flwafact here?
     real(dp), dimension(nz-1,nx,ny), intent(in) ::  &
-       flwa,             &! temperature-based flow factor A, Pa^{-n} yr^{-1}
-       flwafact           ! temperature-based flow factor, 0.5 * A^(-1/n), Pa yr^(1/n)
+       flwafact           ! temperature-based flow factor, 0.5 * A^(-1/n),
                           ! used to compute the effective viscosity
+                          ! units: Pa yr^(1/n)
 
     integer, intent(in) ::   &
-       whichapprox,     & ! option for Stokes approximation (BP, L1L2, SSA, SIA)
+       whichapprox,     & ! option for Stokes approximation (BP, SSA, SIA)
        whichefvs          ! option for effective viscosity calculation 
 
-    real(dp), dimension(nx,ny), intent(in) ::   &
-       diva_slope_factor_x, &  ! correction factor for DIVA in x direction, based on theta_slope_x
-       diva_slope_factor_y     ! correction factor for DIVA in y direction, based on theta_slope_y
-
     real(dp), dimension(nz-1,nx,ny), intent(out) ::  &
        efvs               ! effective viscosity (Pa yr)
 
@@ -5104,61 +5388,36 @@ subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
        efvs_constant,   & ! constant value of effective viscosity (Pa yr)
        effstrain_min      ! minimum value of effective strain rate (yr^-1) for computing viscosity
 
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(out) ::  &
+    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(out) ::  &
        Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
-       Avu, Avv                                    
-
-    ! The following optional arguments are used for the L1L2 approximation only
-
-    real(dp), dimension(nx-1,ny-1), intent(in), optional ::  &
-       dusrf_dx,       &  ! upper surface elevation gradient on staggered grid (m/m)
-       dusrf_dy           ! needed for L1L2 assembly only
-
-    ! The following optional arguments are used for DIVA only
-
-    real(dp), dimension(nx,ny), intent(in), optional ::   &
-       thck               ! ice thickness (m)
-
-    real(dp), dimension(nx-1,ny-1), intent(in), optional ::   &
-       btractx, btracty         ! components of basal traction (Pa)
-
-    real(dp), dimension(nz,nx,ny), intent(out), optional :: &
-       omega_k            ! single integral, defined by Goldberg (2011) eq. 32
-
-    real(dp), dimension(nx,ny), intent(out), optional :: &
-       omega              ! double integral, defined by Goldberg (2011) eq. 35
-                          ! Note: omega here = Goldberg's omega/H
-
-    real(dp), dimension(nz-1,nQuadPoints_2d,nx,ny), intent(inout), optional ::  &
-       efvs_qp_3d         ! effective viscosity (Pa yr)
+       Avu, Avv
 
     !---------------------------------------------------------
     ! Local variables
     !---------------------------------------------------------
 
-    real(dp), dimension(nQuadPoints_2d) ::   &
+    real(dp), dimension(nQuadPoints_3d) ::   &
        detJ               ! determinant of J
 
-    real(dp), dimension(nNodesPerElement_2d) ::   &
-       dphi_dx_2d, dphi_dy_2d, dphi_dz_2d  ! derivatives of basis function, evaluated at quad pts
-                                           ! set dphi_dz = 0 for 2D problem
+    real(dp), dimension(nNodesPerElement_3d) ::   &
+       dphi_dx_3d, dphi_dy_3d, dphi_dz_3d  ! derivatives of basis function, evaluated at quad pt
 
     !----------------------------------------------------------------
-    ! Note: Kuu, Kuv, Kvu, and Kvv are 4x4 components of the stiffness matrix
-    !       for the local element.  (The combined stiffness matrix is 8x8.)
+    ! Note: Kuu, Kuv, Kvu, and Kvv are 8x8 components of the stiffness matrix
+    !       for the local element.  (The combined stiffness matrix is 16x16.)
     !
-    ! Once these matrices are formed, their coefficients are summed into the global
-    !  matrices Auu_2d, Auv_2d, Avu_2d, Avv_2d.  The global matrices each have as 
-    !  many rows as there are active vertices, but only 9 columns, corresponding to 
-    !  the 9 vertices of the 4 elements sharing a given node.
+    ! Once these matrices are formed, their coefficients are summed into the assembled
+    !  matrices Auu, Auv, Avu, Avv.  The A matrices each have as many rows as there are
+    !  active nodes, but only 27 columns, corresponding to the 27 vertices that belong to
+    !  the 8 elements sharing a given node.
     !
     ! The native structured PCG solver works with the dense A matrices in the form
     ! computed here.  For the SLAP solver, the terms of the A matrices are put
-    ! in a sparse matrix format during preprocessing.  For the Trilinos solver, 
-    ! the terms of the A matrices are passed to Trilinos one row at a time. 
+    ! in a sparse matrix during preprocessing.  For the Trilinos solver, the terms
+    ! of the A matrices are passed to Trilinos one row at a time.
     !----------------------------------------------------------------
 
-    real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) ::   &   !
+    real(dp), dimension(nNodesPerElement_3d, nNodesPerElement_3d) ::   &   !
        Kuu,          &    ! element stiffness matrix, divided into 4 parts as shown below
        Kuv,          &    !  
        Kvu,          &    !
@@ -5169,36 +5428,25 @@ subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
                           !         
                           ! Kvu may not be needed if matrix is symmetric, but is included for now
 
-    real(dp), dimension(nNodesPerElement_2d) ::     &
-       x, y,            & ! Cartesian coordinates of vertices
-       u, v,            & ! depth-integrated mean velocity at vertices (m/yr)
-       h,               & ! thickness at vertices (m)
-       s,               & ! upper surface elevation at vertices (m)
-       bx, by,          & ! basal traction at vertices (Pa) (DIVA only)
-       dsdx, dsdy         ! upper surface elevation gradient at vertices (m/m) (L1L2 only)
-
-    real(dp), dimension(nQuadPoints_2d) ::    &
-       efvs_qp_vertavg    ! vertically averaged effective viscosity at a quad pt
-
-    real(dp) ::         &
-       h_qp               ! thickness at a quad pt
+    real(dp), dimension(nNodesPerElement_3d) ::     &
+       x, y, z,         & ! Cartesian coordinates of nodes
+       u, v,            & ! u and v at nodes
+       s                  ! upper surface elevation at nodes
 
-    real(dp), dimension(nz-1,nQuadPoints_2d) ::    &
-       efvs_qp            ! effective viscosity at each layer in a cell column
-                          ! corresponding to a quad pt
+    real(dp), dimension(nQuadPoints_3d) ::    &
+       efvs_qp            ! effective viscosity at a quad pt
 
     logical, parameter ::   &
        check_symmetry_element = .true.  ! if true, then check symmetry of element matrix
-
-    real(dp), dimension(nx,ny) ::  &
-       flwafact_2d        ! vertically averaged flow factor
+                                        !Note: Can speed up assembly a bit by setting to false for production
 
     integer :: i, j, k, n, p
-    integer :: iVertex, jVertex
+    integer :: iNode, jNode, kNode
 
     if (verbose_matrix .and. main_task) then
        write(iulog,*) ' '
-       write(iulog,*) 'In assemble_stiffness_matrix_2d'
+       write(iulog,*) 'In assemble_stiffness_matrix_3d'
+       write(iulog,*) 'itest, jtest, ktest, rtest =', itest, jtest, ktest, rtest
     endif
 
     ! Initialize effective viscosity
@@ -5206,220 +5454,125 @@ subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
 
     ! Initialize global stiffness matrix
 
-    Auu(:,:,:) = 0.d0
-    Auv(:,:,:) = 0.d0
-    Avu(:,:,:) = 0.d0
-    Avv(:,:,:) = 0.d0
-
-    ! Compute vertical average of flow factor (SSA only)
-    if (whichapprox == HO_APPROX_SSA) then
-       call glissade_vertical_average(nx,       ny,      &
-                                      nz,       sigma,   &
-                                      flwafact, flwafact_2d)
-    endif
+    Auu(:,:,:,:) = 0.d0
+    Auv(:,:,:,:) = 0.d0
+    Avu(:,:,:,:) = 0.d0
+    Avv(:,:,:,:) = 0.d0
 
     ! Sum over elements in active cells 
     ! Loop over all cells that border locally owned vertices.
 
     do j = nhalo+1, ny-nhalo+1
     do i = nhalo+1, nx-nhalo+1
-       
-       if (active_cell(i,j)) then
-
-          ! Initialize element stiffness matrix
-          Kuu(:,:) = 0.d0
-          Kuv(:,:) = 0.d0
-          Kvu(:,:) = 0.d0
-          Kvv(:,:) = 0.d0
-  
-          ! Compute spatial coordinates, velocity, thickness and surface elevation for each vertex
-          ! Also compute surface elevation gradient (for L1L2) and basal traction (for DIVA)
-          do n = 1, nNodesPerElement_2d
 
-             ! Determine (i,j) for this vertex
-             ! The reason for the '3' is that node 3, in the NE corner of the grid cell, has index (i,j).
-             ! Indices for other nodes are computed relative to this vertex.
-             iVertex = i + ishift(3,n)
-             jVertex = j + jshift(3,n)
-
-             x(n) = xVertex(iVertex,jVertex)
-             y(n) = yVertex(iVertex,jVertex)
-             u(n) = uvel_2d(iVertex,jVertex)
-             v(n) = vvel_2d(iVertex,jVertex)
-             s(n) = stagusrf(iVertex,jVertex)
-             h(n) = stagthck(iVertex,jVertex)
-             if (present(dusrf_dx) .and. present(dusrf_dy)) then  ! L1L2
-                dsdx(n) = dusrf_dx(iVertex,jVertex)
-                dsdy(n) = dusrf_dy(iVertex,jVertex)
-             endif
-             if (present(btractx) .and. present(btracty)) then    ! DIVA
-                bx(n) = btractx(iVertex,jVertex)
-                by(n) = btracty(iVertex,jVertex)
-             endif
-
-             if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest) then
-                write(iulog,*) ' '
-                write(iulog,*) 'i, j, n, x, y:', i, j, n, x(n), y(n)
-                write(iulog,*) 's, h, u, v:', s(n), h(n), u(n), v(n)
-                if (present(btractx) .and. present(btracty)) write(iulog,*) 'bx, by:', bx(n), by(n)
-             endif
-
-          enddo   ! vertices per element
-
-          ! Loop over quadrature points for this element
-   
-          do p = 1, nQuadPoints_2d
-
-             ! Evaluate the derivatives of the element basis functions at this quadrature point.
-
-             call get_basis_function_derivatives_2d(x(:),             y(:),          &
-                                                    dphi_dxr_2d(:,p), dphi_dyr_2d(:,p), &
-                                                    dphi_dx_2d(:),    dphi_dy_2d(:),    &
-                                                    detJ(p),                            &
-                                                    itest, jtest, rtest,                &
-                                                    i, j, p)
-
-             dphi_dz_2d(:) = 0.d0
+       if (active_cell(i,j)) then
 
-             if (whichapprox == HO_APPROX_L1L2) then
+          do k = 1, nz-1    ! loop over elements in this column
+                            ! assume k increases from upper surface to bed
 
-                ! Compute effective viscosity for each layer at this quadrature point
-                !TODO - sigma -> stagsigma for L1L2 viscosity?
-                call compute_effective_viscosity_L1L2(whichefvs,                               &
-                                                      efvs_constant,        effstrain_min,     &
-                                                      nz,                   sigma,             &
-                                                      nNodesPerElement_2d,  phi_2d(:,p),       &
-                                                      dphi_dx_2d(:),        dphi_dy_2d(:),     &
-                                                      u(:),                 v(:),              & 
-                                                      h(:),                                    &
-                                                      dsdx(:),              dsdy(:),           &
-                                                      flwa(:,i,j),          flwafact(:,i,j),   &
-                                                      efvs_qp(:,p),                            &
-                                                      itest,    jtest,      rtest,             &
-                                                      i, j, p)
+             ! Initialize element stiffness matrix
+             Kuu(:,:) = 0.d0
+             Kuv(:,:) = 0.d0
+             Kvu(:,:) = 0.d0
+             Kvv(:,:) = 0.d0
 
-                ! Compute vertical average of effective viscosity
-                efvs_qp_vertavg(p) = 0.d0
-                do k = 1, nz-1
-                   efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p) * (sigma(k+1) - sigma(k))
-                enddo
+             ! compute spatial coordinates, velocity, and upper surface elevation for each node
 
-             elseif (whichapprox == HO_APPROX_DIVA) then
+             do n = 1, nNodesPerElement_3d
 
-                ! Copy efvs_qp from global array to local column array
-                efvs_qp(:,:) = efvs_qp_3d(:,:,i,j)
+                ! Determine (k,i,j) for this node
+                ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j).
+                ! Indices for other nodes are computed relative to this node.
+                iNode = i + ishift(7,n)
+                jNode = j + jshift(7,n)
+                kNode = k + kshift(7,n)
 
-                ! Compute effective viscosity for each layer at this quadrature point
-                ! Note: efvs_qp_3d is intent(inout); old value is used to compute new value
-                call compute_effective_viscosity_diva(whichefvs,                               &
-                                                      efvs_constant,        effstrain_min,     &
-                                                      nz,                   stagsigma,         &
-                                                      nNodesPerElement_2d,  phi_2d(:,p),       &
-                                                      dphi_dx_2d(:),        dphi_dy_2d(:),     &
-                                                      u(:),                 v(:),              & 
-                                                      bx(:),                by(:),             &
-                                                      diva_slope_factor_x(i,j),                &
-                                                      diva_slope_factor_y(i,j),                &
-                                                      h(:),                                    &
-                                                      flwa(:,i,j),          flwafact(:,i,j),   &
-                                                      efvs_qp(:,p),                            &
-                                                      itest,    jtest,      rtest,             &
-                                                      i, j, p)
+                x(n) = xVertex(iNode,jNode)
+                y(n) = yVertex(iNode,jNode)
+                z(n) = stagusrf(iNode,jNode) - sigma(kNode)*stagthck(iNode,jNode)
+                u(n) = uvel(kNode,iNode,jNode)
+                v(n) = vvel(kNode,iNode,jNode)
+                s(n) = stagusrf(iNode,jNode)
 
-                if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then
-                   write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(:,p)
+                if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then
+                   write(iulog,*) ' '
+                   write(iulog,*) 'i, j, k, n, x, y, z:', i, j, k, n, x(n), y(n), z(n)
+                   write(iulog,*) 's, u, v:', s(n), u(n), v(n)
                 endif
 
-                !WHL - Copy local efvs_qp to the global array
-                efvs_qp_3d(:,:,i,j) = efvs_qp(:,:)
-
-                ! Compute vertical average of effective viscosity
-                efvs_qp_vertavg(p) = 0.d0
-                do k = 1, nz-1
-                   efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p)*(sigma(k+1) - sigma(k))
-                enddo
-
-             else     ! SSA
-
-                ! Compute vertically averaged effective viscosity at this quadrature point
-                !TODO - Why do we pass in dphi_dz_2d here and not elsewhere?
-                call compute_effective_viscosity(whichefvs,        whichapprox,                       &
-                                                 efvs_constant,    effstrain_min,                     &
-                                                 nNodesPerElement_2d,                                 &
-                                                 dphi_dx_2d(:),    dphi_dy_2d(:),    dphi_dz_2d(:),   &
-                                                 u(:),             v(:),                              & 
-                                                 flwafact_2d(i,j), efvs_qp_vertavg(p),                &
-                                                 itest, jtest, rtest,                                 &
-                                                 i, j, 1, p)
-
-                ! Copy vertically averaged value to all levels
-                efvs_qp(:,p) = efvs_qp_vertavg(p)
-
-             endif    ! whichapprox
+             enddo   ! nodes per element
 
-             ! Compute ice thickness at this quadrature point
+             ! Loop over quadrature points for this element
 
-             h_qp = 0.d0
-             do n = 1, nNodesPerElement_2d
-                h_qp = h_qp + phi_2d(n,p) * h(n)
-             enddo
+             do p = 1, nQuadPoints_3d
 
-             ! Increment the element stiffness matrix with the contribution from each quadrature point.
-             ! Note: The effective viscosity is multiplied by thickness since the equation to be solved
-             !       is vertically integrated.
+                ! Evaluate the derivatives of the element basis functions at this quadrature point.
+                !WHL - Pass in i, j, k, and p to the following subroutines for debugging.
 
-             call compute_element_matrix(whichapprox,     nNodesPerElement_2d,               & 
-                                         wqp_2d(p),       detJ(p),                           &
-                                         h_qp*efvs_qp_vertavg(p),                            &
-                                         dphi_dx_2d(:),   dphi_dy_2d(:),    dphi_dz_2d(:),   &
-                                         Kuu(:,:),        Kuv(:,:),                          &
-                                         Kvu(:,:),        Kvv(:,:),                          &
-                                         itest, jtest, rtest,                                &
-                                         i, j, 1, p )
+                call get_basis_function_derivatives_3d(x(:),             y(:),             z(:),              &
+                                                       dphi_dxr_3d(:,p), dphi_dyr_3d(:,p), dphi_dzr_3d(:,p),  &
+                                                       dphi_dx_3d(:),    dphi_dy_3d(:),    dphi_dz_3d(:),     &
+                                                       detJ(p),                                               &
+                                                       itest, jtest, rtest,                                   &
+                                                       i, j, k, p)
 
-          enddo   ! nQuadPoints_2d
+!          call t_startf('glissade_effective_viscosity')
+                call compute_effective_viscosity(whichefvs,        whichapprox,                       &
+                                                 efvs_constant,    effstrain_min,                     &
+                                                 nNodesPerElement_3d,                                 &
+                                                 dphi_dx_3d(:),    dphi_dy_3d(:),    dphi_dz_3d(:),   &
+                                                 u(:),             v(:),                              & 
+                                                 flwafact(k,i,j),  efvs_qp(p),                        &
+                                                 itest, jtest, rtest,                                 &
+                                                 i, j, k, p )
+!          call t_stopf('glissade_effective_viscosity')
 
-          if (whichapprox == HO_APPROX_DIVA) then
+                if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then
+                   write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(p)
+                endif
 
-             ! Compute vertical integrals needed for the 2D solve and 3D velocity reconstruction
-             call compute_integrals_diva(nz,             sigma,         &
-                                         itest,  jtest,  rtest,         &
-                                         thck(i,j),      efvs_qp(:,:),  &
-                                         omega_k(:,i,j), omega(i,j),    &
-                                         i, j)
+                ! Increment the element stiffness matrix with the contribution from each quadrature point.
 
-          endif
+!          call t_startf('glissade_compute_element_matrix')
+                call compute_element_matrix(whichapprox,     nNodesPerElement_3d,               &
+                                            wqp_3d(p),       detJ(p),          efvs_qp(p),      &
+                                            dphi_dx_3d(:),   dphi_dy_3d(:),    dphi_dz_3d(:),   &
+                                            Kuu(:,:),        Kuv(:,:),                          &
+                                            Kvu(:,:),        Kvv(:,:),                          &
+                                            itest, jtest, rtest,                                &
+                                            i, j, k, p )
+!          call t_stopf('glissade_compute_element_matrix')
 
-          ! Compute average of effective viscosity over quad points.
-          ! For L1L2 and DIVA there is a different efvs in each layer.
-          ! For SSA, simply write the vertical average value to each layer.
+             enddo   ! nQuadPoints_3d
 
-          efvs(:,i,j) = 0.d0
-          do p = 1, nQuadPoints_2d
-             do k = 1, nz-1
-                efvs(k,i,j) = efvs(k,i,j) + efvs_qp(k,p)
+             ! Compute average of effective viscosity over quad pts
+             efvs(k,i,j) = 0.d0
+
+             do p = 1, nQuadPoints_3d
+                efvs(k,i,j) = efvs(k,i,j) + efvs_qp(p)
              enddo
-          enddo
-          efvs(:,i,j) = efvs(:,i,j) / nQuadPoints_2d
+             efvs(k,i,j) = efvs(k,i,j) / nQuadPoints_3d
 
-          if (check_symmetry_element) then
-             call check_symmetry_element_matrix(nNodesPerElement_2d,   &
-                                                Kuu, Kuv, Kvu, Kvv)
-          endif
+             if (check_symmetry_element) then
+                call check_symmetry_element_matrix(nNodesPerElement_3d,  &
+                                                   Kuu, Kuv, Kvu, Kvv)
+             endif
 
-          ! Sum the terms of element matrix K into the dense assembled matrix A
+             ! Sum terms of element matrix K into dense assembled matrix A
 
-          call element_to_global_matrix_2d(nx,           ny,        &
-                                           i,            j,         &
-                                           itest, jtest, rtest,     &
-                                           Kuu,          Kuv,       &
-                                           Kvu,          Kvv,       &
-                                           Auu,          Auv,       &
-                                           Avu,          Avv)
+             call element_to_global_matrix_3d(nx,           ny,          nz,    &
+                                              i,            j,           k,     &
+                                              itest,        jtest,       rtest, &
+                                              Kuu,          Kuv,                &
+                                              Kvu,          Kvv,                &
+                                              Auu,          Auv,                &
+                                              Avu,          Avv)
+
+          enddo   ! nz  (loop over elements in this column)
 
           if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then
              write(iulog,*) ' '
-             write(iulog,*) 'Assembled 2D matrix, i, j =', i, j
+             write(iulog,*) 'Assembled 3D matrix, i, j =', i, j
              write(iulog,*) 'k, flwafact, efvs:'
              do k = 1, nz-1
                 write(iulog,*) k, flwafact(k,i,j), efvs(k,i,j)
@@ -5431,7 +5584,7 @@ subroutine assemble_stiffness_matrix_2d(nx,               ny,              &
     enddo      ! i
     enddo      ! j
 
-  end subroutine assemble_stiffness_matrix_2d
+  end subroutine assemble_stiffness_matrix_3d
 
 !****************************************************************************
 
@@ -5573,8 +5726,7 @@ subroutine compute_3d_velocity_diva(&
        itest,    jtest,     rtest,              &
        active_vertex,       diva_level_index,   &
        ice_plus_land_mask,                      &
-       stag_omega,          omega_k,            &
-       beta,                                    &
+       omega_k,             beta,               &
        beta_eff_x,          beta_eff_y,         &
        stag_theta_slope_x,  stag_theta_slope_y, &
        stag_diva_slope_factor_x,                &
@@ -5624,9 +5776,6 @@ subroutine compute_3d_velocity_diva(&
        stag_theta_slope_y, & ! slope angle (radians) in y direction at vertices
        stag_diva_slope_factor_x, & ! slope correction factor in x direction
        stag_diva_slope_factor_y, & ! slope correction factor in y direction
-       stag_omega,      & ! double integral, defined by Goldberg eq. 35 (m^2/(Pa yr))
-                          ! already interpolated to staggered grid
-                          ! Note: omega here = Goldberg's omega/H
        uvel_2d, vvel_2d   ! depth-integrated mean velocity; solution of 2D velocity solve (m/yr)
 
     real(dp), dimension(nx-1,ny-1), intent(out) ::  &
@@ -5643,12 +5792,6 @@ subroutine compute_3d_velocity_diva(&
          stag_omega_k       ! single integral, defined by Goldberg eq. 32 (m^2/(Pa yr))
                             ! interpolated to staggered grid
 
-    real(dp), dimension(nx-1,ny-1) ::  &
-         stag_integral      ! integral that relates bed velocity to uvel_2d and vvel_2d
-                            ! = stag_omega for diva_level_index = 0
-                            ! = stag_omega_k(k,:,:) for other values of diva_level_index
-
-
     real(dp) ::  &
          slope_correction_x, & ! slope-based correction for vertical shear in x direction
          slope_correction_y    ! slope-based correction for vertical shear in y direction
@@ -5668,15 +5811,6 @@ subroutine compute_3d_velocity_diva(&
                              stagger_margin_in = 1)
     enddo
 
-    ! Identify the appropriate integral for relating uvel_2d/vvel_2d to the bed velocity
-
-    if (diva_level_index == 0) then  ! solved for mean velocity
-       stag_integral(:,:) = stag_omega(:,:)
-    else
-       k = diva_level_index
-       stag_integral(:,:) = stag_omega_k(k,:,:)
-    endif
-
     !----------------------------------------------------------------
     ! Compute the 3D velocity field
     !----------------------------------------------------------------
@@ -6092,37 +6226,220 @@ subroutine compute_3d_velocity_L1L2(nx,               ny,              &
                    endif
                 endif
 
-                if (vmask_dirichlet(i,j) == 1) then
-                   vvel(k,i,j) = vvel(nz,i,j)
-                else
-                   if (include_membrane_stress_in_tau) then
-                      vvel(k,i,j) = vvel(k+1,i,j) + fact * tau_yz(k,i,j)
-                   else   ! SIA stress term only
-                      vvel(k,i,j) = vvel(k+1,i,j) + fact * tau_yz_sia(k,i,j)
-                   endif
-                endif
+                if (vmask_dirichlet(i,j) == 1) then
+                   vvel(k,i,j) = vvel(nz,i,j)
+                else
+                   if (include_membrane_stress_in_tau) then
+                      vvel(k,i,j) = vvel(k+1,i,j) + fact * tau_yz(k,i,j)
+                   else   ! SIA stress term only
+                      vvel(k,i,j) = vvel(k+1,i,j) + fact * tau_yz_sia(k,i,j)
+                   endif
+                endif
+
+                if (verbose_L1L2 .and. this_rank==rtest .and. i==itest .and. j==jtest) then
+                   depth = stagsigma(k) * stagthck(i,j)
+                   write(iulog,*) ' '
+                   write(iulog,*) 'k, depth, fact:', &
+                        k, depth, fact
+                   write(iulog,*) 'tau_xz(i,j): SIA term, membrane term, total:', &
+                        tau_xz_sia(k,i,j), tau_xz(k,i,j) - tau_xz_sia(k,i,j), tau_xz(k,i,j)
+                   write(iulog,*) 'tau_yz(i,j): SIA term, membrane term, total:', &
+                        tau_yz_sia(k,i,j), tau_yz(k,i,j) - tau_yz_sia(k,i,j), tau_yz(k,i,j)
+                   write(iulog,*) 'uvel(k), vvel(k):', uvel(k,i,j), vvel(k,i,j)
+                endif
+
+             endif
+
+          enddo   ! i
+       enddo      ! j
+
+    enddo         ! k
+
+  end subroutine compute_3d_velocity_L1L2
+
+!****************************************************************************
+
+  subroutine get_basis_function_derivatives_2d(xNode,       yNode,         &
+                                               dphi_dxr_2d, dphi_dyr_2d,   &
+                                               dphi_dx_2d,  dphi_dy_2d,    &
+                                               detJ,                       &
+                                               itest, jtest, rtest,        &
+                                               i, j, p)
+
+    !------------------------------------------------------------------
+    ! Evaluate the x and y derivatives of 2D element basis functions
+    ! at a particular quadrature point.
+    !
+    ! Also determine the Jacobian of the transformation between the
+    ! reference element and the true element.
+    ! 
+    ! This subroutine should work for any 2D element with any number of nodes.
+    !------------------------------------------------------------------
+
+    real(dp), dimension(nNodesPerElement_2d), intent(in) :: &
+       xNode, yNode,                   &! nodal coordinates
+       dphi_dxr_2d, dphi_dyr_2d         ! derivatives of basis functions at quad pt
+                                        !  wrt x and y in reference element
+
+    real(dp), dimension(nNodesPerElement_2d), intent(out) :: &
+       dphi_dx_2d, dphi_dy_2d           ! derivatives of basis functions at quad pt
+                                        !  wrt x and y in true Cartesian coordinates
+
+    real(dp), intent(out) :: &
+                detJ      ! determinant of Jacobian matrix
+
+    real(dp), dimension(2,2) ::  &
+                Jac,      &! Jacobian matrix
+                Jinv       ! inverse Jacobian matrix
+
+    integer, intent(in) :: &
+       itest, jtest, rtest              ! coordinates of diagnostic point
+
+    integer, intent(in) :: i, j, p
+
+    integer :: n, row, col
+
+    logical, parameter :: Jac_bug_check = .false.   ! set to true for debugging
+    real(dp), dimension(2,2) :: prod     ! Jac * Jinv (should be identity matrix)
+
+    !------------------------------------------------------------------
+    ! Compute the Jacobian for the transformation from the reference
+    ! coordinates to the true coordinates:
+    !
+    !              |                                                  |
+    !              | sum_n{dphi_n/dxr * xn}   sum_n{dphi_n/dxr * yn}  |
+    !   J(xr,yr) = |                                                  |
+    !              | sum_n{dphi_n/dyr * xn}   sum_n{dphi_n/dyr * yn}  |
+    !              |                                                  |
+    !
+    ! where (xn,yn) are the true Cartesian nodal coordinates,
+    !       (xr,yr) are the coordinates of the quad point in the reference element,
+    !       and sum_n denotes a sum over nodes.
+    !------------------------------------------------------------------
+
+    Jac(:,:) = 0.d0
+
+    if ((verbose_Jac .or. verbose_diva) .and. this_rank==rtest .and. i==itest .and. j==jtest) then
+       write(iulog,*) ' '
+       write(iulog,*) 'In get_basis_function_derivatives_2d: i, j, p =', i, j, p
+    endif
+
+    do n = 1, nNodesPerElement_2d
+       if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
+          write(iulog,*) ' '
+          write(iulog,*) 'n, x, y:', n, xNode(n), yNode(n)
+          write(iulog,*) 'dphi_dxr_2d, dphi_dyr_2d:', dphi_dxr_2d(n), dphi_dyr_2d(n)
+       endif
+       Jac(1,1) = Jac(1,1) + dphi_dxr_2d(n) * xNode(n)
+       Jac(1,2) = Jac(1,2) + dphi_dxr_2d(n) * yNode(n)
+       Jac(2,1) = Jac(2,1) + dphi_dyr_2d(n) * xNode(n)
+       Jac(2,2) = Jac(2,2) + dphi_dyr_2d(n) * yNode(n)
+    enddo
+
+    !------------------------------------------------------------------
+    ! Compute the determinant and inverse of J
+    !------------------------------------------------------------------
+
+    detJ = Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1)
+
+    if (abs(detJ) > 0.d0) then
+       Jinv(1,1) =  Jac(2,2)/detJ
+       Jinv(1,2) = -Jac(1,2)/detJ
+       Jinv(2,1) = -Jac(2,1)/detJ
+       Jinv(2,2) =  Jac(1,1)/detJ
+    else
+       write(iulog,*) 'stopping, det J = 0'
+       write(iulog,*) 'i, j, p:', i, j, p
+       write(iulog,*) 'Jacobian matrix:'
+       write(iulog,*) Jac(1,:)
+       write(iulog,*) Jac(2,:)
+       call write_log('Jacobian matrix is singular', GM_FATAL)
+    endif
+
+    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
+       write(iulog,*) ' '
+       write(iulog,*) 'Jacobian calc, p =', p
+       write(iulog,*) 'det J =', detJ
+       write(iulog,*) ' '
+       write(iulog,*) 'Jacobian matrix:'
+       write(iulog,*) Jac(1,:)
+       write(iulog,*) Jac(2,:)
+       write(iulog,*) ' '
+       write(iulog,*) 'Inverse matrix:'
+       write(iulog,*) Jinv(1,:)
+       write(iulog,*) Jinv(2,:)
+       write(iulog,*) ' '
+       prod = matmul(Jac, Jinv)
+       write(iulog,*) 'Jac*Jinv:'
+       write(iulog,*) prod(1,:)
+       write(iulog,*) prod(2,:)
+    endif
+
+    ! Optional bug check - Verify that J * Jinv = I
+
+    if (Jac_bug_check) then
+       prod = matmul(Jac,Jinv)
+       do col = 1, 2
+          do row = 1, 2
+             if (abs(prod(row,col) - identity3(row,col)) > 1.d-12) then
+                write(iulog,*) '2d Jacobian, stopping, Jac * Jinv /= identity'
+                write(iulog,*) 'rank, i, j, p:', this_rank, i, j, p
+                write(iulog,*) 'Jac*Jinv:'
+                write(iulog,*) prod(1,:)
+                write(iulog,*) prod(2,:)
+                call write_log('Jacobian matrix was not correctly inverted', GM_FATAL)
+             endif
+          enddo
+       enddo
+    endif
+
+    !------------------------------------------------------------------
+    ! Compute the contribution of this quadrature point to dphi/dx and dphi/dy
+    ! for each basis function.
+    !
+    !   | dphi_n/dx |          | dphi_n/dxr |
+    !   |           | = Jinv * |            |
+    !   | dphi_n/dy |          | dphi_n/dyr |
+    !
+    !------------------------------------------------------------------
+
+    dphi_dx_2d(:) = 0.d0
+    dphi_dy_2d(:) = 0.d0
 
-                if (verbose_L1L2 .and. this_rank==rtest .and. i==itest .and. j==jtest) then
-                   depth = stagsigma(k) * stagthck(i,j)
-                   write(iulog,*) ' '
-                   write(iulog,*) 'k, depth, fact:', &
-                        k, depth, fact
-                   write(iulog,*) 'tau_xz(i,j): SIA term, membrane term, total:', &
-                        tau_xz_sia(k,i,j), tau_xz(k,i,j) - tau_xz_sia(k,i,j), tau_xz(k,i,j)
-                   write(iulog,*) 'tau_yz(i,j): SIA term, membrane term, total:', &
-                        tau_yz_sia(k,i,j), tau_yz(k,i,j) - tau_yz_sia(k,i,j), tau_yz(k,i,j)
-                   write(iulog,*) 'uvel(k), vvel(k):', uvel(k,i,j), vvel(k,i,j)
-                endif
+    do n = 1, nNodesPerElement_2d
+       dphi_dx_2d(n) = dphi_dx_2d(n) + Jinv(1,1)*dphi_dxr_2d(n)  &
+                                     + Jinv(1,2)*dphi_dyr_2d(n)
+       dphi_dy_2d(n) = dphi_dy_2d(n) + Jinv(2,1)*dphi_dxr_2d(n)  &
+                                     + Jinv(2,2)*dphi_dyr_2d(n)
+    enddo
 
-             endif
+    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
+       write(iulog,*) ' '
+       write(iulog,*) 'dphi_dx_2d:', dphi_dx_2d(:)
+       write(iulog,*) 'dphi_dy_2d:', dphi_dy_2d(:)
+    endif
 
-          enddo   ! i
-       enddo      ! j
+    if (Jac_bug_check) then
 
-    enddo         ! k
+       ! Check that the sum of dphi_dx, etc. is close to zero  
+       if (abs( sum(dphi_dx_2d)/maxval(dphi_dx_2d) ) > 1.d-11) then
+          write(iulog,*) 'stopping, sum over basis functions of dphi_dx > 0'
+          write(iulog,*) 'dphi_dx_2d =', dphi_dx_2d(:)
+          write(iulog,*) 'i, j, p =', i, j, p
+          call write_log('Sum over basis functions of dphi_dx /= 0', GM_FATAL)
+       endif
 
-  end subroutine compute_3d_velocity_L1L2
+       if (abs( sum(dphi_dy_2d)/maxval(dphi_dy_2d) ) > 1.d-11) then
+          write(iulog,*) 'stopping, sum over basis functions of dphi_dy > 0'
+          write(iulog,*) 'dphi_dy =', dphi_dy_2d(:)
+          write(iulog,*) 'i, j, p =', i, j, p
+          call write_log('Sum over basis functions of dphi_dy /= 0', GM_FATAL)
+       endif
+
+    endif
 
+  end subroutine get_basis_function_derivatives_2d
+  
 !****************************************************************************
 
   subroutine get_basis_function_derivatives_3d(xNode,       yNode,       zNode,       &
@@ -6149,7 +6466,7 @@ subroutine get_basis_function_derivatives_3d(xNode,       yNode,       zNode,
 
     real(dp), dimension(nNodesPerElement_3d), intent(out) :: &
        dphi_dx_3d, dphi_dy_3d, dphi_dz_3d      ! derivatives of basis functions at quad pt
-                                               !  wrt x, y and z in true Cartesian coordinates  
+                                               !  wrt x, y and z in true Cartesian coordinates
 
     real(dp), intent(out) :: &
          detJ      ! determinant of Jacobian matrix
@@ -6276,263 +6593,80 @@ subroutine get_basis_function_derivatives_3d(xNode,       yNode,       zNode,
 
     if (Jac_bug_check) then
        prod = matmul(Jac,Jinv)
-       do col = 1, 3
-          do row = 1, 3
-             if (abs(prod(row,col) - identity3(row,col)) > 1.d-11) then
-                write(iulog,*) 'stopping, Jac * Jinv /= identity'
-                write(iulog,*) 'i, j, k, p:', i, j, k, p
-                write(iulog,*) 'Jac*Jinv:'
-                write(iulog,*) prod(1,:)
-                write(iulog,*) prod(2,:)
-                write(iulog,*) prod(3,:)
-                call write_log('Jacobian matrix was not correctly inverted', GM_FATAL)
-             endif
-          enddo
-       enddo
-    endif  ! Jac_bug_check
-
-    !------------------------------------------------------------------
-    ! Compute the contribution of this quadrature point to dphi/dx and dphi/dy
-    ! for each basis function.
-    !
-    !   | dphi_n/dx |          | dphi_n/dxr |
-    !   |           |          |            | 
-    !   | dphi_n/dy | = Jinv * | dphi_n/dyr |
-    !   |           |          |            |
-    !   | dphi_n/dz |          | dphi_n/dzr |
-    !
-    !------------------------------------------------------------------
-
-    dphi_dx_3d(:) = 0.d0
-    dphi_dy_3d(:) = 0.d0
-    dphi_dz_3d(:) = 0.d0
-
-    do n = 1, nNodesPerElement_3d
-       dphi_dx_3d(n) = Jinv(1,1)*dphi_dxr_3d(n)  &
-                     + Jinv(1,2)*dphi_dyr_3d(n)  &
-                     + Jinv(1,3)*dphi_dzr_3d(n)
-       dphi_dy_3d(n) = Jinv(2,1)*dphi_dxr_3d(n)  &
-                     + Jinv(2,2)*dphi_dyr_3d(n)  &
-                     + Jinv(2,3)*dphi_dzr_3d(n)
-       dphi_dz_3d(n) = Jinv(3,1)*dphi_dxr_3d(n)  &
-                     + Jinv(3,2)*dphi_dyr_3d(n)  &
-                     + Jinv(3,3)*dphi_dzr_3d(n)
-    enddo
-
-    if (Jac_bug_check) then
-
-       ! Check that the sum of dphi_dx, etc. is close to zero  
-
-       if (abs( sum(dphi_dx_3d)/maxval(dphi_dx_3d) ) > 1.d-11) then
-          write(iulog,*) 'stopping, sum over basis functions of dphi_dx > 0'
-          write(iulog,*) 'dphi_dx_3d =', dphi_dx_3d(:)
-          write(iulog,*) 'sum =', sum(dphi_dx_3d)
-          write(iulog,*) 'i, j, k, p =', i, j, k, p
-          call write_log('Sum over basis functions of dphi_dx /= 0', GM_FATAL)
-       endif
-
-       if (abs( sum(dphi_dy_3d)/maxval(dphi_dy_3d) ) > 1.d-11) then
-          write(iulog,*) 'stopping, sum over basis functions of dphi_dy > 0'
-          write(iulog,*) 'dphi_dy_3d =', dphi_dy_3d(:)
-          write(iulog,*) 'sum =', sum(dphi_dy_3d)
-          write(iulog,*) 'i, j, k, p =', i, j, k, p
-          call write_log('Sum over basis functions of dphi_dy /= 0', GM_FATAL)
-       endif
-
-       if (abs( sum(dphi_dz_3d)/maxval(dphi_dz_3d) ) > 1.d-11) then
-          write(iulog,*) 'stopping, sum over basis functions of dphi_dz > 0'
-          write(iulog,*) 'dphi_dz_3d =', dphi_dz_3d(:)
-          write(iulog,*) 'sum =', sum(dphi_dz_3d)
-          write(iulog,*) 'i, j, k, p =', i, j, k, p
-          call write_log('Sum over basis functions of dphi_dz /= 0', GM_FATAL)
-       endif
-
-    endif  ! Jac_bug_check
-
-  end subroutine get_basis_function_derivatives_3d
-
-!****************************************************************************
-
-  subroutine get_basis_function_derivatives_2d(xNode,       yNode,         &
-                                               dphi_dxr_2d, dphi_dyr_2d,   &
-                                               dphi_dx_2d,  dphi_dy_2d,    &
-                                               detJ,                       &
-                                               itest, jtest, rtest,        &
-                                               i, j, p)
-
-    !------------------------------------------------------------------
-    ! Evaluate the x and y derivatives of 2D element basis functions
-    ! at a particular quadrature point.
-    !
-    ! Also determine the Jacobian of the transformation between the
-    ! reference element and the true element.
-    ! 
-    ! This subroutine should work for any 2D element with any number of nodes.
-    !------------------------------------------------------------------
-
-    real(dp), dimension(nNodesPerElement_2d), intent(in) :: &
-       xNode, yNode,                   &! nodal coordinates
-       dphi_dxr_2d, dphi_dyr_2d         ! derivatives of basis functions at quad pt
-                                        !  wrt x and y in reference element
-
-    real(dp), dimension(nNodesPerElement_2d), intent(out) :: &
-       dphi_dx_2d, dphi_dy_2d           ! derivatives of basis functions at quad pt
-                                        !  wrt x and y in true Cartesian coordinates  
-
-    real(dp), intent(out) :: &
-                detJ      ! determinant of Jacobian matrix
-
-    real(dp), dimension(2,2) ::  &
-                Jac,      &! Jacobian matrix
-                Jinv       ! inverse Jacobian matrix
-
-    integer, intent(in) :: &
-       itest, jtest, rtest              ! coordinates of diagnostic point
-
-    integer, intent(in) :: i, j, p
-
-    integer :: n, row, col
-
-    logical, parameter :: Jac_bug_check = .false.   ! set to true for debugging
-    real(dp), dimension(2,2) :: prod     ! Jac * Jinv (should be identity matrix)
-
-    !------------------------------------------------------------------
-    ! Compute the Jacobian for the transformation from the reference
-    ! coordinates to the true coordinates:
-    !
-    !              |                                                  |
-    !              | sum_n{dphi_n/dxr * xn}   sum_n{dphi_n/dxr * yn}  |
-    !   J(xr,yr) = |                                                  |
-    !              | sum_n{dphi_n/dyr * xn}   sum_n{dphi_n/dyr * yn}  |
-    !              |                                                  |
-    !
-    ! where (xn,yn) are the true Cartesian nodal coordinates,
-    !       (xr,yr) are the coordinates of the quad point in the reference element,
-    !       and sum_n denotes a sum over nodes.
-    !------------------------------------------------------------------
-
-    Jac(:,:) = 0.d0
-
-    if ((verbose_Jac .or. verbose_diva) .and. this_rank==rtest .and. i==itest .and. j==jtest) then
-       write(iulog,*) ' '
-       write(iulog,*) 'In get_basis_function_derivatives_2d: i, j, p =', i, j, p
-    endif
-
-    do n = 1, nNodesPerElement_2d
-       if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
-          write(iulog,*) ' '
-          write(iulog,*) 'n, x, y:', n, xNode(n), yNode(n)
-          write(iulog,*) 'dphi_dxr_2d, dphi_dyr_2d:', dphi_dxr_2d(n), dphi_dyr_2d(n)
-       endif
-       Jac(1,1) = Jac(1,1) + dphi_dxr_2d(n) * xNode(n)
-       Jac(1,2) = Jac(1,2) + dphi_dxr_2d(n) * yNode(n)
-       Jac(2,1) = Jac(2,1) + dphi_dyr_2d(n) * xNode(n)
-       Jac(2,2) = Jac(2,2) + dphi_dyr_2d(n) * yNode(n)
-    enddo
-
-    !------------------------------------------------------------------
-    ! Compute the determinant and inverse of J
-    !------------------------------------------------------------------
-
-    detJ = Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1)
-
-    if (abs(detJ) > 0.d0) then
-       Jinv(1,1) =  Jac(2,2)/detJ
-       Jinv(1,2) = -Jac(1,2)/detJ
-       Jinv(2,1) = -Jac(2,1)/detJ
-       Jinv(2,2) =  Jac(1,1)/detJ
-    else
-       write(iulog,*) 'stopping, det J = 0'
-       write(iulog,*) 'i, j, p:', i, j, p
-       write(iulog,*) 'Jacobian matrix:'
-       write(iulog,*) Jac(1,:)
-       write(iulog,*) Jac(2,:)
-       call write_log('Jacobian matrix is singular', GM_FATAL)
-    endif
-
-    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
-       write(iulog,*) ' '
-       write(iulog,*) 'Jacobian calc, p =', p
-       write(iulog,*) 'det J =', detJ
-       write(iulog,*) ' '
-       write(iulog,*) 'Jacobian matrix:'
-       write(iulog,*) Jac(1,:)
-       write(iulog,*) Jac(2,:)
-       write(iulog,*) ' '
-       write(iulog,*) 'Inverse matrix:'
-       write(iulog,*) Jinv(1,:)
-       write(iulog,*) Jinv(2,:)
-       write(iulog,*) ' '
-       prod = matmul(Jac, Jinv)
-       write(iulog,*) 'Jac*Jinv:'
-       write(iulog,*) prod(1,:)
-       write(iulog,*) prod(2,:)
-    endif
-
-    ! Optional bug check - Verify that J * Jinv = I
-
-    if (Jac_bug_check) then
-       prod = matmul(Jac,Jinv)
-       do col = 1, 2
-          do row = 1, 2
-             if (abs(prod(row,col) - identity3(row,col)) > 1.d-12) then
-                write(iulog,*) 'stopping, Jac * Jinv /= identity'
-                write(iulog,*) 'i, j, p:', i, j, p
+       do col = 1, 3
+          do row = 1, 3
+             if (abs(prod(row,col) - identity3(row,col)) > eps10) then
+                write(iulog,*) '3d Jacobian, stopping, Jac * Jinv /= identity'
+                write(iulog,*) 'rank, i, j, k, p:', this_rank, i, j, k, p
                 write(iulog,*) 'Jac*Jinv:'
                 write(iulog,*) prod(1,:)
                 write(iulog,*) prod(2,:)
+                write(iulog,*) prod(3,:)
                 call write_log('Jacobian matrix was not correctly inverted', GM_FATAL)
              endif
           enddo
        enddo
-    endif
+    endif  ! Jac_bug_check
 
     !------------------------------------------------------------------
     ! Compute the contribution of this quadrature point to dphi/dx and dphi/dy
     ! for each basis function.
     !
     !   | dphi_n/dx |          | dphi_n/dxr |
-    !   |           | = Jinv * |            |
-    !   | dphi_n/dy |          | dphi_n/dyr |
+    !   |           |          |            | 
+    !   | dphi_n/dy | = Jinv * | dphi_n/dyr |
+    !   |           |          |            |
+    !   | dphi_n/dz |          | dphi_n/dzr |
     !
     !------------------------------------------------------------------
 
-    dphi_dx_2d(:) = 0.d0
-    dphi_dy_2d(:) = 0.d0
+    dphi_dx_3d(:) = 0.d0
+    dphi_dy_3d(:) = 0.d0
+    dphi_dz_3d(:) = 0.d0
 
-    do n = 1, nNodesPerElement_2d
-       dphi_dx_2d(n) = dphi_dx_2d(n) + Jinv(1,1)*dphi_dxr_2d(n)  &
-                                     + Jinv(1,2)*dphi_dyr_2d(n)
-       dphi_dy_2d(n) = dphi_dy_2d(n) + Jinv(2,1)*dphi_dxr_2d(n)  &
-                                     + Jinv(2,2)*dphi_dyr_2d(n)
+    do n = 1, nNodesPerElement_3d
+       dphi_dx_3d(n) = Jinv(1,1)*dphi_dxr_3d(n)  &
+                     + Jinv(1,2)*dphi_dyr_3d(n)  &
+                     + Jinv(1,3)*dphi_dzr_3d(n)
+       dphi_dy_3d(n) = Jinv(2,1)*dphi_dxr_3d(n)  &
+                     + Jinv(2,2)*dphi_dyr_3d(n)  &
+                     + Jinv(2,3)*dphi_dzr_3d(n)
+       dphi_dz_3d(n) = Jinv(3,1)*dphi_dxr_3d(n)  &
+                     + Jinv(3,2)*dphi_dyr_3d(n)  &
+                     + Jinv(3,3)*dphi_dzr_3d(n)
     enddo
 
-    if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then
-       write(iulog,*) ' '
-       write(iulog,*) 'dphi_dx_2d:', dphi_dx_2d(:)
-       write(iulog,*) 'dphi_dy_2d:', dphi_dy_2d(:)
-    endif
-
     if (Jac_bug_check) then
 
        ! Check that the sum of dphi_dx, etc. is close to zero  
-       if (abs( sum(dphi_dx_2d)/maxval(dphi_dx_2d) ) > 1.d-11) then
+
+       if (abs( sum(dphi_dx_3d)/maxval(dphi_dx_3d) ) > 1.d-11) then
           write(iulog,*) 'stopping, sum over basis functions of dphi_dx > 0'
-          write(iulog,*) 'dphi_dx_2d =', dphi_dx_2d(:)
-          write(iulog,*) 'i, j, p =', i, j, p
+          write(iulog,*) 'dphi_dx_3d =', dphi_dx_3d(:)
+          write(iulog,*) 'sum =', sum(dphi_dx_3d)
+          write(iulog,*) 'i, j, k, p =', i, j, k, p
           call write_log('Sum over basis functions of dphi_dx /= 0', GM_FATAL)
        endif
 
-       if (abs( sum(dphi_dy_2d)/maxval(dphi_dy_2d) ) > 1.d-11) then
+       if (abs( sum(dphi_dy_3d)/maxval(dphi_dy_3d) ) > 1.d-11) then
           write(iulog,*) 'stopping, sum over basis functions of dphi_dy > 0'
-          write(iulog,*) 'dphi_dy =', dphi_dy_2d(:)
-          write(iulog,*) 'i, j, p =', i, j, p
+          write(iulog,*) 'dphi_dy_3d =', dphi_dy_3d(:)
+          write(iulog,*) 'sum =', sum(dphi_dy_3d)
+          write(iulog,*) 'i, j, k, p =', i, j, k, p
           call write_log('Sum over basis functions of dphi_dy /= 0', GM_FATAL)
        endif
 
-    endif
+       if (abs( sum(dphi_dz_3d)/maxval(dphi_dz_3d) ) > 1.d-11) then
+          write(iulog,*) 'stopping, sum over basis functions of dphi_dz > 0'
+          write(iulog,*) 'dphi_dz_3d =', dphi_dz_3d(:)
+          write(iulog,*) 'sum =', sum(dphi_dz_3d)
+          write(iulog,*) 'i, j, k, p =', i, j, k, p
+          call write_log('Sum over basis functions of dphi_dz /= 0', GM_FATAL)
+       endif
 
-  end subroutine get_basis_function_derivatives_2d
+    endif  ! Jac_bug_check
+
+  end subroutine get_basis_function_derivatives_3d
 
 !****************************************************************************
 
@@ -6934,7 +7068,7 @@ subroutine compute_internal_stress (nx,            ny,            &
 
     integer :: i, j, k, n, p
     integer :: iNode, jNode, kNode
-   
+
     ! initialize stresses
     tau_xz (:,:,:) = 0.d0
     tau_yz (:,:,:) = 0.d0
@@ -7543,7 +7677,7 @@ subroutine compute_effective_viscosity_diva(whichefvs,
                                               efvs,                                 &
                                               itest,  jtest,    rtest,              &
                                               i, j, p )
-    
+
     ! Compute the effective viscosity at each layer of an ice column corresponding
     !  to a particular quadrature point, based on the depth-integrated formulation.
     ! See Goldberg(2011) for details.
@@ -7823,7 +7957,7 @@ subroutine compute_element_matrix(whichapprox, nNodesPerElement,     &
     !----------------------------------------------------------------
 
     integer, intent(in) :: &
-         whichapprox     ! which Stokes approximation to use (BP, SIA, SSA)
+         whichapprox     ! which Stokes approximation to use (SIA, SSA, DIVA, L1L2, BP)
 
     integer, intent(in) :: nNodesPerElement  ! number of nodes per element
 
@@ -7915,7 +8049,7 @@ subroutine compute_element_matrix(whichapprox, nNodesPerElement,     &
                                      (2.d0*dphi_dy(nr)*dphi_dx(nc) + dphi_dx(nr)*dphi_dy(nc))
 
              Kvv(nr,nc) = Kvv(nr,nc) + efvs_factor *                                             &
-                                    ( 4.d0*dphi_dy(nr)*dphi_dy(nc) + dphi_dx(nr)*dphi_dx(nc)        &
+                                    ( 4.d0*dphi_dy(nr)*dphi_dy(nc) + dphi_dx(nr)*dphi_dx(nc)     &
                                     + dphi_dz(nr)*dphi_dz(nc) )
 
           enddo  ! nr (rows)
@@ -7927,146 +8061,148 @@ end subroutine compute_element_matrix
 
 !****************************************************************************
 
-  subroutine element_to_global_matrix_3d(nx,           ny,          nz,          &
-                                         iElement,     jElement,    kElement,    &
-                                         itest,        jtest,       rtest,       &
-                                         Kuu,          Kuv,                      &
-                                         Kvu,          Kvv,                      &
-                                         Auu,          Auv,                      &
+  subroutine element_to_global_matrix_2d(nx,           ny,        &
+                                         iElement,     jElement,  &
+                                         itest, jtest, rtest,     &
+                                         Kuu,          Kuv,       &
+                                         Kvu,          Kvv,       &
+                                         Auu,          Auv,       &
                                          Avu,          Avv)
-             
+
     ! Sum terms of element matrix K into dense assembled matrix A
     ! K is partitioned into Kuu, Kuv, Kvu, and Kvv, and similarly for A.
 
     integer, intent(in) ::   &
-       nx, ny,               &  ! horizontal grid dimensions
-       nz                       ! number of vertical levels where velocity is computed
+       nx, ny                   ! horizontal grid dimensions
 
     integer, intent(in) ::   &
-       iElement, jElement, kElement     ! i, j and k indices for this element
+       iElement, jElement       ! i and j indices for this element
 
     integer, intent(in) :: &
        itest, jtest, rtest         ! coordinates of diagnostic point
 
-    real(dp), dimension(nNodesPerElement_3d,nNodesPerElement_3d), intent(in) ::  &
+    real(dp), dimension(nNodesPerElement_2d,nNodesPerElement_2d), intent(in) ::  &
        Kuu, Kuv, Kvu, Kvv       ! element matrix
 
-    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) ::    &
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::    &
        Auu, Auv, Avu, Avv       ! assembled matrix
 
-    integer :: i, j, k, m
-    integer :: iA, jA, kA
+    integer :: i, j, m
+    integer :: iA, jA
     integer :: n, nr, nc
 
-    if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest .and. kElement==ktest) then
-       write(iulog,*) 'Element i, j, k:', iElement, jElement, kElement 
+    if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then
+       write(iulog,*) 'Element i, j:', iElement, jElement 
        write(iulog,*) 'Rows of Kuu:'
-       do n = 1, nNodesPerElement_3d
+       do n = 1, nNodesPerElement_2d
           write(iulog, '(8e12.4)') Kuu(n,:)
        enddo
     endif
 
-    !WHL - On a Mac I tried switching the loops to put nc on the outside, but 
-    !      the one with nr on the outside is faster.
-    do nr = 1, nNodesPerElement_3d       ! rows of K
+    do nr = 1, nNodesPerElement_2d       ! rows of K
 
-       ! Determine row of A to be incremented by finding (k,i,j) for this node
-       ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j).
+       ! Determine row of A to be incremented by finding (i,j) for this node
+       ! The reason for the '3' is that node 3, in the NE corner of this gridcell, has index (i,j).
        ! Indices for other nodes are computed relative to this node.
-       i = iElement + ishift(7,nr)
-       j = jElement + jshift(7,nr)
-       k = kElement + kshift(7,nr)
+       i = iElement + ishift(3,nr)
+       j = jElement + jshift(3,nr)
       
-       do nc = 1, nNodesPerElement_3d    ! columns of K
+       do nc = 1, nNodesPerElement_2d    ! columns of K
 
           ! Determine column of A to be incremented
-          kA = kshift(nr,nc)           ! k index of A into which K(m,n) is summed
-          iA = ishift(nr,nc)           ! similarly for i and j indices 
+          iA = ishift(nr,nc)           ! similarly for i and j indices
           jA = jshift(nr,nc)           ! these indices can take values -1, 0 and 1
-          m = indxA_3d(iA,jA,kA)
+          m = indxA_2d(iA,jA)
 
           ! Increment A
-          Auu(m,k,i,j) = Auu(m,k,i,j) + Kuu(nr,nc)
-          Auv(m,k,i,j) = Auv(m,k,i,j) + Kuv(nr,nc)
-          Avu(m,k,i,j) = Avu(m,k,i,j) + Kvu(nr,nc)
-          Avv(m,k,i,j) = Avv(m,k,i,j) + Kvv(nr,nc)
+          Auu(i,j,m) = Auu(i,j,m) + Kuu(nr,nc)
+          Auv(i,j,m) = Auv(i,j,m) + Kuv(nr,nc)
+          Avu(i,j,m) = Avu(i,j,m) + Kvu(nr,nc)
+          Avv(i,j,m) = Avv(i,j,m) + Kvv(nr,nc)
 
-       enddo     ! nc
+          if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then
+             write(iulog,*) 'Increment Auu, element i, j, nr, nc =', iElement, jElement, nr, nc
+             write(iulog,*) '     i, j, m, Kuu, new Auu:', i, j, m, Kuu(nr,nc), Auu(i,j,m)
+!!             write(iulog,*) 'Increment Avv, element i, j, nr, nc =', iElement, jElement, nr, nc
+!!             write(iulog,*) '     i, j, m, Kvv, new Avv:', i, j, m, Kvv(nr,nc), Avv(i,j,m)
+          endif
 
+       enddo     ! nc
     enddo        ! nr
 
-  end subroutine element_to_global_matrix_3d
-
+  end subroutine element_to_global_matrix_2d
+  
 !****************************************************************************
 
-  subroutine element_to_global_matrix_2d(nx,           ny,        &
-                                         iElement,     jElement,  &
-                                         itest, jtest, rtest,     &
-                                         Kuu,          Kuv,       &
-                                         Kvu,          Kvv,       &
-                                         Auu,          Auv,       &
+  subroutine element_to_global_matrix_3d(nx,           ny,          nz,          &
+                                         iElement,     jElement,    kElement,    &
+                                         itest,        jtest,       rtest,       &
+                                         Kuu,          Kuv,                      &
+                                         Kvu,          Kvv,                      &
+                                         Auu,          Auv,                      &
                                          Avu,          Avv)
-
+             
     ! Sum terms of element matrix K into dense assembled matrix A
     ! K is partitioned into Kuu, Kuv, Kvu, and Kvv, and similarly for A.
 
     integer, intent(in) ::   &
-       nx, ny                   ! horizontal grid dimensions
+       nx, ny,               &  ! horizontal grid dimensions
+       nz                       ! number of vertical levels where velocity is computed
 
     integer, intent(in) ::   &
-       iElement, jElement       ! i and j indices for this element
+       iElement, jElement, kElement     ! i, j and k indices for this element
 
     integer, intent(in) :: &
        itest, jtest, rtest         ! coordinates of diagnostic point
 
-    real(dp), dimension(nNodesPerElement_2d,nNodesPerElement_2d), intent(in) ::  &
+    real(dp), dimension(nNodesPerElement_3d,nNodesPerElement_3d), intent(in) ::  &
        Kuu, Kuv, Kvu, Kvv       ! element matrix
 
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::    &
+    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) ::    &
        Auu, Auv, Avu, Avv       ! assembled matrix
 
-    integer :: i, j, m
-    integer :: iA, jA
+    integer :: i, j, k, m
+    integer :: iA, jA, kA
     integer :: n, nr, nc
 
-    if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then
-       write(iulog,*) 'Element i, j:', iElement, jElement 
+    if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest .and. kElement==ktest) then
+       write(iulog,*) 'Element i, j, k:', iElement, jElement, kElement 
        write(iulog,*) 'Rows of Kuu:'
-       do n = 1, nNodesPerElement_2d
+       do n = 1, nNodesPerElement_3d
           write(iulog, '(8e12.4)') Kuu(n,:)
        enddo
     endif
 
-    do nr = 1, nNodesPerElement_2d       ! rows of K
+    !WHL - On a Mac I tried switching the loops to put nc on the outside, but
+    !      the one with nr on the outside is faster.
+    do nr = 1, nNodesPerElement_3d       ! rows of K
 
-       ! Determine row of A to be incremented by finding (i,j) for this node
-       ! The reason for the '3' is that node 3, in the NE corner of this gridcell, has index (i,j).
+       ! Determine row of A to be incremented by finding (k,i,j) for this node
+       ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j).
        ! Indices for other nodes are computed relative to this node.
-       i = iElement + ishift(3,nr)
-       j = jElement + jshift(3,nr)
+       i = iElement + ishift(7,nr)
+       j = jElement + jshift(7,nr)
+       k = kElement + kshift(7,nr)
       
-       do nc = 1, nNodesPerElement_2d    ! columns of K
+       do nc = 1, nNodesPerElement_3d    ! columns of K
 
           ! Determine column of A to be incremented
+          kA = kshift(nr,nc)           ! k index of A into which K(m,n) is summed
           iA = ishift(nr,nc)           ! similarly for i and j indices 
           jA = jshift(nr,nc)           ! these indices can take values -1, 0 and 1
-          m = indxA_2d(iA,jA)
+          m = indxA_3d(iA,jA,kA)
 
           ! Increment A
-          Auu(i,j,m) = Auu(i,j,m) + Kuu(nr,nc)
-          Auv(i,j,m) = Auv(i,j,m) + Kuv(nr,nc)
-          Avu(i,j,m) = Avu(i,j,m) + Kvu(nr,nc)
-          Avv(i,j,m) = Avv(i,j,m) + Kvv(nr,nc)
-
-          if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then
-             write(iulog,*) 'Increment Auu, element i, j, nr, nc =', iElement, jElement, nr, nc
-             write(iulog,*) '     i, j, m, Kuu, new Auu:', i, j, m, Kuu(nr,nc), Auu(i,j,m)
-          endif
+          Auu(m,k,i,j) = Auu(m,k,i,j) + Kuu(nr,nc)
+          Auv(m,k,i,j) = Auv(m,k,i,j) + Kuv(nr,nc)
+          Avu(m,k,i,j) = Avu(m,k,i,j) + Kvu(nr,nc)
+          Avv(m,k,i,j) = Avv(m,k,i,j) + Kvv(nr,nc)
 
        enddo     ! nc
+
     enddo        ! nr
 
-  end subroutine element_to_global_matrix_2d
+  end subroutine element_to_global_matrix_3d
 
 !****************************************************************************
     !WHL, May 2025:
@@ -8078,7 +8214,7 @@ end subroutine element_to_global_matrix_2d
     !       Set diva_slope_correction = F to reproduce older results.
 
   !TODO - Call this subroutine for both 2D and 3D solvers.
-  !       First need to switch the index order for 3D matrices.
+  !       First would need to switch the index order for 3D matrices.
   subroutine basal_sliding_bc_2d(nx,               ny,              &
                                  nNeighbors,       nhalo,           &
                                  parallel,                          &
@@ -8091,7 +8227,6 @@ subroutine basal_sliding_bc_2d(nx,               ny,              &
                                  whichassemble_beta,                &
                                  Auu,              Avv)
 
-
     !------------------------------------------------------------------------
     ! Increment the Auu and Avv matrices with basal traction terms.
     ! Do a surface integral over all basal faces that contain at least one node with a stress BC.
@@ -8164,6 +8299,7 @@ subroutine basal_sliding_bc_2d(nx,               ny,              &
        dphi_dx_2d, dphi_dy_2d, dphi_dz_2d  ! derivatives of basis functions, evaluated at quad pts
 
     real(dp) ::   &
+       increment,   & ! incremental change in matrix element
        beta_qp,     & ! beta evaluated at quadrature point
        detJ           ! determinant of Jacobian for the transformation
                       !  between the reference element and true element
@@ -8171,24 +8307,23 @@ subroutine basal_sliding_bc_2d(nx,               ny,              &
     real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) ::   &
        Kuu, Kvv       ! components of element matrix associated with basal sliding
 
-    if (verbose_basal) then
-       call point_diag(beta, 'beta', itest, jtest, rtest, 7, 7, '(f10.0)')
-    endif
-
     if (whichassemble_beta == HO_ASSEMBLE_BETA_LOCAL) then
 
-       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
-          m = indxA_3d(0,0,0)
-       else  ! 2D problem
-          m = indxA_2d(0,0)
-       endif
-
+       m = indxA_2d(0,0)
        ! Sum over active vertices
+       !WHL, 12/13/25: The following minor change makes the results independent of processor count.
+       !               Without the commented code, the 1-core result differs from the 4-core result
+       !                for some cells near the 4-core processor boundary (i = 211 in a GrIS run).
+       !               The difference is in the last of 64 bits.
+       !               I don't know why the original results differ or why the change fixes it.
        do j = 1, ny-1
           do i = 1, nx-1
              if (active_vertex(i,j)) then
-                Auu(i,j,m) = Auu(i,j,m) + dx*dy/vol0 * beta(i,j)
-                Avv(i,j,m) = Avv(i,j,m) + dx*dy/vol0 * beta(i,j)
+!                Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j)
+!                Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j)
+                increment = (dx*dy/vol0) * beta(i,j)
+                Auu(i,j,m) = Auu(i,j,m) + increment
+                Avv(i,j,m) = Avv(i,j,m) + increment
              endif   ! active_vertex
           enddo   ! i
        enddo   ! j
@@ -8329,20 +8464,6 @@ subroutine basal_sliding_bc_2d(nx,               ny,              &
 
     endif   ! whichassemble_beta
 
-    if (verbose_basal .and. this_rank==rtest) then
-       i = itest
-       j = jtest
-       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
-          m = indxA_3d(0,0,0)
-       else
-          m = indxA_2d(0,0)
-       endif
-       write(iulog,*) ' '
-       write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m
-       write(iulog,*) 'New Auu diagonal:', Auu(i,j,m)
-       write(iulog,*) 'New Avv diagonal:', Avv(i,j,m)
-    endif
-
   end subroutine basal_sliding_bc_2d
 
 !****************************************************************************
@@ -8455,11 +8576,7 @@ subroutine basal_sliding_bc_2d_diva(&
 
     if (whichassemble_beta == HO_ASSEMBLE_BETA_LOCAL) then
 
-       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
-          m = indxA_3d(0,0,0)
-       else  ! 2D problem
-          m = indxA_2d(0,0)
-       endif
+       m = indxA_2d(0,0)
 
        ! Average the lower ice surface elevation to vertices
        call glissade_stagger(&
@@ -8481,16 +8598,16 @@ subroutine basal_sliding_bc_2d_diva(&
        call parallel_halo(theta_basal_slope_y, parallel)
 
        if (verbose_basal) then
-          call point_diag(theta_basal_slope_x*180.d0/pi, 'theta_basal_slope_x (deg)', itest, jtest, rtest, 7, 7, '(f10.0)')
-          call point_diag(theta_basal_slope_y*180.d0/pi, 'theta_basal_slope_y (deg)', itest, jtest, rtest, 7, 7, '(f10.0)')
+          call point_diag(theta_basal_slope_x*180.d0/pi, 'theta_basal_slope_x (deg)', itest, jtest, rtest, 7, 7)
+          call point_diag(theta_basal_slope_y*180.d0/pi, 'theta_basal_slope_y (deg)', itest, jtest, rtest, 7, 7)
        endif
 
        ! Sum over active vertices
        do j = 1, ny-1
           do i = 1, nx-1
              if (active_vertex(i,j)) then
-                Auu(i,j,m) = Auu(i,j,m) + dx*dy/vol0 * beta_eff_x(i,j) / cos(theta_basal_slope_x(i,j))
-                Avv(i,j,m) = Avv(i,j,m) + dx*dy/vol0 * beta_eff_y(i,j) / cos(theta_basal_slope_y(i,j))
+                Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta_eff_x(i,j) / cos(theta_basal_slope_x(i,j))
+                Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta_eff_y(i,j) / cos(theta_basal_slope_y(i,j))
              endif   ! active_vertex
           enddo   ! i
        enddo   ! j
@@ -8627,20 +8744,6 @@ subroutine basal_sliding_bc_2d_diva(&
 
     endif   ! whichassemble_beta
 
-    if (verbose_basal .and. this_rank==rtest) then
-       i = itest
-       j = jtest
-       if (nNeighbors == nNodeNeighbors_3d) then  ! 3D problem
-          m = indxA_3d(0,0,0)
-       else
-          m = indxA_2d(0,0)
-       endif
-       write(iulog,*) ' '
-       write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m
-       write(iulog,*) 'New Auu diagonal:', Auu(i,j,m)
-       write(iulog,*) 'New Avv diagonal:', Avv(i,j,m)
-    endif
-
   end subroutine basal_sliding_bc_2d_diva
 
 !****************************************************************************
@@ -8962,8 +9065,8 @@ end subroutine basal_sliding_bc_3d
 
 !****************************************************************************
 
-  subroutine dirichlet_boundary_conditions_3d(nx,              ny,               &
-                                              nz,              nhalo,            &
+  subroutine dirichlet_boundary_conditions_2d(nx,              ny,               &
+                                              nhalo,                             &
                                               active_vertex,                     &
                                               umask_dirichlet, vmask_dirichlet,  &
                                               uvel,            vvel,             &
@@ -8984,183 +9087,158 @@ subroutine dirichlet_boundary_conditions_3d(nx,              ny,               &
 
     integer, intent(in) ::   &
        nx, ny,               &  ! horizontal grid dimensions
-       nz,                   &  ! number of vertical levels where velocity is computed
        nhalo                    ! number of halo layers
 
     logical, dimension(nx-1,ny-1), intent(in) ::  &
        active_vertex       ! true for active vertices (vertices of active cells)
 
-      integer, dimension(nz,nx-1,ny-1), intent(in) ::  &
-       umask_dirichlet,   &! Dirichlet mask for u velocity (if true, u is prescribed)
-       vmask_dirichlet     ! Dirichlet mask for v velocity (if true, v is prescribed)
+    integer, dimension(nx-1,ny-1), intent(in) ::  &
+       umask_dirichlet,   &! Dirichlet mask for velocity (if true, u is prescribed)
+       vmask_dirichlet     ! Dirichlet mask for velocity (if true, v is prescribed)
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
        uvel, vvel          ! velocity components
 
-    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) ::   &
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::   &
        Auu, Auv,    &      ! assembled stiffness matrix, divided into 4 parts
        Avu, Avv                                    
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::   &
+    real(dp), dimension(nx-1,ny-1), intent(inout) ::   &
        bu, bv              ! assembled load vector, divided into 2 parts
 
     !----------------------------------------------------------------
     ! Local variables
     !----------------------------------------------------------------
     
-    integer :: i, j, k     ! Cartesian indices of nodes
-    integer :: iA, jA, kA  ! i, j, and k offsets of neighboring nodes 
-    integer :: m
+    integer :: i, j     ! Cartesian indices of nodes
+    integer :: iA, jA   ! i and j offsets of neighboring nodes 
+    integer :: m, mm
 
     ! Loop over all vertices that border locally owned vertices.
-    ! For outflow BC, OK to skip vertices outside the global domain (i < nhalo or j < nhalo).
+    ! Locally owned vertices are (staggered_ilo:staggered_ihi, staggered_jlo_staggered_jhi).
+    ! OK to skip vertices outside the global domain (i < nhalo or j < nhalo).
     ! Note: Need nhalo >= 2 so as not to step out of bounds.
 
-     do j = nhalo, ny-nhalo+1
-        do i = nhalo, nx-nhalo+1
-          if (active_vertex(i,j)) then
-             do k = 1, nz
+    do jA = -1,1
+    do iA = -1,1
+       m  = indxA_2d(iA,jA)
+       mm = indxA_2d(-iA,-jA)
 
-                if (umask_dirichlet(k,i,j) == 1) then
+       do j = nhalo, ny-nhalo+1
+          do i = nhalo, nx-nhalo+1
+             if (active_vertex(i,j)) then
+
+                if (umask_dirichlet(i,j) == 1) then
 
                    ! set the rhs to the prescribed velocity
-                   bu(k,i,j) = uvel(k,i,j)
+                   bu(i,j) = uvel(i,j)
 
-                   ! loop through matrix values in the rows associated with this node
+                   ! loop through matrix values in the rows associated with this vertex
                    ! (Auu contains one row, Avu contains a second row)
-                   do kA = -1,1
-                   do jA = -1,1
-                   do iA = -1,1
-
-                      if ( (k+kA >= 1 .and. k+kA <= nz)         &
-                                      .and.                     &
-                           (i+iA >= 1 .and. i+iA <= nx-1)       &
-                                      .and.                     &
-                           (j+jA >= 1 .and. j+jA <= ny-1) ) then
 
-                         if (iA==0 .and. jA==0 .and. kA==0) then  ! main diagonal
+                   if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
+                                   .and.                     &
+                        (j+jA >= 1 .and. j+jA <= ny-1) ) then
 
-                            ! Set Auu = 1 on the main diagonal
-                            ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix
-                            ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0)
-                            m = indxA_3d(0,0,0)
-                            Auu(m,k,i,j) = 1.d0
-                            Auv(m,k,i,j) = 0.d0
-                            Avu(m,k,i,j) = 0.d0
+                      if (iA==0 .and. jA==0) then  ! main diagonal
 
-                            !TODO - Set bu above, outside iA/jA loop
-                            ! Set the rhs to the prescribed velocity, forcing u = prescribed uvel for this vertex
-!!                            bu(k,i,j) = uvel(k,i,j)
-                            
-                         else     ! not on the diagonal
+                         ! Set Auu = 1 on the main diagonal
+                         ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix
+                         ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0)
+                         Auu(i,j,indxA_2d(0,0)) = 1.d0
+                         Auv(i,j,indxA_2d(0,0)) = 1.d0
+                         Avu(i,j,indxA_2d(0,0)) = 1.d0
 
-                            ! Zero out non-diagonal matrix terms in the rows associated with this node
-                            m = indxA_3d(iA,jA,kA)
-                            Auu(m, k, i, j) = 0.d0
-                            Auv(m, k, i, j) = 0.d0
+                      else     ! not on the diagonal
 
-                            ! Shift terms associated with this velocity to the rhs.
-                            ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
-                            m = indxA_3d(-iA,-jA,-kA)
+                         ! Zero out non-diagonal matrix terms in the row associated with this vertex
+                         Auu(i,j,m) = 0.d0
+                         Auv(i,j,m) = 0.d0
 
-                            if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
-                               ! Move (Auu term) * uvel to rhs
-                               bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auu(m, k+kA, i+iA, j+jA) * uvel(k,i,j) 
-                               Auu(m, k+kA, i+iA, j+jA) = 0.d0
-                            endif
+                         ! Shift terms associated with this velocity to the rhs.
+                         ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
+                         ! Recall mm = indxA_2d(-iA,-jA)
 
-                            if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
-                               ! Move (Avu term) * uvel to rhs
-                               bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avu(m, k+kA, i+iA, j+jA) * uvel(k,i,j)
-                               Avu(m, k+kA, i+iA, j+jA) = 0.d0
-                            endif
+                         if (umask_dirichlet(i+iA, j+jA) /= 1) then
+                            ! Move (Auu term) * uvel to rhs
+                            bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auu(i+iA, j+jA, mm) * uvel(i,j)
+                            Auu(i+iA, j+jA, mm) = 0.d0
+                         endif
 
-                         endif  ! on the diagonal
+                         if (vmask_dirichlet(i+iA, j+jA) /= 1) then
+                            ! Move (Avu term) * uvel to rhs
+                            bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avu(i+iA, j+jA, mm) * uvel(i,j)
+                            Avu(i+iA, j+jA, mm) = 0.d0
+                         endif
 
-                     endif     ! i+iA, j+jA, and k+kA in bounds
+                      endif  ! on the diagonal
 
-                  enddo        ! kA
-                  enddo        ! iA
-                  enddo        ! jA
+                   endif     ! i+iA and j+jA in bounds
 
-                endif    ! umask_dirichlet
+                endif       ! umask_dirichlet
 
-                if (vmask_dirichlet(k,i,j) == 1) then
+                if (vmask_dirichlet(i,j) == 1) then
 
                    ! set the rhs to the prescribed velocity
-                   bv(k,i,j) = vvel(k,i,j)
-
-                   ! loop through matrix values in the rows associated with this node
-                   ! (Auu contains one row, Avu contains a second row)
-                   do kA = -1,1
-                   do jA = -1,1
-                   do iA = -1,1
+                   bv(i,j) = vvel(i,j)
 
-                      if ( (k+kA >= 1 .and. k+kA <= nz)         &
-                                      .and.                     &
-                           (i+iA >= 1 .and. i+iA <= nx-1)       &
-                                      .and.                     &
-                           (j+jA >= 1 .and. j+jA <= ny-1) ) then
+                   ! loop through matrix values in the rows associated with this vertex
+                   ! (Auv contains one row, Avv contains a second row)
 
-                         if (iA==0 .and. jA==0 .and. kA==0) then  ! main diagonal
+                   if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
+                                   .and.                     &
+                        (j+jA >= 1 .and. j+jA <= ny-1) ) then
 
-                            ! Set Avv = 1 on the main diagonal
-                            ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix
-                            ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0)
-                            m = indxA_3d(0,0,0)
+                      if (iA==0 .and. jA==0) then  ! main diagonal
 
-                            Auv(m,k,i,j) = 0.d0
-                            Avu(m,k,i,j) = 0.d0
-                            Avv(m,k,i,j) = 1.d0
+                         ! Set Avv = 1 on the main diagonal
+                         ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix
+                         ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0)
+                         Auv(i,j,indxA_2d(0,0)) = 0.d0
+                         Avu(i,j,indxA_2d(0,0)) = 0.d0
+                         Avv(i,j,indxA_2d(0,0)) = 1.d0
 
-                            !TODO - Set bv above, outside iA/jA loop
-                            ! Set the rhs to the prescribed velocity, forcing v = prescribed vvel for this node
-!!                            bv(k,i,j) = vvel(k,i,j)
-                            
-                         else     ! not on the diagonal
+                      else     ! not on the diagonal
 
-                            ! Zero out non-diagonal matrix terms in the rows associated with this node
-                            m = indxA_3d(iA,jA,kA)
-                            Avu(m, k, i, j) = 0.d0
-                            Avv(m, k, i, j) = 0.d0
+                         ! Zero out non-diagonal matrix terms in the rows associated with this vertex
+                         Avu(i,j,m) = 0.d0
+                         Avv(i,j,m) = 0.d0
 
-                            ! Shift terms associated with this velocity to the rhs.
-                            ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
-                            m = indxA_3d(-iA,-jA,-kA)
+                         ! Shift terms associated with this velocity to the rhs.
+                         ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
+                         ! Recall mm = indxA_2d(-iA,-jA)
 
-                            if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
-                               ! Move (Auv term) * vvel to rhs
-                               bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auv(m, k+kA, i+iA, j+jA) * vvel(k,i,j)
-                               Auv(m, k+kA, i+iA, j+jA) = 0.d0
-                            endif
+                         if (umask_dirichlet(i+iA, j+jA) /= 1) then
+                            ! Move (Auv term) * vvel to rhs
+                            bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auv(i+iA, j+jA, mm) * vvel(i,j)
+                            Auv(i+iA, j+jA, mm) = 0.d0
+                         endif
 
-                            if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
-                               ! Move (Avv term) * vvel to rhs
-                               bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avv(m, k+kA, i+iA, j+jA) * vvel(k,i,j)
-                               Avv(m, k+kA, i+iA, j+jA) = 0.d0
-                            endif
+                         if (vmask_dirichlet(i+iA, j+jA) /= 1) then
+                            ! Move (Avv term) * vvel to rhs
+                            bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avv(i+iA, j+jA, mm) * vvel(i,j)
+                            Avv(i+iA, j+jA, mm) = 0.d0
+                         endif
 
-                         endif  ! on the diagonal
+                      endif  ! on the diagonal
 
-                     endif     ! i+iA, j+jA, and k+kA in bounds
+                   endif     ! i+iA and j+jA in bounds
 
-                  enddo        ! kA
-                  enddo        ! iA
-                  enddo        ! jA
+                endif       ! vmask_dirichlet
 
-                endif    ! vmask_dirichlet
+             endif          ! active_vertex
+          enddo             ! i
+       enddo                ! j
 
-             enddo       ! k
-          endif          ! active_vertex
-       enddo             ! i
-    enddo                ! j
+    enddo    ! iA
+    enddo    ! jA
 
-  end subroutine dirichlet_boundary_conditions_3d
+  end subroutine dirichlet_boundary_conditions_2d
 
 !****************************************************************************
 
-  subroutine dirichlet_boundary_conditions_2d(nx,              ny,               &
-                                              nhalo,                             &
+  subroutine dirichlet_boundary_conditions_3d(nx,              ny,               &
+                                              nz,              nhalo,            &
                                               active_vertex,                     &
                                               umask_dirichlet, vmask_dirichlet,  &
                                               uvel,            vvel,             &
@@ -9181,157 +9259,182 @@ subroutine dirichlet_boundary_conditions_2d(nx,              ny,               &
 
     integer, intent(in) ::   &
        nx, ny,               &  ! horizontal grid dimensions
+       nz,                   &  ! number of vertical levels where velocity is computed
        nhalo                    ! number of halo layers
 
     logical, dimension(nx-1,ny-1), intent(in) ::  &
        active_vertex       ! true for active vertices (vertices of active cells)
 
-    integer, dimension(nx-1,ny-1), intent(in) ::  &
-       umask_dirichlet,   &! Dirichlet mask for velocity (if true, u is prescribed)
-       vmask_dirichlet     ! Dirichlet mask for velocity (if true, v is prescribed)
+      integer, dimension(nz,nx-1,ny-1), intent(in) ::  &
+       umask_dirichlet,   &! Dirichlet mask for u velocity (if true, u is prescribed)
+       vmask_dirichlet     ! Dirichlet mask for v velocity (if true, v is prescribed)
 
-    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
        uvel, vvel          ! velocity components
 
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::   &
+    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) ::   &
        Auu, Auv,    &      ! assembled stiffness matrix, divided into 4 parts
        Avu, Avv                                    
 
-    real(dp), dimension(nx-1,ny-1), intent(inout) ::   &
+    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::   &
        bu, bv              ! assembled load vector, divided into 2 parts
 
     !----------------------------------------------------------------
     ! Local variables
     !----------------------------------------------------------------
     
-    integer :: i, j     ! Cartesian indices of nodes
-    integer :: iA, jA   ! i and j offsets of neighboring nodes 
-    integer :: m, mm
+    integer :: i, j, k     ! Cartesian indices of nodes
+    integer :: iA, jA, kA  ! i, j, and k offsets of neighboring nodes 
+    integer :: m
 
     ! Loop over all vertices that border locally owned vertices.
-    ! Locally owned vertices are (staggered_ilo:staggered_ihi, staggered_jlo_staggered_jhi).
-    ! OK to skip vertices outside the global domain (i < nhalo or j < nhalo).
+    ! For outflow BC, OK to skip vertices outside the global domain (i < nhalo or j < nhalo).
     ! Note: Need nhalo >= 2 so as not to step out of bounds.
 
-    do jA = -1,1
-    do iA = -1,1
-       m  = indxA_2d(iA,jA)
-       mm = indxA_2d(-iA,-jA)
-
-       do j = nhalo, ny-nhalo+1
-          do i = nhalo, nx-nhalo+1
-             if (active_vertex(i,j)) then
+     do j = nhalo, ny-nhalo+1
+        do i = nhalo, nx-nhalo+1
+          if (active_vertex(i,j)) then
+             do k = 1, nz
 
-                if (umask_dirichlet(i,j) == 1) then
+                if (umask_dirichlet(k,i,j) == 1) then
 
                    ! set the rhs to the prescribed velocity
-                   bu(i,j) = uvel(i,j)
+                   bu(k,i,j) = uvel(k,i,j)
 
-                   ! loop through matrix values in the rows associated with this vertex
+                   ! loop through matrix values in the rows associated with this node
                    ! (Auu contains one row, Avu contains a second row)
+                   do kA = -1,1
+                   do jA = -1,1
+                   do iA = -1,1
+
+                      if ( (k+kA >= 1 .and. k+kA <= nz)         &
+                                      .and.                     &
+                           (i+iA >= 1 .and. i+iA <= nx-1)       &
+                                      .and.                     &
+                           (j+jA >= 1 .and. j+jA <= ny-1) ) then
 
-                   if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
-                                   .and.                     &
-                        (j+jA >= 1 .and. j+jA <= ny-1) ) then
+                         if (iA==0 .and. jA==0 .and. kA==0) then  ! main diagonal
 
-                      if (iA==0 .and. jA==0) then  ! main diagonal
+                            ! Set Auu = 1 on the main diagonal
+                            ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix
+                            ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0)
+                            m = indxA_3d(0,0,0)
+                            Auu(m,k,i,j) = 1.d0
+                            Auv(m,k,i,j) = 0.d0
+                            Avu(m,k,i,j) = 0.d0
 
-                         ! Set Auu = 1 on the main diagonal
-                         ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix
-                         ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0)
-                         Auu(i,j,indxA_2d(0,0)) = 1.d0
-                         Auv(i,j,indxA_2d(0,0)) = 1.d0
-                         Avu(i,j,indxA_2d(0,0)) = 1.d0
+                            !TODO - Set bu above, outside iA/jA loop
+                            ! Set the rhs to the prescribed velocity, forcing u = prescribed uvel for this vertex
+!!                            bu(k,i,j) = uvel(k,i,j)
+                            
+                         else     ! not on the diagonal
 
-                      else     ! not on the diagonal
+                            ! Zero out non-diagonal matrix terms in the rows associated with this node
+                            m = indxA_3d(iA,jA,kA)
+                            Auu(m, k, i, j) = 0.d0
+                            Auv(m, k, i, j) = 0.d0
 
-                         ! Zero out non-diagonal matrix terms in the row associated with this vertex
-                         Auu(i,j,m) = 0.d0
-                         Auv(i,j,m) = 0.d0
+                            ! Shift terms associated with this velocity to the rhs.
+                            ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
+                            m = indxA_3d(-iA,-jA,-kA)
 
-                         ! Shift terms associated with this velocity to the rhs.
-                         ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
-                         ! Recall mm = indxA_2d(-iA,-jA)
+                            if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
+                               ! Move (Auu term) * uvel to rhs
+                               bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auu(m, k+kA, i+iA, j+jA) * uvel(k,i,j) 
+                               Auu(m, k+kA, i+iA, j+jA) = 0.d0
+                            endif
 
-                         if (umask_dirichlet(i+iA, j+jA) /= 1) then
-                            ! Move (Auu term) * uvel to rhs
-                            bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auu(i+iA, j+jA, mm) * uvel(i,j)
-                            Auu(i+iA, j+jA, mm) = 0.d0
-                         endif
+                            if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
+                               ! Move (Avu term) * uvel to rhs
+                               bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avu(m, k+kA, i+iA, j+jA) * uvel(k,i,j)
+                               Avu(m, k+kA, i+iA, j+jA) = 0.d0
+                            endif
 
-                         if (vmask_dirichlet(i+iA, j+jA) /= 1) then
-                            ! Move (Avu term) * uvel to rhs
-                            bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avu(i+iA, j+jA, mm) * uvel(i,j)
-                            Avu(i+iA, j+jA, mm) = 0.d0
-                         endif
+                         endif  ! on the diagonal
 
-                      endif  ! on the diagonal
+                     endif     ! i+iA, j+jA, and k+kA in bounds
 
-                   endif     ! i+iA and j+jA in bounds
+                  enddo        ! kA
+                  enddo        ! iA
+                  enddo        ! jA
 
-                endif       ! umask_dirichlet
+                endif    ! umask_dirichlet
 
-                if (vmask_dirichlet(i,j) == 1) then
+                if (vmask_dirichlet(k,i,j) == 1) then
 
                    ! set the rhs to the prescribed velocity
-                   bv(i,j) = vvel(i,j)
+                   bv(k,i,j) = vvel(k,i,j)
 
-                   ! loop through matrix values in the rows associated with this vertex
-                   ! (Auv contains one row, Avv contains a second row)
+                   ! loop through matrix values in the rows associated with this node
+                   ! (Auu contains one row, Avu contains a second row)
+                   do kA = -1,1
+                   do jA = -1,1
+                   do iA = -1,1
 
-                   if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
-                                   .and.                     &
-                        (j+jA >= 1 .and. j+jA <= ny-1) ) then
+                      if ( (k+kA >= 1 .and. k+kA <= nz)         &
+                                      .and.                     &
+                           (i+iA >= 1 .and. i+iA <= nx-1)       &
+                                      .and.                     &
+                           (j+jA >= 1 .and. j+jA <= ny-1) ) then
 
-                      if (iA==0 .and. jA==0) then  ! main diagonal
+                         if (iA==0 .and. jA==0 .and. kA==0) then  ! main diagonal
 
-                         ! Set Avv = 1 on the main diagonal
-                         ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix
-                         ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0)
-                         Auv(i,j,indxA_2d(0,0)) = 0.d0
-                         Avu(i,j,indxA_2d(0,0)) = 0.d0
-                         Avv(i,j,indxA_2d(0,0)) = 1.d0
+                            ! Set Avv = 1 on the main diagonal
+                            ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix
+                            ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0)
+                            m = indxA_3d(0,0,0)
 
-                      else     ! not on the diagonal
+                            Auv(m,k,i,j) = 0.d0
+                            Avu(m,k,i,j) = 0.d0
+                            Avv(m,k,i,j) = 1.d0
 
-                         ! Zero out non-diagonal matrix terms in the rows associated with this vertex
-                         Avu(i,j,m) = 0.d0
-                         Avv(i,j,m) = 0.d0
+                            !TODO - Set bv above, outside iA/jA loop
+                            ! Set the rhs to the prescribed velocity, forcing v = prescribed vvel for this node
+!!                            bv(k,i,j) = vvel(k,i,j)
+                            
+                         else     ! not on the diagonal
 
-                         ! Shift terms associated with this velocity to the rhs.
-                         ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
-                         ! Recall mm = indxA_2d(-iA,-jA)
+                            ! Zero out non-diagonal matrix terms in the rows associated with this node
+                            m = indxA_3d(iA,jA,kA)
+                            Avu(m, k, i, j) = 0.d0
+                            Avv(m, k, i, j) = 0.d0
 
-                         if (umask_dirichlet(i+iA, j+jA) /= 1) then
-                            ! Move (Auv term) * vvel to rhs
-                            bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auv(i+iA, j+jA, mm) * vvel(i,j)
-                            Auv(i+iA, j+jA, mm) = 0.d0
-                         endif
+                            ! Shift terms associated with this velocity to the rhs.
+                            ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix.
+                            m = indxA_3d(-iA,-jA,-kA)
 
-                         if (vmask_dirichlet(i+iA, j+jA) /= 1) then
-                            ! Move (Avv term) * vvel to rhs
-                            bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avv(i+iA, j+jA, mm) * vvel(i,j)
-                            Avv(i+iA, j+jA, mm) = 0.d0
-                         endif
+                            if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
+                               ! Move (Auv term) * vvel to rhs
+                               bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auv(m, k+kA, i+iA, j+jA) * vvel(k,i,j)
+                               Auv(m, k+kA, i+iA, j+jA) = 0.d0
+                            endif
 
-                      endif  ! on the diagonal
+                            if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then
+                               ! Move (Avv term) * vvel to rhs
+                               bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avv(m, k+kA, i+iA, j+jA) * vvel(k,i,j)
+                               Avv(m, k+kA, i+iA, j+jA) = 0.d0
+                            endif
 
-                   endif     ! i+iA and j+jA in bounds
+                         endif  ! on the diagonal
 
-                endif       ! vmask_dirichlet
+                     endif     ! i+iA, j+jA, and k+kA in bounds
 
-             endif          ! active_vertex
-          enddo             ! i
-       enddo                ! j
+                  enddo        ! kA
+                  enddo        ! iA
+                  enddo        ! jA
 
-    enddo    ! iA
-    enddo    ! jA
+                endif    ! vmask_dirichlet
 
-  end subroutine dirichlet_boundary_conditions_2d
+             enddo       ! k
+          endif          ! active_vertex
+       enddo             ! i
+    enddo                ! j
+
+  end subroutine dirichlet_boundary_conditions_3d
 
 !****************************************************************************
 
-  subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
+  subroutine compute_residual_vector_2d(nx,            ny,            &
                                         parallel,                     &
                                         itest,  jtest, rtest,         &
                                         active_vertex,                &
@@ -9345,9 +9448,12 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
     ! Compute the residual vector Ax - b and its L2 norm.
     ! This subroutine assumes that the matrix is stored in structured (x/y/z) format.
 
+    !----------------------------------------------------------------
+    ! Input/output arguments
+    !----------------------------------------------------------------
+
     integer, intent(in) ::   &
-       nx, ny,             &  ! horizontal grid dimensions (for scalars)
-       nz                     ! number of vertical levels where velocity is computed
+       nx, ny                 ! horizontal grid dimensions (for scalars)
 
     type(parallel_type), intent(in) :: &
        parallel               ! info for parallel communication
@@ -9358,25 +9464,25 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
     logical, dimension(nx-1,ny-1), intent(in) ::   &
        active_vertex          ! T for columns (i,j) where velocity is computed, else F
 
-    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::   &
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::   &
        Auu, Auv, Avu, Avv     ! four components of assembled matrix
-                              ! 1st dimension = 3 (node and its nearest neighbors in x, y and z direction)
-                              ! other dimensions = (z,x,y) indices
+                              ! 3rd dimension = 9 (node and its nearest neighbors in x and y directions)
+                              ! 1st and 2nd dimensions = (x,y) indices
                               !
                               !    Auu  | Auv
                               !    _____|____
                               !    Avu  | Avv
                               !         |
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
        bu, bv              ! assembled load (rhs) vector, divided into 2 parts
 
-   real(dp), dimension(nz,nx-1,ny-1), intent(in) ::   &
+   real(dp), dimension(nx-1,ny-1), intent(in) ::   &
        uvel, vvel          ! u and v components of velocity (m/yr)
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(out) ::   &
+    real(dp), dimension(nx-1,ny-1), intent(out) ::   &
        resid_u,      & ! residual vector, divided into 2 parts
-       resid_v         !
+       resid_v
 
     real(dp), intent(out) ::    &
        L2_norm             ! L2 norm of residual vector, |Ax - b|
@@ -9384,12 +9490,17 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
     real(dp), intent(out), optional ::    &
        L2_norm_relative    ! L2 norm of residual vector relative to rhs, |Ax - b| / |b|
 
-    real(dp), dimension(nz,nx-1,ny-1) ::   &
-       resid_sq        ! resid_u^2 + resid_v^2
+    !----------------------------------------------------------------
+    ! Local variables
+    !----------------------------------------------------------------
+
+    real(dp), dimension(nx-1,ny-1) ::  &
+         worku, workv,     & ! work arrays for global sums
+         resid_sq            ! resid_u^2 + resid_v^2
 
     real(dp) :: my_max_resid, global_max_resid
 
-    integer :: i, j, k, iA, jA, kA, m, iglobal, jglobal
+    integer :: i, j, iA, jA, m, iglobal, jglobal
 
     real(dp) :: L2_norm_rhs   ! L2 norm of rhs vector, |b|
 
@@ -9404,75 +9515,50 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
 
     ! Compute u and v components of A*x
 
-    resid_u(:,:,:) = 0.d0
-    resid_v(:,:,:) = 0.d0
+    resid_u(:,:) = 0.d0
+    resid_v(:,:) = 0.d0
 
-    !TODO - Replace the following by a call to matvec_multiply_structured_3d
     ! Loop over locally owned vertices
-
-    do j = staggered_jlo, staggered_jhi
-    do i = staggered_ilo, staggered_ihi
-
-       if (active_vertex(i,j)) then
-
-          do k = 1, nz
-
-             do kA = -1,1
-             do jA = -1,1
-             do iA = -1,1
-
-                if ( (k+kA >= 1 .and. k+kA <= nz)      &
-                                .and.                  &
-                     (i+iA >= 1 .and. i+iA <= nx-1)    &
-                             .and.                     &
-                     (j+jA >= 1 .and. j+jA <= ny-1) ) then
-
-                   m = indxA_3d(iA,jA,kA)
-
-                   resid_u(k,i,j) = resid_u(k,i,j)                     & 
-                                  + Auu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
-                                  + Auv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)
-
-                   resid_v(k,i,j) = resid_v(k,i,j)                     &
-                                  + Avu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
-                                  + Avv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)
-
-                endif   ! in bounds
-
-             enddo   ! kA
-             enddo   ! iA
-             enddo   ! jA
-
-          enddo   ! k
-
-       endif   ! active_vertex
-
-    enddo   ! i
-    enddo   ! j
+    do jA = -1,1
+       do iA = -1,1
+          m = indxA_2d(iA,jA)
+          do j = staggered_jlo, staggered_jhi
+             do i = staggered_ilo, staggered_ihi
+                if (active_vertex(i,j)) then
+                   if ( (i+iA >= 1 .and. i+iA <= nx-1)    &
+                                   .and.                     &
+                        (j+jA >= 1 .and. j+jA <= ny-1) ) then
+                      resid_u(i,j) = resid_u(i,j)                     &
+                                   + Auu(i,j,m)*uvel(i+iA,j+jA)  &
+                                   + Auv(i,j,m)*vvel(i+iA,j+jA)
+                      resid_v(i,j) = resid_v(i,j)                     &
+                                   + Avu(i,j,m)*uvel(i+iA,j+jA)  &
+                                   + Avv(i,j,m)*vvel(i+iA,j+jA)
+                   endif   ! in bounds
+                endif   ! active_vertex
+             enddo   ! i
+          enddo   ! j
+       enddo   ! iA
+    enddo      ! jA
 
     ! Subtract b to get A*x - b
-    ! Sum up squared L2 norm as we go
-
-    L2_norm = 0.d0
-    resid_sq(:,:,:) = 0.0d0
+    worku(:,:) = 0.0d0
+    workv(:,:) = 0.0d0
 
     ! Loop over locally owned vertices
-
     do j = staggered_jlo, staggered_jhi
-    do i = staggered_ilo, staggered_ihi
-       if (active_vertex(i,j)) then
-          do k = 1, nz
-             resid_u(k,i,j) = resid_u(k,i,j) - bu(k,i,j)
-             resid_v(k,i,j) = resid_v(k,i,j) - bv(k,i,j)
-             resid_sq(k,i,j) = resid_u(k,i,j)*resid_u(k,i,j) + resid_v(k,i,j)*resid_v(k,i,j)
-             L2_norm = L2_norm + resid_sq(k,i,j)
-          enddo  ! k
-       endif     ! active vertex
-    enddo        ! i
-    enddo        ! j
+       do i = staggered_ilo, staggered_ihi
+          if (active_vertex(i,j)) then
+             resid_u(i,j) = resid_u(i,j) - bu(i,j)
+             resid_v(i,j) = resid_v(i,j) - bv(i,j)
+             worku(i,j) = resid_u(i,j)*resid_u(i,j)
+             workv(i,j) = resid_v(i,j)*resid_v(i,j)
+          endif     ! active vertex
+       enddo     ! i
+    enddo     ! j
 
     ! Take global sum, then take square root
-    L2_norm = parallel_reduce_sum(L2_norm)
+    L2_norm = parallel_global_sum_stagger(worku, parallel, workv)
     L2_norm = sqrt(L2_norm)
 
     if (verbose_residual) then
@@ -9480,28 +9566,28 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
        if (this_rank==rtest) then
           i = itest
           j = jtest
-          k = ktest
-          write(iulog,*) 'In compute_residual_vector_3d: task, i, j, k =', this_rank, i, j, k
-          write(iulog, '(a16, 2f13.7, 2e13.5)') &
-               '  u, v, ru, rv: ', uvel(k,i,j), vvel(k,i,j), resid_u(k,i,j), resid_v(k,i,j)
+          call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+          write(iulog,*) ' '
+          write(iulog,*) 'In compute_residual_vector_2d: test ig, jg =', i, j
+          write(iulog, '(a15, 2f12.5, 2e13.5)') &
+               '  u, v, ru, rv:', uvel(i,j), vvel(i,j), resid_u(i,j), resid_v(i,j)
        endif
 
        ! Compute max value of (squared) residual on this task.
        ! If this task owns the vertex with the global max residual, then print a diagnostic message.
+       resid_sq(:,:) = worku(:,:) + workv(:,:)
        my_max_resid = maxval(resid_sq)
        global_max_resid = parallel_reduce_max(my_max_resid)
 
        if (abs((my_max_resid - global_max_resid)/global_max_resid) < 1.0d-6) then
           do j = staggered_jlo, staggered_jhi
              do i = staggered_ilo, staggered_ihi
-                do k = 1, nz
-                   if (abs((resid_sq(k,i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then
-                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                      write(iulog, '(a24, 2i6, i4, 2e13.5, e16.8)') 'ig, jg, k, ru, rv, rmax:', &
-                           iglobal, jglobal, k, resid_u(k,i,j), resid_v(k,i,j), sqrt(global_max_resid)
-                      write(iulog,*) ' '
-                   endif
-                enddo
+                if (abs((resid_sq(i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then
+                   call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                   write(iulog, '(a24, 2i6, 2e13.5, e16.8)') 'ig, jg, ru, rv, global rmax:', &
+                        iglobal, jglobal, resid_u(i,j), resid_v(i,j), sqrt(global_max_resid)
+                   write(iulog,*) ' '
+                endif
              enddo
           enddo
        endif
@@ -9510,35 +9596,36 @@ subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
 
     if (present(L2_norm_relative)) then   ! compute L2_norm relative to rhs
 
-       L2_norm_rhs = 0.d0
+       worku(:,:) = 0.0d0
+       workv(:,:) = 0.0d0
 
+       ! Loop over locally owned vertices
        do j = staggered_jlo, staggered_jhi
-       do i = staggered_ilo, staggered_ihi
-          if (active_vertex(i,j)) then
-             do k = 1, nz
-                L2_norm_rhs = L2_norm_rhs + bu(k,i,j)*bu(k,i,j) + bv(k,i,j)*bv(k,i,j)
-             enddo  ! k
-          endif     ! active vertex
-       enddo        ! i
-       enddo        ! j
+          do i = staggered_ilo, staggered_ihi
+             if (active_vertex(i,j)) then
+                worku(i,j) = bu(i,j)*bu(i,j)
+                workv(i,j) = bv(i,j)*bv(i,j)
+             endif     ! active vertex
+          enddo     ! i
+       enddo     ! j
 
        ! Take global sum, then take square root
-       L2_norm_rhs = parallel_reduce_sum(L2_norm_rhs)
+       L2_norm_rhs = parallel_global_sum_stagger(worku, parallel, workv)
        L2_norm_rhs = sqrt(L2_norm_rhs)
 
-       if (L2_norm_rhs > 0.d0) then
+       if (L2_norm_rhs > 0.0d0) then
           L2_norm_relative = L2_norm / L2_norm_rhs
        else
-          L2_norm_relative = 0.d0
+          L2_norm_relative = 0.0d0
        endif
 
     endif
 
-  end subroutine compute_residual_vector_3d
+  end subroutine compute_residual_vector_2d
 
 !****************************************************************************
 
-  subroutine compute_residual_vector_2d(nx,            ny,            &
+  subroutine compute_residual_vector_3d(nx,    ny,     nz,            &
                                         parallel,                     &
                                         itest,  jtest, rtest,         &
                                         active_vertex,                &
@@ -9552,8 +9639,13 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
     ! Compute the residual vector Ax - b and its L2 norm.
     ! This subroutine assumes that the matrix is stored in structured (x/y/z) format.
 
+    !----------------------------------------------------------------
+    ! Input/output variables
+    !----------------------------------------------------------------
+
     integer, intent(in) ::   &
-       nx, ny                 ! horizontal grid dimensions (for scalars)
+       nx, ny,             &  ! horizontal grid dimensions (for scalars)
+       nz                     ! number of vertical levels where velocity is computed
 
     type(parallel_type), intent(in) :: &
        parallel               ! info for parallel communication
@@ -9564,25 +9656,25 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
     logical, dimension(nx-1,ny-1), intent(in) ::   &
        active_vertex          ! T for columns (i,j) where velocity is computed, else F
 
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::   &
+    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::   &
        Auu, Auv, Avu, Avv     ! four components of assembled matrix
-                              ! 3rd dimension = 9 (node and its nearest neighbors in x and y directions)
-                              ! 1st and 2nd dimensions = (x,y) indices
+                              ! 1st dimension = 3 (node and its nearest neighbors in x, y and z direction)
+                              ! other dimensions = (z,x,y) indices
                               !
                               !    Auu  | Auv
                               !    _____|____
                               !    Avu  | Avv
                               !         |
 
-    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
        bu, bv              ! assembled load (rhs) vector, divided into 2 parts
 
-   real(dp), dimension(nx-1,ny-1), intent(in) ::   &
+   real(dp), dimension(nz,nx-1,ny-1), intent(in) ::   &
        uvel, vvel          ! u and v components of velocity (m/yr)
 
-    real(dp), dimension(nx-1,ny-1), intent(out) ::   &
+    real(dp), dimension(nz,nx-1,ny-1), intent(out) ::   &
        resid_u,      & ! residual vector, divided into 2 parts
-       resid_v
+       resid_v         !
 
     real(dp), intent(out) ::    &
        L2_norm             ! L2 norm of residual vector, |Ax - b|
@@ -9590,12 +9682,17 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
     real(dp), intent(out), optional ::    &
        L2_norm_relative    ! L2 norm of residual vector relative to rhs, |Ax - b| / |b|
 
-    real(dp), dimension(nx-1,ny-1) ::  &
+    !----------------------------------------------------------------
+    ! Local variables
+    !----------------------------------------------------------------
+
+    real(dp), dimension(nz,nx-1,ny-1) ::   &
+       worku, workv,     & ! work arrays for global sums
        resid_sq            ! resid_u^2 + resid_v^2
 
     real(dp) :: my_max_resid, global_max_resid
 
-    integer :: i, j, iA, jA, m, iglobal, jglobal
+    integer :: i, j, k, iA, jA, kA, m, iglobal, jglobal
 
     real(dp) :: L2_norm_rhs   ! L2 norm of rhs vector, |b|
 
@@ -9610,54 +9707,62 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
 
     ! Compute u and v components of A*x
 
-    resid_u(:,:) = 0.d0
-    resid_v(:,:) = 0.d0
+    resid_u(:,:,:) = 0.d0
+    resid_v(:,:,:) = 0.d0
 
     ! Loop over locally owned vertices
-    do jA = -1,1
-       do iA = -1,1
-          m = indxA_2d(iA,jA)
-          do j = staggered_jlo, staggered_jhi
-             do i = staggered_ilo, staggered_ihi
-                if (active_vertex(i,j)) then
-                   if ( (i+iA >= 1 .and. i+iA <= nx-1)    &
-                                   .and.                     &
-                        (j+jA >= 1 .and. j+jA <= ny-1) ) then
-                      resid_u(i,j) = resid_u(i,j)                     &
-                                   + Auu(i,j,m)*uvel(i+iA,j+jA)  &
-                                   + Auv(i,j,m)*vvel(i+iA,j+jA)
-                      resid_v(i,j) = resid_v(i,j)                     &
-                                   + Avu(i,j,m)*uvel(i+iA,j+jA)  &
-                                   + Avv(i,j,m)*vvel(i+iA,j+jA)
-                   endif   ! in bounds
-                endif   ! active_vertex
-             enddo   ! i
-          enddo   ! j
-       enddo   ! iA
-    enddo      ! jA
+    do j = staggered_jlo, staggered_jhi
+       do i = staggered_ilo, staggered_ihi
+          if (active_vertex(i,j)) then
+             do k = 1, nz
+                do kA = -1,1
+                   do jA = -1,1
+                      do iA = -1,1
+                         if ( (k+kA >= 1 .and. k+kA <= nz)      &
+                                         .and.                  &
+                              (i+iA >= 1 .and. i+iA <= nx-1)    &
+                                         .and.                  &
+                              (j+jA >= 1 .and. j+jA <= ny-1) ) then
+
+                            m = indxA_3d(iA,jA,kA)
+
+                            resid_u(k,i,j) = resid_u(k,i,j)                     &
+                                           + Auu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
+                                           + Auv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)
+
+                            resid_v(k,i,j) = resid_v(k,i,j)                     &
+                                           + Avu(m,k,i,j)*uvel(k+kA,i+iA,j+jA)  &
+                                           + Avv(m,k,i,j)*vvel(k+kA,i+iA,j+jA)
+                         endif   ! in bounds
+                      enddo   ! kA
+                   enddo   ! iA
+                enddo   ! jA
+             enddo   ! k
+          endif   ! active_vertex
+       enddo   ! i
+    enddo   ! j
 
     ! Subtract b to get A*x - b
-    ! Sum up squared L2 norm as we go
 
-    L2_norm = 0.d0
-    resid_sq(:,:) = 0.0d0
+    worku(:,:,:) = 0.0d0
+    workv(:,:,:) = 0.0d0
 
     ! Loop over locally owned vertices
-
     do j = staggered_jlo, staggered_jhi
-    do i = staggered_ilo, staggered_ihi
-       if (active_vertex(i,j)) then
-          resid_u(i,j) = resid_u(i,j) - bu(i,j)
-          resid_v(i,j) = resid_v(i,j) - bv(i,j)
-          resid_sq(i,j) = resid_u(i,j)*resid_u(i,j) + resid_v(i,j)*resid_v(i,j)
-          L2_norm = L2_norm + resid_sq(i,j)
-       endif     ! active vertex
-    enddo        ! i
+       do i = staggered_ilo, staggered_ihi
+          if (active_vertex(i,j)) then
+             do k = 1, nz
+                resid_u(k,i,j) = resid_u(k,i,j) - bu(k,i,j)
+                resid_v(k,i,j) = resid_v(k,i,j) - bv(k,i,j)
+                worku(k,i,j) = resid_u(k,i,j)*resid_u(k,i,j)
+                workv(k,i,j) = resid_v(k,i,j)*resid_v(k,i,j)
+             enddo  ! k
+          endif     ! active vertex
+       enddo        ! i
     enddo        ! j
 
     ! Take global sum, then take square root
-
-    L2_norm = parallel_reduce_sum(L2_norm)
+    L2_norm = parallel_global_sum_stagger(worku, parallel, workv)
     L2_norm = sqrt(L2_norm)
 
     if (verbose_residual) then
@@ -9665,9 +9770,10 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
        if (this_rank==rtest) then
           i = itest
           j = jtest
-          write(iulog,*) 'In compute_residual_vector_2d: task, i, j =', this_rank, i, j
+          k = ktest
+          write(iulog,*) 'In compute_residual_vector_3d: task, i, j, k =', this_rank, i, j, k
           write(iulog, '(a16, 2f13.7, 2e13.5)') &
-               '  u, v, ru, rv: ', uvel(i,j), vvel(i,j), resid_u(i,j), resid_v(i,j)
+               '  u, v, ru, rv: ', uvel(k,i,j), vvel(k,i,j), resid_u(k,i,j), resid_v(k,i,j)
        endif
 
        ! Compute max value of (squared) residual on this task.
@@ -9678,12 +9784,14 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
        if (abs((my_max_resid - global_max_resid)/global_max_resid) < 1.0d-6) then
           do j = staggered_jlo, staggered_jhi
              do i = staggered_ilo, staggered_ihi
-                if (abs((resid_sq(i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then
-                   call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                   write(iulog, '(a24, 2i6, 2e13.5, e16.8)') 'ig, jg, ru, rv, rmax:', &
-                        iglobal, jglobal, resid_u(i,j), resid_v(i,j), sqrt(global_max_resid)
-                   write(iulog,*) ' '
-                endif
+                do k = 1, nz
+                   if (abs((resid_sq(k,i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then
+                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                      write(iulog, '(a24, 2i6, i4, 2e13.5, e16.8)') 'ig, jg, k, ru, rv, rmax:', &
+                           iglobal, jglobal, k, resid_u(k,i,j), resid_v(k,i,j), sqrt(global_max_resid)
+                      write(iulog,*) ' '
+                   endif
+                enddo
              enddo
           enddo
        endif
@@ -9692,48 +9800,60 @@ subroutine compute_residual_vector_2d(nx,            ny,            &
 
     if (present(L2_norm_relative)) then   ! compute L2_norm relative to rhs
 
-       L2_norm_rhs = 0.d0
+       worku(:,:,:) = 0.0d0
+       workv(:,:,:) = 0.0d0
 
        do j = staggered_jlo, staggered_jhi
-       do i = staggered_ilo, staggered_ihi
-          if (active_vertex(i,j)) then
-             L2_norm_rhs = L2_norm_rhs + bu(i,j)*bu(i,j) + bv(i,j)*bv(i,j)
-          endif     ! active vertex
-       enddo        ! i
+          do i = staggered_ilo, staggered_ihi
+             if (active_vertex(i,j)) then
+                do k = 1, nz
+                   worku(k,i,j) = bu(k,i,j)*bu(k,i,j)
+                   workv(k,i,j) = bv(k,i,j)*bv(k,i,j)
+                enddo  ! k
+             endif     ! active vertex
+          enddo        ! i
        enddo        ! j
 
        ! Take global sum, then take square root
-       L2_norm_rhs = parallel_reduce_sum(L2_norm_rhs)
+       L2_norm_rhs = parallel_global_sum_stagger(worku, parallel, workv)
        L2_norm_rhs = sqrt(L2_norm_rhs)
 
-       if (L2_norm_rhs > 0.d0) then
+       if (L2_norm_rhs > 0.0d0) then
           L2_norm_relative = L2_norm / L2_norm_rhs
        else
-          L2_norm_relative = 0.d0
+          L2_norm_relative = 0.0d0
        endif
 
     endif
 
-  end subroutine compute_residual_vector_2d
+  end subroutine compute_residual_vector_3d
 
 !****************************************************************************
 
-  subroutine evaluate_accelerated_picard_3d(&
+  subroutine evaluate_accelerated_picard_2d(&
+       whichapprox,   rtest,                &
        L2_norm,       L2_norm_large,        &
        L2_norm_alpha_sav,                   &
        alpha_accel,   alpha_accel_max,      &
        gamma_accel,   resid_reduction_threshold,  &
-       uvel,          vvel,                 &
-       Auu,           Auv,                  &
-       Avu,           Avv,                  &
-       uvel_old,      vvel_old,             &
-       duvel,         dvvel,                &
-       uvel_sav,      vvel_sav,             &
-       Auu_sav,       Auv_sav,              &
-       Avu_sav,       Avv_sav,              &
+       uvel_2d,       vvel_2d,              &
+       Auu_2d,        Auv_2d,               &
+       Avu_2d,        Avv_2d,               &
+       uvel_2d_old,   vvel_2d_old,          &
+       duvel_2d,      dvvel_2d,             &
+       uvel_2d_sav,   vvel_2d_sav,          &
+       Auu_2d_sav,    Auv_2d_sav,           &
+       Avu_2d_sav,    Avv_2d_sav,           &
        beta_internal, beta_internal_sav,    &
+       beta_eff_x,    beta_eff_x_sav,       &
+       beta_eff_y,    beta_eff_y_sav,       &
+       omega_k,       omega_k_sav,          &
        assembly_is_done)
 
+    integer, intent(in) :: &
+         rtest,                  &  ! rank for diagnostic point
+         whichapprox                ! which Stokes approximation to use (SIA, SSA, DIVA, L1L2, BP)
+
     real(dp), intent(in) :: &
          L2_norm,                &  ! latest value of L2 norm of residual
          L2_norm_large,          &  ! large value for re-initializing the L2 norm
@@ -9746,24 +9866,33 @@ subroutine evaluate_accelerated_picard_3d(&
          L2_norm_alpha_sav          ! value of L2 norm of residual, given the previous alpha_accel
 
     real(dp), dimension(:,:), intent(inout) ::  &
+         uvel_2d,     vvel_2d,         & ! latest guess for the velocity solution
+         uvel_2d_old, vvel_2d_old,     & ! velocity solution from previous nonlinear iteration
+         duvel_2d,    dvvel_2d,        & ! difference between old velocity solution and latest solution
+         uvel_2d_sav, vvel_2d_sav,     & ! best velocity solution so far, based on the residual norm
          beta_internal,                & ! beta_internal as a function of uvel_2d and vvel_2d
          beta_internal_sav               ! beta_internal as a function of uvel_2d_sav and vvel_2d_sav
 
+    real(dp), dimension(:,:), intent(inout) ::  &
+         beta_eff_x, beta_eff_x_sav, &
+         beta_eff_y, beta_eff_y_sav
+
     real(dp), dimension(:,:,:), intent(inout) ::  &
-         uvel,        vvel,            & ! latest guess for the velocity solution
-         uvel_old,    vvel_old,        & ! velocity solution from previous nonlinear iteration
-         duvel,       dvvel,           & ! difference between old velocity solution and latest solution
-         uvel_sav,    vvel_sav           ! best velocity solution so far, based on the residual norm
+         omega_k, omega_k_sav
 
-    real(dp), dimension(:,:,:,:), intent(inout) ::  &
-         Auu,         Auv,             & ! latest assembled matrices as a function of uvel_2d and vvel_2d
-         Avu,         Avv,             &
-         Auu_sav,     Auv_sav,         & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav
-         Avu_sav,     Avv_sav
+    real(dp), dimension(:,:,:), intent(inout) ::  &
+         Auu_2d,      Auv_2d,          & ! latest assembled matrices as a function of uvel_2d and vvel_2d
+         Avu_2d,      Avv_2d,          &
+         Auu_2d_sav,  Auv_2d_sav,      & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav
+         Avu_2d_sav,  Avv_2d_sav
 
     logical, intent(inout) :: &
          assembly_is_done                ! if true, then accept the current assembled matrices and proceed to solution
 
+    if (verbose_picard .and. this_rank == rtest) then
+       write(iulog,*) 'Evaluate Picard acceleration: L2_norm_sav, L2_norm:', L2_norm_alpha_sav, L2_norm
+    endif
+
     if (L2_norm < resid_reduction_threshold*L2_norm_alpha_sav .and. &
          alpha_accel + gamma_accel <= alpha_accel_max) then
 
@@ -9771,39 +9900,39 @@ subroutine evaluate_accelerated_picard_3d(&
        ! ("Substantially" is defined by the factor resid_reduction_threshold < 1.)
 
        ! Save the latest values of the solver inputs
-
-       uvel_sav = uvel
-       vvel_sav = vvel
-       Auu_sav = Auu
-       Auv_sav = Auv
-       Avu_sav = Avu
-       Avv_sav = Avv
+       uvel_2d_sav = uvel_2d
+       vvel_2d_sav = vvel_2d
+       Auu_2d_sav = Auu_2d
+       Auv_2d_sav = Auv_2d
+       Avu_2d_sav = Avu_2d
+       Avv_2d_sav = Avv_2d
        beta_internal_sav = beta_internal
+       if (whichapprox == HO_APPROX_DIVA) then
+          beta_eff_x_sav = beta_eff_x
+          beta_eff_y_sav = beta_eff_y
+          omega_k_sav = omega_k
+       endif
 
        ! Increase alpha_accel and see if the residual keeps getting smaller.
        ! If not, we will back off to the saved values above.
        alpha_accel = alpha_accel + gamma_accel
        L2_norm_alpha_sav = L2_norm
 
-       if (verbose_picard .and. main_task) then
-          write(iulog,*) 'Keep going, alpha =', alpha_accel
+       if (verbose_picard .and. this_rank == rtest) then
+          write(iulog,*) 'Increase alpha to', alpha_accel
        endif
 
        ! Since assembly_is_done = F, we now return to the start of the loop:
-       ! do while (.not.assembly_is_done)
+       !    do while (.not.assembly_is_done)
 
     elseif (L2_norm < L2_norm_alpha_sav) then
 
        ! The residual norm decreased only a little (or we have reached alpha_accel_max).
        ! Call it good and move on to the solver.
 
-       if (verbose_picard .and. main_task) then
-          write(iulog,*) 'Hold, alpha =', alpha_accel
-       endif
-
        ! Save this velocity as the starting point for the next nonlinear iteration
-       uvel_old = uvel
-       vvel_old = vvel
+       uvel_2d_old = uvel_2d
+       vvel_2d_old = vvel_2d
 
        ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration
        alpha_accel = 1.0d0
@@ -9812,26 +9941,30 @@ subroutine evaluate_accelerated_picard_3d(&
        ! proceed to the matrix solution
        assembly_is_done = .true.
 
-    else
+       if (verbose_picard .and. this_rank == rtest) then
+          write(iulog,*) 'Hold alpha at', alpha_accel
+          write(iulog,*) ' Continue to matrix solver'
+       endif
 
+    else
        ! The residual is larger than the previous value.
        ! Switch back to the previously saved velocity and matrix with the lower residual.
-       uvel = uvel_sav
-       vvel = vvel_sav
-       Auu = Auu_sav
-       Auv = Auv_sav
-       Avu = Avu_sav
-       Avv = Avv_sav
+       uvel_2d = uvel_2d_sav
+       vvel_2d = vvel_2d_sav
+       Auu_2d = Auu_2d_sav
+       Auv_2d = Auv_2d_sav
+       Avu_2d = Avu_2d_sav
+       Avv_2d = Avv_2d_sav
        beta_internal = beta_internal_sav
+       if (whichapprox == HO_APPROX_DIVA) then
+          beta_eff_x = beta_eff_x_sav
+          beta_eff_y = beta_eff_y_sav
+          omega_k = omega_k_sav
+       endif
 
        ! Save this velocity as the starting point for the next nonlinear iteration
-       uvel_old = uvel
-       vvel_old = vvel
-
-       if (verbose_picard .and. main_task) then
-          write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel
-          write(iulog,*) 'Continue to matrix solver'
-       endif
+       uvel_2d_old = uvel_2d
+       vvel_2d_old = vvel_2d
 
        ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration
        alpha_accel = 1.0d0
@@ -9840,31 +9973,35 @@ subroutine evaluate_accelerated_picard_3d(&
        ! proceed to the matrix solution
        assembly_is_done = .true.
 
-    endif  ! L2_norm of residual has reduced
+       if (verbose_picard .and. this_rank == rtest) then
+          write(iulog,*) 'Reduce alpha to', alpha_accel - gamma_accel
+          write(iulog,*) 'Continue to matrix solver'
+       endif
 
-  end subroutine evaluate_accelerated_picard_3d
+    endif  ! L2_norm of residual has reduced
 
+  end subroutine evaluate_accelerated_picard_2d
+  
 !****************************************************************************
 
-  subroutine evaluate_accelerated_picard_2d(&
-       nx,            ny,                   &
+  subroutine evaluate_accelerated_picard_3d(&
+       rtest,                               &
        L2_norm,       L2_norm_large,        &
        L2_norm_alpha_sav,                   &
        alpha_accel,   alpha_accel_max,      &
        gamma_accel,   resid_reduction_threshold,  &
-       uvel_2d,       vvel_2d,              &
-       Auu_2d,        Auv_2d,               &
-       Avu_2d,        Avv_2d,               &
-       uvel_2d_old,   vvel_2d_old,          &
-       duvel_2d,      dvvel_2d,             &
-       uvel_2d_sav,   vvel_2d_sav,          &
-       Auu_2d_sav,    Auv_2d_sav,           &
-       Avu_2d_sav,    Avv_2d_sav,           &
+       uvel,          vvel,                 &
+       Auu,           Auv,                  &
+       Avu,           Avv,                  &
+       uvel_old,      vvel_old,             &
+       duvel,         dvvel,                &
+       uvel_sav,      vvel_sav,             &
+       Auu_sav,       Auv_sav,              &
+       Avu_sav,       Avv_sav,              &
        beta_internal, beta_internal_sav,    &
        assembly_is_done)
 
-    integer, intent(in) :: &
-         nx,  ny                    ! number of grid cells in each direction
+    integer, intent(in) :: rtest    ! rank for diagnostic point
 
     real(dp), intent(in) :: &
          L2_norm,                &  ! latest value of L2 norm of residual
@@ -9877,19 +10014,21 @@ subroutine evaluate_accelerated_picard_2d(&
          alpha_accel,            &  ! factor for extending the vector (duvel, dvvel) to reduce the residual
          L2_norm_alpha_sav          ! value of L2 norm of residual, given the previous alpha_accel
 
-    real(dp), dimension(nx-1,ny-1), intent(inout) ::  &
-         uvel_2d,     vvel_2d,         & ! latest guess for the velocity solution
-         uvel_2d_old, vvel_2d_old,     & ! velocity solution from previous nonlinear iteration
-         duvel_2d,    dvvel_2d,        & ! difference between old velocity solution and latest solution
-         uvel_2d_sav, vvel_2d_sav,     & ! best velocity solution so far, based on the residual norm
+    real(dp), dimension(:,:), intent(inout) ::  &
          beta_internal,                & ! beta_internal as a function of uvel_2d and vvel_2d
          beta_internal_sav               ! beta_internal as a function of uvel_2d_sav and vvel_2d_sav
 
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::  &
-         Auu_2d,      Auv_2d,          & ! latest assembled matrices as a function of uvel_2d and vvel_2d
-         Avu_2d,      Avv_2d,          &
-         Auu_2d_sav,  Auv_2d_sav,      & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav
-         Avu_2d_sav,  Avv_2d_sav
+    real(dp), dimension(:,:,:), intent(inout) ::  &
+         uvel,        vvel,            & ! latest guess for the velocity solution
+         uvel_old,    vvel_old,        & ! velocity solution from previous nonlinear iteration
+         duvel,       dvvel,           & ! difference between old velocity solution and latest solution
+         uvel_sav,    vvel_sav           ! best velocity solution so far, based on the residual norm
+
+    real(dp), dimension(:,:,:,:), intent(inout) ::  &
+         Auu,         Auv,             & ! latest assembled matrices as a function of uvel_2d and vvel_2d
+         Avu,         Avv,             &
+         Auu_sav,     Auv_sav,         & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav
+         Avu_sav,     Avv_sav
 
     logical, intent(inout) :: &
          assembly_is_done                ! if true, then accept the current assembled matrices and proceed to solution
@@ -9902,12 +10041,12 @@ subroutine evaluate_accelerated_picard_2d(&
 
        ! Save the latest values of the solver inputs
 
-       uvel_2d_sav = uvel_2d
-       vvel_2d_sav = vvel_2d
-       Auu_2d_sav = Auu_2d
-       Auv_2d_sav = Auv_2d
-       Avu_2d_sav = Avu_2d
-       Avv_2d_sav = Avv_2d
+       uvel_sav = uvel
+       vvel_sav = vvel
+       Auu_sav = Auu
+       Auv_sav = Auv
+       Avu_sav = Avu
+       Avv_sav = Avv
        beta_internal_sav = beta_internal
 
        ! Increase alpha_accel and see if the residual keeps getting smaller.
@@ -9915,8 +10054,8 @@ subroutine evaluate_accelerated_picard_2d(&
        alpha_accel = alpha_accel + gamma_accel
        L2_norm_alpha_sav = L2_norm
 
-       if (verbose_picard .and. main_task) then
-          write(iulog,*) 'Keep going, alpha =', alpha_accel
+       if (verbose_picard .and. this_rank == rtest) then
+          write(iulog,*) 'Increase alpha to', alpha_accel
        endif
 
        ! Since assembly_is_done = F, we now return to the start of the loop:
@@ -9932,8 +10071,8 @@ subroutine evaluate_accelerated_picard_2d(&
        endif
 
        ! Save this velocity as the starting point for the next nonlinear iteration
-       uvel_2d_old = uvel_2d
-       vvel_2d_old = vvel_2d
+       uvel_old = uvel
+       vvel_old = vvel
 
        ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration
        alpha_accel = 1.0d0
@@ -9942,26 +10081,26 @@ subroutine evaluate_accelerated_picard_2d(&
        ! proceed to the matrix solution
        assembly_is_done = .true.
 
+       if (verbose_picard .and. this_rank == rtest) then
+          write(iulog,*) 'Hold alpha at', alpha_accel
+          write(iulog,*) ' Continue to matrix solver'
+       endif
+
     else
 
        ! The residual is larger than the previous value.
        ! Switch back to the previously saved velocity and matrix with the lower residual.
-       uvel_2d = uvel_2d_sav
-       vvel_2d = vvel_2d_sav
-       Auu_2d = Auu_2d_sav
-       Auv_2d = Auv_2d_sav
-       Avu_2d = Avu_2d_sav
-       Avv_2d = Avv_2d_sav
+       uvel = uvel_sav
+       vvel = vvel_sav
+       Auu = Auu_sav
+       Auv = Auv_sav
+       Avu = Avu_sav
+       Avv = Avv_sav
        beta_internal = beta_internal_sav
 
        ! Save this velocity as the starting point for the next nonlinear iteration
-       uvel_2d_old = uvel_2d
-       vvel_2d_old = vvel_2d
-
-       if (verbose_picard .and. main_task) then
-          write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel
-          write(iulog,*) 'Continue to matrix solver'
-       endif
+       uvel_old = uvel
+       vvel_old = vvel
 
        ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration
        alpha_accel = 1.0d0
@@ -9970,9 +10109,140 @@ subroutine evaluate_accelerated_picard_2d(&
        ! proceed to the matrix solution
        assembly_is_done = .true.
 
+       if (verbose_picard .and. main_task) then
+          write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel
+          write(iulog,*) 'Continue to matrix solver'
+       endif
+
     endif  ! L2_norm of residual has reduced
 
-  end subroutine evaluate_accelerated_picard_2d
+  end subroutine evaluate_accelerated_picard_3d
+
+!****************************************************************************
+
+  subroutine compute_residual_velocity_2d(whichresid,    parallel,    &
+                                          uvel,          vvel,        &
+                                          usav,          vsav,        &
+                                          resid_velo)
+
+    integer, intent(in) ::   &
+         whichresid         ! option for method to use when calculating residual
+
+    type(parallel_type), intent(in) :: &
+         parallel             ! info for parallel communication
+
+    real(dp), dimension(:,:), intent(in) ::  &
+         uvel, vvel,      & ! current guess for velocity
+         usav, vsav         ! previous guess for velocity
+
+    real(dp), intent(out) ::    &
+         resid_velo         ! quantity related to velocity convergence
+
+    integer ::   &
+         imaxdiff, jmaxdiff   ! location of maximum speed difference
+                              ! currently computed but not used
+ 
+    integer :: i, j, count
+
+    real(dp) ::   &
+         speed,      &   ! current guess for ice speed
+         oldspeed,   &   ! previous guess for ice speed
+         diffspeed       ! abs(speed-oldspeed)
+
+    integer :: &
+         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
+         staggered_jlo, staggered_jhi
+
+    staggered_ilo = parallel%staggered_ilo
+    staggered_ihi = parallel%staggered_ihi
+    staggered_jlo = parallel%staggered_jlo
+    staggered_jhi = parallel%staggered_jhi
+
+    ! Compute a residual quantity based on convergence of the velocity field.
+
+    ! options for residual calculation method, as specified in configuration file
+    ! case(0): use max of abs( vel_old - vel ) / vel )
+    ! case(1): use max of abs( vel_old - vel ) / vel ) but ignore basal vels
+    ! case(2): use mean of abs( vel_old - vel ) / vel )
+    ! case(3): use max of abs( vel_old - vel ) / vel ) (in addition to L2 norm)
+    
+    resid_velo = 0.d0
+    imaxdiff = 0
+    jmaxdiff = 0
+
+    select case (whichresid)
+
+    case(HO_RESID_MAXU_NO_UBAS)   ! max speed difference, excluding the bed
+
+       ! Loop over locally owned vertices
+
+       do j = staggered_jlo, staggered_jhi
+          do i = staggered_ilo, staggered_ihi
+             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
+             if (speed /= 0.d0) then
+                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
+                diffspeed = abs((oldspeed - speed)/speed)
+                if (diffspeed > resid_velo) then
+                   resid_velo = diffspeed
+                   imaxdiff = i
+                   jmaxdiff = j
+                endif
+             endif
+          enddo
+       enddo
+       
+       ! take global max
+       resid_velo = parallel_reduce_max(resid_velo)
+
+    case(HO_RESID_MEANU)   ! mean relative speed difference
+
+       count = 0
+
+       ! Loop over locally owned vertices
+
+       do j = staggered_jlo, staggered_jhi
+          do i = staggered_ilo, staggered_ihi
+             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
+             if (speed /= 0.d0) then
+                count = count+1
+                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
+                diffspeed = abs((oldspeed - speed)/speed)
+                resid_velo = resid_velo + diffspeed
+             endif
+          enddo
+       enddo
+
+       if (count > 0) resid_velo = resid_velo / count
+
+       !TODO - Need to convert the mean residual to a global value.
+       !       (Or simply remove this case, which is rarely if ever used)
+       call not_parallel(__FILE__, __LINE__)
+
+   case default    ! max speed difference, including basal speeds
+                   ! (case HO_RESID_MAXU or HO_RESID_L2NORM)
+
+       ! Loop over locally owned vertices
+
+       do j = staggered_jlo, staggered_jhi
+          do i = staggered_ilo, staggered_ihi
+             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
+             if (speed /= 0.d0) then
+                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
+                diffspeed = abs((oldspeed - speed)/speed)
+                if (diffspeed > resid_velo) then
+                   resid_velo = diffspeed
+                   imaxdiff = i
+                   jmaxdiff = j
+                endif
+             endif
+          enddo
+       enddo
+
+       resid_velo = parallel_reduce_max(resid_velo)
+       
+    end select
+
+  end subroutine compute_residual_velocity_2d
 
 !****************************************************************************
 
@@ -10068,7 +10338,7 @@ subroutine compute_residual_velocity_3d(whichresid,    parallel,      &
                 if (speed /= 0.d0) then
                    count = count+1
                    oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2)
-                   diffspeed = abs((oldspeed - speed)/speed)                
+                   diffspeed = abs((oldspeed - speed)/speed)
                    resid_velo = resid_velo + diffspeed
                 endif
              enddo
@@ -10112,129 +10382,66 @@ end subroutine compute_residual_velocity_3d
 
 !****************************************************************************
 
-  subroutine compute_residual_velocity_2d(whichresid,    parallel,    &
-                                          uvel,          vvel,        &
-                                          usav,          vsav,        &
-                                          resid_velo)
+  subroutine count_nonzeros_2d(nx,            ny,     &
+                               parallel,              &
+                               Auu,           Auv,    &
+                               Avu,           Avv,    &
+                               active_vertex,         &
+                               nNonzeros)
+
+    !----------------------------------------------------------------
+    ! Input-output arguments
+    !----------------------------------------------------------------
 
     integer, intent(in) ::   &
-         whichresid         ! option for method to use when calculating residual
+       nx, ny             ! number of grid cells in each direction
 
     type(parallel_type), intent(in) :: &
          parallel             ! info for parallel communication
 
-    real(dp), dimension(:,:), intent(in) ::  &
-         uvel, vvel,      & ! current guess for velocity
-         usav, vsav         ! previous guess for velocity
-
-    real(dp), intent(out) ::    &
-         resid_velo         ! quantity related to velocity convergence
-
-    integer ::   &
-         imaxdiff, jmaxdiff   ! location of maximum speed difference
-                              ! currently computed but not used
- 
-    integer :: i, j, count
-
-    real(dp) ::   &
-         speed,      &   ! current guess for ice speed
-         oldspeed,   &   ! previous guess for ice speed
-         diffspeed       ! abs(speed-oldspeed)
-
-    integer :: &
-         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
-         staggered_jlo, staggered_jhi
-
-    staggered_ilo = parallel%staggered_ilo
-    staggered_ihi = parallel%staggered_ihi
-    staggered_jlo = parallel%staggered_jlo
-    staggered_jhi = parallel%staggered_jhi
-
-    ! Compute a residual quantity based on convergence of the velocity field.
-
-    ! options for residual calculation method, as specified in configuration file
-    ! case(0): use max of abs( vel_old - vel ) / vel )
-    ! case(1): use max of abs( vel_old - vel ) / vel ) but ignore basal vels
-    ! case(2): use mean of abs( vel_old - vel ) / vel )
-    ! case(3): use max of abs( vel_old - vel ) / vel ) (in addition to L2 norm)
-    
-    resid_velo = 0.d0
-    imaxdiff = 0
-    jmaxdiff = 0
-
-    select case (whichresid)
-
-    case(HO_RESID_MAXU_NO_UBAS)   ! max speed difference, excluding the bed
-
-       ! Loop over locally owned vertices
-
-       do j = staggered_jlo, staggered_jhi
-          do i = staggered_ilo, staggered_ihi
-             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
-             if (speed /= 0.d0) then
-                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
-                diffspeed = abs((oldspeed - speed)/speed)
-                if (diffspeed > resid_velo) then
-                   resid_velo = diffspeed
-                   imaxdiff = i
-                   jmaxdiff = j
-                endif
-             endif
-          enddo
-       enddo
-       
-       ! take global max
-       resid_velo = parallel_reduce_max(resid_velo)
-
-    case(HO_RESID_MEANU)   ! mean relative speed difference
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::  &
+       Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
+       Avu, Avv
 
-       count = 0
+    logical, dimension(nx-1,ny-1), intent(in) :: &
+       active_vertex      ! true for vertices of active cells
 
-       ! Loop over locally owned vertices
+    integer, intent(out) ::   &
+       nNonzeros          ! number of nonzero matrix elements
 
-       do j = staggered_jlo, staggered_jhi
-          do i = staggered_ilo, staggered_ihi
-             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
-             if (speed /= 0.d0) then
-                count = count+1
-                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
-                diffspeed = abs((oldspeed - speed)/speed)                
-                resid_velo = resid_velo + diffspeed
-             endif
-          enddo
-       enddo
+    !----------------------------------------------------------------
+    ! Local variables
+    !----------------------------------------------------------------
 
-       if (count > 0) resid_velo = resid_velo / count
+    integer :: i, j, m
 
-       !TODO - Need to convert the mean residual to a global value.
-       !       (Or simply remove this case, which is rarely if ever used)
-       call not_parallel(__FILE__, __LINE__)
+    integer :: &
+         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
+         staggered_jlo, staggered_jhi
 
-   case default    ! max speed difference, including basal speeds
-                   ! (case HO_RESID_MAXU or HO_RESID_L2NORM)
+    staggered_ilo = parallel%staggered_ilo
+    staggered_ihi = parallel%staggered_ihi
+    staggered_jlo = parallel%staggered_jlo
+    staggered_jhi = parallel%staggered_jhi
 
-       ! Loop over locally owned vertices
+    nNonzeros = 0
 
+    do m = 1, nNodeNeighbors_2d
        do j = staggered_jlo, staggered_jhi
           do i = staggered_ilo, staggered_ihi
-             speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2)
-             if (speed /= 0.d0) then
-                oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2)
-                diffspeed = abs((oldspeed - speed)/speed)
-                if (diffspeed > resid_velo) then
-                   resid_velo = diffspeed
-                   imaxdiff = i
-                   jmaxdiff = j
-                endif
-             endif
-          enddo
-       enddo
+             if (active_vertex(i,j)) then
+                if (Auu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
+                if (Auv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
+                if (Avu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
+                if (Avv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
+             endif     ! active_vertex
+          enddo        ! i
+       enddo           ! j
+    enddo              ! m
 
-       resid_velo = parallel_reduce_max(resid_velo)
-       
-    end select
+    nNonzeros = parallel_reduce_sum(nNonzeros)
 
-  end subroutine compute_residual_velocity_2d
+  end subroutine count_nonzeros_2d
 
 !****************************************************************************
 
@@ -10308,38 +10515,112 @@ end subroutine count_nonzeros_3d
 
 !****************************************************************************
 
-  subroutine count_nonzeros_2d(nx,            ny,     &
-                               parallel,              &
-                               Auu,           Auv,    &
-                               Avu,           Avv,    &
-                               active_vertex,         &
-                               nNonzeros)
+  subroutine check_symmetry_element_matrix(nNodesPerElement,  &
+                                           Kuu, Kuv, Kvu, Kvv)
 
-    !----------------------------------------------------------------
-    ! Input-output arguments
-    !----------------------------------------------------------------
+    !------------------------------------------------------------------
+    ! Check that the element stiffness matrix is symmetric.
+    ! This is true provided that (1) Kuu = (Kuu)^T
+    !                            (2) Kvv = (Kvv)^T
+    !                            (3) Kuv = (Kvu)^T
+    ! This subroutine works for either 2D or 3D elements.
+    ! A symmetry check should not be needed for production runs with a well-tested code,
+    !  but is included for now to help with debugging.
+    !------------------------------------------------------------------
+
+    integer, intent(in) :: nNodesPerElement  ! number of nodes per element
+
+    real(dp), dimension(nNodesPerElement, nNodesPerElement), intent(in) ::   &
+             Kuu, Kuv, Kvu, Kvv     ! component of element stiffness matrix
+                                    !
+                                    !    Kuu  | Kuv
+                                    !    _____|____          
+                                    !    Kvu  | Kvv
+                                    !         |
+
+    integer :: i, j
+
+    ! make sure Kuu = (Kuu)^T
+
+    do j = 1, nNodesPerElement
+       do i = j, nNodesPerElement
+          if (abs(Kuu(i,j) - Kuu(j,i)) > eps11) then
+             write(iulog,*) 'Kuu is not symmetric'
+             write(iulog,*) 'i, j, Kuu(i,j), Kuu(j,i):', i, j, Kuu(i,j), Kuu(j,i)
+             stop
+          endif    
+       enddo
+    enddo
+
+    ! check that Kvv = (Kvv)^T
+
+    do j = 1, nNodesPerElement
+       do i = j, nNodesPerElement
+          if (abs(Kvv(i,j) - Kvv(j,i)) > eps11) then
+             write(iulog,*) 'Kvv is not symmetric'
+             write(iulog,*) 'i, j, Kvv(i,j), Kvv(j,i):', i, j, Kvv(i,j), Kvv(j,i)
+             stop
+          endif    
+       enddo
+    enddo
+
+    ! Check that Kuv = (Kvu)^T
+
+    do j = 1, nNodesPerElement
+       do i = 1, nNodesPerElement
+          if (abs(Kuv(i,j) - Kvu(j,i)) > eps11) then
+             write(iulog,*) 'Kuv /= (Kvu)^T'
+             write(iulog,*) 'i, j, Kuv(i,j), Kvu(j,i):', i, j, Kuv(i,j), Kvu(j,i)
+             stop
+          endif    
+       enddo
+    enddo
+
+  end subroutine check_symmetry_element_matrix
+
+!****************************************************************************
+
+  subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
+                                                parallel,                &
+                                                active_vertex,           &
+                                                Auu, Auv, Avu, Avv)
+
+    !------------------------------------------------------------------
+    ! Check that the assembled stiffness matrix is symmetric.
+    ! This is true provided that (1) Auu = (Auu)^T
+    !                            (2) Avv = (Avv)^T
+    !                            (3) Auv = (Avu)^T
+    ! The A matrices are assembled in a dense fashion to save storage
+    !  and preserve the i/j/k structure of the grid.
+    !
+    ! There can be small differences from perfect symmetry due to roundoff error.
+    ! These differences are fixed provided they are small enough.
+    !------------------------------------------------------------------    
 
     integer, intent(in) ::   &
-       nx, ny             ! number of grid cells in each direction
+       nx, ny                   ! horizontal grid dimensions
 
     type(parallel_type), intent(in) :: &
          parallel             ! info for parallel communication
 
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::  &
-       Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
-       Avu, Avv                                    
+    logical, dimension(nx-1,ny-1), intent(in) ::   &
+       active_vertex            ! T for columns (i,j) where velocity is computed, else F
 
-    logical, dimension(nx-1,ny-1), intent(in) :: &
-       active_vertex      ! true for vertices of active cells
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::   &
+       Auu, Auv, Avu, Avv       ! components of assembled stiffness matrix
+                                !
+                                !    Auu  | Auv
+                                !    _____|____          
+                                !         |
+                                !    Avu  | Avv                                    
 
-    integer, intent(out) ::   &
-       nNonzeros          ! number of nonzero matrix elements
+    integer :: i, j, iA, jA, m, mm, iglobal, jglobal
 
-    !----------------------------------------------------------------
-    ! Local variables
-    !----------------------------------------------------------------
+    real(dp) :: val1, val2          ! values of matrix coefficients
 
-    integer :: i, j, m
+    real(dp) :: maxdiff, global_maxdiff, diag_entry, avg_val
+
+    integer :: rmax, imax, jmax, mmax
 
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
@@ -10350,89 +10631,158 @@ subroutine count_nonzeros_2d(nx,            ny,     &
     staggered_jlo = parallel%staggered_jlo
     staggered_jhi = parallel%staggered_jhi
 
-    nNonzeros = 0
+    ! Check matrix for symmetry
+
+    ! Here we correct for small differences from symmetry due to roundoff error.
+    ! The maximum departure from symmetry is set to be a small fraction
+    !  of the diagonal entry for the row.
+    ! If the departure from symmetry is larger than this, then the model prints a warning 
+    !  and/or aborts.
+
+    maxdiff = 0.d0
+    rmax = 0; imax = 0; jmax = 0; mmax = 0
+
+    ! Loop over locally owned vertices.
+    ! Each active vertex is associate with 2*nz matrix rows belonging to this processor.
+
+    do jA = -1, 1
+    do iA = -1, 1
+       m =  indxA_2d( iA, jA)
+       mm = indxA_2d(-iA,-jA)
 
-    do m = 1, nNodeNeighbors_2d
        do j = staggered_jlo, staggered_jhi
           do i = staggered_ilo, staggered_ihi
              if (active_vertex(i,j)) then
-                if (Auu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
-                if (Auv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
-                if (Avu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
-                if (Avv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1
-             endif     ! active_vertex
-          enddo        ! i
-       enddo           ! j
-    enddo              ! m
-
-    nNonzeros = parallel_reduce_sum(nNonzeros)
 
-  end subroutine count_nonzeros_2d
+                ! Check Auu and Auv for symmetry
+                diag_entry = Auu(i,j,indxA_2d(0,0))
 
-!****************************************************************************
+                !WHL - debug
+                if (diag_entry /= diag_entry) then
+                   write(iulog,*) 'WARNING: Diagonal NaN: i, j =', i, j
+                   call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                   write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
+!!                   stop
+                endif
 
-  subroutine check_symmetry_element_matrix(nNodesPerElement,  &
-                                           Kuu, Kuv, Kvu, Kvv)
+                ! Check that Auu = Auu^T
+                val1 = Auu(i,    j,    m )   ! value of Auu(row,col)
+                val2 = Auu(i+iA, j+jA, mm)   ! value of Auu(col,row)
+                if (val2 /= val1) then
+                   if (abs(val2 - val1) > maxdiff) then
+                      maxdiff = abs(val2 - val1)
+                      rmax = this_rank; imax = i; jmax = j; mmax = m
+                   endif
+                   ! if difference is small, then fix the asymmetry by averaging values
+                   !WHL - Here and below, I commented out the code to average asymmetric values.
+                   !      The hope is that the asymmetries are too small to matter.
+                   ! else print a warning and abort
+                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
+!                      avg_val = 0.5d0 * (val1 + val2)
+!                      Auu(i,    j,    m ) = avg_val
+!                      Auu(i+iA, j+jA, mm) = avg_val
+                   else
+                      write(iulog,*) 'WARNING: Auu is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
+                      write(iulog,*) 'Auu(row,col), Auu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
+                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
+!!                      stop
+                   endif
+                endif   ! val2 /= val1
 
-    !------------------------------------------------------------------
-    ! Check that the element stiffness matrix is symmetric.
-    ! This is true provided that (1) Kuu = (Kuu)^T
-    !                            (2) Kvv = (Kvv)^T
-    !                            (3) Kuv = (Kvu)^T
-    ! This subroutine works for either 2D or 3D elements.
-    ! A symmetry check should not be needed for production runs with a well-tested code,
-    !  but is included for now to help with debugging.
-    !------------------------------------------------------------------
+                ! Check that Auv = (Avu)^T
+                val1 = Auv(i,    j,    m )   ! value of Auv(row,col)
+                val2 = Avu(i+iA, j+jA, mm)   ! value of Avu(col,row)
+                if (val2 /= val1) then
+                   if (abs(val2 - val1) > maxdiff) then
+                      maxdiff = abs(val2 - val1)
+                      rmax = this_rank; imax = i; jmax = j; mmax = m
+                   endif
+                   ! if difference is small, then fix the asymmetry by averaging values
+                   ! else print a warning and abort
+                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
+!                      avg_val = 0.5d0 * (val1 + val2)
+!                      Auv(i,    j,    m ) = avg_val
+!                      Avu(i+iA, j+jA, mm) = avg_val
+                   else
+                      write(iulog,*) 'WARNING: Auv is not equal to (Avu)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
+                      write(iulog,*) 'Auv(row,col), Avu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
+                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
+!!                      stop
+                   endif
+                endif  ! val2 /= val1
 
-    integer, intent(in) :: nNodesPerElement  ! number of nodes per element
+                ! Now check Avu and Avv
+                diag_entry = Avv(i,j,indxA_2d(0,0))
 
-    real(dp), dimension(nNodesPerElement, nNodesPerElement), intent(in) ::   &
-             Kuu, Kuv, Kvu, Kvv     ! component of element stiffness matrix
-                                    !
-                                    !    Kuu  | Kuv
-                                    !    _____|____          
-                                    !    Kvu  | Kvv
-                                    !         |
+                ! check that Avv = (Avv)^T
+                val1 = Avv(i,    j,    m )   ! value of Avv(row,col)
+                val2 = Avv(i+iA, j+jA, mm)   ! value of Avv(col,row)
 
-    integer :: i, j
+                if (val2 /= val1) then
+                   if (abs(val2 - val1) > maxdiff) then
+                      maxdiff = abs(val2 - val1)
+                      rmax = this_rank; imax = i; jmax = j; mmax = m
+                   endif
+                   ! if difference is small, then fix the asymmetry by averaging values
+                   ! else print a warning and abort
+                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
+!                      avg_val = 0.5d0 * (val1 + val2)
+!                      Avv(i,    j,    m ) = avg_val
+!                      Avv(i+iA, j+jA, mm) = avg_val
+                   else
+                      write(iulog,*) 'WARNING: Avv is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
+                      write(iulog,*) 'Avv(row,col), Avv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
+                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
+!!                      stop
+                   endif
 
-    ! make sure Kuu = (Kuu)^T
+                endif   ! val2 /= val1
 
-    do j = 1, nNodesPerElement
-       do i = j, nNodesPerElement
-          if (abs(Kuu(i,j) - Kuu(j,i)) > eps10) then
-             write(iulog,*) 'Kuu is not symmetric'
-             write(iulog,*) 'i, j, Kuu(i,j), Kuu(j,i):', i, j, Kuu(i,j), Kuu(j,i)
-             stop
-          endif    
-       enddo
-    enddo
+                ! Check that Avu = (Auv)^T
+                val1 = Avu(i,    j,    m )   ! value of Avu(row,col)
+                val2 = Auv(i+iA, j+jA, mm)   ! value of Auv(col,row)
 
-    ! check that Kvv = (Kvv)^T
+                if (val2 /= val1) then
+                   if (abs(val2 - val1) > maxdiff) then
+                      maxdiff = abs(val2 - val1)
+                      rmax = this_rank; imax = i; jmax = j; mmax = m
+                   endif
+                   ! if difference is small, then fix the asymmetry by averaging values
+                   ! else print a warning and abort
+                   if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
+!                      avg_val = 0.5d0 * (val1 + val2)
+!                      Avu(i,    j,    m ) = avg_val
+!                      Auv(i+iA, j+jA, mm) = avg_val
+                   else
+                      write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
+                      write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
+                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
+!!                      stop
+                   endif
 
-    do j = 1, nNodesPerElement
-       do i = j, nNodesPerElement
-          if (abs(Kvv(i,j) - Kvv(j,i)) > eps10) then
-             write(iulog,*) 'Kvv is not symmetric'
-             write(iulog,*) 'i, j, Kvv(i,j), Kvv(j,i):', i, j, Kvv(i,j), Kvv(j,i)
-             stop
-          endif    
-       enddo
-    enddo
+                endif  ! val2 /= val1
 
-    ! Check that Kuv = (Kvu)^T
+             endif        ! active_vertex
+          enddo           ! i
+       enddo              ! j
+    enddo     ! iA
+    enddo     ! jA
 
-    do j = 1, nNodesPerElement
-       do i = 1, nNodesPerElement
-          if (abs(Kuv(i,j) - Kvu(j,i)) > eps10) then
-             write(iulog,*) 'Kuv /= (Kvu)^T'
-             write(iulog,*) 'i, j, Kuv(i,j), Kvu(j,i):', i, j, Kuv(i,j), Kvu(j,i)
-             stop
-          endif    
-       enddo
-    enddo
+    if (verbose_matrix) then
+       global_maxdiff = parallel_reduce_max(maxdiff)
+       if (global_maxdiff > 0.0d0 .and. maxdiff == global_maxdiff) then
+          ! maxdiff is on this processor; compute and broadcast the global index
+          call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel)
+          write(iulog,*) 'Max asymmetry =', global_maxdiff
+          write(iulog,*) '   ig, jg, m =', iglobal, jglobal, mmax
+       endif
+    endif
 
-  end subroutine check_symmetry_element_matrix
+  end subroutine check_symmetry_assembled_matrix_2d
 
 !****************************************************************************
 
@@ -10476,12 +10826,14 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
 
     real(dp) :: val1, val2          ! values of matrix coefficients
 
-    real(dp) :: maxdiff, diag_entry, avg_val
+    real(dp) :: maxdiff, global_maxdiff, diag_entry, avg_val
 
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
 
+    integer :: rmax, imax, jmax, kmax, mmax
+
     staggered_ilo = parallel%staggered_ilo
     staggered_ihi = parallel%staggered_ihi
     staggered_jlo = parallel%staggered_jlo
@@ -10496,6 +10848,7 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
     !  and/or aborts.
 
     maxdiff = 0.d0
+    rmax = 0; imax = 0; jmax = 0; kmax = 0; mmax = 0
 
     ! Loop over locally owned vertices.
     ! Each active vertex is associate with 2*nz matrix rows belonging to this processor.
@@ -10534,12 +10887,15 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
 
                       if (val2 /= val1) then
                           
-                         if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
+                         if (abs(val2 - val1) > maxdiff) then
+                            maxdiff = abs(val2 - val1)
+                            rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m
+                         endif
 
                          ! if difference is small, then fix the asymmetry by averaging values
                          ! else print a warning and abort
 
-                         if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
+                         if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
                             avg_val = 0.5d0 * (val1 + val2)
                             Auu( m, k,   i,   j   ) = avg_val
                             Auu(mm, k+kA,i+iA,j+jA) = avg_val
@@ -10561,12 +10917,15 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
 
                       if (val2 /= val1) then
 
-                         if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
+                         if (abs(val2 - val1) > maxdiff) then
+                            maxdiff = abs(val2 - val1)
+                            rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m
+                         endif
 
                          ! if difference is small, then fix the asymmetry by averaging values
                          ! else print a warning and abort
 
-                         if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
+                         if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
                             avg_val = 0.5d0 * (val1 + val2)
                             Auv( m, k,   i,   j   ) = avg_val
                             Avu(mm, k+kA,i+iA,j+jA) = avg_val
@@ -10616,12 +10975,15 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
 
                       if (val2 /= val1) then
                           
-                         if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
+                         if (abs(val2 - val1) > maxdiff) then
+                            maxdiff = abs(val2 - val1)
+                            rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m
+                         endif
 
                          ! if difference is small, then fix the asymmetry by averaging values
                          ! else print a warning and abort
 
-                         if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
+                         if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
                             avg_val = 0.5d0 * (val1 + val2)
                             Avv( m, k,   i,   j   ) = avg_val
                             Avv(mm, k+kA,i+iA,j+jA) = avg_val
@@ -10641,278 +11003,93 @@ subroutine check_symmetry_assembled_matrix_3d(nx,    ny,     nz,      &
                       val1 = Avu( m, k,    i,    j)      ! value of Avu(row,col)
                       val2 = Auv(mm, k+kA, i+iA, j+jA)   ! value of Auv(col,row)
 
-                      if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
+                      if (abs(val2 - val1) > maxdiff) then
+                         maxdiff = abs(val2 - val1)
+                         rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m
+                      endif
 
                       if (val2 /= val1) then
 
-                         ! if difference is small, then fix the asymmetry by averaging values
-                         ! else print a warning and abort
-
-                         if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
-                            avg_val = 0.5d0 * (val1 + val2)
-                            Avu( m, k,   i,   j   ) = avg_val
-                            Auv(mm, k+kA,i+iA,j+jA) = avg_val
-                         else
-                            write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, k, iA, jA, kA =', &
-                                 this_rank, i, j, k, iA, jA, kA
-                            write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
-                            call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                            write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
-!!                            stop
-                         endif
-
-                      endif  ! val2 /= val1
-
-                   endif     ! k+kA in bounds
-
-                enddo        ! kA
-                enddo        ! iA
-                enddo        ! jA
-
-             enddo     ! k
-          endif        ! active_vertex
-       enddo           ! i
-    enddo              ! j
-
-    if (verbose_matrix) maxdiff = parallel_reduce_max(maxdiff)
-
-    if (verbose_matrix .and. main_task) then
-       write(iulog,*) ' '
-       write(iulog,*) 'Max difference from symmetry =', maxdiff
-    endif
-
-  end subroutine check_symmetry_assembled_matrix_3d
-
-!****************************************************************************
-
-  subroutine check_symmetry_assembled_matrix_2d(nx,            ny,       &
-                                                parallel,                &
-                                                active_vertex,           &
-                                                Auu, Auv, Avu, Avv)
-
-    !------------------------------------------------------------------
-    ! Check that the assembled stiffness matrix is symmetric.
-    ! This is true provided that (1) Auu = (Auu)^T
-    !                            (2) Avv = (Avv)^T
-    !                            (3) Auv = (Avu)^T
-    ! The A matrices are assembled in a dense fashion to save storage
-    !  and preserve the i/j/k structure of the grid.
-    !
-    ! There can be small differences from perfect symmetry due to roundoff error.
-    ! These differences are fixed provided they are small enough.
-    !------------------------------------------------------------------    
-
-    integer, intent(in) ::   &
-       nx, ny                   ! horizontal grid dimensions
-
-    type(parallel_type), intent(in) :: &
-         parallel             ! info for parallel communication
-
-    logical, dimension(nx-1,ny-1), intent(in) ::   &
-       active_vertex            ! T for columns (i,j) where velocity is computed, else F
-
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) ::   &
-       Auu, Auv, Avu, Avv       ! components of assembled stiffness matrix
-                                !
-                                !    Auu  | Auv
-                                !    _____|____          
-                                !         |
-                                !    Avu  | Avv                                    
-
-    integer :: i, j, iA, jA, m, mm, iglobal, jglobal
-
-    real(dp) :: val1, val2          ! values of matrix coefficients
-
-    real(dp) :: maxdiff, diag_entry, avg_val
-
-    integer :: &
-         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
-         staggered_jlo, staggered_jhi
-
-    staggered_ilo = parallel%staggered_ilo
-    staggered_ihi = parallel%staggered_ihi
-    staggered_jlo = parallel%staggered_jlo
-    staggered_jhi = parallel%staggered_jhi
-
-    ! Check matrix for symmetry
-
-    ! Here we correct for small differences from symmetry due to roundoff error.
-    ! The maximum departure from symmetry is set to be a small fraction
-    !  of the diagonal entry for the row.
-    ! If the departure from symmetry is larger than this, then the model prints a warning 
-    !  and/or aborts.
-
-    maxdiff = 0.d0
-
-    ! Loop over locally owned vertices.
-    ! Each active vertex is associate with 2*nz matrix rows belonging to this processor.
-
-    do jA = -1, 1
-    do iA = -1, 1
-       m =  indxA_2d( iA, jA)
-       mm = indxA_2d(-iA,-jA)
-
-       do j = staggered_jlo, staggered_jhi
-          do i = staggered_ilo, staggered_ihi
-             if (active_vertex(i,j)) then
-
-                ! Check Auu and Auv for symmetry
-                diag_entry = Auu(i,j,indxA_2d(0,0))
-
-                !WHL - debug
-                if (diag_entry /= diag_entry) then
-                   write(iulog,*) 'WARNING: Diagonal NaN: i, j =', i, j
-                   call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                   write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
-!!                   stop
-                endif
-
-                ! Check that Auu = Auu^T
-                val1 = Auu(i,    j,    m )   ! value of Auu(row,col)
-                val2 = Auu(i+iA, j+jA, mm)   ! value of Auu(col,row)
-                if (val2 /= val1) then
-                   if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
-                   ! if difference is small, then fix the asymmetry by averaging values
-                   ! else print a warning and abort
-                   if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
-                      avg_val = 0.5d0 * (val1 + val2)
-                      Auu(i,    j,    m ) = avg_val
-                      Auu(i+iA, j+jA, mm) = avg_val
-                   else
-                      write(iulog,*) 'WARNING: Auu is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
-                      write(iulog,*) 'Auu(row,col), Auu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
-                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
-!!                      stop
-                   endif
-                endif   ! val2 /= val1
-
-                ! Check that Auv = (Avu)^T
-                val1 = Auv(i,    j,    m )   ! value of Auv(row,col)
-                val2 = Avu(i+iA, j+jA, mm)   ! value of Avu(col,row)
-                if (val2 /= val1) then
-                   if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
-                   ! if difference is small, then fix the asymmetry by averaging values
-                   ! else print a warning and abort
-                   if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
-                      avg_val = 0.5d0 * (val1 + val2)
-                      Auv(i,    j,    m ) = avg_val
-                      Avu(i+iA, j+jA, mm) = avg_val
-                   else
-                      write(iulog,*) 'WARNING: Auv is not equal to (Avu)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
-                      write(iulog,*) 'Auv(row,col), Avu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
-                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
-!!                      stop
-                   endif
-                endif  ! val2 /= val1
-
-                ! Now check Avu and Avv
-                diag_entry = Avv(i,j,indxA_2d(0,0))
-
-                ! check that Avv = (Avv)^T
-                val1 = Avv(i,    j,    m )   ! value of Avv(row,col)
-                val2 = Avv(i+iA, j+jA, mm)   ! value of Avv(col,row)
-
-                if (val2 /= val1) then
-                   if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
-
-                   ! if difference is small, then fix the asymmetry by averaging values
-                   ! else print a warning and abort
-                   if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
-                      avg_val = 0.5d0 * (val1 + val2)
-                      Avv(i,    j,    m ) = avg_val
-                      Avv(i+iA, j+jA, mm) = avg_val
-                   else
-                      write(iulog,*) 'WARNING: Avv is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
-                      write(iulog,*) 'Avv(row,col), Avv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
-                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
-!!                      stop
-                   endif
-
-                endif   ! val2 /= val1
-
-                ! Check that Avu = (Auv)^T
-                val1 = Avu(i,    j,    m )   ! value of Avu(row,col)
-                val2 = Auv(i+iA, j+jA, mm)   ! value of Auv(col,row)
-
-                if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1)
+                         ! if difference is small, then fix the asymmetry by averaging values
+                         ! else print a warning and abort
 
-                if (val2 /= val1) then
+                         if ( abs(val2-val1) < eps11*abs(diag_entry) ) then
+                            avg_val = 0.5d0 * (val1 + val2)
+                            Avu( m, k,   i,   j   ) = avg_val
+                            Auv(mm, k+kA,i+iA,j+jA) = avg_val
+                         else
+                            write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, k, iA, jA, kA =', &
+                                 this_rank, i, j, k, iA, jA, kA
+                            write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
+                            call parallel_globalindex(i, j, iglobal, jglobal, parallel)
+                            write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
+!!                            stop
+                         endif
 
-                   ! if difference is small, then fix the asymmetry by averaging values
-                   ! else print a warning and abort
-                   if ( abs(val2-val1) < eps08*abs(diag_entry) ) then
-                      avg_val = 0.5d0 * (val1 + val2)
-                      Avu(i,    j,    m ) = avg_val
-                      Auv(i+iA, j+jA, mm) = avg_val
-                   else
-                      write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA
-                      write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry
-                      call parallel_globalindex(i, j, iglobal, jglobal, parallel)
-                      write(iulog,*) '   iglobal, jglobal:', iglobal, jglobal
-!!                      stop
-                   endif
+                      endif  ! val2 /= val1
 
-                endif  ! val2 /= val1
+                   endif     ! k+kA in bounds
 
-             endif        ! active_vertex
-          enddo           ! i
-       enddo              ! j
-    enddo     ! iA
-    enddo     ! jA
+                enddo        ! kA
+                enddo        ! iA
+                enddo        ! jA
 
-    if (verbose_matrix) maxdiff = parallel_reduce_max(maxdiff)
+             enddo     ! k
+          endif        ! active_vertex
+       enddo           ! i
+    enddo              ! j
 
-    if (verbose_matrix .and. main_task) then
-       write(iulog,*) ' '
-       write(iulog,*) 'Max difference from symmetry =', maxdiff
+    if (verbose_matrix) then
+       global_maxdiff = parallel_reduce_max(maxdiff)
+       if (global_maxdiff > 0.0d0 .and. maxdiff == global_maxdiff) then
+          ! maxdiff is on this processor; compute and broadcast the global index
+          call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel)
+          write(iulog,*) 'Max asymmetry =', global_maxdiff
+          write(iulog,*) '   i, j, ig, jg, k, m =', imax, jmax, iglobal, jglobal, kmax, mmax
+       endif
     endif
 
-  end subroutine check_symmetry_assembled_matrix_2d
+  end subroutine check_symmetry_assembled_matrix_3d
 
 !****************************************************************************
 
-  subroutine write_matrix_elements_3d(nx,    ny,   nz,     &
-                                      nNodesSolve, nodeID, &
-                                      iNodeIndex,  jNodeIndex,  &
-                                      kNodeIndex,          &
-                                      Auu,         Auv,    &
-                                      Avu,         Avv,    &
-                                      bu,          bv)
+  subroutine write_matrix_elements_2d(nx,             ny,            &
+                                      nVerticesSolve, vertexID,      &
+                                      iVertexIndex,   jVertexIndex,  &
+                                      Auu,            Auv,           &
+                                      Avu,            Avv,           &
+                                      bu,             bv)
 
     ! Write matrix elements to text files.
     ! Note: Does not work when running on more than one task.
 
     integer, intent(in) ::   &
        nx, ny,               &  ! horizontal grid dimensions
-       nz,                   &  ! number of vertical levels at which velocity is computed
-       nNodesSolve              ! number of nodes where we solve for velocity
+       nVerticesSolve           ! number of vertices where we solve for velocity
 
-    integer, dimension(nz,nx-1,ny-1), intent(in) ::  &
-       nodeID             ! ID for each node
+    integer, dimension(nx-1,ny-1), intent(in) ::  &
+       vertexID             ! ID for each vertex
 
     integer, dimension(:), intent(in) ::   &
-       iNodeIndex, jNodeIndex, kNodeIndex   ! i, j and k indices of active nodes
+       iVertexIndex, jVertexIndex   ! i and j indices of active vertices
 
-    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::  &
        Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
-       Avu, Avv           ! 1st dimension = node and its nearest neighbors in x, y and z direction 
-                          ! other dimensions = (k,i,j) indices
+       Avu, Avv           ! 1st dimension = vertex and its nearest neighbors in x and y direction 
+                          ! other dimensions = (i,j) indices
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
        bu, bv             ! assembled load (rhs) vector, divided into 2 parts
 
     ! Local variables
 
     integer :: rowA, colA
-    integer :: i, j, k, m, iA, jA, kA
+    integer :: i, j, m, iA, jA
 
-    real(dp), dimension(nNodesSolve, nNodesSolve) ::   &
+    real(dp), dimension(nVerticesSolve, nVerticesSolve) ::   &
        Auu_val, Auv_val, Avu_val, Avv_val   ! dense matrices
 
-    real(dp), dimension(nNodesSolve) :: nonzeros
+    real(dp), dimension(nVerticesSolve) :: nonzeros
 
     if (tasks > 1) then
        call write_log('Error: Cannot write matrix elements to files when tasks > 1', GM_FATAL)
@@ -10923,35 +11100,29 @@ subroutine write_matrix_elements_3d(nx,    ny,   nz,     &
     Avu_val(:,:) = 0.d0
     Avv_val(:,:) = 0.d0
 
-    do rowA = 1, nNodesSolve
-
-       i = iNodeIndex(rowA)
-       j = jNodeIndex(rowA)
-       k = kNodeIndex(rowA)
+    do rowA = 1, nVerticesSolve
 
-       do kA = -1, 1
+       i = iVertexIndex(rowA)
+       j = jVertexIndex(rowA)
        do jA = -1, 1
        do iA = -1, 1
 
-          if ( (k+kA >= 1 .and. k+kA <= nz)         &
-                          .and.                     &
-               (i+iA >= 1 .and. i+iA <= nx-1)       &
+          if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
                           .and.                     &
                (j+jA >= 1 .and. j+jA <= ny-1) ) then
 
-             colA = nodeID(k+kA, i+iA, j+jA)   ! ID for neighboring node
-             m = indxA_3d(iA,jA,kA)
+             colA = vertexID(i+iA, j+jA)   ! ID for neighboring vertex
+             m = indxA_2d(iA,jA)
 
              if (colA > 0) then 
-                Auu_val(rowA, colA) = Auu(m,k,i,j)
-                Auv_val(rowA, colA) = Auv(m,k,i,j)
-                Avu_val(rowA, colA) = Avu(m,k,i,j)
-                Avv_val(rowA, colA) = Avv(m,k,i,j)
+                Auu_val(rowA, colA) = Auu(i,j,m)
+                Auv_val(rowA, colA) = Auv(i,j,m)
+                Avu_val(rowA, colA) = Avu(i,j,m)
+                Avv_val(rowA, colA) = Avv(i,j,m)
              endif
 
-          endif     ! i+iA, j+jA, and k+kA in bounds
+          endif     ! i+iA and j+jA in bounds
 
-       enddo        ! kA
        enddo        ! iA
        enddo        ! jA
 
@@ -10960,9 +11131,9 @@ subroutine write_matrix_elements_3d(nx,    ny,   nz,     &
     !WHL - bug check
     write(iulog,*) ' '
     write(iulog,*) 'nonzeros per row:'
-    do rowA = 1, nNodesSolve
+    do rowA = 1, nVerticesSolve
        nonzeros(rowA) = 0
-       do colA = 1, nNodesSolve
+       do colA = 1, nVerticesSolve
           if (abs(Auu_val(rowA,colA)) > 1.d-11) then
              nonzeros(rowA) = nonzeros(rowA) + 1
           endif
@@ -10979,12 +11150,12 @@ subroutine write_matrix_elements_3d(nx,    ny,   nz,     &
     open(unit=12, file='Avu.'//matrix_label, status='unknown')
     open(unit=13, file='Avv.'//matrix_label, status='unknown')
 
-    do rowA = 1, nNodesSolve
+    do rowA = 1, nVerticesSolve
        write(10,'(i6)',advance='no') rowA
        write(11,'(i6)',advance='no') rowA
        write(12,'(i6)',advance='no') rowA
        write(13,'(i6)',advance='no') rowA
-       do colA = 1, nNodesSolve
+       do colA = 1, nVerticesSolve
           write(10,'(e16.8)',advance='no') Auu_val(rowA,colA)
           write(11,'(e16.8)',advance='no') Auv_val(rowA,colA)
           write(12,'(e16.8)',advance='no') Avu_val(rowA,colA)
@@ -11006,57 +11177,58 @@ subroutine write_matrix_elements_3d(nx,    ny,   nz,     &
     ! write load vectors to file
     open(unit=14, file='bu.'//matrix_label, status='unknown')
     open(unit=15, file='bv.'//matrix_label, status='unknown')
-    do rowA = 1, nNodesSolve
-       i = iNodeIndex(rowA)
-       j = jNodeIndex(rowA)
-       k = kNodeIndex(rowA)
-       write(14,'(i6, e16.8)') rowA, bu(k,i,j)
-       write(15,'(i6, e16.8)') rowA, bv(k,i,j)
+    do rowA = 1, nVerticesSolve
+       i = iVertexIndex(rowA)
+       j = jVertexIndex(rowA)
+       write(14,'(i6, e16.8)') rowA, bu(i,j)
+       write(15,'(i6, e16.8)') rowA, bv(i,j)
     enddo
     close(14)
     close(15)
 
-  end subroutine write_matrix_elements_3d
-  
+  end subroutine write_matrix_elements_2d
+
 !****************************************************************************
 
-  subroutine write_matrix_elements_2d(nx,             ny,            &
-                                      nVerticesSolve, vertexID,      &
-                                      iVertexIndex,   jVertexIndex,  &
-                                      Auu,            Auv,           &
-                                      Avu,            Avv,           &
-                                      bu,             bv)
+  subroutine write_matrix_elements_3d(nx,    ny,   nz,     &
+                                      nNodesSolve, nodeID, &
+                                      iNodeIndex,  jNodeIndex,  &
+                                      kNodeIndex,          &
+                                      Auu,         Auv,    &
+                                      Avu,         Avv,    &
+                                      bu,          bv)
 
     ! Write matrix elements to text files.
     ! Note: Does not work when running on more than one task.
 
     integer, intent(in) ::   &
        nx, ny,               &  ! horizontal grid dimensions
-       nVerticesSolve           ! number of vertices where we solve for velocity
+       nz,                   &  ! number of vertical levels at which velocity is computed
+       nNodesSolve              ! number of nodes where we solve for velocity
 
-    integer, dimension(nx-1,ny-1), intent(in) ::  &
-       vertexID             ! ID for each vertex
+    integer, dimension(nz,nx-1,ny-1), intent(in) ::  &
+       nodeID             ! ID for each node
 
     integer, dimension(:), intent(in) ::   &
-       iVertexIndex, jVertexIndex   ! i and j indices of active vertices
+       iNodeIndex, jNodeIndex, kNodeIndex   ! i, j and k indices of active nodes
 
-    real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) ::  &
+    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::  &
        Auu, Auv,    &     ! assembled stiffness matrix, divided into 4 parts
-       Avu, Avv           ! 1st dimension = vertex and its nearest neighbors in x and y direction 
-                          ! other dimensions = (i,j) indices
+       Avu, Avv           ! 1st dimension = node and its nearest neighbors in x, y and z direction 
+                          ! other dimensions = (k,i,j) indices
 
-    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
        bu, bv             ! assembled load (rhs) vector, divided into 2 parts
 
     ! Local variables
 
     integer :: rowA, colA
-    integer :: i, j, m, iA, jA
+    integer :: i, j, k, m, iA, jA, kA
 
-    real(dp), dimension(nVerticesSolve, nVerticesSolve) ::   &
+    real(dp), dimension(nNodesSolve, nNodesSolve) ::   &
        Auu_val, Auv_val, Avu_val, Avv_val   ! dense matrices
 
-    real(dp), dimension(nVerticesSolve) :: nonzeros
+    real(dp), dimension(nNodesSolve) :: nonzeros
 
     if (tasks > 1) then
        call write_log('Error: Cannot write matrix elements to files when tasks > 1', GM_FATAL)
@@ -11067,29 +11239,35 @@ subroutine write_matrix_elements_2d(nx,             ny,            &
     Avu_val(:,:) = 0.d0
     Avv_val(:,:) = 0.d0
 
-    do rowA = 1, nVerticesSolve
+    do rowA = 1, nNodesSolve
 
-       i = iVertexIndex(rowA)
-       j = jVertexIndex(rowA)
+       i = iNodeIndex(rowA)
+       j = jNodeIndex(rowA)
+       k = kNodeIndex(rowA)
+
+       do kA = -1, 1
        do jA = -1, 1
        do iA = -1, 1
 
-          if ( (i+iA >= 1 .and. i+iA <= nx-1)       &
+          if ( (k+kA >= 1 .and. k+kA <= nz)         &
+                          .and.                     &
+               (i+iA >= 1 .and. i+iA <= nx-1)       &
                           .and.                     &
                (j+jA >= 1 .and. j+jA <= ny-1) ) then
 
-             colA = vertexID(i+iA, j+jA)   ! ID for neighboring vertex
-             m = indxA_2d(iA,jA)
+             colA = nodeID(k+kA, i+iA, j+jA)   ! ID for neighboring node
+             m = indxA_3d(iA,jA,kA)
 
              if (colA > 0) then 
-                Auu_val(rowA, colA) = Auu(i,j,m)
-                Auv_val(rowA, colA) = Auv(i,j,m)
-                Avu_val(rowA, colA) = Avu(i,j,m)
-                Avv_val(rowA, colA) = Avv(i,j,m)
+                Auu_val(rowA, colA) = Auu(m,k,i,j)
+                Auv_val(rowA, colA) = Auv(m,k,i,j)
+                Avu_val(rowA, colA) = Avu(m,k,i,j)
+                Avv_val(rowA, colA) = Avv(m,k,i,j)
              endif
 
-          endif     ! i+iA and j+jA in bounds
+          endif     ! i+iA, j+jA, and k+kA in bounds
 
+       enddo        ! kA
        enddo        ! iA
        enddo        ! jA
 
@@ -11098,9 +11276,9 @@ subroutine write_matrix_elements_2d(nx,             ny,            &
     !WHL - bug check
     write(iulog,*) ' '
     write(iulog,*) 'nonzeros per row:'
-    do rowA = 1, nVerticesSolve
+    do rowA = 1, nNodesSolve
        nonzeros(rowA) = 0
-       do colA = 1, nVerticesSolve
+       do colA = 1, nNodesSolve
           if (abs(Auu_val(rowA,colA)) > 1.d-11) then
              nonzeros(rowA) = nonzeros(rowA) + 1
           endif
@@ -11117,12 +11295,12 @@ subroutine write_matrix_elements_2d(nx,             ny,            &
     open(unit=12, file='Avu.'//matrix_label, status='unknown')
     open(unit=13, file='Avv.'//matrix_label, status='unknown')
 
-    do rowA = 1, nVerticesSolve
+    do rowA = 1, nNodesSolve
        write(10,'(i6)',advance='no') rowA
        write(11,'(i6)',advance='no') rowA
        write(12,'(i6)',advance='no') rowA
        write(13,'(i6)',advance='no') rowA
-       do colA = 1, nVerticesSolve
+       do colA = 1, nNodesSolve
           write(10,'(e16.8)',advance='no') Auu_val(rowA,colA)
           write(11,'(e16.8)',advance='no') Auv_val(rowA,colA)
           write(12,'(e16.8)',advance='no') Avu_val(rowA,colA)
@@ -11144,100 +11322,18 @@ subroutine write_matrix_elements_2d(nx,             ny,            &
     ! write load vectors to file
     open(unit=14, file='bu.'//matrix_label, status='unknown')
     open(unit=15, file='bv.'//matrix_label, status='unknown')
-    do rowA = 1, nVerticesSolve
-       i = iVertexIndex(rowA)
-       j = jVertexIndex(rowA)
-       write(14,'(i6, e16.8)') rowA, bu(i,j)
-       write(15,'(i6, e16.8)') rowA, bv(i,j)
+    do rowA = 1, nNodesSolve
+       i = iNodeIndex(rowA)
+       j = jNodeIndex(rowA)
+       k = kNodeIndex(rowA)
+       write(14,'(i6, e16.8)') rowA, bu(k,i,j)
+       write(15,'(i6, e16.8)') rowA, bv(k,i,j)
     enddo
     close(14)
     close(15)
 
-  end subroutine write_matrix_elements_2d
-
-!****************************************************************************
-  !TODO - Either delete this subroutine, or switch the indices.  Not currently used.
-  subroutine compress_3d_to_2d(nx,        ny,      nz,  &
-                               Auu,       Auv,          &
-                               Avu,       Avv,          &
-                               bu,        bv,           &
-                               Auu_2d,    Auv_2d,       &
-                               Avu_2d,    Avv_2d,       &
-                               bu_2d,     bv_2d)
-
-    !----------------------------------------------------------------
-    ! Form the 2D matrix and rhs by combining terms from the 3D matrix and rhs.
-    ! This combination is based on the assumption of no vertical shear;
-    !  i.e., uvel and vvel have the same value at each level in a given column.
-    !----------------------------------------------------------------
-
-    !----------------------------------------------------------------
-    ! Input-output arguments
-    !----------------------------------------------------------------
-
-    integer, intent(in) ::   &
-       nx, ny,               &  ! horizontal grid dimensions
-       nz                       ! number of vertical levels where velocity is computed
-
-    real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) ::  &
-       Auu, Auv,    &     ! assembled 3D stiffness matrix, divided into 4 parts
-       Avu, Avv           
-                          
-    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
-       bu, bv             ! assembled 3D rhs vector, divided into 2 parts
-                          
-    real(dp), dimension(nNodeNeighbors_2d,nx-1,ny-1), intent(out) ::  &
-       Auu_2d, Auv_2d,   &! assembled 2D (SSA) stiffness matrix, divided into 4 parts
-       Avu_2d, Avv_2d           
-                          
-    real(dp), dimension(nx-1,ny-1), intent(out) ::  &
-       bu_2d, bv_2d       ! assembled 2D (SSA) rhs vector, divided into 2 parts
-                          
-    !----------------------------------------------------------------
-    ! Local variables
-    !----------------------------------------------------------------
-
-    integer :: i, j, k, iA, jA, kA, m, m2
-
-    ! Initialize 2D matrix and rhs
-
-    Auu_2d(:,:,:) = 0.d0
-    Auv_2d(:,:,:) = 0.d0
-    Avu_2d(:,:,:) = 0.d0
-    Avv_2d(:,:,:) = 0.d0
-    bu_2d(:,:) = 0.d0
-    bv_2d(:,:) = 0.d0
-
-    ! Form 2D matrix and rhs
-
-    do j = 1, ny-1
-       do i = 1, nx-1
-          do k = 1, nz
-
-             ! matrix
-             do kA = -1,1
-                do jA = -1,1
-                   do iA = -1,1
-                      m = indxA_3d(iA,jA,kA)
-                      m2 = indxA_2d(iA,jA)
-                      Auu_2d(m2,i,j) = Auu_2d(m2,i,j) + Auu(m,k,i,j)
-                      Auv_2d(m2,i,j) = Auv_2d(m2,i,j) + Auv(m,k,i,j)
-                      Avu_2d(m2,i,j) = Avu_2d(m2,i,j) + Avu(m,k,i,j)
-                      Avv_2d(m2,i,j) = Avv_2d(m2,i,j) + Avv(m,k,i,j)
-                   enddo   ! iA
-                enddo      ! jA
-             enddo         ! kA
-
-             ! rhs
-             bu_2d(i,j) = bu_2d(i,j) + bu(k,i,j)
-             bv_2d(i,j) = bv_2d(i,j) + bv(k,i,j)
-
-          enddo            ! k
-       enddo               ! i
-    enddo                  ! j
-
-  end subroutine compress_3d_to_2d
-
+  end subroutine write_matrix_elements_3d
+  
 !****************************************************************************
 
   end module glissade_velo_higher
diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90
index 23c9ceaf..57d1cebe 100644
--- a/libglissade/glissade_velo_higher_pcg.F90
+++ b/libglissade/glissade_velo_higher_pcg.F90
@@ -48,28 +48,25 @@ module glissade_velo_higher_pcg
     use glimmer_log
     use profile, only: t_startf, t_stopf
     use cism_parallel, only: this_rank, main_task, &
-         parallel_type, staggered_parallel_halo, parallel_reduce_sum,  &
-         parallel_global_sum_staggered
+         parallel_type, staggered_parallel_halo, parallel_global_sum_stagger
 
     implicit none
     
     private
-    public :: pcg_solver_standard_3d,  pcg_solver_standard_2d,  &
-              pcg_solver_chrongear_3d, pcg_solver_chrongear_2d, &
+    public :: pcg_solver_standard_2d,  pcg_solver_standard_3d,  &
+              pcg_solver_chrongear_2d, pcg_solver_chrongear_3d, &
               matvec_multiply_structured_3d
 
     logical, parameter :: verbose_pcg = .false.
     logical, parameter :: verbose_tridiag = .false.
-!!    logical, parameter :: verbose_pcg = .true.
-!!    logical, parameter :: verbose_tridiag = .true.
 
   contains
 
 !****************************************************************************
 
-  subroutine pcg_solver_standard_3d(nx,        ny,            &
-                                    nz,        parallel,      &
-                                    indxA_3d,  active_vertex, &
+  subroutine pcg_solver_standard_2d(nx,        ny,            &
+                                    parallel,                 &
+                                    indxA_2d,  active_vertex, &
                                     Auu,       Auv,           &
                                     Avu,       Avv,           &
                                     bu,        bv,            &
@@ -84,50 +81,25 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
     !  to solve the equation $Ax=b$.
     !  Convergence is checked every {\em linear_solve_ncheck} steps.
     !
-    !  It is based on the barotropic solver in the POP ocean model 
-    !  (author Phil Jones, LANL).  Input and output arrays are located
-    !  on a structured (i,j,k) grid as defined in the glissade_velo_higher
-    !  module.  The global matrix is sparse, but its nonzero elements
-    !  are stored in four dense matrices called Auu, Avv, Auv, and Avu.
-    !  Each matrix has 3x3x3 = 27 potential nonzero elements per
-    !  node (i,j,k).
+    !  It is similar to subroutine pcg_solver_standard_3d, but modified
+    !  to solve for x and y at a single horizontal level, as in the
+    !  shallow-shelf approximation.  See the comments in that subroutine
+    !  (above) for more details on data structure and solver methods.
+    !
+    !  Input and output arrays are located on a structured (i,j) grid 
+    !  as defined in the glissade_velo_higher module.  The global matrix 
+    !  is sparse, but its nonzero element are stored in four dense matrices 
+    !  called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential 
+    !  nonzero elements per node (i,j).
     !
     !  The current preconditioning options are
     !  (0) no preconditioning
     !  (1) diagonal preconditioning
-    !  (2) preconditioning using a physics-based SIA solver
-    ! 
-    !  For the dome test case with higher-order dynamics, option (2) is best. 
-    !
-    !  Here is a schematic of the method implemented below for solving Ax = b:
-    !
-    !  halo_update(x0)
-    !  r0 = b - A*x0
-    !  d0 = 0
-    !  eta0 = 1
     !
-    !  while (not converged)
-    !     solve Mz = r for z
-    !     eta1 = (r,z)
-    !     beta = eta1/eta0
-    !     d = z + beta*d
-    !     halo_update(d)
-    !     eta0 = eta1
-    !     q = Ad
-    !     eta2 = (d,q)
-    !     alpha = eta1/eta2
-    !     x = x + alpha*d
-    !     r = r - alpha*q (or occasionally, r = b - Ax)
-    !     Check for convergence: err = sqrt(r,r)/sqrt(b,b) < tolerance
-    !  end while
-    !
-    !  where x = solution (initial value = x0)
-    !        d = conjugate direction vector (initial value = d0)
-    !        r = residual vector (initial value = r0)
-    !        M = preconditioning matrix
-    !    (r,z) = dot product of vectors r and z
-    !            and similarly for (d,q)
-    !       
+    !  The SIA-based preconditioning optional is not available for a 2D solve.
+    ! 
+    !  TODO: Add a tridiagonal preconditioning option to this subroutine,
+    !        as for subroutine pcg_solver_chrongear_2d.
     !---------------------------------------------------------------
 
     !---------------------------------------------------------------
@@ -135,65 +107,61 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
     !---------------------------------------------------------------
 
     integer, intent(in) ::   &
-       nx, ny,               &  ! horizontal grid dimensions (for scalars)
-                                ! velocity grid has dimensions (nx-1,ny-1)
-       nz                       ! number of vertical levels where velocity is computed
+       nx, ny                 ! horizontal grid dimensions (for scalars)
+                              ! velocity grid has dimensions (nx-1,ny-1)
 
     type(parallel_type), intent(in) :: &
          parallel             ! info for parallel communication
 
-    integer, dimension(-1:1,-1:1,-1:1), intent(in) :: &
-       indxA_3d               ! maps relative (x,y,z) coordinates to an index between 1 and 27
+    integer, dimension(-1:1,-1:1), intent(in) :: &
+       indxA_2d               ! maps relative (x,y) coordinates to an index between 1 and 9
 
     logical, dimension(nx-1,ny-1), intent(in) ::   &
-       active_vertex          ! T for columns (i,j) where velocity is computed, else F
+       active_vertex          ! T for vertices (i,j) where velocity is computed, else F
  
-    real(dp), dimension(27,nz,nx-1,ny-1), intent(in) ::   &
+    real(dp), dimension(nx-1,ny-1,9), intent(in) ::   &
        Auu, Auv, Avu, Avv     ! four components of assembled matrix
-                              ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction)
-                              ! other dimensions = (z,x,y) indices
+                              ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction)
+                              ! 1st and 2nd dimensions = (x,y) indices
                               !
                               !    Auu  | Auv
                               !    _____|____
                               !    Avu  | Avv
                               !         |
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
        bu, bv             ! assembled load (rhs) vector, divided into 2 parts
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::   &
+    real(dp), dimension(nx-1,ny-1), intent(inout) ::   &
        xu, xv             ! u and v components of solution (i.e., uvel and vvel)
 
     integer, intent(in)  ::   &
        precond           ! = 0 for no preconditioning
                          ! = 1 for diagonal preconditioning (best option for SSA-dominated flow)
-                         ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow)
 
     integer, intent(in)  :: &
-       linear_solve_ncheck          ! number of iterations between convergence checks in the linear solver
+       linear_solve_ncheck         ! number of iterations between convergence checks in the linear solver
 
     integer, intent(in) ::    &
-       maxiters                     ! max number of linear iterations before quitting
+       maxiters                    ! max number of linear iterations before quitting
 
     real(dp), intent(in) ::   &
-       tolerance                    ! tolerance for linear solver
+       tolerance                   ! tolerance for linear solver
 
     real(dp), intent(out) ::  &
-       err                          ! error (L2 norm of residual) in final solution
+       err                         ! error (L2 norm of residual) in final solution
 
     integer, intent(out) ::   &
-       niters                       ! iterations needed to solution
+       niters                      ! iterations needed to solution
 
     integer, intent(in) :: &
-       itest, jtest, rtest          ! point for debugging diagnostics
+       itest, jtest, rtest         ! point for debugging diagnostics
 
     !---------------------------------------------------------------
     ! Local variables and parameters   
     !---------------------------------------------------------------
 
-    integer ::  i, j, k      ! grid indices
-    integer ::  iA, jA, kA   ! grid offsets ranging from -1 to 1
-    integer ::  m            ! matrix element index
+    integer ::  i, j, ii, jj ! grid indices
     integer ::  iter         ! iteration counter
 
     real(dp) ::           &
@@ -202,7 +170,7 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        beta                ! eta1/eta0 = term in expression for new direction vector
 
     ! vectors (each of these is split into u and v components)
-    real(dp), dimension(nz,nx-1,ny-1) ::  &
+    real(dp), dimension(nx-1,ny-1) ::  &
        Adiagu, Adiagv,    &! diagonal terms of matrices Auu and Avv
        ru, rv,            &! residual vector (b-Ax)
        du, dv,            &! conjugate direction vector
@@ -215,118 +183,262 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        L2_rhs              ! L2 norm of rhs vector b
                            ! solver converges when L2_resid/L2_rhs < tolerance
 
-    real(dp), dimension(-1:1,nz,nx-1,ny-1) ::  &
-       Muu, Mvv            ! simplified SIA matrices for preconditioning
+    ! tridiagonal matrix elements
+    real(dp), dimension(:,:), allocatable :: &
+         Asubdiag_u, Adiag_u, Asupdiag_u,  &  ! matrix entries from Auu for tridiagonal preconditioning
+         Asubdiag_v, Adiag_v, Asupdiag_v      ! matrix entries from Avv for tridiagonal preconditioning
+    
+    real(dp), dimension(:,:), allocatable :: &
+         omega_u, omega_v,         & ! work arrays for tridiagonal solve
+         denom_u, denom_v,         &
+         xuh_u,   xuh_v,           &
+         xlh_u,   xlh_v
+
+    real(dp), dimension(:,:), allocatable :: &
+         b_u, b_v, x_u, x_v
+
+    real(dp), dimension(:,:), allocatable :: &
+         gather_data_row,   &   ! arrays for gathering data from every task on a row or column
+         gather_data_col
+
+    integer :: ilocal, jlocal   ! number of locally owned vertices in each direction   
+
+    integer :: &
+         tasks_row,         &   ! number of tasks per row and column for tridiagonal solve
+         tasks_col
+
+    logical :: first_time       ! true on the first subroutine call (iter = 1), false thereafter
+
+    integer :: &
+         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
+         staggered_jlo, staggered_jhi
+
+    !TODO - Apply the following for tridiag PCs?
+!    integer, parameter :: &
+!         maxiters_tridiag = 100 ! max number of linear iterations for tridiagonal preconditioning,
+!                                ! which generally leads to faster convergence than diagonal preconditioning 
+
+    !WHL - debug
+    integer :: iu_max, ju_max, iv_max, jv_max
+    real(dp) :: ru_max, rv_max
+
+    staggered_ilo = parallel%staggered_ilo
+    staggered_ihi = parallel%staggered_ihi
+    staggered_jlo = parallel%staggered_jlo
+    staggered_jhi = parallel%staggered_jhi
+
+    tasks_row = parallel%tasks_row
+    tasks_col = parallel%tasks_col
 
     if (verbose_pcg .and. main_task) then
        write(iulog,*) 'Using native PCG solver (standard)'
        write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond
     endif
 
+    ! Compute array sizes for locally owned vertices
+    ilocal = staggered_ihi - staggered_ilo + 1
+    jlocal = staggered_jhi - staggered_jlo + 1
+
     ! Set up matrices for preconditioning
-    !TODO - Add tridiagonal options
 
     call t_startf("pcg_precond_init")
 
-    if (precond == HO_PRECOND_DIAG) then
-
-       call setup_preconditioner_diag_3d(nx,       ny,         &
-                                         nz,       indxA_3d,   &
-                                         Auu,      Avv,        &
-                                         Adiagu,   Adiagv)
+    if (precond == HO_PRECOND_NONE) then    ! no preconditioner
 
-       !WHL - debug
-       if (verbose_pcg .and. this_rank == rtest) then
-          i = itest
-          j = jtest
-          write(iulog,*) 'i, j, r =', i, j, this_rank
-          write(iulog,*) 'Auu diag =', Adiagu(:,i,j)
-          write(iulog,*) 'Avu diag =', Adiagv(:,i,j)
+       if (verbose_pcg .and. main_task) then
+          write(iulog,*) 'Using no preconditioner'
        endif
 
-       !TODO - Create a separate setup for tridiag_local
-       !       For this setup: Pass in Auu and Avv
-       !       Return Adiag/subdiag/supdiag for u and v in halo
-       !       Return omega and denom in halo
-       !       Then M*z = r can compute z in halo
+    elseif (precond == HO_PRECOND_DIAG) then
 
-    elseif (precond == HO_PRECOND_SIA) then
+       if (verbose_pcg .and. main_task) then
+          write(iulog,*) 'Using diagonal matrix for preconditioning'
+       endif  ! verbose_pcg
 
-       call setup_preconditioner_sia_3d(nx,        ny,       &
-                                        nz,        indxA_3d, &
-                                        Auu,       Avv,      &
-                                        Muu,       Mvv)
+       call setup_preconditioner_diag_2d(nx,         ny,       &
+                                         indxA_2d,             &
+                                         Auu,        Avv,      &
+                                         Adiagu,     Adiagv)
 
-    else    ! no preconditioner
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
 
-       if (verbose_pcg .and. main_task) then
-          write(iulog,*) 'Using no preconditioner'
+       !WHL - debug
+       if (verbose_tridiag .and. this_rank==rtest) then
+          i = itest
+          j = jtest
+          write(iulog,*) ' '
+          write(iulog,*) 'r, i, j =', this_rank, i, j
+          write(iulog,*) 'Auu =', Auu(i,j,:)
+          write(iulog,*) 'Avv =', Avv(i,j,:)
        endif
 
-    endif   ! precond
+       allocate(Adiag_u   (nx-1,ny-1))
+       allocate(Asubdiag_u(nx-1,ny-1))
+       allocate(Asupdiag_u(nx-1,ny-1))
+       allocate(omega_u   (nx-1,ny-1))
+       allocate(denom_u   (nx-1,ny-1))
 
-    call t_stopf("pcg_precond_init")
+       allocate(Adiag_v   (nx-1,ny-1))
+       allocate(Asubdiag_v(nx-1,ny-1))
+       allocate(Asupdiag_v(nx-1,ny-1))
+       allocate(omega_v   (nx-1,ny-1))
+       allocate(denom_v   (nx-1,ny-1))
 
-    ! Compute initial residual and initialize the direction vector d
-    ! Note: The matrix A must be complete for all rows corresponding to locally 
-    !        owned vertices, and x must have the correct values in
-    !        halo vertices bordering the locally owned vertices.
-    !       Then y = Ax will be correct for locally owned vertices.
+       call setup_preconditioner_tridiag_local_2d(&
+            nx,            ny,             &
+            parallel,      indxA_2d,       &
+            itest,         jtest,  rtest,  &
+            Auu,           Avv,            &
+            Adiag_u,       Adiag_v,        &
+            Asubdiag_u,    Asubdiag_v,     &
+            Asupdiag_u,    Asupdiag_v,     &
+            omega_u,       omega_v,        &
+            denom_u,       denom_v)
 
-    ! Halo update for x (initial guess for velocity solution)
+    elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
 
-    call t_startf("pcg_halo_init")
-    call staggered_parallel_halo(xu, parallel)
-    call staggered_parallel_halo(xv, parallel)
-    call t_stopf("pcg_halo_init")
+       ! Allocate tridiagonal matrices
+       ! Note: (i,j) indices are switced for the A_v matrices to reduce striding.
 
-    ! Compute A*x (use z as a temp vector for A*x)
+       allocate(Adiag_u   (ilocal,jlocal))
+       allocate(Asubdiag_u(ilocal,jlocal))
+       allocate(Asupdiag_u(ilocal,jlocal))
+       allocate(omega_u(ilocal,jlocal))
+       allocate(denom_u(ilocal,jlocal))
+       allocate(xuh_u(ilocal,jlocal))
+       allocate(xlh_u(ilocal,jlocal))
+       allocate(b_u(ilocal,jlocal))
+       allocate(x_u(ilocal,jlocal))
 
-    call t_startf("pcg_matmult_init")
-    call matvec_multiply_structured_3d(nx,            ny,            &
-                                       nz,            parallel,      &
-                                       indxA_3d,      active_vertex, &
-                                       Auu,           Auv,           &
-                                       Avu,           Avv,           &
-                                       xu,            xv,            &
-                                       zu,            zv)
-    call t_stopf("pcg_matmult_init")
+       allocate(Adiag_v   (jlocal,ilocal))
+       allocate(Asubdiag_v(jlocal,ilocal))
+       allocate(Asupdiag_v(jlocal,ilocal))
+       allocate(omega_v(jlocal,ilocal))
+       allocate(denom_v(jlocal,ilocal))
+       allocate(xuh_v(jlocal,ilocal))
+       allocate(xlh_v(jlocal,ilocal))
+       allocate(b_v(jlocal,ilocal))
+       allocate(x_v(jlocal,ilocal))
 
-    ! Compute the initial residual r(0) = b - Ax(0)
-    ! This will be correct for locally owned vertices.
+       ! These two matrices are for gathering data from all tasks on a given row or column.
+       allocate(gather_data_row(8*tasks_row,jlocal))
+       allocate(gather_data_col(8*tasks_col,ilocal))
+       gather_data_row = 0.0d0
+       gather_data_col = 0.0d0
 
-    call t_startf("pcg_vecupdate_init")
-    ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
-    rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
-    call t_stopf("pcg_vecupdate_init")
+       ! Compute the entries of the tridiagonal matrices
 
-    ! Initialize scalars and vectors
+       ! Extract tridiagonal matrix entries from Auu
+       do j = 1, jlocal
+          jj = j + staggered_jlo - 1
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0))   ! subdiagonal elements
+             Adiag_u   (i,j) = Auu(ii,jj,indxA_2d( 0,0))   ! diagonal elements
+             Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0))   ! superdiagonal elements
+          enddo
+       enddo
+
+       ! compute work arrays for the u solve in each matrix row
+       call setup_preconditioner_tridiag_global_2d(&
+            ilocal,       jlocal,      &
+!!          itest, jtest, rtest,       &
+            itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
+            jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
+            rtest,                     &
+            Adiag_u,                   &
+            Asubdiag_u,   Asupdiag_u,  &
+            omega_u,      denom_u,     &
+            xuh_u,        xlh_u)
+
+       ! Extract tridiagonal matrix entries from Avv
+
+       do i = 1, ilocal
+          ii = i + staggered_ilo - 1
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1))   ! subdiagonal elements
+             Adiag_v   (j,i) = Avv(ii,jj,indxA_2d(0, 0))   ! diagonal elements
+             Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1))   ! superdiagonal elements
+          enddo
+       enddo
+
+       ! compute work arrays for the v solve in each matrix column
+       ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides
+          
+       call setup_preconditioner_tridiag_global_2d(&
+            jlocal,       ilocal,      &
+!!          itest, jtest, rtest,       &
+            jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
+            itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
+            rtest,                     &
+            Adiag_v,                   &
+            Asubdiag_v,   Asupdiag_v,  &
+            omega_v,      denom_v,     &
+            xuh_v,        xlh_v)
+
+    endif   ! precond
+
+    if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup'
+
+    call t_stopf("pcg_precond_init")
+
+    ! Compute initial residual and initialize the direction vector d
+    ! Note: The matrix A must be complete for all rows corresponding to locally 
+    !        owned vertices, and x must have the correct values in
+    !        halo vertices bordering the locally owned vertices.
+    !       Then y = Ax will be correct for locally owned vertices.
+
+    ! Halo update for x (initial guess for velocity solution)
+
+    call t_startf("pcg_halo_init")
+    call staggered_parallel_halo(xu, parallel)
+    call staggered_parallel_halo(xv, parallel)
+    call t_stopf("pcg_halo_init")
+
+    ! Compute A*x (use z as a temp vector for A*x)
+
+    call t_startf("pcg_matmult_init")
+    call matvec_multiply_structured_2d(nx,            ny,            &
+                                       parallel,                     &
+                                       indxA_2d,      active_vertex, &
+                                       Auu,           Auv,           &
+                                       Avu,           Avv,           &
+                                       xu,            xv,            &
+                                       zu,            zv)
+    call t_stopf("pcg_matmult_init")
+
+    ! Compute the initial residual r(0) = b - Ax(0)
+    ! This will be correct for locally owned vertices.
+
+    call t_startf("pcg_vecupdate_init")
+    ru(:,:) = bu(:,:) - zu(:,:)
+    rv(:,:) = bv(:,:) - zv(:,:)
+    call t_stopf("pcg_vecupdate_init")
+
+    ! Initialize scalars and vectors
 
     niters = maxiters 
     eta0 = 1.d0
 
-    du(:,:,:) = 0.d0
-    dv(:,:,:) = 0.d0
+    du(:,:) = 0.d0
+    dv(:,:) = 0.d0
 
-    zu(:,:,:) = 0.d0
-    zv(:,:,:) = 0.d0
+    zu(:,:) = 0.d0
+    zv(:,:) = 0.d0
 
     ! Compute the L2 norm of the RHS vectors
     ! (Goal is to obtain L2_resid/L2_rhs < tolerance)
 
     call t_startf("pcg_dotprod")
-    work0u(:,:,:) = bu(:,:,:)*bu(:,:,:)    ! terms of dot product (b, b)
-    work0v(:,:,:) = bv(:,:,:)*bv(:,:,:)
+    work0u(:,:) = bu(:,:)*bu(:,:)    ! terms of dot product (b, b)
+    work0v(:,:) = bv(:,:)*bv(:,:)
     call t_stopf("pcg_dotprod")
 
     ! find global sum of the squared L2 norm
 
     call t_startf("pcg_glbsum_init")
-    call parallel_global_sum_staggered(&
-         nx,            ny,          &
-         nz,            parallel,    &
-         L2_rhs,                     &
-         work0u, work0v)
+    L2_rhs = parallel_global_sum_stagger(work0u, parallel, work0v)
     call t_stopf("pcg_glbsum_init")
 
     ! take square root
@@ -337,44 +449,173 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
 
     iter_loop: do iter = 1, maxiters
 
+       if (verbose_pcg .and. main_task) then
+!          write(iulog,*) 'iter =', iter
+       endif
+
        call t_startf("pcg_precond")
 
        ! Compute PC(r) = solution z of Mz = r
 
        if (precond == HO_PRECOND_NONE) then      ! no preconditioning
 
-           zu(:,:,:) = ru(:,:,:)         ! PC(r) = r     
-           zv(:,:,:) = rv(:,:,:)         ! PC(r) = r    
+           zu(:,:) = ru(:,:)         ! PC(r) = r     
+           zv(:,:) = rv(:,:)         ! PC(r) = r    
 
-       elseif (precond == HO_PRECOND_DIAG ) then  ! diagonal preconditioning
+       elseif (precond == HO_PRECOND_DIAG) then  ! diagonal preconditioning
 
           do j = 1, ny-1
           do i = 1, nx-1
-          do k = 1, nz
-             if (Adiagu(k,i,j) /= 0.d0) then
-                zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j)   ! PC(r), where PC is formed from diagonal elements of A
+             if (Adiagu(i,j) /= 0.d0) then
+                zu(i,j) = ru(i,j) / Adiagu(i,j)   ! PC(r), where PC is formed from diagonal elements of A
              else                                        
-                zu(k,i,j) = 0.d0
+                zu(i,j) = 0.d0
              endif
-             if (Adiagv(k,i,j) /= 0.d0) then
-                zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j)  
+             if (Adiagv(i,j) /= 0.d0) then
+                zv(i,j) = rv(i,j) / Adiagv(i,j)  
              else                                        
-                zv(k,i,j) = 0.d0
+                zv(i,j) = 0.d0
              endif
-          enddo    ! k
           enddo    ! i
           enddo    ! j
 
-       elseif (precond == HO_PRECOND_SIA) then   ! local vertical shallow-ice solver for preconditioning
+       elseif(precond == HO_PRECOND_TRIDIAG_LOCAL) then  ! local
 
-          call easy_sia_solver(nx,   ny,   nz,        &
-                               active_vertex,         &
-                               Muu,  ru,   zu)      ! solve Muu*zu = ru for zu 
+          if (verbose_tridiag .and. this_rank == rtest) then
+             i = itest
+             j = jtest
+             write(iulog,*) 'Residual:'
+             write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j)
+             write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j)
+             write(iulog,*) ' '
+             write(iulog,*) 'jtest =', jtest
+             write(iulog,*) 'i, ru, rv:'
+             do i = itest-3, itest+3
+                write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
+             enddo
+          endif
 
-          call easy_sia_solver(nx,   ny,   nz,        &
-                               active_vertex,         &
-                               Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
+          if (verbose_pcg .and. main_task) then
+             write(iulog,*) 'call tridiag_solver_local_2d'
+          endif
+
+          ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task)
+          !TODO - Test a local solver that can compute zu and zv in the halo
+          !       (to avoid the halo update below)
+
+          call tridiag_solver_local_2d(&
+               nx,            ny,            &
+               parallel,                     &
+               itest, jtest,  rtest,         &
+               Adiag_u,       Adiag_v,       &  ! entries of preconditioning matrix
+               Asubdiag_u,    Asubdiag_v,    &
+               Asupdiag_u,    Asupdiag_v,    &
+               omega_u,       omega_v,       &
+               denom_u,       denom_v,       &
+               ru,            rv,            &  ! right hand side
+               zu,            zv)               ! solution
+       
+          !WHL - debug
+          if (verbose_pcg .and. this_rank == rtest) then
+             j = jtest
+             write(iulog,*) ' '
+             write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
+             do i = itest-3, itest+3
+                write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
+             enddo
+          endif
+
+          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+          !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
+          call staggered_parallel_halo(zu, parallel)
+          call staggered_parallel_halo(zv, parallel)
+
+       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
+
+          ! Use a global tridiagonal solver to find an approximate solution of A*z = r
+          if (iter == 1) then
+             first_time = .true.
+          else
+             first_time = .false.
+          endif
+
+          ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal)
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             do i = 1, ilocal
+                ii = i + staggered_ilo - 1
+                b_u(i,j) = ru(ii,jj)
+             enddo
+          enddo
+
+          ! Solve M*z = r, where M is a global tridiagonal matrix
+
+          call tridiag_solver_global_2d(&
+               ilocal,       jlocal,      &
+               parallel,     tasks_row,   &
+               'row',                     &  ! tridiagonal solve for each row
+!!                                          itest, jtest, rtest,      &
+               itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates                                  
+               jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
+               rtest,                     &
+               Adiag_u,                   &
+               Asubdiag_u,   Asupdiag_u,  &
+               omega_u,      denom_u,     &
+               xuh_u,        xlh_u,       &
+               b_u,          x_u,         &
+               first_time,                &
+               gather_data_row)
+
+          ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1)
+          zu(:,:) = 0.0d0
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             do i = 1, ilocal
+                ii = i + staggered_ilo - 1
+                zu(ii,jj) = x_u(i,j)
+             enddo
+          enddo
+
+          ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal)
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             do j = 1, jlocal
+                jj = j + staggered_jlo - 1
+                b_v(j,i) = rv(ii,jj)
+             enddo
+          enddo
+
+          call tridiag_solver_global_2d(&
+               jlocal,       ilocal,      &
+               parallel,     tasks_col,   &
+               'col',                     &  ! tridiagonal solve for each column
+               !!                                          itest, jtest, rtest,      &
+               jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
+               itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
+               rtest,                     &
+               Adiag_v,                   &
+               Asubdiag_v,   Asupdiag_v,  &
+               omega_v,      denom_v,     &
+               xuh_v,        xlh_v,       &
+               b_v,          x_v,         &
+               first_time,                &
+               gather_data_col)
+
+          ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1)
+          zv(:,:) = 0.0d0
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             do j = 1, jlocal
+                jj = j + staggered_jlo - 1
+                zv(ii,jj) = x_v(j,i)
+             enddo
+          enddo
 
+          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+          !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells?
+          call staggered_parallel_halo(zu, parallel)
+          call staggered_parallel_halo(zv, parallel)
+          
        endif    ! precond
 
        call t_stopf("pcg_precond")
@@ -382,23 +623,19 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        ! Compute the dot product eta1 = (r, PC(r))
 
        call t_startf("pcg_dotprod")
-       work0u(:,:,:) = ru(:,:,:)*zu(:,:,:)    ! terms of dot product (r, PC(r))
-       work0v(:,:,:) = rv(:,:,:)*zv(:,:,:)    
+       work0u(:,:) = ru(:,:)*zu(:,:)    ! terms of dot product (r, PC(r))
+       work0v(:,:) = rv(:,:)*zv(:,:)    
        call t_stopf("pcg_dotprod")
 
        call t_startf("pcg_glbsum_iter")
-       call parallel_global_sum_staggered(&
-            nx,            ny,            &
-            nz,            parallel,      &
-            eta1,                         &
-            work0u, work0v)
+       eta1 = parallel_global_sum_stagger(work0u, parallel, work0v)
        call t_stopf("pcg_glbsum_iter")
 
        !WHL - If the SIA solver has failed due to singular matrices,
        !      then eta1 will be NaN.
  
        if (eta1 /= eta1) then  ! eta1 is NaN
-          call write_log('PCG solver has failed, alpha = NaN', GM_FATAL)
+          call write_log('PCG solver has failed, eta1 = NaN', GM_FATAL)
        endif
 
        ! Update the conjugate direction vector d
@@ -406,13 +643,13 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        beta = eta1/eta0
 
        call t_startf("pcg_vecupdate")
-       du(:,:,:) = zu(:,:,:) + beta*du(:,:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
-       dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:)       !
-                                                    !                    (r_(i+1), PC(r_(i+1)))
-                                                    ! where beta_(i+1) = --------------------  
-                                                    !                        (r_i, PC(r_i)) 
-                                                    ! Initially eta0 = 1  
-                                                    ! For n >=2, eta0 = old eta1
+       du(:,:) = zu(:,:) + beta*du(:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
+       dv(:,:) = zv(:,:) + beta*dv(:,:)       !
+                                              !                    (r_(i+1), PC(r_(i+1)))
+                                              ! where beta_(i+1) = --------------------  
+                                              !                        (r_i, PC(r_i)) 
+                                              ! Initially eta0 = 1  
+                                              ! For n >=2, eta0 = old eta1
        call t_stopf("pcg_vecupdate")
 
        ! Halo update for d
@@ -426,9 +663,9 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        ! This is the one matvec multiply required for each iteration
 
        call t_startf("pcg_matmult_iter")
-       call matvec_multiply_structured_3d(nx,            ny,            &
-                                          nz,            parallel,      &
-                                          indxA_3d,      active_vertex, &
+       call matvec_multiply_structured_2d(nx,            ny,            &
+                                          parallel,                     &
+                                          indxA_2d,      active_vertex, &
                                           Auu,           Auv,           &
                                           Avu,           Avv,           &
                                           du,            dv,            &
@@ -442,16 +679,12 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        ! Compute the dot product eta2 = (d, A*d)
 
        call t_startf("pcg_dotprod")
-       work0u(:,:,:) = du(:,:,:) * qu(:,:,:)       ! terms of dot product (d, Ad)
-       work0v(:,:,:) = dv(:,:,:) * qv(:,:,:)
+       work0u(:,:) = du(:,:) * qu(:,:)       ! terms of dot product (d, Ad)
+       work0v(:,:) = dv(:,:) * qv(:,:)
        call t_stopf("pcg_dotprod")
 
        call t_startf("pcg_glbsum_iter")
-       call parallel_global_sum_staggered(&
-            nx,            ny,        &
-            nz,            parallel,  &
-            eta2,                     &
-            work0u, work0v)
+       eta2 = parallel_global_sum_stagger(work0u, parallel, work0v)
        call t_stopf("pcg_glbsum_iter")
 
        ! Compute alpha
@@ -469,11 +702,11 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        ! Compute the new solution and residual
 
        call t_startf("pcg_vecupdate")
-       xu(:,:,:) = xu(:,:,:) + alpha * du(:,:,:)    ! new solution, x_(i+1) = x_i + alpha*d
-       xv(:,:,:) = xv(:,:,:) + alpha * dv(:,:,:)
+       xu(:,:) = xu(:,:) + alpha * du(:,:)    ! new solution, x_(i+1) = x_i + alpha*d
+       xv(:,:) = xv(:,:) + alpha * dv(:,:)
 
-       ru(:,:,:) = ru(:,:,:) - alpha * qu(:,:,:)    ! new residual, r_(i+1) = r_i - alpha*(Ad)
-       rv(:,:,:) = rv(:,:,:) - alpha * qv(:,:,:)
+       ru(:,:) = ru(:,:) - alpha * qu(:,:)    ! new residual, r_(i+1) = r_i - alpha*(Ad)
+       rv(:,:) = rv(:,:) - alpha * qv(:,:)
        call t_stopf("pcg_vecupdate")
 
        ! Check for convergence every linear_solve_ncheck iterations.
@@ -483,7 +716,6 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
        ! For convergence check, use r = b - Ax
 
        if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then
-!!       if (mod(iter, linear_solve_ncheck) == 0 .or. iter == linear_solve_ncheck/2) then
 
           ! Halo update for x
 
@@ -495,9 +727,9 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
           ! Compute A*x (use z as a temp vector for A*x)
            
           call t_startf("pcg_matmult_resid")
-          call matvec_multiply_structured_3d(nx,            ny,            &
-                                             nz,            parallel,      &
-                                             indxA_3d,      active_vertex, &
+          call matvec_multiply_structured_2d(nx,            ny,            &
+                                             parallel,                     &
+                                             indxA_2d,      active_vertex, &
                                              Auu,           Auv,           &
                                              Avu,           Avv,           &
                                              xu,            xv,            &
@@ -507,23 +739,19 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
           ! Compute residual r = b - Ax
 
           call t_startf("pcg_vecupdate")
-          ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
-          rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
+          ru(:,:) = bu(:,:) - zu(:,:)
+          rv(:,:) = bv(:,:) - zv(:,:)
           call t_stopf("pcg_vecupdate")
 
           ! Compute squared L2 norm of (r, r)
 
           call t_startf("pcg_dotprod")
-          work0u(:,:,:) = ru(:,:,:)*ru(:,:,:)   ! terms of dot product (r, r)
-          work0v(:,:,:) = rv(:,:,:)*rv(:,:,:)
+          work0u(:,:) = ru(:,:)*ru(:,:)   ! terms of dot product (r, r)
+          work0v(:,:) = rv(:,:)*rv(:,:)
           call t_stopf("pcg_dotprod")
 
           call t_startf("pcg_glbsum_resid")
-          call parallel_global_sum_staggered(&
-               nx,            ny,       &
-               nz,            parallel, &
-               L2_resid,                &
-               work0u, work0v)
+          L2_resid = parallel_global_sum_stagger(work0u, parallel, work0v)
           call t_stopf("pcg_glbsum_resid")
 
           ! take square root
@@ -532,36 +760,74 @@ subroutine pcg_solver_standard_3d(nx,        ny,            &
           ! compute normalized error
           err = L2_resid/L2_rhs
 
-          if (verbose_pcg .and. main_task) then
-!             write(iulog,*) ' '
-!             write(iulog,*) 'iter, L2_resid, error =', iter, L2_resid, err
+          !WHL - debug
+          if (verbose_pcg .and. this_rank == rtest) then
+             ru_max = 0.d0
+             rv_max = 0.d0
+             iu_max = 0
+             ju_max = 0
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   if (abs(ru(i,j)) > ru_max) then
+                      ru_max = ru(i,j)
+                      iu_max = i
+                      ju_max = j
+                   endif
+                   if (abs(rv(i,j)) > rv_max) then
+                      rv_max = rv(i,j)
+                      iv_max = i
+                      jv_max = j
+                   endif
+                enddo
+             enddo
+             write(iulog,*) 'r, i, j, ru_max:', this_rank, iu_max, ju_max, ru_max
+             write(iulog,*) 'r, i, j, rv_max:', this_rank, iv_max, jv_max, rv_max
           endif
 
+          ! If converged, then exit the loop.
+          ! Note: Without good preconditioning, convergence can be slow,
+          !       but the solution after maxiters_chrongear might be good enough.
+
           if (err < tolerance) then
              niters = iter
+             if (verbose_pcg .and. main_task) then
+                write(iulog,*) 'Glissade PCG solver has converged, iter =', niters
+                write(iulog,*) ' '
+             endif
              exit iter_loop
-          endif            
+          elseif (iter == maxiters) then
+             if (verbose_pcg .and. main_task) then
+                write(iulog,*) 'Glissade PCG solver did not converge'
+                write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance
+                write(iulog,*) ' '
+             endif
+          endif
 
        endif    ! linear_solve_ncheck
 
     enddo iter_loop
 
-!WHL - Without good preconditioning, convergence can be slow, but the solution after maxiters might be good enough.
- 
-    if (niters == maxiters) then
-       if (verbose_pcg .and. main_task) then
-          write(iulog,*) 'Glissade PCG solver not converged'
-          write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance
-       endif
-    endif
+    ! Clean up
+    if (allocated(Adiag_u))    deallocate(Adiag_u, Adiag_v)
+    if (allocated(Asubdiag_u)) deallocate(Asubdiag_u, Asubdiag_v)
+    if (allocated(Asupdiag_u)) deallocate(Asupdiag_u, Asupdiag_v)
+    if (allocated(omega_u))    deallocate(omega_u, omega_v)
+    if (allocated(denom_u))    deallocate(denom_u, denom_v)
+    if (allocated(xuh_u))      deallocate(xuh_u, xuh_v)
+    if (allocated(xlh_u))      deallocate(xlh_u, xlh_v)
+    if (allocated(b_u))        deallocate(b_u, b_v)
+    if (allocated(x_u))        deallocate(x_u, x_v)
+    if (allocated(gather_data_row)) deallocate(gather_data_row)
+    if (allocated(gather_data_col)) deallocate(gather_data_col)
 
-  end subroutine pcg_solver_standard_3d
+  end subroutine pcg_solver_standard_2d
 
 !****************************************************************************
 
-  subroutine pcg_solver_standard_2d(nx,        ny,            &
-                                    parallel,                 &
-                                    indxA_2d,  active_vertex, &
+  subroutine pcg_solver_standard_3d(nx,        ny,            &
+                                    nz,        parallel,      &
+                                    indxA_2d,  indxA_3d,      &
+                                    active_vertex,            &
                                     Auu,       Auv,           &
                                     Avu,       Avv,           &
                                     bu,        bv,            &
@@ -576,25 +842,50 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
     !  to solve the equation $Ax=b$.
     !  Convergence is checked every {\em linear_solve_ncheck} steps.
     !
-    !  It is similar to subroutine pcg_solver_standard_3d, but modified
-    !  to solve for x and y at a single horizontal level, as in the
-    !  shallow-shelf approximation.  See the comments in that subroutine
-    !  (above) for more details on data structure and solver methods.
-    !
-    !  Input and output arrays are located on a structured (i,j) grid 
-    !  as defined in the glissade_velo_higher module.  The global matrix 
-    !  is sparse, but its nonzero element are stored in four dense matrices 
-    !  called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential 
-    !  nonzero elements per node (i,j).
+    !  It is based on the barotropic solver in the POP ocean model 
+    !  (author Phil Jones, LANL).  Input and output arrays are located
+    !  on a structured (i,j,k) grid as defined in the glissade_velo_higher
+    !  module.  The global matrix is sparse, but its nonzero elements
+    !  are stored in four dense matrices called Auu, Avv, Auv, and Avu.
+    !  Each matrix has 3x3x3 = 27 potential nonzero elements per
+    !  node (i,j,k).
     !
     !  The current preconditioning options are
     !  (0) no preconditioning
     !  (1) diagonal preconditioning
-    !
-    !  The SIA-based preconditioning optional is not available for a 2D solve.
+    !  (2) preconditioning using a physics-based SIA solver
     ! 
-    !  TODO: Add a tridiagonal preconditioning option to this subroutine,
-    !        as for subroutine pcg_solver_chrongear_2d.
+    !  For the dome test case with higher-order dynamics, option (2) is best. 
+    !
+    !  Here is a schematic of the method implemented below for solving Ax = b:
+    !
+    !  halo_update(x0)
+    !  r0 = b - A*x0
+    !  d0 = 0
+    !  eta0 = 1
+    !
+    !  while (not converged)
+    !     solve Mz = r for z
+    !     eta1 = (r,z)
+    !     beta = eta1/eta0
+    !     d = z + beta*d
+    !     halo_update(d)
+    !     eta0 = eta1
+    !     q = Ad
+    !     eta2 = (d,q)
+    !     alpha = eta1/eta2
+    !     x = x + alpha*d
+    !     r = r - alpha*q (or occasionally, r = b - Ax)
+    !     Check for convergence: err = sqrt(r,r)/sqrt(b,b) < tolerance
+    !  end while
+    !
+    !  where x = solution (initial value = x0)
+    !        d = conjugate direction vector (initial value = d0)
+    !        r = residual vector (initial value = r0)
+    !        M = preconditioning matrix
+    !    (r,z) = dot product of vectors r and z
+    !            and similarly for (d,q)
+    !       
     !---------------------------------------------------------------
 
     !---------------------------------------------------------------
@@ -602,62 +893,70 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
     !---------------------------------------------------------------
 
     integer, intent(in) ::   &
-       nx, ny                 ! horizontal grid dimensions (for scalars)
-                              ! velocity grid has dimensions (nx-1,ny-1)
+       nx, ny,               &  ! horizontal grid dimensions (for scalars)
+                                ! velocity grid has dimensions (nx-1,ny-1)
+       nz                       ! number of vertical levels where velocity is computed
 
     type(parallel_type), intent(in) :: &
-         parallel             ! info for parallel communication
+       parallel               ! info for parallel communication
 
     integer, dimension(-1:1,-1:1), intent(in) :: &
        indxA_2d               ! maps relative (x,y) coordinates to an index between 1 and 9
 
+    integer, dimension(-1:1,-1:1,-1:1), intent(in) :: &
+       indxA_3d               ! maps relative (x,y,z) coordinates to an index between 1 and 27
+
     logical, dimension(nx-1,ny-1), intent(in) ::   &
-       active_vertex          ! T for vertices (i,j) where velocity is computed, else F
+       active_vertex          ! T for columns (i,j) where velocity is computed, else F
  
-    real(dp), dimension(nx-1,ny-1,9), intent(in) ::   &
+    real(dp), dimension(27,nz,nx-1,ny-1), intent(in) ::   &
        Auu, Auv, Avu, Avv     ! four components of assembled matrix
-                              ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction)
-                              ! 1st and 2nd dimensions = (x,y) indices
+                              ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction)
+                              ! other dimensions = (z,x,y) indices
                               !
                               !    Auu  | Auv
                               !    _____|____
                               !    Avu  | Avv
                               !         |
 
-    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
        bu, bv             ! assembled load (rhs) vector, divided into 2 parts
 
-    real(dp), dimension(nx-1,ny-1), intent(inout) ::   &
+    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::   &
        xu, xv             ! u and v components of solution (i.e., uvel and vvel)
 
     integer, intent(in)  ::   &
        precond           ! = 0 for no preconditioning
                          ! = 1 for diagonal preconditioning (best option for SSA-dominated flow)
+                         ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow)
 
     integer, intent(in)  :: &
-       linear_solve_ncheck         ! number of iterations between convergence checks in the linear solver
+       linear_solve_ncheck          ! number of iterations between convergence checks in the linear solver
 
     integer, intent(in) ::    &
-       maxiters                    ! max number of linear iterations before quitting
+       maxiters                     ! max number of linear iterations before quitting
 
     real(dp), intent(in) ::   &
-       tolerance                   ! tolerance for linear solver
+       tolerance                    ! tolerance for linear solver
 
     real(dp), intent(out) ::  &
-       err                         ! error (L2 norm of residual) in final solution
+       err                          ! error (L2 norm of residual) in final solution
 
     integer, intent(out) ::   &
-       niters                      ! iterations needed to solution
+       niters                       ! iterations needed to solution
 
     integer, intent(in) :: &
-       itest, jtest, rtest         ! point for debugging diagnostics
+       itest, jtest, rtest          ! point for debugging diagnostics
 
     !---------------------------------------------------------------
     ! Local variables and parameters   
     !---------------------------------------------------------------
 
-    integer ::  i, j         ! grid indices
-    integer ::  iter         ! iteration counter
+    integer :: i, j, k        ! grid indices
+    integer :: iA, jA, kA     ! grid offsets ranging from -1 to 1
+    integer :: m              ! matrix element index
+    integer :: ilocal, jlocal ! number of locally owned vertices in each direction
+    integer :: iter           ! iteration counter
 
     real(dp) ::           &
        eta0, eta1, eta2,  &! scalar inner product results
@@ -665,7 +964,7 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        beta                ! eta1/eta0 = term in expression for new direction vector
 
     ! vectors (each of these is split into u and v components)
-    real(dp), dimension(nx-1,ny-1) ::  &
+    real(dp), dimension(nz,nx-1,ny-1) ::  &
        Adiagu, Adiagv,    &! diagonal terms of matrices Auu and Avv
        ru, rv,            &! residual vector (b-Ax)
        du, dv,            &! conjugate direction vector
@@ -678,86 +977,243 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        L2_rhs              ! L2 norm of rhs vector b
                            ! solver converges when L2_resid/L2_rhs < tolerance
 
-    if (verbose_pcg .and. main_task) then
-       write(iulog,*) 'Using native PCG solver (standard)'
-       write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond
-    endif
+    real(dp), dimension(-1:1,nz,nx-1,ny-1) ::  &
+       Muu, Mvv            ! simplified SIA matrices for preconditioning
 
-    ! Set up matrices for preconditioning
+    ! arrays for tridiagonal preconditioning
+    ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above
+    real(dp), dimension(:,:), allocatable :: &
+         Asubdiag_u, Adiag_u, Asupdiag_u,  &  ! matrix entries from Auu for tridiagonal preconditioning
+         Asubdiag_v, Adiag_v, Asupdiag_v      ! matrix entries from Avv for tridiagonal preconditioning
 
-    !TODO - Add tridiagonal option
+    real(dp), dimension(:,:), allocatable :: &
+         omega_u, omega_v,         & ! work arrays for tridiagonal solve
+	 denom_u, denom_v,         &
+         xuh_u,   xuh_v,           &
+         xlh_u,   xlh_v
 
-    if (verbose_pcg .and. main_task) then
-       write(iulog,*) 'Using diagonal matrix for preconditioning'
-    endif  ! verbose_pcg
+    ! Note: These two matrices are global in the EW and NS dimensions, respectively.
+    !       Each holds 8 pieces of information for each task on each row or column.
+    !       Since only 2 of these 8 pieces of information change from one iteration to the next,
+    !        it is more efficient to gather the remaining information once and pass the arrays
+    !        with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d
+    !        and gather all the information every time the subroutine is called.
+    ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity?
 
-    call t_startf("pcg_precond_init")
-    call setup_preconditioner_diag_2d(nx,         ny,       &
-                                      indxA_2d,             &
-                                      Auu,        Avv,      &
-                                      Adiagu,     Adiagv)
-    call t_stopf("pcg_precond_init")
+    real(dp), dimension(:,:), allocatable :: &
+         gather_data_row,   &   ! arrays for gathering data from every task on a row or column
+         gather_data_col
 
-    ! Compute initial residual and initialize the direction vector d
-    ! Note: The matrix A must be complete for all rows corresponding to locally 
-    !        owned vertices, and x must have the correct values in
-    !        halo vertices bordering the locally owned vertices.
-    !       Then y = Ax will be correct for locally owned vertices.
+    integer :: &
+         tasks_row,         &   ! number of tasks per row and column for tridiagonal solve
+         tasks_col
 
-    ! Halo update for x (initial guess for velocity solution)
+    logical :: first_time       ! true on the first subroutine call (iter = 1), false thereafter
 
-    call t_startf("pcg_halo_init")
-    call staggered_parallel_halo(xu, parallel)
-    call staggered_parallel_halo(xv, parallel)
-    call t_stopf("pcg_halo_init")
+    integer :: &
+         staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
+         staggered_jlo, staggered_jhi
 
-    ! Compute A*x (use z as a temp vector for A*x)
+    !WHL - debug
+    integer :: iu_max, ju_max, iv_max, jv_max
+    real(dp) :: ru_max, rv_max
 
-    call t_startf("pcg_matmult_init")
-    call matvec_multiply_structured_2d(nx,            ny,            &
-                                       parallel,                     &
-                                       indxA_2d,      active_vertex, &
-                                       Auu,           Auv,           &
-                                       Avu,           Avv,           &
-                                       xu,            xv,            &
-                                       zu,            zv)
-    call t_stopf("pcg_matmult_init")
+    staggered_ilo = parallel%staggered_ilo
+    staggered_ihi = parallel%staggered_ihi
+    staggered_jlo = parallel%staggered_jlo
+    staggered_jhi = parallel%staggered_jhi
 
-    ! Compute the initial residual r(0) = b - Ax(0)
-    ! This will be correct for locally owned vertices.
+    tasks_row = parallel%tasks_row
+    tasks_col = parallel%tasks_col
 
-    call t_startf("pcg_vecupdate_init")
-    ru(:,:) = bu(:,:) - zu(:,:)
-    rv(:,:) = bv(:,:) - zv(:,:)
-    call t_stopf("pcg_vecupdate_init")
+    if (verbose_pcg .and. main_task) then
+       write(iulog,*) 'Using native PCG solver (standard)'
+       write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond
+    endif
 
-    ! Initialize scalars and vectors
+    ! Compute array sizes for locally owned vertices
+    ilocal = staggered_ihi - staggered_ilo + 1
+    jlocal = staggered_jhi - staggered_jlo + 1
 
-    niters = maxiters 
-    eta0 = 1.d0
+    ! Set up matrices for preconditioning
 
-    du(:,:) = 0.d0
-    dv(:,:) = 0.d0
+    call t_startf("pcg_precond_init")
 
-    zu(:,:) = 0.d0
-    zv(:,:) = 0.d0
+    if (precond == HO_PRECOND_NONE) then    ! no preconditioner
 
-    ! Compute the L2 norm of the RHS vectors
-    ! (Goal is to obtain L2_resid/L2_rhs < tolerance)
+       if (verbose_pcg .and. this_rank == rtest) then
+          write(iulog,*) 'Using no preconditioner'
+       endif
 
-    call t_startf("pcg_dotprod")
-    work0u(:,:) = bu(:,:)*bu(:,:)    ! terms of dot product (b, b)
-    work0v(:,:) = bv(:,:)*bv(:,:)
-    call t_stopf("pcg_dotprod")
+    elseif (precond == HO_PRECOND_DIAG) then
 
-    ! find global sum of the squared L2 norm
+       call setup_preconditioner_diag_3d(nx,       ny,         &
+                                         nz,       indxA_3d,   &
+                                         Auu,      Avv,        &
+                                         Adiagu,   Adiagv)
+
+       !WHL - debug
+       if (verbose_pcg .and. this_rank == rtest) then
+          i = itest
+          j = jtest
+          write(iulog,*) 'i, j, r =', i, j, this_rank
+          write(iulog,*) 'Auu diag =', Adiagu(:,i,j)
+          write(iulog,*) 'Avv diag =', Adiagv(:,i,j)
+       endif
+
+    elseif (precond == HO_PRECOND_SIA) then
+
+       call setup_preconditioner_sia_3d(nx,        ny,       &
+                                        nz,        indxA_3d, &
+                                        Auu,       Avv,      &
+                                        Muu,       Mvv)
+
+       if (verbose_pcg .and. this_rank == rtest) then
+          j = jtest
+          write(iulog,*) ' '
+          write(iulog,*) 'i, k, Muu_sia, Mvv_sia:'
+          do i = staggered_ihi, staggered_ilo, -1
+             write(iulog,*) ' '
+             do k = 1, nz
+                write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j)
+             enddo
+          enddo  ! i
+       endif
+
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+
+       ! Allocate tridiagonal preconditioning matrices
+       allocate(Adiag_u   (nx-1,ny-1))
+       allocate(Asubdiag_u(nx-1,ny-1))
+       allocate(Asupdiag_u(nx-1,ny-1))
+       allocate(omega_u   (nx-1,ny-1))
+       allocate(denom_u   (nx-1,ny-1))
+
+       allocate(Adiag_v   (nx-1,ny-1))
+       allocate(Asubdiag_v(nx-1,ny-1))
+       allocate(Asupdiag_v(nx-1,ny-1))
+       allocate(omega_v   (nx-1,ny-1))
+       allocate(denom_v   (nx-1,ny-1))
+
+       ! Compute arrays for tridiagonal preconditioning
+
+       call setup_preconditioner_tridiag_local_3d(&
+            nx,            ny,            &
+            nz,            parallel,      &
+            active_vertex,                &
+            indxA_2d,      indxA_3d,      &
+            itest, jtest,  rtest,         &
+            Auu,           Avv,           &
+            Muu,           Mvv,           &
+            Adiag_u,       Adiag_v,       &
+            Asubdiag_u,    Asubdiag_v,    &
+            Asupdiag_u,    Asupdiag_v,    &
+            omega_u,       omega_v,       &
+            denom_u,       denom_v)
+
+    elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+
+       ! Allocate tridiagonal preconditioning matrices
+       ! Note: (i,j) indices are switched for the A_v matrices to reduce striding.
+       allocate(Adiag_u   (ilocal,jlocal))
+       allocate(Asubdiag_u(ilocal,jlocal))
+       allocate(Asupdiag_u(ilocal,jlocal))
+       allocate(omega_u(ilocal,jlocal))
+       allocate(denom_u(ilocal,jlocal))
+       allocate(xuh_u(ilocal,jlocal))
+       allocate(xlh_u(ilocal,jlocal))
+
+       allocate(Adiag_v   (jlocal,ilocal))
+       allocate(Asubdiag_v(jlocal,ilocal))
+       allocate(Asupdiag_v(jlocal,ilocal))
+       allocate(omega_v(jlocal,ilocal))
+       allocate(denom_v(jlocal,ilocal))
+       allocate(xuh_v(jlocal,ilocal))
+       allocate(xlh_v(jlocal,ilocal))
+
+       ! These two matrices are for gathering data from all tasks on a given row or column.
+       allocate(gather_data_row(8*tasks_row,jlocal))
+       allocate(gather_data_col(8*tasks_col,ilocal))
+       gather_data_row = 0.0d0
+       gather_data_col = 0.0d0
+
+       call setup_preconditioner_tridiag_global_3d(&
+            nx,            ny,            &
+            nz,            parallel,      &
+            active_vertex,                &
+            indxA_2d,      indxA_3d,      &
+            ilocal,        jlocal,        &
+            itest, jtest,  rtest,         &
+            Auu,           Avv,           &
+            Muu,           Mvv,           &
+            Adiag_u,       Adiag_v,       &
+            Asubdiag_u,    Asubdiag_v,    &
+            Asupdiag_u,    Asupdiag_v,    &
+            omega_u,       omega_v,       &
+            denom_u,       denom_v,       &
+            xuh_u,         xuh_v,         &
+            xlh_u,         xlh_v)
+
+    endif   ! precond
+
+    call t_stopf("pcg_precond_init")
+
+    ! Compute initial residual and initialize the direction vector d
+    ! Note: The matrix A must be complete for all rows corresponding to locally 
+    !        owned vertices, and x must have the correct values in
+    !        halo vertices bordering the locally owned vertices.
+    !       Then y = Ax will be correct for locally owned vertices.
+
+    ! Halo update for x (initial guess for velocity solution)
+
+    call t_startf("pcg_halo_init")
+    call staggered_parallel_halo(xu, parallel)
+    call staggered_parallel_halo(xv, parallel)
+    call t_stopf("pcg_halo_init")
+
+    ! Compute A*x (use z as a temp vector for A*x)
+
+    call t_startf("pcg_matmult_init")
+    call matvec_multiply_structured_3d(nx,            ny,            &
+                                       nz,            parallel,      &
+                                       indxA_3d,      active_vertex, &
+                                       Auu,           Auv,           &
+                                       Avu,           Avv,           &
+                                       xu,            xv,            &
+                                       zu,            zv)
+    call t_stopf("pcg_matmult_init")
+
+    ! Compute the initial residual r(0) = b - Ax(0)
+    ! This will be correct for locally owned vertices.
+
+    call t_startf("pcg_vecupdate_init")
+    ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
+    rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
+    call t_stopf("pcg_vecupdate_init")
+
+    ! Initialize scalars and vectors
+
+    niters = maxiters 
+    eta0 = 1.d0
+
+    du(:,:,:) = 0.d0
+    dv(:,:,:) = 0.d0
+
+    zu(:,:,:) = 0.d0
+    zv(:,:,:) = 0.d0
+
+    ! Compute the L2 norm of the RHS vectors
+    ! (Goal is to obtain L2_resid/L2_rhs < tolerance)
+
+    call t_startf("pcg_dotprod")
+    work0u(:,:,:) = bu(:,:,:)*bu(:,:,:)    ! terms of dot product (b, b)
+    work0v(:,:,:) = bv(:,:,:)*bv(:,:,:)
+    call t_stopf("pcg_dotprod")
+
+    ! find global sum of the squared L2 norm
 
     call t_startf("pcg_glbsum_init")
-    call parallel_global_sum_staggered(&
-         nx,        ny,       &
-         parallel,            &
-         L2_rhs,              &
-         work0u,    work0v)
+    L2_rhs = parallel_global_sum_stagger(work0u, parallel, work0v)
     call t_stopf("pcg_glbsum_init")
 
     ! take square root
@@ -768,36 +1224,92 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
 
     iter_loop: do iter = 1, maxiters
 
-       if (verbose_pcg .and. main_task) then
-          write(iulog,*) 'iter =', iter
-       endif
-
        call t_startf("pcg_precond")
 
        ! Compute PC(r) = solution z of Mz = r
 
        if (precond == HO_PRECOND_NONE) then      ! no preconditioning
 
-           zu(:,:) = ru(:,:)         ! PC(r) = r     
-           zv(:,:) = rv(:,:)         ! PC(r) = r    
+           zu(:,:,:) = ru(:,:,:)         ! PC(r) = r     
+           zv(:,:,:) = rv(:,:,:)         ! PC(r) = r    
 
-       elseif (precond == HO_PRECOND_DIAG) then  ! diagonal preconditioning
+       elseif (precond == HO_PRECOND_DIAG ) then  ! diagonal preconditioning
 
           do j = 1, ny-1
           do i = 1, nx-1
-             if (Adiagu(i,j) /= 0.d0) then
-                zu(i,j) = ru(i,j) / Adiagu(i,j)   ! PC(r), where PC is formed from diagonal elements of A
+          do k = 1, nz
+             if (Adiagu(k,i,j) /= 0.d0) then
+                zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j)   ! PC(r), where PC is formed from diagonal elements of A
              else                                        
-                zu(i,j) = 0.d0
+                zu(k,i,j) = 0.d0
              endif
-             if (Adiagv(i,j) /= 0.d0) then
-                zv(i,j) = rv(i,j) / Adiagv(i,j)  
+             if (Adiagv(k,i,j) /= 0.d0) then
+                zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j)  
              else                                        
-                zv(i,j) = 0.d0
+                zv(k,i,j) = 0.d0
              endif
+          enddo    ! k
           enddo    ! i
           enddo    ! j
 
+       elseif (precond == HO_PRECOND_SIA) then   ! local vertical shallow-ice solver for preconditioning
+
+          call easy_sia_solver(nx,   ny,   nz,        &
+                               active_vertex,         &
+                               Muu,  ru,   zu)      ! solve Muu*zu = ru for zu 
+
+          call easy_sia_solver(nx,   ny,   nz,        &
+                               active_vertex,         &
+                               Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
+
+       elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+
+          ! Use a local tridiagonal solver to find an approximate solution of A*z = r
+
+          call tridiag_solver_local_3d(&
+               nx,            ny,            &
+               nz,            parallel,      &
+               active_vertex,                &
+               itest, jtest,  rtest,         &
+               Adiag_u,       Adiag_v,       &  ! entries of 2D preconditioning matrix
+               Asubdiag_u,    Asubdiag_v,    &
+               Asupdiag_u,    Asupdiag_v,    &
+               omega_u,       omega_v,       &
+               denom_u,       denom_v,       &
+               Muu,           Mvv,           &  ! entries of SIA matrix
+               ru,            rv,            &  ! 3D residual
+               zu,            zv)               ! approximate solution of Az = r
+
+       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+
+          ! Use a global tridiagonal solver to find an approximate solution of A*z = r
+
+          if (iter == 1) then
+             first_time = .true.
+          else
+             first_time = .false.
+          endif
+
+          call tridiag_solver_global_3d(&
+               nx,              ny,              &
+               nz,              parallel,        &
+               active_vertex,                    &
+               ilocal,          jlocal,          &
+               tasks_row,       tasks_col,       &
+               itest,  jtest,   rtest,           &
+               Adiag_u,         Adiag_v,         &  ! entries of 2D preconditioning matrix
+               Asubdiag_u,      Asubdiag_v,      &
+               Asupdiag_u,      Asupdiag_v,      &
+               omega_u,         omega_v,         &
+               denom_u,         denom_v,         &
+               xuh_u,           xuh_v,           &
+               xlh_u,           xlh_v,           &
+               Muu,             Mvv,             &  ! entries of SIA matrix
+               gather_data_row, gather_data_col, &
+               first_time,                       &
+               ru,              rv,              &  ! 3D residual
+               zu,              zv)                 ! approximate solution of Az = r
+
        endif    ! precond
 
        call t_stopf("pcg_precond")
@@ -805,23 +1317,19 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        ! Compute the dot product eta1 = (r, PC(r))
 
        call t_startf("pcg_dotprod")
-       work0u(:,:) = ru(:,:)*zu(:,:)    ! terms of dot product (r, PC(r))
-       work0v(:,:) = rv(:,:)*zv(:,:)    
+       work0u(:,:,:) = ru(:,:,:)*zu(:,:,:)    ! terms of dot product (r, PC(r))
+       work0v(:,:,:) = rv(:,:,:)*zv(:,:,:)    
        call t_stopf("pcg_dotprod")
 
        call t_startf("pcg_glbsum_iter")
-       call parallel_global_sum_staggered(&
-            nx,     ny,     &
-            parallel,       &
-            eta1,           &
-            work0u, work0v)
+       eta1 = parallel_global_sum_stagger(work0u, parallel, work0v)
        call t_stopf("pcg_glbsum_iter")
 
        !WHL - If the SIA solver has failed due to singular matrices,
        !      then eta1 will be NaN.
  
        if (eta1 /= eta1) then  ! eta1 is NaN
-          call write_log('PCG solver has failed, eta1 = NaN', GM_FATAL)
+          call write_log('PCG solver has failed, alpha = NaN', GM_FATAL)
        endif
 
        ! Update the conjugate direction vector d
@@ -829,13 +1337,13 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        beta = eta1/eta0
 
        call t_startf("pcg_vecupdate")
-       du(:,:) = zu(:,:) + beta*du(:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
-       dv(:,:) = zv(:,:) + beta*dv(:,:)       !
-                                              !                    (r_(i+1), PC(r_(i+1)))
-                                              ! where beta_(i+1) = --------------------  
-                                              !                        (r_i, PC(r_i)) 
-                                              ! Initially eta0 = 1  
-                                              ! For n >=2, eta0 = old eta1
+       du(:,:,:) = zu(:,:,:) + beta*du(:,:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
+       dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:)       !
+                                                    !                    (r_(i+1), PC(r_(i+1)))
+                                                    ! where beta_(i+1) = --------------------  
+                                                    !                        (r_i, PC(r_i)) 
+                                                    ! Initially eta0 = 1  
+                                                    ! For n >=2, eta0 = old eta1
        call t_stopf("pcg_vecupdate")
 
        ! Halo update for d
@@ -849,9 +1357,9 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        ! This is the one matvec multiply required for each iteration
 
        call t_startf("pcg_matmult_iter")
-       call matvec_multiply_structured_2d(nx,            ny,            &
-                                          parallel,                     &
-                                          indxA_2d,      active_vertex, &
+       call matvec_multiply_structured_3d(nx,            ny,            &
+                                          nz,            parallel,      &
+                                          indxA_3d,      active_vertex, &
                                           Auu,           Auv,           &
                                           Avu,           Avv,           &
                                           du,            dv,            &
@@ -865,16 +1373,12 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        ! Compute the dot product eta2 = (d, A*d)
 
        call t_startf("pcg_dotprod")
-       work0u(:,:) = du(:,:) * qu(:,:)       ! terms of dot product (d, Ad)
-       work0v(:,:) = dv(:,:) * qv(:,:)
+       work0u(:,:,:) = du(:,:,:) * qu(:,:,:)       ! terms of dot product (d, Ad)
+       work0v(:,:,:) = dv(:,:,:) * qv(:,:,:)
        call t_stopf("pcg_dotprod")
 
        call t_startf("pcg_glbsum_iter")
-       call parallel_global_sum_staggered(&
-            nx,     ny,       &
-            parallel,         &
-            eta2,             &
-            work0u, work0v)
+       eta2 = parallel_global_sum_stagger(work0u, parallel, work0v)
        call t_stopf("pcg_glbsum_iter")
 
        ! Compute alpha
@@ -892,13 +1396,26 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        ! Compute the new solution and residual
 
        call t_startf("pcg_vecupdate")
-       xu(:,:) = xu(:,:) + alpha * du(:,:)    ! new solution, x_(i+1) = x_i + alpha*d
-       xv(:,:) = xv(:,:) + alpha * dv(:,:)
+       xu(:,:,:) = xu(:,:,:) + alpha * du(:,:,:)    ! new solution, x_(i+1) = x_i + alpha*d
+       xv(:,:,:) = xv(:,:,:) + alpha * dv(:,:,:)
 
-       ru(:,:) = ru(:,:) - alpha * qu(:,:)    ! new residual, r_(i+1) = r_i - alpha*(Ad)
-       rv(:,:) = rv(:,:) - alpha * qv(:,:)
+       ru(:,:,:) = ru(:,:,:) - alpha * qu(:,:,:)    ! new residual, r_(i+1) = r_i - alpha*(Ad)
+       rv(:,:,:) = rv(:,:,:) - alpha * qv(:,:,:)
        call t_stopf("pcg_vecupdate")
 
+       if (verbose_pcg .and. this_rank == rtest) then
+          j = jtest
+          write(iulog,*) ' '
+          write(iulog,*) 'iter =', iter
+          write(iulog,*) 'i, k, xu, xv, ru, rv:'
+          do i = itest-3, itest+3
+             write(iulog,*) ' '
+             do k = 1, nz
+                write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
+             enddo
+          enddo  ! i
+       endif
+
        ! Check for convergence every linear_solve_ncheck iterations.
        ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence.
        ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5?  This would be answer-changing.
@@ -906,7 +1423,11 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
        ! For convergence check, use r = b - Ax
 
        if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then
-!!       if (mod(iter, linear_solve_ncheck) == 0 .or. iter == linear_solve_ncheck/2) then
+
+          if (verbose_pcg .and. main_task) then
+             write(iulog,*) ' '
+             write(iulog,*) 'Check convergence, iter =', iter
+          endif
 
           ! Halo update for x
 
@@ -918,9 +1439,9 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
           ! Compute A*x (use z as a temp vector for A*x)
            
           call t_startf("pcg_matmult_resid")
-          call matvec_multiply_structured_2d(nx,            ny,            &
-                                             parallel,                     &
-                                             indxA_2d,      active_vertex, &
+          call matvec_multiply_structured_3d(nx,            ny,            &
+                                             nz,            parallel,      &
+                                             indxA_3d,      active_vertex, &
                                              Auu,           Auv,           &
                                              Avu,           Avv,           &
                                              xu,            xv,            &
@@ -930,23 +1451,19 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
           ! Compute residual r = b - Ax
 
           call t_startf("pcg_vecupdate")
-          ru(:,:) = bu(:,:) - zu(:,:)
-          rv(:,:) = bv(:,:) - zv(:,:)
+          ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
+          rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
           call t_stopf("pcg_vecupdate")
 
           ! Compute squared L2 norm of (r, r)
 
           call t_startf("pcg_dotprod")
-          work0u(:,:) = ru(:,:)*ru(:,:)   ! terms of dot product (r, r)
-          work0v(:,:) = rv(:,:)*rv(:,:)
+          work0u(:,:,:) = ru(:,:,:)*ru(:,:,:)   ! terms of dot product (r, r)
+          work0v(:,:,:) = rv(:,:,:)*rv(:,:,:)
           call t_stopf("pcg_dotprod")
 
           call t_startf("pcg_glbsum_resid")
-          call parallel_global_sum_staggered(&
-               nx,     ny,     &
-               parallel,       &
-               L2_resid,       &
-               work0u, work0v)
+          L2_resid = parallel_global_sum_stagger(work0u, parallel, work0v)
           call t_stopf("pcg_glbsum_resid")
 
           ! take square root
@@ -955,32 +1472,75 @@ subroutine pcg_solver_standard_2d(nx,        ny,            &
           ! compute normalized error
           err = L2_resid/L2_rhs
 
+          if (verbose_pcg .and. main_task) then
+             write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err
+          endif
+
+          !WHL - debug
+          if (verbose_pcg .and. this_rank == rtest) then
+             ru_max = 0.d0
+             rv_max = 0.d0
+             iu_max = 0
+             ju_max = 0
+             do j = staggered_jlo, staggered_jhi
+                do i = staggered_ilo, staggered_ihi
+                   if (sum(abs(ru(:,i,j))) > ru_max) then
+                      ru_max = sum(abs(ru(:,i,j)))
+                      iu_max = i
+                      ju_max = j
+                   endif
+                   if (sum(abs(rv(:,i,j))) > rv_max) then
+                      rv_max = sum(abs(rv(:,i,j)))
+                      iv_max = i
+                      jv_max = j
+                   endif
+                enddo
+             enddo
+             write(iulog,*) 'r, i, j, ru_max:', this_rank, iu_max, ju_max, ru_max
+             write(iulog,*) 'r, i, j, rv_max:', this_rank, iv_max, jv_max, rv_max
+          endif
+
+          ! If converged, then exit the loop.
+          ! Note: Without good preconditioning, convergence can be slow,
+          !       but the solution after maxiters_chrongear might be good enough.
+
           if (err < tolerance) then
              niters = iter
+             if (verbose_pcg .and. main_task) then
+                write(iulog,*) 'Glissade PCG solver has converged, iter =', niters
+                write(iulog,*) ' '
+             endif
              exit iter_loop
-          endif            
+          elseif (iter == maxiters) then
+             if (verbose_pcg .and. main_task) then
+                write(iulog,*) 'Glissade PCG solver did not converge'
+                write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance
+                write(iulog,*) ' '
+             endif
+          endif
 
        endif    ! linear_solve_ncheck
 
     enddo iter_loop
 
-!WHL - Without good preconditioning, convergence can be slow, but the solution after maxiters might be good enough.
- 
-    if (niters == maxiters) then
-       if (verbose_pcg .and. main_task) then
-          write(iulog,*) 'Glissade PCG solver not converged'
-          write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance
-       endif
-    endif
+    ! Clean up
+    if (allocated(Adiag_u))    deallocate(Adiag_u, Adiag_v)
+    if (allocated(Asubdiag_u)) deallocate(Asubdiag_u, Asubdiag_v)
+    if (allocated(Asupdiag_u)) deallocate(Asupdiag_u, Asupdiag_v)
+    if (allocated(omega_u))    deallocate(omega_u, omega_v)
+    if (allocated(denom_u))    deallocate(denom_u, denom_v)
+    if (allocated(xuh_u))      deallocate(xuh_u, xuh_v)
+    if (allocated(xlh_u))      deallocate(xlh_u, xlh_v)
+    if (allocated(gather_data_row)) deallocate(gather_data_row)
+    if (allocated(gather_data_col)) deallocate(gather_data_col)
 
-  end subroutine pcg_solver_standard_2d
+  end subroutine pcg_solver_standard_3d
 
 !****************************************************************************
 
-  subroutine pcg_solver_chrongear_3d(nx,        ny,            &
-                                     nz,        parallel,      &
-                                     indxA_2d,  indxA_3d,      &
-                                     active_vertex,            &
+  subroutine pcg_solver_chrongear_2d(nx,        ny,            &
+                                     parallel,                 &
+                                     indxA_2d,  active_vertex, &
                                      Auu,       Auv,           &
                                      Avu,       Avv,           &
                                      bu,        bv,            &
@@ -992,104 +1552,26 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
     !---------------------------------------------------------------
     !  This subroutine uses a Chronopoulos-Gear preconditioned conjugate-gradient
-    !  algorithm to solve the equation $Ax=b$.
-    !
-    !  It is based on the Chronopoulos-Gear PCG solver in the POP ocean model 
-    !  (author Frank Bryan, NCAR). It is a rearranged conjugate gradient solver 
-    !  that reduces the number of global reductions per iteration from two to one 
-    !  (not counting the convergence check).  Convergence is checked every 
-    !  {\em linear_solve_ncheck} steps.
-    !
-    !     References are:
-    !
-    !     Chronopoulos, A.T., A Class of Parallel Iterative Methods Implemented on Multiprocessors,
-    !        Ph.D. thesis, Technical Report UIUCDCS-R-86-1267, Department of Computer Science,
-    !        University of Illinois, Urbana, Illinois, pp. 1-116, 1986.
-    !
-    !     Chronopoulos, A.T., and C.W. Gear. s-step iterative methods
-    !        for symmetric linear systems. J. Comput. Appl. Math., 25(2),
-    !        153-168, 1989.
-    !
-    !     Dongarra, J. and V. Eijkhout. LAPACK Working Note 159.
-    !        Finite-choice algorithm optimization in conjugate gradients.
-    !        Tech. Rep. ut-cs-03-502. Computer Science Department.
-    !        University of Tennessee, Knoxville. 2003.
+    !  algorithm to solve the equation $Ax=b$. (See references in subroutine above.)
     !
-    !     D Azevedo, E.F., V.L. Eijkhout, and C.H. Romine. LAPACK Working
-    !        Note 56. Conjugate gradient algorithms with reduced
-    !        synchronization overhead on distributed memory multiprocessors.
-    !        Tech. Rep. CS-93-185.  Computer Science Department.
-    !        University of Tennessee, Knoxville. 1993.
-    !---------------------------------------------------------------
+    !  It is similar to subroutine pcg_solver_chrongear_3d, but modified
+    !  to solve for x and y at a single horizontal level, as in the
+    !  shallow-shelf approximation.  See the comments in that subroutine
+    !  (above) for more details on data structure and solver methods.
     !
-    !  The input and output arrays are located on a structured (i,j,k) grid 
-    !  as defined in the glissade_velo_higher module.  
-    !  The global matrix is sparse, but its nonzero elements are stored in 
-    !  four dense matrices called Auu, Avv, Auv, and Avu.
-    !  Each matrix has 3x3x3 = 27 potential nonzero elements per node (i,j,k).
+    !  Input and output arrays are located on a structured (i,j) grid 
+    !  as defined in the glissade_velo_higher module.  The global matrix 
+    !  is sparse, but its nonzero element are stored in four dense matrices 
+    !  called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential 
+    !  nonzero elements per node (i,j).
     !
-    !  The current preconditioning options are
+    !  The current preconditioning options for the solver are
     !  (0) no preconditioning
     !  (1) diagonal preconditioning
-    !  (2) preconditioning using a physics-based SIA solver
-    ! 
-    !  For the dome test case with higher-order dynamics, option (2) is best. 
-    !
-    !  Here is a schematic of the method implemented below for solving Ax = b:
-    !
-    !  Set up preconditioner M
-    !  work0 = (b,b)
-    !  bb = global_sum(work0)
-    !
-    !  First pass of algorithm:
-    !  halo_update(x)
-    !  r = b - A*x
-    !  halo_update(r)
-    !  solve Mz = r for z
-    !  work(1) = (r,z)
-    !  d = z
-    !  q = A*d
-    !  work(2) = (d,q)
-    !  halo_update(q)
-    !  rho_old = global_sum(work(1))
-    !  sigma = global_sum(work(2))
-    !  alpha = rho_old/sigma
-    !  x = x + alpha*d
-    !  r = r - alpha*q
-    !
-    !  Iterative loop:
-    !  while (not converged)
-    !     solve Mz = r for z
-    !     Az = A*z
-    !     work(1) = (r,z)
-    !     work(2) = (Az,z)
-    !     halo_update(Az)
-    !     rho = global_sum(work(1))
-    !     delta = global_sum(work(2))
-    !     beta = rho/rho_old
-    !     sigma = delta - beta^2 * sigma
-    !     alpha = rho/sigma
-    !     rho_old = rho
-    !     d = z + beta*d
-    !     q = Az + beta*q
-    !     x = x + alpha*d
-    !     r = r - alpha*q
-    !     if (time to check convergence) then
-    !        r = b - A*x
-    !        work0 = (r,r)
-    !        halo_update(r)
-    !        rr = global_sum(work0)
-    !        if (sqrt(r,r)/sqrt(b,b) < tolerance) exit
-    !     endif
-    !  end while
+    !  (3) local tridiagonal preconditioning
+    !  (4) global tridiagonal preconditioning
+    !  The SIA-based preconditioning option is not available for a 2D solve.
     !
-    !  where x = solution vector
-    !        d = conjugate direction vector
-    !        r = residual vector
-    !        M = preconditioning matrix
-    !    (r,z) = dot product of vectors r and z
-    !            and similarly for (Az,z), etc.
-    !       
     !---------------------------------------------------------------
 
     !---------------------------------------------------------------
@@ -1097,70 +1579,63 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     !---------------------------------------------------------------
 
     integer, intent(in) ::   &
-       nx, ny,                       &  ! horizontal grid dimensions (for scalars)
-                                        ! velocity grid has dimensions (nx-1,ny-1)
-       nz                               ! number of vertical levels where velocity is computed
+       nx, ny                 ! horizontal grid dimensions (for scalars)
+                              ! velocity grid has dimensions (nx-1,ny-1)
 
     type(parallel_type), intent(in) :: &
-         parallel             ! info for parallel communication
+       parallel               ! info for parallel communication
 
     integer, dimension(-1:1,-1:1), intent(in) :: &
        indxA_2d               ! maps relative (x,y) coordinates to an index between 1 and 9
 
-    integer, dimension(-1:1,-1:1,-1:1), intent(in) :: &
-       indxA_3d               ! maps relative (x,y,z) coordinates to an index between 1 and 27
-
     logical, dimension(nx-1,ny-1), intent(in) ::   &
        active_vertex          ! T for columns (i,j) where velocity is computed, else F
  
-    real(dp), dimension(27,nz,nx-1,ny-1), intent(in) ::   &
-       Auu, Auv, Avu, Avv     ! four components of assembled matrix
-                              ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction)
-                              ! other dimensions = (z,x,y) indices
+    real(dp), dimension(nx-1,ny-1,9), intent(in) ::   &
+       Auu, Auv, &            ! four components of assembled matrix
+       Avu, Avv               ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction)
+                              ! 1st and 2nd dimensions = (x,y) indices
                               !
                               !    Auu  | Auv
                               !    _____|____
                               !    Avu  | Avv
                               !         |
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
        bu, bv             ! assembled load (rhs) vector, divided into 2 parts
 
-    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::   &
+    real(dp), dimension(nx-1,ny-1), intent(inout) ::   &
        xu, xv             ! u and v components of solution (i.e., uvel and vvel)
 
     integer, intent(in)  ::   &
        precond           ! = 0 for no preconditioning
                          ! = 1 for diagonal preconditioning (best option for SSA-dominated flow)
-                         ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow)
 
     integer, intent(in)  :: &
-       linear_solve_ncheck         ! number of iterations between convergence checks in the linear solver
+       linear_solve_ncheck          ! number of iterations between convergence checks in the linear solver
 
     integer, intent(in) ::    &
-       maxiters                    ! max number of linear iterations before quitting
+       maxiters                     ! max number of linear iterations before quitting
 
     real(dp), intent(in) ::   &
-       tolerance                   ! tolerance for linear solver
+       tolerance                    ! tolerance for linear solver
 
     real(dp), intent(out) ::  &
-       err                         ! error (L2 norm of residual) in final solution
+       err                          ! error (L2 norm of residual) in final solution
 
     integer, intent(out) ::   &
-       niters                      ! iterations needed to solution
+       niters                       ! iterations needed to solution
 
     integer, intent(in) :: &
-       itest, jtest, rtest         ! point for debugging diagnostics
+       itest, jtest, rtest      ! point for debugging diagnostics
 
     !---------------------------------------------------------------
     ! Local variables and parameters   
     !---------------------------------------------------------------
 
-    integer :: i, j, k, m     ! grid indices
-    integer :: ii, jj
-    integer :: ilocal, jlocal ! number of locally owned vertices in each direction
-    integer :: iter           ! iteration counter
-    integer :: maxiters_chrongear  ! max number of linear iterations before quitting
+    integer ::  i, j, ii, jj ! grid indices
+    integer ::  m            ! matrix element index
+    integer ::  iter         ! iteration counter
 
     real(dp) ::           &
        alpha,             &! rho/sigma = term in expression for new residual and solution
@@ -1174,8 +1649,7 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        gsum                ! result of global sum for dot products
    
     ! vectors (each of these is split into u and v components)
-    real(dp), dimension(nz,nx-1,ny-1) ::  &
-       Adiagu, Adiagv,    &! diagonal terms of matrices Auu and Avv
+    real(dp), dimension(nx-1,ny-1) ::  &
        ru, rv,            &! residual vector (b-Ax)
        du, dv,            &! conjugate direction vector
        zu, zv,            &! solution of Mz = r
@@ -1183,22 +1657,21 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        Azu, Azv,          &! result of matvec multiply A*z
        worku, workv        ! intermediate results
 
-    real(dp), dimension(nz,nx-1,ny-1,2) ::  &
+    real(dp), dimension(nx-1,ny-1,2) ::  &
        work2u, work2v      ! intermediate results
 
     real(dp) ::  &
        rr,                &! dot product (r,r)
        bb,                &! dot product (b,b)
        L2_resid,          &! L2 norm of residual = sqrt(r,r)
-       L2_rhs              ! L2 norm of rhs vector = sqrt(b,b)
-                           ! solver is converged when L2_resid/L2_rhs < tolerance
-
-    real(dp), dimension(-1:1,nz,nx-1,ny-1) ::  &
-       Muu, Mvv            ! simplified SIA matrices for preconditioning
+       L2_rhs              ! L2 norm of rhs vector = sqrt(b,b)
+                           ! solver is converged when L2_resid/L2_rhs < tolerance
 
-    ! arrays for tridiagonal preconditioning
-    ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above
+    ! diagonal matrix elements
+    real(dp), dimension(nx-1,ny-1) ::  &
+       Adiagu, Adiagv      ! diagonal terms of matrices Auu and Avv
 
+    ! tridiagonal matrix elements
     real(dp), dimension(:,:), allocatable :: &
          Asubdiag_u, Adiag_u, Asupdiag_u,  &  ! matrix entries from Auu for tridiagonal preconditioning
          Asubdiag_v, Adiag_v, Asupdiag_v      ! matrix entries from Avv for tridiagonal preconditioning
@@ -1212,18 +1685,14 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     real(dp), dimension(:,:), allocatable :: &
          b_u, b_v, x_u, x_v
 
-    ! Note: These two matrices are global in the EW and NS dimensions, respectively.
-    !       Each holds 8 pieces of information for each task on each row or column.
-    !       Since only 2 of these 8 pieces of information change from one iteration to the next,
-    !        it is more efficient to gather the remaining information once and pass the arrays
-    !        with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d
-    !        and gather all the information every time the subroutine is called.
-    ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity?
-
     real(dp), dimension(:,:), allocatable :: &
          gather_data_row,   &   ! arrays for gathering data from every task on a row or column
          gather_data_col
 
+    integer :: ilocal, jlocal   ! number of locally owned vertices in each direction
+
+    integer :: maxiters_chrongear  ! max number of linear iterations before quitting
+
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
          staggered_jlo, staggered_jhi
@@ -1232,9 +1701,17 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
          tasks_row,         &   ! number of tasks per row and column for tridiagonal solve
          tasks_col
 
+    integer, parameter :: &
+       maxiters_tridiag = 100  ! max number of linear iterations for tridiagonal preconditioning,
+                               ! which generally leads to faster convergence than diagonal preconditioning
+
+    !WHL - debug
+    real(dp) :: usum, usum_global, vsum, vsum_global
+
     !WHL - debug
     integer :: iu_max, ju_max, iv_max, jv_max
     real(dp) :: ru_max, rv_max
+    real(dp) :: sum_temp
 
     staggered_ilo = parallel%staggered_ilo
     staggered_ihi = parallel%staggered_ihi
@@ -1244,22 +1721,17 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     tasks_row = parallel%tasks_row
     tasks_col = parallel%tasks_col
 
-    ! Note: maxiters_tridiag commented out here, because the BP tridiagonal solver
-    !       tends not to converge as well as the 2D version.
-    ! TODO: Make maxiters a config option.
-
     ! Set the maximum number of linear iterations.
     ! Typically allow up to 200 iterations with diagonal preconditioning, but only 100
     !  with tridiagonal, which usually converges faster.
 
-    !TODO - Test whether maxiters_tridiag (currently = 100) is sufficient for convergence with 3D solver
-!!    if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then
-!!       maxiters_chrongear = maxiters_tridiag
-!!    else
+    if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+       maxiters_chrongear = maxiters_tridiag
+    else
        maxiters_chrongear = maxiters
-!!    endif
+    endif
 
-    if (verbose_pcg .and. this_rank == rtest) then
+    if (verbose_pcg .and. main_task) then
        write(iulog,*) 'Using native PCG solver (Chronopoulos-Gear)'
        write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters_chrongear, precond
     endif
@@ -1268,26 +1740,20 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     ilocal = staggered_ihi - staggered_ilo + 1
     jlocal = staggered_jhi - staggered_jlo + 1
 
-    !WHL - debug
-    if (verbose_pcg .and. this_rank == rtest) then
-       write(iulog,*) 'stag_ihi, stag_ilo, ilocal:', staggered_ihi, staggered_ilo, ilocal
-       write(iulog,*) 'stag_jhi, stag_jlo, jlocal:', staggered_jhi, staggered_jlo, jlocal
-    endif
-
     !---- Set up matrices for preconditioning
 
     call t_startf("pcg_precond_init")
 
     if (precond == HO_PRECOND_NONE) then    ! no preconditioner
 
-       if (verbose_pcg .and. this_rank == rtest) then
+       if (verbose_pcg .and. main_task) then
           write(iulog,*) 'Using no preconditioner'
        endif
 
     elseif (precond == HO_PRECOND_DIAG) then
 
-       call setup_preconditioner_diag_3d(nx,       ny,         &
-                                         nz,       indxA_3d,   &
+       call setup_preconditioner_diag_2d(nx,       ny,         &
+                                         indxA_2d,             &
                                          Auu,      Avv,        &
                                          Adiagu,   Adiagv)
 
@@ -1296,32 +1762,22 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
           i = itest
           j = jtest
           write(iulog,*) 'i, j, r =', i, j, this_rank
-          write(iulog,*) 'Auu diag =', Adiagu(:,i,j)
-          write(iulog,*) 'Avu diag =', Adiagv(:,i,j)
+          write(iulog,*) 'Au diag =', Adiagu(i,j)
+          write(iulog,*) 'Av diag =', Adiagv(i,j)
        endif
 
-    elseif (precond == HO_PRECOND_SIA) then
-
-       call setup_preconditioner_sia_3d(nx,        ny,       &
-                                        nz,        indxA_3d, &
-                                        Auu,       Avv,      &
-                                        Muu,       Mvv)
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
 
-       if (verbose_pcg .and. this_rank == rtest) then
+       !WHL - debug
+       if (verbose_tridiag .and. this_rank==rtest) then
+          i = itest
           j = jtest
           write(iulog,*) ' '
-          write(iulog,*) 'i, k, Muu_sia, Mvv_sia:'
-          do i = staggered_ihi, staggered_ilo, -1
-             write(iulog,*) ' '
-             do k = 1, nz
-                write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j)
-             enddo
-          enddo  ! i
+          write(iulog,*) 'r, i, j =', this_rank, i, j
+          write(iulog,*) 'Auu =', Auu(i,j,:)
+          write(iulog,*) 'Avv =', Avv(i,j,:)
        endif
 
-    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
-
-       ! Allocate tridiagonal preconditioning matrices
        allocate(Adiag_u   (nx-1,ny-1))
        allocate(Asubdiag_u(nx-1,ny-1))
        allocate(Asupdiag_u(nx-1,ny-1))
@@ -1334,26 +1790,22 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        allocate(omega_v   (nx-1,ny-1))
        allocate(denom_v   (nx-1,ny-1))
 
-       ! Compute arrays for tridiagonal preconditioning
-
-       call setup_preconditioner_tridiag_local_3d(&
-            nx,            ny,            &
-            nz,            parallel,      &
-            active_vertex,                &
-            indxA_2d,      indxA_3d,      &
-            itest, jtest,  rtest,         &
-            Auu,           Avv,           &
-            Muu,           Mvv,           &
-            Adiag_u,       Adiag_v,       &
-            Asubdiag_u,    Asubdiag_v,    &
-            Asupdiag_u,    Asupdiag_v,    &
-            omega_u,       omega_v,       &
+       call setup_preconditioner_tridiag_local_2d(&
+            nx,            ny,             &
+            parallel,      indxA_2d,       &
+            itest,         jtest,  rtest,  &
+            Auu,           Avv,            &
+            Adiag_u,       Adiag_v,        &
+            Asubdiag_u,    Asubdiag_v,     &
+            Asupdiag_u,    Asupdiag_v,     &
+            omega_u,       omega_v,        &
             denom_u,       denom_v)
-       
+
     elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
 
        ! Allocate tridiagonal preconditioning matrices
-       ! Note: (i,j) indices are switched for the A_v matrices to reduce striding.
+       ! Note: (i,j) indices are switced for the A_v matrices to reduce striding.
+
        allocate(Adiag_u   (ilocal,jlocal))
        allocate(Asubdiag_u(ilocal,jlocal))
        allocate(Asupdiag_u(ilocal,jlocal))
@@ -1361,6 +1813,8 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        allocate(denom_u(ilocal,jlocal))
        allocate(xuh_u(ilocal,jlocal))
        allocate(xlh_u(ilocal,jlocal))
+       allocate(b_u(ilocal,jlocal))
+       allocate(x_u(ilocal,jlocal))
 
        allocate(Adiag_v   (jlocal,ilocal))
        allocate(Asubdiag_v(jlocal,ilocal))
@@ -1369,6 +1823,8 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        allocate(denom_v(jlocal,ilocal))
        allocate(xuh_v(jlocal,ilocal))
        allocate(xlh_v(jlocal,ilocal))
+       allocate(b_v(jlocal,ilocal))
+       allocate(x_v(jlocal,ilocal))
 
        ! These two matrices are for gathering data from all tasks on a given row or column.
        allocate(gather_data_row(8*tasks_row,jlocal))
@@ -1376,63 +1832,93 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        gather_data_row = 0.0d0
        gather_data_col = 0.0d0
 
-       ! Compute arrays for tridiagonal preconditioning
+       ! Compute the entries of the tridiagonal matrices
 
-       call setup_preconditioner_tridiag_global_3d(&
-            nx,            ny,            &
-            nz,            parallel,      &
-            active_vertex,                &
-            indxA_2d,      indxA_3d,      &
-            ilocal,        jlocal,        &
-            itest, jtest,  rtest,         &
-            Auu,           Avv,           &
-            Muu,           Mvv,           &
-            Adiag_u,       Adiag_v,       &
-            Asubdiag_u,    Asubdiag_v,    &
-            Asupdiag_u,    Asupdiag_v,    &
-            omega_u,       omega_v,       &
-            denom_u,       denom_v,       &
-            xuh_u,         xuh_v,         &
-            xlh_u,         xlh_v)
+       ! Extract tridiagonal matrix entries from Auu
+       do j = 1, jlocal
+          jj = j + staggered_jlo - 1
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0))   ! subdiagonal elements
+             Adiag_u   (i,j) = Auu(ii,jj,indxA_2d( 0,0))   ! diagonal elements
+             Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0))   ! superdiagonal elements
+          enddo
+       enddo
+
+       ! compute work arrays for the u solve in each matrix row
+       call setup_preconditioner_tridiag_global_2d(&
+            ilocal,       jlocal,      &
+!!          itest, jtest, rtest,       &
+            itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
+            jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
+            rtest,                     &
+            Adiag_u,                   &
+            Asubdiag_u,   Asupdiag_u,  &
+            omega_u,      denom_u,     &
+            xuh_u,        xlh_u)
+
+       ! Extract tridiagonal matrix entries from Avv
+       do i = 1, ilocal
+          ii = i + staggered_ilo - 1
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1))   ! subdiagonal elements
+             Adiag_v   (j,i) = Avv(ii,jj,indxA_2d(0, 0))   ! diagonal elements
+             Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1))   ! superdiagonal elements
+          enddo
+       enddo
+
+       ! compute work arrays for the v solve in each matrix column
+       ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides
+
+       call setup_preconditioner_tridiag_global_2d(&
+            jlocal,       ilocal,      &
+!!          itest, jtest, rtest,       &
+            jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
+            itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
+            rtest,                     &
+            Adiag_v,                   &
+            Asubdiag_v,   Asupdiag_v,  &
+            omega_v,      denom_v,     &
+            xuh_v,        xlh_v)
 
     endif   ! precond
 
+    !WHL - debug
+    if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup'
+
     call t_stopf("pcg_precond_init")
 
     !---- Initialize scalars and vectors
 
     niters = maxiters_chrongear
-    ru(:,:,:) = 0.d0
-    rv(:,:,:) = 0.d0
-    du(:,:,:) = 0.d0
-    dv(:,:,:) = 0.d0
-    zu(:,:,:) = 0.d0
-    zv(:,:,:) = 0.d0
-    qu(:,:,:) = 0.d0
-    qv(:,:,:) = 0.d0
-    Azu(:,:,:) = 0.d0
-    Azv(:,:,:) = 0.d0
-    worku(:,:,:) = 0.d0
-    workv(:,:,:) = 0.d0
-    work2u(:,:,:,:) = 0.d0
-    work2v(:,:,:,:) = 0.d0
+    ru(:,:) = 0.d0
+    rv(:,:) = 0.d0
+    du(:,:) = 0.d0
+    dv(:,:) = 0.d0
+    zu(:,:) = 0.d0
+    zv(:,:) = 0.d0
+    qu(:,:) = 0.d0
+    qv(:,:) = 0.d0
+    Azu(:,:) = 0.d0
+    Azv(:,:) = 0.d0
+    worku(:,:) = 0.d0
+    workv(:,:) = 0.d0
+    work2u(:,:,:) = 0.d0
+    work2v(:,:,:) = 0.d0
 
     !---- Compute the L2 norm of the RHS vectors
     !---- (Goal is to obtain L2_resid/L2_rhs < tolerance)
 
     call t_startf("pcg_dotprod")
-    worku(:,:,:) = bu(:,:,:)*bu(:,:,:)    ! terms of dot product (b, b)
-    workv(:,:,:) = bv(:,:,:)*bv(:,:,:)
+    worku(:,:) = bu(:,:)*bu(:,:)    ! terms of dot product (b, b)
+    workv(:,:) = bv(:,:)*bv(:,:)
     call t_stopf("pcg_dotprod")
 
     ! find global sum of the squared L2 norm
 
     call t_startf("pcg_glbsum_init")
-    call parallel_global_sum_staggered(&
-         nx,     ny,         &
-         nz,     parallel,   &
-         bb,                 &
-         worku,  workv)
+    bb = parallel_global_sum_stagger(worku, parallel, workv)
     call t_stopf("pcg_glbsum_init")
 
     ! take square root
@@ -1458,9 +1944,9 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     !---- Compute A*x   (use z as a temp vector for A*x)
 
     call t_startf("pcg_matmult_init")
-    call matvec_multiply_structured_3d(nx,            ny,            &
-                                       nz,            parallel,      &
-                                       indxA_3d,      active_vertex, &
+    call matvec_multiply_structured_2d(nx,            ny,            &
+                                       parallel,                     &
+                                       indxA_2d,      active_vertex, &
                                        Auu,           Auv,           &
                                        Avu,           Avv,           &
                                        xu,            xv,            &
@@ -1471,8 +1957,8 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     !---- This is correct for locally owned nodes.
 
     call t_startf("pcg_vecupdate")
-    ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
-    rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
+    ru(:,:) = bu(:,:) - zu(:,:)
+    rv(:,:) = bv(:,:) - zv(:,:)
     call t_stopf("pcg_vecupdate")
 
     !---- Halo update for residual
@@ -1488,103 +1974,169 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     ! From here on, call timers with 'iter' suffix because this can be considered the first iteration
     call t_startf("pcg_precond_iter")
 
-    if (precond == HO_PRECOND_NONE) then   ! no preconditioning
+    if (precond == HO_PRECOND_NONE) then      ! no preconditioning
 
-       zu(:,:,:) = ru(:,:,:)         ! PC(r) = r     
-       zv(:,:,:) = rv(:,:,:)         ! PC(r) = r    
+       zu(:,:) = ru(:,:)         ! PC(r) = r     
+       zv(:,:) = rv(:,:)         ! PC(r) = r    
 
-    elseif (precond == HO_PRECOND_DIAG ) then  ! diagonal preconditioning
+    elseif (precond == HO_PRECOND_DIAG) then  ! diagonal preconditioning
 
+       ! Solve Mz = r, where M is a diagonal matrix
        do j = 1, ny-1
        do i = 1, nx-1
-       do k = 1, nz
-          if (Adiagu(k,i,j) /= 0.d0) then
-             zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j)   ! PC(r), where PC is formed from diagonal elements of A
+          if (Adiagu(i,j) /= 0.d0) then
+             zu(i,j) = ru(i,j) / Adiagu(i,j)   ! PC(r), where PC is formed from diagonal elements of A
           else                                        
-             zu(k,i,j) = 0.d0
+             zu(i,j) = 0.d0
           endif
-          if (Adiagv(k,i,j) /= 0.d0) then
-             zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j)  
+          if (Adiagv(i,j) /= 0.d0) then
+             zv(i,j) = rv(i,j) / Adiagv(i,j)
           else                                        
-             zv(k,i,j) = 0.d0
+             zv(i,j) = 0.d0
           endif
-       enddo    ! k
        enddo    ! i
        enddo    ! j
 
-    elseif (precond == HO_PRECOND_SIA) then   ! local vertical shallow-ice solver for preconditioning
+       !WHL - debug
+       if (verbose_pcg .and. this_rank == rtest) then
+!          i = itest
+!          write(iulog,*) ' '
+!          write(iulog,*) 'zv solve with diagonal precond, this_rank, i =', this_rank, i
+!          write(iulog,*) 'j, active, Adiagv, rv, zv, xv:'
+!          do j = staggered_jhi, staggered_jlo, -1
+!             write(iulog,'(i4, l4, 2f12.3, e12.3, f12.3)') j, active_vertex(i,j), Adiagv(i,j), rv(i,j), zv(i,j), xv(i,j)
+!          enddo
+       endif
+
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then  ! local
+
+       if (verbose_tridiag .and. this_rank == rtest) then
+          i = itest
+          j = jtest
+          write(iulog,*) 'Residual:'
+          write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j)
+          write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j)
+          write(iulog,*) ' '
+          write(iulog,*) 'jtest =', jtest
+          write(iulog,*) 'i, ru, rv:'
+          do i = itest-3, itest+3
+             write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
+          enddo
+       endif
+
+       if (verbose_pcg .and. main_task) then
+          write(iulog,*) 'call tridiag_solver_local_2d'
+       endif
 
-       call easy_sia_solver(nx,   ny,   nz,        &
-                            active_vertex,         &
-                            Muu,  ru,   zu)      ! solve Muu*zu = ru for zu 
+       ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task)
 
-       call easy_sia_solver(nx,   ny,   nz,        &
-                            active_vertex,         &
-                            Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
+       !TODO - Test a local solver that can compute zu and zv in the halo
+       !       (to avoid the halo update below)
+
+       call tridiag_solver_local_2d(nx,            ny,            &
+                                    parallel,                     &
+                                    itest, jtest,  rtest,         &
+                                    Adiag_u,       Adiag_v,       &  ! entries of preconditioning matrix
+                                    Asubdiag_u,    Asubdiag_v,    &
+                                    Asupdiag_u,    Asupdiag_v,    &
+                                    omega_u,       omega_v,       &
+                                    denom_u,       denom_v,       &
+                                    ru,            rv,            &  ! right hand side
+                                    zu,            zv)               ! solution
 
        !WHL - debug
        if (verbose_pcg .and. this_rank == rtest) then
           j = jtest
-          write(iulog,*) 'Standard SIA PC:'
           write(iulog,*) ' '
-          write(iulog,*) 'i, zu_sia(1), zu_sia(nz):'
+          write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
           do i = itest-3, itest+3
-             write(iulog,*) ' '
-             do k = 1, nz
-                write(iulog,'(i4, 2f16.10)') i, zu(1,i,j), zu(nz,i,j)
-             enddo
-          enddo  ! i
-          write(iulog,*) ' '
-          write(iulog,*) 'i, zv_sia(1), zv_sia(nz):'
-          do i = itest-3, itest+3
-             write(iulog,*) ' '
-             do k = 1, nz
-                write(iulog,'(i4, 2f16.10)') i, zv(1,i,j), zv(nz,i,j)
-             enddo
-          enddo  ! i
+             write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
+          enddo
        endif
 
-    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+       !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+       !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
+       call staggered_parallel_halo(zu, parallel)
+       call staggered_parallel_halo(zv, parallel)
 
-       ! Use a local tridiagonal solver to find an approximate solution of A*z = r
+    elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
 
-       call tridiag_solver_local_3d(&
-            nx,            ny,            &
-            nz,            parallel,      &
-            active_vertex,                &
-            itest, jtest,  rtest,         &
-            Adiag_u,       Adiag_v,       &  ! entries of 2D preconditioning matrix
-            Asubdiag_u,    Asubdiag_v,    &
-            Asupdiag_u,    Asupdiag_v,    &
-            omega_u,       omega_v,       &
-            denom_u,       denom_v,       &
-            Muu,           Mvv,           &  ! entries of SIA matrix
-            ru,            rv,            &  ! 3D residual
-            zu,            zv)               ! approximate solution of Az = r
+       ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal)
+       do j = 1, jlocal
+          jj = j + staggered_jlo - 1
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             b_u(i,j) = ru(ii,jj)
+          enddo
+       enddo
 
-    elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+       ! Solve M*z = r, where M is a global tridiagonal matrix
 
-       ! Use a global tridiagonal solver to find an approximate solution of A*z = r
+       call tridiag_solver_global_2d(ilocal,       jlocal,      &
+                                     parallel,     tasks_row,   &
+                                     'row',                     &  ! tridiagonal solve for each row
+!!                                          itest, jtest, rtest,      &
+                                     itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
+                                     jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
+                                     rtest,                     &
+                                     Adiag_u,                   &
+                                     Asubdiag_u,   Asupdiag_u,  &
+                                     omega_u,      denom_u,     &
+                                     xuh_u,        xlh_u,       &
+                                     b_u,          x_u,         &
+                                     .true.,                    &  ! first_time
+                                     gather_data_row)
 
-       call tridiag_solver_global_3d(&
-            nx,              ny,              &
-            nz,              parallel,        &
-            active_vertex,                    &
-            ilocal,          jlocal,          &
-            tasks_row,       tasks_col,       &
-            itest,  jtest,   rtest,           &
-            Adiag_u,         Adiag_v,         &  ! entries of 2D preconditioning matrix
-            Asubdiag_u,      Asubdiag_v,      &
-            Asupdiag_u,      Asupdiag_v,      &
-            omega_u,         omega_v,         &
-            denom_u,         denom_v,         &
-            xuh_u,           xuh_v,           &
-            xlh_u,           xlh_v,           &
-            Muu,             Mvv,             &  ! entries of SIA matrix
-            gather_data_row, gather_data_col, &
-            .true.,                           &  ! first_time = T (first iteration)
-            ru,              rv,              &  ! 3D residual
-            zu,              zv)                 ! approximate solution of Az = r
+       ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1)
+       zu(:,:) = 0.0d0
+       do j = 1, jlocal
+          jj = j + staggered_jlo - 1
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             zu(ii,jj) = x_u(i,j)
+          enddo
+       enddo
+
+       ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal)
+
+       do i = 1, ilocal
+          ii = i + staggered_ilo - 1
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             b_v(j,i) = rv(ii,jj)
+          enddo
+       enddo
+
+       call tridiag_solver_global_2d(jlocal,       ilocal,      &
+                                     parallel,     tasks_col,   &
+                                     'col',                     &  ! tridiagonal solve for each column
+!!                                          itest, jtest, rtest,      &
+                                     jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
+                                     itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
+                                     rtest,                     &
+                                     Adiag_v,                   &
+                                     Asubdiag_v,   Asupdiag_v,  &
+                                     omega_v,      denom_v,     &
+                                     xuh_v,        xlh_v,       &
+                                     b_v,          x_v,         &
+                                     .true.,                    &  ! first_time
+                                     gather_data_col)
+
+       ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1)
+
+       zv(:,:) = 0.0d0
+       do i = 1, ilocal
+          ii = i + staggered_ilo - 1
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             zv(ii,jj) = x_v(j,i)
+          enddo
+       enddo
+
+       !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+       !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells?
+       call staggered_parallel_halo(zu, parallel)
+       call staggered_parallel_halo(zv, parallel)
 
     endif    ! precond
 
@@ -1593,23 +2145,23 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     !---- Compute intermediate result for dot product (r,z)
 
     call t_startf("pcg_dotprod")
-    work2u(:,:,:,1) = ru(:,:,:) * zu(:,:,:)
-    work2v(:,:,:,1) = rv(:,:,:) * zv(:,:,:)
+    work2u(:,:,1) = ru(:,:) * zu(:,:)
+    work2v(:,:,1) = rv(:,:) * zv(:,:)
     call t_stopf("pcg_dotprod")
 
     !---- Compute the conjugate direction vector d
     !---- Since z is correct in halo, so is d
 
-    du(:,:,:) = zu(:,:,:)
-    dv(:,:,:) = zv(:,:,:)
+    du(:,:) = zu(:,:)
+    dv(:,:) = zv(:,:)
 
     !---- Compute q = A*d
-    !---- q is correct for locally owned nodes
+    !---- q is correct for locally owned nodes, provided d extends one layer into the halo
 
     call t_startf("pcg_matmult_iter")
-    call matvec_multiply_structured_3d(nx,            ny,            &
-                                       nz,            parallel,      &
-                                       indxA_3d,      active_vertex, &
+    call matvec_multiply_structured_2d(nx,            ny,            &
+                                       parallel,                     &
+                                       indxA_2d,      active_vertex, &
                                        Auu,           Auv,           &
                                        Avu,           Avv,           &
                                        du,            dv,            &
@@ -1619,18 +2171,14 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     !---- Compute intermediate result for dot product (d,q) = (d,Ad)
 
     call t_startf("pcg_dotprod")
-    work2u(:,:,:,2) = du(:,:,:) * qu(:,:,:)
-    work2v(:,:,:,2) = dv(:,:,:) * qv(:,:,:)
+    work2u(:,:,2) = du(:,:) * qu(:,:)
+    work2v(:,:,2) = dv(:,:) * qv(:,:)
     call t_stopf("pcg_dotprod")
 
     !---- Find global sums of (r,z) and (d,q)
 
     call t_startf("pcg_glbsum_iter")
-    call parallel_global_sum_staggered(&
-         nx,     ny,         &
-         nz,     parallel,   &
-         gsum,               &
-         work2u, work2v)
+    gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)  ! nflds = 2
     call t_stopf("pcg_glbsum_iter")
 
     !---- Halo update for q
@@ -1654,150 +2202,241 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     !---- These are correct in halo
 
     call t_startf("pcg_vecupdate")
-    xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:)
-    xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:)
+    xu(:,:) = xu(:,:) + alpha*du(:,:)
+    xv(:,:) = xv(:,:) + alpha*dv(:,:)
 
-    ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:)     ! q = A*d
-    rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:)
+    ru(:,:) = ru(:,:) - alpha*qu(:,:)     ! q = A*d
+    rv(:,:) = rv(:,:) - alpha*qv(:,:)
     call t_stopf("pcg_vecupdate")
 
     !WHL - debug
     if (verbose_pcg .and. this_rank == rtest) then
        j = jtest
-       write(iulog,*) ' '
-       write(iulog,*) 'alpha =', alpha
-       write(iulog,*) 'iter = 1: i, k, xu, xv, ru, rv:'
-       do i = staggered_ilo, staggered_ihi
-!!       do i = itest-3, itest+3
-          write(iulog,*) ' '
-          do k = 1, nz
-             write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
-          enddo
-       enddo
+!!       write(iulog,*) ' '
+!!       write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:'
+       do i = itest-3, itest+3
+!!          write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j)
+       enddo  ! i
     endif
 
+
     !---------------------------------------------------------------
     ! Iterate to solution
     !---------------------------------------------------------------
 
     iter_loop: do iter = 2, maxiters_chrongear  ! first iteration done above
 
-       if (verbose_pcg .and. this_rank == rtest) then
-          write(iulog,*) ' '
-          write(iulog,*) 'iter =', iter
+       if (verbose_pcg .and. main_task) then
+!          write(iulog,*) 'iter =', iter
        endif
 
-       !---- Compute PC(r) = solution z of Mz = r
-       !---- z is correct in halo
+       !---- Compute PC(r) = solution z of Mz = r
+       !---- z is correct in halo
+
+       call t_startf("pcg_precond_iter")
+
+       if (precond == HO_PRECOND_NONE) then      ! no preconditioning
+
+           zu(:,:) = ru(:,:)         ! PC(r) = r
+           zv(:,:) = rv(:,:)         ! PC(r) = r    
+
+       elseif (precond == HO_PRECOND_DIAG) then  ! diagonal preconditioning
+
+          do j = 1, ny-1
+          do i = 1, nx-1
+             if (Adiagu(i,j) /= 0.d0) then
+                zu(i,j) = ru(i,j) / Adiagu(i,j)   ! PC(r), where PC is formed from diagonal elements of A
+             else
+                zu(i,j) = 0.d0
+             endif
+             if (Adiagv(i,j) /= 0.d0) then
+                zv(i,j) = rv(i,j) / Adiagv(i,j)
+             else
+                zv(i,j) = 0.d0
+             endif
+          enddo    ! i
+          enddo    ! j
+
+       elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then   ! tridiagonal preconditioning with local solve
+
+          ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task)
+
+          !TODO - Test a local solver that can compute zu and zv in the halo
+          !       (to avoid the halo update below)
+
+          !WHL - debug
+          if (verbose_tridiag .and. this_rank == rtest) then
+             i = itest
+             j = jtest
+!             write(iulog,*) 'Residual:'
+!             write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j)
+!             write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j)
+!             write(iulog,*) ' '
+!             write(iulog,*) 'jtest =', jtest
+!             write(iulog,*) 'i, ru, rv:'
+!             do i = staggered_ihi, staggered_ilo, -1
+!                write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
+!             enddo
+          endif
+
+
+          !WHL - debug
+          if (verbose_pcg .and. this_rank == rtest) then
+             write(iulog,*) '   call tridiag_solver_local_2d'
+          endif
+
+          call tridiag_solver_local_2d(nx,            ny,            &
+                                       parallel,                     &
+                                       itest, jtest,  rtest,         &
+                                       Adiag_u,       Adiag_v,       &  ! entries of preconditioning matrix
+                                       Asubdiag_u,    Asubdiag_v,    &
+                                       Asupdiag_u,    Asupdiag_v,    &
+                                       omega_u,       omega_v,       &
+                                       denom_u,       denom_v,       &
+                                       ru,            rv,            &  ! right hand side
+                                       zu,            zv)               ! solution
+
+          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+          !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
+          call staggered_parallel_halo(zu, parallel)
+          call staggered_parallel_halo(zv, parallel)
 
-       call t_startf("pcg_precond_iter")
+          if (verbose_pcg .and. this_rank == rtest) then
+             j = jtest
+             write(iulog,*) ' '
+             write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
+             do i = itest-3, itest+3
+                write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
+             enddo
+          endif
 
-       if (precond == HO_PRECOND_NONE) then   ! no preconditioning
+       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
 
-           zu(:,:,:) = ru(:,:,:)         ! PC(r) = r
-           zv(:,:,:) = rv(:,:,:)         ! PC(r) = r    
+          !WHL - debug
+          if (verbose_tridiag .and. this_rank == rtest) then
+             j = jtest
+             write(iulog,*) ' '
+             write(iulog,*) 'jtest =', jtest
+             write(iulog,*) 'i, ru, rv:'
+             do i = itest-3, itest+3
+                write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
+             enddo
+          endif
 
-       elseif (precond == HO_PRECOND_DIAG ) then  ! diagonal preconditioning
+          ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal)
 
-          do j = 1, ny-1
-          do i = 1, nx-1
-          do k = 1, nz
-             if (Adiagu(k,i,j) /= 0.d0) then
-                zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j)   ! PC(r), where PC is formed from diagonal elements of A
-             else
-                zu(k,i,j) = 0.d0
-             endif
-             if (Adiagv(k,i,j) /= 0.d0) then
-                zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j)
-             else
-                zv(k,i,j) = 0.d0
-             endif
-          enddo    ! k
-          enddo    ! i
-          enddo    ! j
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             do i = 1, ilocal
+                ii = i + staggered_ilo - 1
+                b_u(i,j) = ru(ii,jj)
+             enddo
+          enddo
 
-       elseif (precond == HO_PRECOND_SIA) then   ! local vertical shallow-ice solver for preconditioning
+          !WHL - debug
+          if (verbose_pcg .and. this_rank == rtest) then
+             j = jtest
+             write(iulog,*) ' '
+             write(iulog,*) 'Before global tridiag PC u solve, r, j =', rtest, jtest
+             write(iulog,*) ' '
+             write(iulog,*) 'i, Adiag_u, Asubdiag_u, Asupdiag_u, b_u:'
+             do i = itest-3, itest+3
+                write(iulog,'(i4, 4e16.8)') i, Adiag_u(i,j), Asubdiag_u(i,j), Asupdiag_u(i,j), b_u(i,j)
+             enddo
+          endif
 
-          call easy_sia_solver(nx,   ny,   nz,        &
-                               active_vertex,         &
-                               Muu,  ru,   zu)      ! solve Muu*zu = ru for zu 
+          call tridiag_solver_global_2d(ilocal,       jlocal,      &
+                                        parallel,     tasks_row,   &
+                                        'row',                     &  ! tridiagonal solve for each row
+!!                                             itest, jtest, rtest,      &
+                                        itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
+                                        jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
+                                        rtest,                     &
+                                        Adiag_u,                   &
+                                        Asubdiag_u,   Asupdiag_u,  &
+                                        omega_u,      denom_u,     &
+                                        xuh_u,        xlh_u,       &
+                                        b_u,          x_u,         &
+                                        .false.,                   &  ! first_time
+                                        gather_data_row)
 
-          call easy_sia_solver(nx,   ny,   nz,        &
-                               active_vertex,         &
-                               Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
+          ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1)
+          zu(:,:) = 0.0d0
+          do j = 1, jlocal
+             jj = j + staggered_jlo - 1
+             do i = 1, ilocal
+                ii = i + staggered_ilo - 1
+                zu(ii,jj) = x_u(i,j)
+             enddo
+          enddo
 
-       elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+          ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal)
 
-          ! Use a local tridiagonal solver to find an approximate solution of A*z = r
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             do j = 1, jlocal
+                jj = j + staggered_jlo - 1
+                b_v(j,i) = rv(ii,jj)
+             enddo
+          enddo
 
-          call tridiag_solver_local_3d(&
-               nx,            ny,            &
-               nz,            parallel,      &
-               active_vertex,                &
-               itest, jtest,  rtest,         &
-               Adiag_u,       Adiag_v,       &  ! entries of 2D preconditioning matrix
-               Asubdiag_u,    Asubdiag_v,    &
-               Asupdiag_u,    Asupdiag_v,    &
-               omega_u,       omega_v,       &
-               denom_u,       denom_v,       &
-               Muu,           Mvv,           &  ! entries of SIA matrix
-               ru,            rv,            &  ! 3D residual
-               zu,            zv)               ! approximate solution of Az = r
+          call tridiag_solver_global_2d(jlocal,       ilocal,      &
+                                        parallel,     tasks_col,   &
+                                        'col',                     &  ! tridiagonal solve for each column
+!!                                             itest, jtest, rtest,      &
+                                        jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
+                                        itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
+                                        rtest,                     &
+                                        Adiag_v,                   &
+                                        Asubdiag_v,   Asupdiag_v,  &
+                                        omega_v,      denom_v,     &
+                                        xuh_v,        xlh_v,       &
+                                        b_v,          x_v,         &
+                                        .false.,                   &  ! first_time
+                                        gather_data_col)
 
-       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
+          ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1)
 
-          ! Use a global tridiagonal solver to find an approximate solution of A*z = r
+          zv(:,:) = 0.0d0
+          do i = 1, ilocal
+             ii = i + staggered_ilo - 1
+             do j = 1, jlocal
+                jj = j + staggered_jlo - 1
+                zv(ii,jj) = x_v(j,i)
+             enddo
+          enddo
 
-          call tridiag_solver_global_3d(&
-               nx,              ny,              &
-               nz,              parallel,        &
-               active_vertex,                    &
-               ilocal,          jlocal,          &
-               tasks_row,       tasks_col,       &
-               itest,  jtest,   rtest,           &
-               Adiag_u,         Adiag_v,         &  ! entries of 2D preconditioning matrix
-               Asubdiag_u,      Asubdiag_v,      &
-               Asupdiag_u,      Asupdiag_v,      &
-               omega_u,         omega_v,         &
-               denom_u,         denom_v,         &
-               xuh_u,           xuh_v,           &
-               xlh_u,           xlh_v,           &
-               Muu,             Mvv,             &  ! entries of SIA matrix
-               gather_data_row, gather_data_col, &
-               .false.,                          &  ! first_time = F (iteration 2+)
-               ru,              rv,              &  ! 3D residual
-               zu,              zv)                 ! approximate solution of Az = r
+          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
+          !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
+          call staggered_parallel_halo(zu, parallel)
+          call staggered_parallel_halo(zv, parallel)
 
        endif    ! precond
 
-    !WHL - debug
-    if (verbose_pcg .and. this_rank == rtest) write(iulog,*) 'L1'
-
        call t_stopf("pcg_precond_iter")
 
        !---- Compute Az = A*z
        !---- This is the one matvec multiply required per iteration
-       !---- Az is correct for local owned nodes and needs a halo update (below)
+       !---- Az is correct for locally owned nodes and needs a halo update (below)
 
        call t_startf("pcg_matmult_iter")
-       call matvec_multiply_structured_3d(nx,            ny,            &
-                                          nz,            parallel,      &
-                                          indxA_3d,      active_vertex, &
+       call matvec_multiply_structured_2d(nx,            ny,            &
+                                          parallel,                     &
+                                          indxA_2d,      active_vertex, &
                                           Auu,           Auv,           &
                                           Avu,           Avv,           &
                                           zu,            zv,            &
                                           Azu,           Azv)
        call t_stopf("pcg_matmult_iter")
 
-
        !---- Compute intermediate results for the dot products (r,z) and (Az,z)
 
        call t_startf("pcg_dotprod")
-       work2u(:,:,:,1) = ru(:,:,:)*zu(:,:,:)     ! terms of dot product (r,z)
-       work2v(:,:,:,1) = rv(:,:,:)*zv(:,:,:)    
+       work2u(:,:,1) = ru(:,:)*zu(:,:)     ! terms of dot product (r,z)
+       work2v(:,:,1) = rv(:,:)*zv(:,:)    
 
-       work2u(:,:,:,2) = Azu(:,:,:)*zu(:,:,:)    ! terms of dot product (A*z,z)
-       work2v(:,:,:,2) = Azv(:,:,:)*zv(:,:,:)    
+       work2u(:,:,2) = Azu(:,:)*zu(:,:)    ! terms of dot product (A*z,z)
+       work2v(:,:,2) = Azv(:,:)*zv(:,:)    
        call t_stopf("pcg_dotprod")
 
        ! Take the global sums of (r,z) and (Az,z)
@@ -1805,15 +2444,10 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
        ! this is the one MPI global reduction per iteration.
 
        call t_startf("pcg_glbsum_iter")
-       call parallel_global_sum_staggered(&
-            nx,     ny,          &
-            nz,     parallel,    &
-            gsum,                &
-            work2u, work2v)
+       gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)  ! nflds = 2
        call t_stopf("pcg_glbsum_iter")
 
        !---- Halo update for Az
-       !---- This is the one halo update required per iteration
 
        call t_startf("pcg_halo_iter")
        call staggered_parallel_halo(Azu, parallel)
@@ -1832,7 +2466,7 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
        if (alpha /= alpha) then  ! alpha is NaN
 !!          write(iulog,*) 'rho, sigma, alpha:', rho, sigma, alpha
-          call write_log('Chron-Gear PCG solver has failed, alpha = NaN', GM_FATAL)
+          call write_log('Chron_Gear PCG solver has failed, alpha = NaN', GM_FATAL)
        endif
 
        !---- Update d and q
@@ -1840,37 +2474,25 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
        call t_startf("pcg_vecupdate")
 
-       du(:,:,:) = zu(:,:,:) + beta*du(:,:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
-       dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:)       !
-                                                    !                    (r_(i+1), PC(r_(i+1)))
-                                                    ! where beta_(i+1) = --------------------  
-                                                    !                        (r_i, PC(r_i)) 
-       qu(:,:,:) = Azu(:,:,:) + beta*qu(:,:,:)
-       qv(:,:,:) = Azv(:,:,:) + beta*qv(:,:,:)
+       du(:,:) = zu(:,:) + beta*du(:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
+       dv(:,:) = zv(:,:) + beta*dv(:,:)       !
+                                              !                    (r_(i+1), PC(r_(i+1)))
+                                              ! where beta_(i+1) = --------------------  
+                                              !                        (r_i, PC(r_i)) 
+       qu(:,:) = Azu(:,:) + beta*qu(:,:)
+       qv(:,:) = Azv(:,:) + beta*qv(:,:)
 
        !---- Update solution and residual
        !---- These are correct in halo
 
-       xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:)
-       xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:)
+       xu(:,:) = xu(:,:) + alpha*du(:,:)
+       xv(:,:) = xv(:,:) + alpha*dv(:,:)
 
-       ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:)
-       rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:)
+       ru(:,:) = ru(:,:) - alpha*qu(:,:)
+       rv(:,:) = rv(:,:) - alpha*qv(:,:)
 
        call t_stopf("pcg_vecupdate")
 
-       if (verbose_pcg .and. this_rank == rtest) then
-          j = jtest
-          write(iulog,*) ' '
-          write(iulog,*) 'i, k, xu, xv, ru, rv:'
-          do i = itest-3, itest+3
-             write(iulog,*) ' '
-             do k = 1, nz
-                write(iulog,'(i4, 4f16.10)') i, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
-             enddo
-          enddo  ! i
-       endif
-
        ! Check for convergence every linear_solve_ncheck iterations.
        ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence.
        ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5?  This would be answer-changing.
@@ -1879,17 +2501,21 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
        if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then
 
-          if (verbose_pcg .and. this_rank == rtest) then
+          if (verbose_pcg .and. main_task) then
              write(iulog,*) ' '
-             write(iulog,*) 'Check convergence, iter =', iter
+             write(iulog,*) '   check convergence, iter =', iter
           endif
 
           !---- Compute z = A*x  (use z as a temp vector for A*x)
            
+          !WHL - debug - don't think this is needed, but try just in case
+!!          call staggered_parallel_halo(xu, parallel)
+!!          call staggered_parallel_halo(xv, parallel)
+
           call t_startf("pcg_matmult_resid")
-          call matvec_multiply_structured_3d(nx,            ny,            &
-                                             nz,            parallel,      &
-                                             indxA_3d,      active_vertex, &
+          call matvec_multiply_structured_2d(nx,            ny,            &
+                                             parallel,                     &
+                                             indxA_2d,      active_vertex, &
                                              Auu,           Auv,           &
                                              Avu,           Avv,           &
                                              xu,            xv,            &
@@ -1898,33 +2524,41 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
           !---- Compute residual r = b - A*x
 
+          !WHL - debug - don't think this is needed, but try just in case
+!!          call staggered_parallel_halo(bu, parallel)
+!!          call staggered_parallel_halo(bv, parallel)
+          !WHL - debug - don't think this is needed, but try just in case
+!!          call staggered_parallel_halo(zu, parallel)
+!!          call staggered_parallel_halo(zv, parallel)
+
           call t_startf("pcg_vecupdate")
-          ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
-          rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
+          ru(:,:) = bu(:,:) - zu(:,:)
+          rv(:,:) = bv(:,:) - zv(:,:)
           call t_stopf("pcg_vecupdate")
 
+          !WHL - debug - don't think this is needed, but try just in case
+!!          call staggered_parallel_halo(ru, parallel)
+!!          call staggered_parallel_halo(rv, parallel)
+
           !---- Compute dot product (r, r)
 
           call t_startf("pcg_dotprod")
-          worku(:,:,:) = ru(:,:,:)*ru(:,:,:)
-          workv(:,:,:) = rv(:,:,:)*rv(:,:,:)
+          worku(:,:) = ru(:,:)*ru(:,:)
+          workv(:,:) = rv(:,:)*rv(:,:)
           call t_stopf("pcg_dotprod")
 
           call t_startf("pcg_glbsum_resid")
-          call parallel_global_sum_staggered(&
-               nx,     ny,         &
-               nz,     parallel,   &
-               rr,                 &
-               worku,  workv)
+          rr = parallel_global_sum_stagger(worku, parallel, workv)
           call t_stopf("pcg_glbsum_resid")
 
           L2_resid = sqrt(rr)          ! L2 norm of residual
           err = L2_resid/L2_rhs        ! normalized error
 
-          if (verbose_pcg .and. this_rank == rtest) then
+          if (verbose_pcg .and. main_task) then
              write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err
           endif
 
+          !WHL - debug
           if (verbose_pcg .and. this_rank == rtest) then
              ru_max = 0.d0
              rv_max = 0.d0
@@ -1932,13 +2566,13 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
              ju_max = 0
              do j = staggered_jlo, staggered_jhi
                 do i = staggered_ilo, staggered_ihi
-                   if (abs(sum(ru(:,i,j))) > ru_max) then
-                      ru_max = sum(ru(:,i,j))
+                   if (abs(ru(i,j)) > ru_max) then
+                      ru_max = ru(i,j)
                       iu_max = i
                       ju_max = j
                    endif
-                   if (abs(sum(rv(:,i,j))) > rv_max) then
-                      rv_max = sum(rv(:,i,j))
+                   if (abs(rv(i,j)) > rv_max) then
+                      rv_max = rv(i,j)
                       iv_max = i
                       jv_max = j
                    endif
@@ -1954,15 +2588,15 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
 
           if (err < tolerance) then
              niters = iter
-             if (verbose_pcg .and. this_rank == rtest) then
+             if (verbose_pcg .and. main_task) then
                 write(iulog,*) 'Glissade PCG solver has converged, iter =', niters
                 write(iulog,*) ' '
              endif
              exit iter_loop
-          elseif (niters == maxiters_chrongear) then
-             if (verbose_pcg .and. this_rank == rtest) then
-                write(iulog,*) 'Glissade PCG solver not converged'
-                write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance
+          elseif (iter == maxiters_chrongear) then
+             if (verbose_pcg .and. main_task) then
+                write(iulog,*) 'Glissade PCG solver did not converge'
+                write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance
                 write(iulog,*) ' '
              endif
           endif
@@ -1986,14 +2620,19 @@ subroutine pcg_solver_chrongear_3d(nx,        ny,            &
     if (allocated(denom_u))    deallocate(denom_u, denom_v)
     if (allocated(xuh_u))      deallocate(xuh_u, xuh_v)
     if (allocated(xlh_u))      deallocate(xlh_u, xlh_v)
+    if (allocated(b_u))        deallocate(b_u, b_v)
+    if (allocated(x_u))        deallocate(x_u, x_v)
+    if (allocated(gather_data_row)) deallocate(gather_data_row)
+    if (allocated(gather_data_col)) deallocate(gather_data_col)
 
-  end subroutine pcg_solver_chrongear_3d
+  end subroutine pcg_solver_chrongear_2d
 
 !****************************************************************************
-
-  subroutine pcg_solver_chrongear_2d(nx,        ny,            &
-                                     parallel,                 &
-                                     indxA_2d,  active_vertex, &
+    
+  subroutine pcg_solver_chrongear_3d(nx,        ny,            &
+                                     nz,        parallel,      &
+                                     indxA_2d,  indxA_3d,      &
+                                     active_vertex,            &
                                      Auu,       Auv,           &
                                      Avu,       Avv,           &
                                      bu,        bv,            &
@@ -2005,26 +2644,104 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
     !---------------------------------------------------------------
     !  This subroutine uses a Chronopoulos-Gear preconditioned conjugate-gradient
-    !  algorithm to solve the equation $Ax=b$. (See references in subroutine above.)
+    !  algorithm to solve the equation $Ax=b$.
     !
-    !  It is similar to subroutine pcg_solver_chrongear_3d, but modified
-    !  to solve for x and y at a single horizontal level, as in the
-    !  shallow-shelf approximation.  See the comments in that subroutine
-    !  (above) for more details on data structure and solver methods.
+    !  It is based on the Chronopoulos-Gear PCG solver in the POP ocean model 
+    !  (author Frank Bryan, NCAR). It is a rearranged conjugate gradient solver 
+    !  that reduces the number of global reductions per iteration from two to one 
+    !  (not counting the convergence check).  Convergence is checked every 
+    !  {\em linear_solve_ncheck} steps.
     !
-    !  Input and output arrays are located on a structured (i,j) grid 
-    !  as defined in the glissade_velo_higher module.  The global matrix 
-    !  is sparse, but its nonzero element are stored in four dense matrices 
-    !  called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential 
-    !  nonzero elements per node (i,j).
+    !     References are:
     !
-    !  The current preconditioning options for the solver are
+    !     Chronopoulos, A.T., A Class of Parallel Iterative Methods Implemented on Multiprocessors,
+    !        Ph.D. thesis, Technical Report UIUCDCS-R-86-1267, Department of Computer Science,
+    !        University of Illinois, Urbana, Illinois, pp. 1-116, 1986.
+    !
+    !     Chronopoulos, A.T., and C.W. Gear. s-step iterative methods
+    !        for symmetric linear systems. J. Comput. Appl. Math., 25(2),
+    !        153-168, 1989.
+    !
+    !     Dongarra, J. and V. Eijkhout. LAPACK Working Note 159.
+    !        Finite-choice algorithm optimization in conjugate gradients.
+    !        Tech. Rep. ut-cs-03-502. Computer Science Department.
+    !        University of Tennessee, Knoxville. 2003.
+    !
+    !     D Azevedo, E.F., V.L. Eijkhout, and C.H. Romine. LAPACK Working
+    !        Note 56. Conjugate gradient algorithms with reduced
+    !        synchronization overhead on distributed memory multiprocessors.
+    !        Tech. Rep. CS-93-185.  Computer Science Department.
+    !        University of Tennessee, Knoxville. 1993.
+    !---------------------------------------------------------------
+    !
+    !  The input and output arrays are located on a structured (i,j,k) grid 
+    !  as defined in the glissade_velo_higher module.  
+    !  The global matrix is sparse, but its nonzero elements are stored in 
+    !  four dense matrices called Auu, Avv, Auv, and Avu.
+    !  Each matrix has 3x3x3 = 27 potential nonzero elements per node (i,j,k).
+    !
+    !  The current preconditioning options are
     !  (0) no preconditioning
     !  (1) diagonal preconditioning
-    !  (3) local tridiagonal preconditioning
-    !  (4) global tridiagonal preconditioning
-    !  The SIA-based preconditioning option is not available for a 2D solve.
+    !  (2) preconditioning using a physics-based SIA solver
+    ! 
+    !  For the dome test case with higher-order dynamics, option (2) is best. 
     !
+    !  Here is a schematic of the method implemented below for solving Ax = b:
+    !
+    !  Set up preconditioner M
+    !  work0 = (b,b)
+    !  bb = global_sum(work0)
+    !
+    !  First pass of algorithm:
+    !  halo_update(x)
+    !  r = b - A*x
+    !  halo_update(r)
+    !  solve Mz = r for z
+    !  work(1) = (r,z)
+    !  d = z
+    !  q = A*d
+    !  work(2) = (d,q)
+    !  halo_update(q)
+    !  rho_old = global_sum(work(1))
+    !  sigma = global_sum(work(2))
+    !  alpha = rho_old/sigma
+    !  x = x + alpha*d
+    !  r = r - alpha*q
+    !
+    !  Iterative loop:
+    !  while (not converged)
+    !     solve Mz = r for z
+    !     Az = A*z
+    !     work(1) = (r,z)
+    !     work(2) = (Az,z)
+    !     halo_update(Az)
+    !     rho = global_sum(work(1))
+    !     delta = global_sum(work(2))
+    !     beta = rho/rho_old
+    !     sigma = delta - beta^2 * sigma
+    !     alpha = rho/sigma
+    !     rho_old = rho
+    !     d = z + beta*d
+    !     q = Az + beta*q
+    !     x = x + alpha*d
+    !     r = r - alpha*q
+    !     if (time to check convergence) then
+    !        r = b - A*x
+    !        work0 = (r,r)
+    !        halo_update(r)
+    !        rr = global_sum(work0)
+    !        if (sqrt(r,r)/sqrt(b,b) < tolerance) exit
+    !     endif
+    !  end while
+    !
+    !  where x = solution vector
+    !        d = conjugate direction vector
+    !        r = residual vector
+    !        M = preconditioning matrix
+    !    (r,z) = dot product of vectors r and z
+    !            and similarly for (Az,z), etc.
+    !       
     !---------------------------------------------------------------
 
     !---------------------------------------------------------------
@@ -2032,8 +2749,9 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !---------------------------------------------------------------
 
     integer, intent(in) ::   &
-       nx, ny                 ! horizontal grid dimensions (for scalars)
-                              ! velocity grid has dimensions (nx-1,ny-1)
+       nx, ny,                       &  ! horizontal grid dimensions (for scalars)
+                                        ! velocity grid has dimensions (nx-1,ny-1)
+       nz                               ! number of vertical levels where velocity is computed
 
     type(parallel_type), intent(in) :: &
        parallel               ! info for parallel communication
@@ -2041,54 +2759,60 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     integer, dimension(-1:1,-1:1), intent(in) :: &
        indxA_2d               ! maps relative (x,y) coordinates to an index between 1 and 9
 
+    integer, dimension(-1:1,-1:1,-1:1), intent(in) :: &
+       indxA_3d               ! maps relative (x,y,z) coordinates to an index between 1 and 27
+
     logical, dimension(nx-1,ny-1), intent(in) ::   &
        active_vertex          ! T for columns (i,j) where velocity is computed, else F
  
-    real(dp), dimension(nx-1,ny-1,9), intent(in) ::   &
-       Auu, Auv, &            ! four components of assembled matrix
-       Avu, Avv               ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction)
-                              ! 1st and 2nd dimensions = (x,y) indices
+    real(dp), dimension(27,nz,nx-1,ny-1), intent(in) ::   &
+       Auu, Auv, Avu, Avv     ! four components of assembled matrix
+                              ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction)
+                              ! other dimensions = (z,x,y) indices
                               !
                               !    Auu  | Auv
                               !    _____|____
                               !    Avu  | Avv
                               !         |
 
-    real(dp), dimension(nx-1,ny-1), intent(in) ::  &
+    real(dp), dimension(nz,nx-1,ny-1), intent(in) ::  &
        bu, bv             ! assembled load (rhs) vector, divided into 2 parts
 
-    real(dp), dimension(nx-1,ny-1), intent(inout) ::   &
+    real(dp), dimension(nz,nx-1,ny-1), intent(inout) ::   &
        xu, xv             ! u and v components of solution (i.e., uvel and vvel)
 
     integer, intent(in)  ::   &
        precond           ! = 0 for no preconditioning
                          ! = 1 for diagonal preconditioning (best option for SSA-dominated flow)
+                         ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow)
 
     integer, intent(in)  :: &
-       linear_solve_ncheck          ! number of iterations between convergence checks in the linear solver
+       linear_solve_ncheck         ! number of iterations between convergence checks in the linear solver
 
     integer, intent(in) ::    &
-       maxiters                     ! max number of linear iterations before quitting
+       maxiters                    ! max number of linear iterations before quitting
 
     real(dp), intent(in) ::   &
-       tolerance                    ! tolerance for linear solver
+       tolerance                   ! tolerance for linear solver
 
     real(dp), intent(out) ::  &
-       err                          ! error (L2 norm of residual) in final solution
+       err                         ! error (L2 norm of residual) in final solution
 
     integer, intent(out) ::   &
-       niters                       ! iterations needed to solution
+       niters                      ! iterations needed to solution
 
     integer, intent(in) :: &
-       itest, jtest, rtest      ! point for debugging diagnostics
+       itest, jtest, rtest         ! point for debugging diagnostics
 
     !---------------------------------------------------------------
     ! Local variables and parameters   
     !---------------------------------------------------------------
 
-    integer ::  i, j         ! grid indices
-    integer ::  m            ! matrix element index
-    integer ::  iter         ! iteration counter
+    integer :: i, j, k, m     ! grid indices
+    integer :: ii, jj
+    integer :: ilocal, jlocal ! number of locally owned vertices in each direction
+    integer :: iter           ! iteration counter
+    integer :: maxiters_chrongear  ! max number of linear iterations before quitting
 
     real(dp) ::           &
        alpha,             &! rho/sigma = term in expression for new residual and solution
@@ -2101,26 +2825,9 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     real(dp), dimension(2) ::   &
        gsum                ! result of global sum for dot products
    
-    ! diagonal matrix elements
-    real(dp), dimension(nx-1,ny-1) ::  &
-       Adiagu, Adiagv      ! diagonal terms of matrices Auu and Avv
-
-    ! tridiagonal matrix elements
-    real(dp), dimension(:,:), allocatable :: &
-         Asubdiag_u, Adiag_u, Asupdiag_u,  &  ! matrix entries from Auu for tridiagonal preconditioning
-         Asubdiag_v, Adiag_v, Asupdiag_v      ! matrix entries from Avv for tridiagonal preconditioning
-
-    real(dp), dimension(:,:), allocatable :: &
-         omega_u, omega_v,         & ! work arrays for tridiagonal solve
-         denom_u, denom_v,         &
-         xuh_u,   xuh_v,           &
-         xlh_u,   xlh_v
-
-    real(dp), dimension(:,:), allocatable :: &
-         b_u, b_v, x_u, x_v
-
     ! vectors (each of these is split into u and v components)
-    real(dp), dimension(nx-1,ny-1) ::  &
+    real(dp), dimension(nz,nx-1,ny-1) ::  &
+       Adiagu, Adiagv,    &! diagonal terms of matrices Auu and Avv
        ru, rv,            &! residual vector (b-Ax)
        du, dv,            &! conjugate direction vector
        zu, zv,            &! solution of Mz = r
@@ -2128,7 +2835,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        Azu, Azv,          &! result of matvec multiply A*z
        worku, workv        ! intermediate results
 
-    real(dp), dimension(nx-1,ny-1,2) ::  &
+    real(dp), dimension(nz,nx-1,ny-1,2) ::  &
        work2u, work2v      ! intermediate results
 
     real(dp) ::  &
@@ -2138,14 +2845,33 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        L2_rhs              ! L2 norm of rhs vector = sqrt(b,b)
                            ! solver is converged when L2_resid/L2_rhs < tolerance
 
+    real(dp), dimension(-1:1,nz,nx-1,ny-1) ::  &
+       Muu, Mvv            ! simplified SIA matrices for preconditioning
+
+    ! arrays for tridiagonal preconditioning
+    ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above
+
     real(dp), dimension(:,:), allocatable :: &
-         gather_data_row,   &   ! arrays for gathering data from every task on a row or column
-         gather_data_col
+         Asubdiag_u, Adiag_u, Asupdiag_u,  &  ! matrix entries from Auu for tridiagonal preconditioning
+         Asubdiag_v, Adiag_v, Asupdiag_v      ! matrix entries from Avv for tridiagonal preconditioning
 
-    integer :: ilocal, jlocal   ! number of locally owned vertices in each direction
+    real(dp), dimension(:,:), allocatable :: &
+         omega_u, omega_v,         & ! work arrays for tridiagonal solve
+         denom_u, denom_v,         &
+         xuh_u,   xuh_v,           &
+         xlh_u,   xlh_v
 
-    integer :: ii, jj
-    integer :: maxiters_chrongear  ! max number of linear iterations before quitting
+    ! Note: These two matrices are global in the EW and NS dimensions, respectively.
+    !       Each holds 8 pieces of information for each task on each row or column.
+    !       Since only 2 of these 8 pieces of information change from one iteration to the next,
+    !        it is more efficient to gather the remaining information once and pass the arrays
+    !        with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d
+    !        and gather all the information every time the subroutine is called.
+    ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity?
+
+    real(dp), dimension(:,:), allocatable :: &
+         gather_data_row,   &   ! arrays for gathering data from every task on a row or column
+         gather_data_col
 
     integer :: &
          staggered_ilo, staggered_ihi, &  ! bounds of locally owned vertices on staggered grid
@@ -2155,13 +2881,6 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
          tasks_row,         &   ! number of tasks per row and column for tridiagonal solve
          tasks_col
 
-    integer, parameter :: &
-       maxiters_tridiag = 100  ! max number of linear iterations for tridiagonal preconditioning,
-                               ! which generally leads to faster convergence than diagonal preconditioning
-
-    !WHL - debug
-    real(dp) :: usum, usum_global, vsum, vsum_global
-
     !WHL - debug
     integer :: iu_max, ju_max, iv_max, jv_max
     real(dp) :: ru_max, rv_max
@@ -2174,15 +2893,20 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     tasks_row = parallel%tasks_row
     tasks_col = parallel%tasks_col
 
+    ! Note: maxiters_tridiag commented out here, because the BP tridiagonal solver
+    !       tends not to converge as well as the 2D version.
+    ! TODO: Make maxiters a config option.
+
     ! Set the maximum number of linear iterations.
     ! Typically allow up to 200 iterations with diagonal preconditioning, but only 100
     !  with tridiagonal, which usually converges faster.
 
-    if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then
-       maxiters_chrongear = maxiters_tridiag
-    else
+    !TODO - Test whether maxiters_tridiag (currently = 100) is sufficient for convergence with 3D solver
+!!    if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then
+!!       maxiters_chrongear = maxiters_tridiag
+!!    else
        maxiters_chrongear = maxiters
-    endif
+!!    endif
 
     if (verbose_pcg .and. this_rank == rtest) then
        write(iulog,*) 'Using native PCG solver (Chronopoulos-Gear)'
@@ -2205,8 +2929,8 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
     elseif (precond == HO_PRECOND_DIAG) then
 
-       call setup_preconditioner_diag_2d(nx,       ny,         &
-                                         indxA_2d,             &
+       call setup_preconditioner_diag_3d(nx,       ny,         &
+                                         nz,       indxA_3d,   &
                                          Auu,      Avv,        &
                                          Adiagu,   Adiagv)
 
@@ -2215,22 +2939,32 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
           i = itest
           j = jtest
           write(iulog,*) 'i, j, r =', i, j, this_rank
-          write(iulog,*) 'Au diag =', Adiagu(i,j)
-          write(iulog,*) 'Av diag =', Adiagv(i,j)
+          write(iulog,*) 'Auu diag =', Adiagu(:,i,j)
+          write(iulog,*) 'Avv diag =', Adiagv(:,i,j)
        endif
 
-    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+    elseif (precond == HO_PRECOND_SIA) then
 
-          !WHL - debug
-          if (verbose_tridiag .and. this_rank==rtest) then
-             i = itest
-             j = jtest
+       call setup_preconditioner_sia_3d(nx,        ny,       &
+                                        nz,        indxA_3d, &
+                                        Auu,       Avv,      &
+                                        Muu,       Mvv)
+
+       if (verbose_pcg .and. this_rank == rtest) then
+          j = jtest
+          write(iulog,*) ' '
+          write(iulog,*) 'i, k, Muu_sia, Mvv_sia:'
+          do i = staggered_ihi, staggered_ilo, -1
              write(iulog,*) ' '
-             write(iulog,*) 'r, i, j =', this_rank, i, j
-             write(iulog,*) 'Auu =', Auu(i,j,:)
-             write(iulog,*) 'Avv =', Avv(i,j,:)
-          endif
+             do k = 1, nz
+                write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j)
+             enddo
+          enddo  ! i
+       endif
+
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
 
+       ! Allocate tridiagonal preconditioning matrices
        allocate(Adiag_u   (nx-1,ny-1))
        allocate(Asubdiag_u(nx-1,ny-1))
        allocate(Asupdiag_u(nx-1,ny-1))
@@ -2243,22 +2977,26 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        allocate(omega_v   (nx-1,ny-1))
        allocate(denom_v   (nx-1,ny-1))
 
-       call setup_preconditioner_tridiag_local_2d(&
-            nx,            ny,             &
-            parallel,      indxA_2d,       &
-            itest,         jtest,  rtest,  &
-            Auu,           Avv,            &
-            Adiag_u,       Adiag_v,        &
-            Asubdiag_u,    Asubdiag_v,     &
-            Asupdiag_u,    Asupdiag_v,     &
-            omega_u,       omega_v,        &
-            denom_u,       denom_v)
+       ! Compute arrays for tridiagonal preconditioning
 
+       call setup_preconditioner_tridiag_local_3d(&
+            nx,            ny,            &
+            nz,            parallel,      &
+            active_vertex,                &
+            indxA_2d,      indxA_3d,      &
+            itest, jtest,  rtest,         &
+            Auu,           Avv,           &
+            Muu,           Mvv,           &
+            Adiag_u,       Adiag_v,       &
+            Asubdiag_u,    Asubdiag_v,    &
+            Asupdiag_u,    Asupdiag_v,    &
+            omega_u,       omega_v,       &
+            denom_u,       denom_v)
+       
     elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
 
-       ! Allocate tridiagonal matrices
-       ! Note: (i,j) indices are switced for the A_v matrices to reduce striding.
-
+       ! Allocate tridiagonal preconditioning matrices
+       ! Note: (i,j) indices are switched for the A_v matrices to reduce striding.
        allocate(Adiag_u   (ilocal,jlocal))
        allocate(Asubdiag_u(ilocal,jlocal))
        allocate(Asupdiag_u(ilocal,jlocal))
@@ -2266,8 +3004,6 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        allocate(denom_u(ilocal,jlocal))
        allocate(xuh_u(ilocal,jlocal))
        allocate(xlh_u(ilocal,jlocal))
-       allocate(b_u(ilocal,jlocal))
-       allocate(x_u(ilocal,jlocal))
 
        allocate(Adiag_v   (jlocal,ilocal))
        allocate(Asubdiag_v(jlocal,ilocal))
@@ -2276,100 +3012,66 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        allocate(denom_v(jlocal,ilocal))
        allocate(xuh_v(jlocal,ilocal))
        allocate(xlh_v(jlocal,ilocal))
-       allocate(b_v(jlocal,ilocal))
-       allocate(x_v(jlocal,ilocal))
-
-       ! Compute the entries of the tridiagonal matrices
-
-       ! Extract tridiagonal matrix entries from Auu
-       do j = 1, jlocal
-          jj = j + staggered_jlo - 1
-          do i = 1, ilocal
-             ii = i + staggered_ilo - 1
-             Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0))   ! subdiagonal elements
-             Adiag_u   (i,j) = Auu(ii,jj,indxA_2d( 0,0))   ! diagonal elements
-             Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0))   ! superdiagonal elements
-          enddo
-       enddo
-
-       ! compute work arrays for the u solve in each matrix row
-       call setup_preconditioner_tridiag_global_2d(&
-            ilocal,       jlocal,      &
-!!          itest, jtest, rtest,       &
-            itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
-            jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
-            rtest,                     &
-            Adiag_u,                   &
-            Asubdiag_u,   Asupdiag_u,  &
-            omega_u,      denom_u,     &
-            xuh_u,        xlh_u)
 
-       ! Extract tridiagonal matrix entries from Avv
-       do i = 1, ilocal
-          ii = i + staggered_ilo - 1
-          do j = 1, jlocal
-             jj = j + staggered_jlo - 1
-             Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1))   ! subdiagonal elements
-             Adiag_v   (j,i) = Avv(ii,jj,indxA_2d(0, 0))   ! diagonal elements
-             Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1))   ! superdiagonal elements
-          enddo
-       enddo
+       ! These two matrices are for gathering data from all tasks on a given row or column.
+       allocate(gather_data_row(8*tasks_row,jlocal))
+       allocate(gather_data_col(8*tasks_col,ilocal))
+       gather_data_row = 0.0d0
+       gather_data_col = 0.0d0
 
-       ! compute work arrays for the v solve in each matrix column
-       ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides
+       ! Compute arrays for tridiagonal preconditioning
 
-       call setup_preconditioner_tridiag_global_2d(&
-            jlocal,       ilocal,      &
-!!          itest, jtest, rtest,       &
-            jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
-            itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
-            rtest,                     &
-            Adiag_v,                   &
-            Asubdiag_v,   Asupdiag_v,  &
-            omega_v,      denom_v,     &
-            xuh_v,        xlh_v)
+       call setup_preconditioner_tridiag_global_3d(&
+            nx,            ny,            &
+            nz,            parallel,      &
+            active_vertex,                &
+            indxA_2d,      indxA_3d,      &
+            ilocal,        jlocal,        &
+            itest, jtest,  rtest,         &
+            Auu,           Avv,           &
+            Muu,           Mvv,           &
+            Adiag_u,       Adiag_v,       &
+            Asubdiag_u,    Asubdiag_v,    &
+            Asupdiag_u,    Asupdiag_v,    &
+            omega_u,       omega_v,       &
+            denom_u,       denom_v,       &
+            xuh_u,         xuh_v,         &
+            xlh_u,         xlh_v)
 
     endif   ! precond
 
-    !WHL - debug
-    if (verbose_pcg .and. this_rank == rtest) write(iulog,*) 'Done in PC setup'
-
     call t_stopf("pcg_precond_init")
 
     !---- Initialize scalars and vectors
 
     niters = maxiters_chrongear
-    ru(:,:) = 0.d0
-    rv(:,:) = 0.d0
-    du(:,:) = 0.d0
-    dv(:,:) = 0.d0
-    zu(:,:) = 0.d0
-    zv(:,:) = 0.d0
-    qu(:,:) = 0.d0
-    qv(:,:) = 0.d0
-    Azu(:,:) = 0.d0
-    Azv(:,:) = 0.d0
-    worku(:,:) = 0.d0
-    workv(:,:) = 0.d0
-    work2u(:,:,:) = 0.d0
-    work2v(:,:,:) = 0.d0
+    ru(:,:,:) = 0.d0
+    rv(:,:,:) = 0.d0
+    du(:,:,:) = 0.d0
+    dv(:,:,:) = 0.d0
+    zu(:,:,:) = 0.d0
+    zv(:,:,:) = 0.d0
+    qu(:,:,:) = 0.d0
+    qv(:,:,:) = 0.d0
+    Azu(:,:,:) = 0.d0
+    Azv(:,:,:) = 0.d0
+    worku(:,:,:) = 0.d0
+    workv(:,:,:) = 0.d0
+    work2u(:,:,:,:) = 0.d0
+    work2v(:,:,:,:) = 0.d0
 
     !---- Compute the L2 norm of the RHS vectors
     !---- (Goal is to obtain L2_resid/L2_rhs < tolerance)
 
     call t_startf("pcg_dotprod")
-    worku(:,:) = bu(:,:)*bu(:,:)    ! terms of dot product (b, b)
-    workv(:,:) = bv(:,:)*bv(:,:)
+    worku(:,:,:) = bu(:,:,:)*bu(:,:,:)    ! terms of dot product (b, b)
+    workv(:,:,:) = bv(:,:,:)*bv(:,:,:)
     call t_stopf("pcg_dotprod")
 
     ! find global sum of the squared L2 norm
 
     call t_startf("pcg_glbsum_init")
-    call parallel_global_sum_staggered(&
-         nx,     ny,   &
-         parallel,     &
-         bb,           &
-         worku,  workv)
+    bb = parallel_global_sum_stagger(worku, parallel, workv)
     call t_stopf("pcg_glbsum_init")
 
     ! take square root
@@ -2380,6 +3082,8 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     ! First pass of algorithm
     !---------------------------------------------------------------
 
+    iter = 1
+
     ! Note: The matrix A must be complete for all rows corresponding to locally 
     !        owned nodes, and x must have the correct values in
     !        halo nodes bordering the locally owned nodes.
@@ -2395,9 +3099,9 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !---- Compute A*x   (use z as a temp vector for A*x)
 
     call t_startf("pcg_matmult_init")
-    call matvec_multiply_structured_2d(nx,            ny,            &
-                                       parallel,                     &
-                                       indxA_2d,      active_vertex, &
+    call matvec_multiply_structured_3d(nx,            ny,            &
+                                       nz,            parallel,      &
+                                       indxA_3d,      active_vertex, &
                                        Auu,           Auv,           &
                                        Avu,           Avv,           &
                                        xu,            xv,            &
@@ -2408,8 +3112,8 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !---- This is correct for locally owned nodes.
 
     call t_startf("pcg_vecupdate")
-    ru(:,:) = bu(:,:) - zu(:,:)
-    rv(:,:) = bv(:,:) - zv(:,:)
+    ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
+    rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
     call t_stopf("pcg_vecupdate")
 
     !---- Halo update for residual
@@ -2425,177 +3129,81 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     ! From here on, call timers with 'iter' suffix because this can be considered the first iteration
     call t_startf("pcg_precond_iter")
 
-    if (precond == HO_PRECOND_NONE) then      ! no preconditioning
+    if (precond == HO_PRECOND_NONE) then   ! no preconditioning
 
-       zu(:,:) = ru(:,:)         ! PC(r) = r     
-       zv(:,:) = rv(:,:)         ! PC(r) = r    
+       zu(:,:,:) = ru(:,:,:)         ! PC(r) = r     
+       zv(:,:,:) = rv(:,:,:)         ! PC(r) = r    
 
-    elseif (precond == HO_PRECOND_DIAG) then  ! diagonal preconditioning
+    elseif (precond == HO_PRECOND_DIAG ) then  ! diagonal preconditioning
 
-       ! Solve Mz = r, where M is a diagonal matrix
        do j = 1, ny-1
        do i = 1, nx-1
-          if (Adiagu(i,j) /= 0.d0) then
-             zu(i,j) = ru(i,j) / Adiagu(i,j)   ! PC(r), where PC is formed from diagonal elements of A
-          else                                        
-             zu(i,j) = 0.d0
-          endif
-          if (Adiagv(i,j) /= 0.d0) then
-             zv(i,j) = rv(i,j) / Adiagv(i,j)  
+       do k = 1, nz
+          if (Adiagu(k,i,j) /= 0.d0) then
+             zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j)   ! PC(r), where PC is formed from diagonal elements of A
           else                                        
-             zv(i,j) = 0.d0
-          endif
-       enddo    ! i
-       enddo    ! j
-
-       !WHL - debug
-       if (verbose_pcg .and. this_rank == rtest) then
-          i = itest
-!          write(iulog,*) ' '
-!          write(iulog,*) 'zv solve with diagonal precond, this_rank, i =', this_rank, i
-!          write(iulog,*) 'j, active, Adiagv, rv, zv, xv:'
-!          do j = staggered_jhi, staggered_jlo, -1
-!             write(iulog,'(i4, l4, 2f12.3, e12.3, f12.3)') j, active_vertex(i,j), Adiagv(i,j), rv(i,j), zv(i,j), xv(i,j)
-!          enddo
-       endif
-
-    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then  ! local
-
-       if (verbose_tridiag .and. this_rank == rtest) then
-          i = itest
-          j = jtest
-          write(iulog,*) 'Residual:'
-          write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j)
-          write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j)
-          write(iulog,*) ' '
-          write(iulog,*) 'jtest =', jtest
-          write(iulog,*) 'i, ru, rv:'
-          do i = itest-3, itest+3
-             write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
-          enddo
-       endif
-
-       if (verbose_pcg .and. this_rank == rtest) then
-          write(iulog,*) 'call tridiag_solver_local_2d'
-       endif
-
-       ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task)
-
-       !TODO - Test a local solver that can compute zu and zv in the halo
-       !       (to avoid the halo update below)
-
-       call tridiag_solver_local_2d(nx,            ny,            &
-                                    parallel,                     &
-                                    itest, jtest,  rtest,         &
-                                    Adiag_u,       Adiag_v,       &  ! entries of preconditioning matrix
-                                    Asubdiag_u,    Asubdiag_v,    &
-                                    Asupdiag_u,    Asupdiag_v,    &
-                                    omega_u,       omega_v,       &
-                                    denom_u,       denom_v,       &
-                                    ru,            rv,            &  ! right hand side
-                                    zu,            zv)               ! solution
-
-       !WHL - debug
-          if (verbose_pcg .and. this_rank == rtest) then
-             j = jtest
-             write(iulog,*) ' '
-             write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
-             do i = itest-3, itest+3
-                write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
-             enddo
+             zu(k,i,j) = 0.d0
           endif
+          if (Adiagv(k,i,j) /= 0.d0) then
+             zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j)  
+          else                                        
+             zv(k,i,j) = 0.d0
+          endif
+       enddo    ! k
+       enddo    ! i
+       enddo    ! j
 
-       !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
-       !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
-       call staggered_parallel_halo(zu, parallel)
-       call staggered_parallel_halo(zv, parallel)
-
-    elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
-
-       ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal)
-       do j = 1, jlocal
-          jj = j + staggered_jlo - 1
-          do i = 1, ilocal
-             ii = i + staggered_ilo - 1
-             b_u(i,j) = ru(ii,jj)
-          enddo
-       enddo
-
-       ! Initialize the array for gathering information on each row of tasks
-       allocate(gather_data_row(8*tasks_row,jlocal))
-       gather_data_row = 0.0d0
-
-       ! Solve M*z = r, where M is a global tridiagonal matrix
-
-       call tridiag_solver_global_2d(ilocal,       jlocal,      &
-                                     parallel,     tasks_row,   &
-                                     'row',                     &  ! tridiagonal solve for each row
-!!                                          itest, jtest, rtest,      &
-                                     itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
-                                     jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
-                                     rtest,                     &
-                                     Adiag_u,                   &
-                                     Asubdiag_u,   Asupdiag_u,  &
-                                     omega_u,      denom_u,     &
-                                     xuh_u,        xlh_u,       &
-                                     b_u,          x_u,         &
-                                     .true.,                    &  ! first_time
-                                     gather_data_row)
+    elseif (precond == HO_PRECOND_SIA) then   ! local vertical shallow-ice solver for preconditioning
 
-       ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1)
-       zu(:,:) = 0.0d0
-       do j = 1, jlocal
-          jj = j + staggered_jlo - 1
-          do i = 1, ilocal
-             ii = i + staggered_ilo - 1
-             zu(ii,jj) = x_u(i,j)
-          enddo
-       enddo
+       call easy_sia_solver(nx,   ny,   nz,        &
+                            active_vertex,         &
+                            Muu,  ru,   zu)      ! solve Muu*zu = ru for zu 
 
-       ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal)
+       call easy_sia_solver(nx,   ny,   nz,        &
+                            active_vertex,         &
+                            Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
 
-       do i = 1, ilocal
-          ii = i + staggered_ilo - 1
-          do j = 1, jlocal
-             jj = j + staggered_jlo - 1
-             b_v(j,i) = rv(ii,jj)
-          enddo
-       enddo
+    elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
 
-       ! Initialize the array for gathering information on each column of tasks
-       allocate(gather_data_col(8*tasks_col,ilocal))
-       gather_data_col = 0.0d0
+       ! Use a local tridiagonal solver to find an approximate solution of A*z = r
 
-       call tridiag_solver_global_2d(jlocal,       ilocal,      &
-                                     parallel,     tasks_col,   &
-                                     'col',                     &  ! tridiagonal solve for each column
-!!                                          itest, jtest, rtest,      &
-                                     jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
-                                     itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
-                                     rtest,                     &
-                                     Adiag_v,                   &
-                                     Asubdiag_v,   Asupdiag_v,  &
-                                     omega_v,      denom_v,     &
-                                     xuh_v,        xlh_v,       &
-                                     b_v,          x_v,         &
-                                     .true.,                    &  ! first_time
-                                     gather_data_col)
+       call tridiag_solver_local_3d(&
+            nx,            ny,            &
+            nz,            parallel,      &
+            active_vertex,                &
+            itest, jtest,  rtest,         &
+            Adiag_u,       Adiag_v,       &  ! entries of 2D preconditioning matrix
+            Asubdiag_u,    Asubdiag_v,    &
+            Asupdiag_u,    Asupdiag_v,    &
+            omega_u,       omega_v,       &
+            denom_u,       denom_v,       &
+            Muu,           Mvv,           &  ! entries of SIA matrix
+            ru,            rv,            &  ! 3D residual
+            zu,            zv)               ! approximate solution of Az = r
 
-       ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1)
+    elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then
 
-       zv(:,:) = 0.0d0
-       do i = 1, ilocal
-          ii = i + staggered_ilo - 1
-          do j = 1, jlocal
-             jj = j + staggered_jlo - 1
-             zv(ii,jj) = x_v(j,i)
-          enddo
-       enddo
+       ! Use a global tridiagonal solver to find an approximate solution of A*z = r
 
-       !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
-       !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells?
-       call staggered_parallel_halo(zu, parallel)
-       call staggered_parallel_halo(zv, parallel)
+       call tridiag_solver_global_3d(&
+            nx,              ny,              &
+            nz,              parallel,        &
+            active_vertex,                    &
+            ilocal,          jlocal,          &
+            tasks_row,       tasks_col,       &
+            itest,  jtest,   rtest,           &
+            Adiag_u,         Adiag_v,         &  ! entries of 2D preconditioning matrix
+            Asubdiag_u,      Asubdiag_v,      &
+            Asupdiag_u,      Asupdiag_v,      &
+            omega_u,         omega_v,         &
+            denom_u,         denom_v,         &
+            xuh_u,           xuh_v,           &
+            xlh_u,           xlh_v,           &
+            Muu,             Mvv,             &  ! entries of SIA matrix
+            gather_data_row, gather_data_col, &
+            .true.,                           &  ! first_time = T (first iteration)
+            ru,              rv,              &  ! 3D residual
+            zu,              zv)                 ! approximate solution of Az = r
 
     endif    ! precond
 
@@ -2604,60 +3212,42 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !---- Compute intermediate result for dot product (r,z)
 
     call t_startf("pcg_dotprod")
-    work2u(:,:,1) = ru(:,:) * zu(:,:)
-    work2v(:,:,1) = rv(:,:) * zv(:,:)
+    work2u(:,:,:,1) = ru(:,:,:) * zu(:,:,:)
+    work2v(:,:,:,1) = rv(:,:,:) * zv(:,:,:)
     call t_stopf("pcg_dotprod")
 
     !---- Compute the conjugate direction vector d
     !---- Since z is correct in halo, so is d
 
-    du(:,:) = zu(:,:)
-    dv(:,:) = zv(:,:)
+    du(:,:,:) = zu(:,:,:)
+    dv(:,:,:) = zv(:,:,:)
 
     !---- Compute q = A*d
-    !---- q is correct for locally owned nodes, provided d extends one layer into the halo
+    !---- q is correct for locally owned nodes
 
     call t_startf("pcg_matmult_iter")
-    call matvec_multiply_structured_2d(nx,            ny,            &
-                                       parallel,                     &
-                                       indxA_2d,      active_vertex, &
+    call matvec_multiply_structured_3d(nx,            ny,            &
+                                       nz,            parallel,      &
+                                       indxA_3d,      active_vertex, &
                                        Auu,           Auv,           &
                                        Avu,           Avv,           &
                                        du,            dv,            &
                                        qu,            qv)
     call t_stopf("pcg_matmult_iter")
 
-    !WHL - debug
-    usum = sum(qu(staggered_ilo:staggered_ihi,staggered_jlo:staggered_jhi))
-    usum_global = parallel_reduce_sum(usum)
-    vsum = sum(qv(staggered_ilo:staggered_ihi,staggered_jlo:staggered_jhi))
-    vsum_global = parallel_reduce_sum(vsum)
-
-    if (verbose_pcg .and. this_rank == rtest) then
-!!       write(iulog,*) 'Prep: sum(qu), sum(qv) =', usum_global, vsum_global
-    endif
-
     !---- Compute intermediate result for dot product (d,q) = (d,Ad)
 
     call t_startf("pcg_dotprod")
-    work2u(:,:,2) = du(:,:) * qu(:,:)
-    work2v(:,:,2) = dv(:,:) * qv(:,:)
+    work2u(:,:,:,2) = du(:,:,:) * qu(:,:,:)
+    work2v(:,:,:,2) = dv(:,:,:) * qv(:,:,:)
     call t_stopf("pcg_dotprod")
 
     !---- Find global sums of (r,z) and (d,q)
 
     call t_startf("pcg_glbsum_iter")
-    call parallel_global_sum_staggered(&
-         nx,     ny,   &
-         parallel,     &
-         gsum,         &
-         work2u, work2v)
+    gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)
     call t_stopf("pcg_glbsum_iter")
 
-    if (verbose_pcg .and. this_rank == rtest) then
-!!       write(iulog,*) 'Prep: gsum(1), gsum(2) =', gsum(1), gsum(2)
-    endif
-
     !---- Halo update for q
 
     call t_startf("pcg_halo_iter")
@@ -2679,227 +3269,112 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     !---- These are correct in halo
 
     call t_startf("pcg_vecupdate")
-    xu(:,:) = xu(:,:) + alpha*du(:,:)
-    xv(:,:) = xv(:,:) + alpha*dv(:,:)
-
-    ru(:,:) = ru(:,:) - alpha*qu(:,:)     ! q = A*d
-    rv(:,:) = rv(:,:) - alpha*qv(:,:)
-    call t_stopf("pcg_vecupdate")
-
-       !WHL - debug
-       if (verbose_pcg .and. this_rank == rtest) then
-          j = jtest
-          write(iulog,*) ' '
-          write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:'
-!!          do i = itest-3, itest+3
-          do i = staggered_ilo, staggered_ihi
-             write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j)
-          enddo  ! i
-       endif
-
-    !---------------------------------------------------------------
-    ! Iterate to solution
-    !---------------------------------------------------------------
-
-    iter_loop: do iter = 2, maxiters_chrongear  ! first iteration done above
-
-       if (verbose_pcg .and. this_rank == rtest) then
-          write(iulog,*) ' '
-          write(iulog,*) 'iter =', iter
-       endif
-
-       !---- Compute PC(r) = solution z of Mz = r
-       !---- z is correct in halo
-
-       call t_startf("pcg_precond_iter")
-
-       if (precond == HO_PRECOND_NONE) then      ! no preconditioning
-
-           zu(:,:) = ru(:,:)         ! PC(r) = r
-           zv(:,:) = rv(:,:)         ! PC(r) = r    
-
-       elseif (precond == HO_PRECOND_DIAG) then  ! diagonal preconditioning
-
-          do j = 1, ny-1
-          do i = 1, nx-1
-             if (Adiagu(i,j) /= 0.d0) then
-                zu(i,j) = ru(i,j) / Adiagu(i,j)   ! PC(r), where PC is formed from diagonal elements of A
-             else
-                zu(i,j) = 0.d0
-             endif
-             if (Adiagv(i,j) /= 0.d0) then
-                zv(i,j) = rv(i,j) / Adiagv(i,j)
-             else
-                zv(i,j) = 0.d0
-             endif
-          enddo    ! i
-          enddo    ! j
-
-       elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then   ! tridiagonal preconditioning with local solve
-
-          ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task)
-
-          !TODO - Test a local solver that can compute zu and zv in the halo
-          !       (to avoid the halo update below)
-
-          !WHL - debug
-          if (verbose_tridiag .and. this_rank == rtest) then
-             i = itest
-             j = jtest
-!             write(iulog,*) 'Residual:'
-!             write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j)
-!             write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j)
-!             write(iulog,*) ' '
-!             write(iulog,*) 'jtest =', jtest
-!             write(iulog,*) 'i, ru, rv:'
-!             do i = staggered_ihi, staggered_ilo, -1
-!                write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
-!             enddo
-          endif
-
-
-          !WHL - debug
-          if (verbose_pcg .and. this_rank == rtest) then
-             write(iulog,*) '   call tridiag_solver_local_2d'
-          endif
-
-          call tridiag_solver_local_2d(nx,            ny,            &
-                                       parallel,                     &
-                                       itest, jtest,  rtest,         &
-                                       Adiag_u,       Adiag_v,       &  ! entries of preconditioning matrix
-                                       Asubdiag_u,    Asubdiag_v,    &
-                                       Asupdiag_u,    Asupdiag_v,    &
-                                       omega_u,       omega_v,       &
-                                       denom_u,       denom_v,       &
-                                       ru,            rv,            &  ! right hand side
-                                       zu,            zv)               ! solution
-
-          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
-          !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
-          call staggered_parallel_halo(zu, parallel)
-          call staggered_parallel_halo(zv, parallel)
-
-          if (verbose_pcg .and. this_rank == rtest) then
-             j = jtest
-             write(iulog,*) ' '
-             write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:'
-             do i = itest-3, itest+3
-                write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j)
-             enddo
-          endif
-
-       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
-
-          !WHL - debug
-          if (verbose_tridiag .and. this_rank == rtest) then
-             j = jtest
-             write(iulog,*) ' '
-             write(iulog,*) 'jtest =', jtest
-             write(iulog,*) 'i, ru, rv:'
-             do i = itest-3, itest+3
-                write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j)
-             enddo
-          endif
-
-          ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal)
+    xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:)
+    xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:)
 
-          do j = 1, jlocal
-             jj = j + staggered_jlo - 1
-             do i = 1, ilocal
-                ii = i + staggered_ilo - 1
-                b_u(i,j) = ru(ii,jj)
-             enddo
-          enddo
+    ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:)     ! q = A*d
+    rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:)
+    call t_stopf("pcg_vecupdate")
 
-          !WHL - debug
     if (verbose_pcg .and. this_rank == rtest) then
        j = jtest
        write(iulog,*) ' '
-       write(iulog,*) 'Before global tridiag PC u solve, r, j =', rtest, jtest
-       write(iulog,*) ' '
-       write(iulog,*) 'i, Adiag_u, Asubdiag_u, Asupdiag_u, b_u:'
+       write(iulog,*) 'iter =', iter
+       write(iulog,*) 'i, k, xu, xv, ru, rv:'
        do i = itest-3, itest+3
-          write(iulog,'(i4, 4e16.8)') i, Adiag_u(i,j), Asubdiag_u(i,j), Asupdiag_u(i,j), b_u(i,j)
-       enddo
+          write(iulog,*) ' '
+          do k = 1, nz
+             write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
+          enddo
+       enddo  ! i
     endif
 
+    !---------------------------------------------------------------
+    ! Iterate to solution
+    !---------------------------------------------------------------
 
-          call tridiag_solver_global_2d(ilocal,       jlocal,      &
-                                        parallel,     tasks_row,   &
-                                        'row',                     &  ! tridiagonal solve for each row
-!!                                             itest, jtest, rtest,      &
-                                        itest - staggered_ilo + 1, &  ! itest referenced to (ilocal,jlocal) coordinates
-                                        jtest - staggered_jlo + 1, &  ! jtest referenced to (ilocal,jlocal) coordinates
-                                        rtest,                     &
-                                        Adiag_u,                   &
-                                        Asubdiag_u,   Asupdiag_u,  &
-                                        omega_u,      denom_u,     &
-                                        xuh_u,        xlh_u,       &
-                                        b_u,          x_u,         &
-                                        .false.,                   &  ! first_time
-                                        gather_data_row)
+    iter_loop: do iter = 2, maxiters_chrongear  ! first iteration done above
 
-          ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1)
-          zu(:,:) = 0.0d0
-          do j = 1, jlocal
-             jj = j + staggered_jlo - 1
-             do i = 1, ilocal
-                ii = i + staggered_ilo - 1
-                zu(ii,jj) = x_u(i,j)
-             enddo
-          enddo
+       !---- Compute PC(r) = solution z of Mz = r
+       !---- z is correct in halo
 
-          ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal)
+       call t_startf("pcg_precond_iter")
 
-          do i = 1, ilocal
-             ii = i + staggered_ilo - 1
-             do j = 1, jlocal
-                jj = j + staggered_jlo - 1
-                b_v(j,i) = rv(ii,jj)
-             enddo
-          enddo
+       if (precond == HO_PRECOND_NONE) then   ! no preconditioning
 
-    if (verbose_pcg .and. this_rank == rtest) then
-       j = jtest
-       write(iulog,*) ' '
-       write(iulog,*) 'Before global tridiag PC v solve, r, j =', rtest, jtest
-       write(iulog,*) ' '
-       write(iulog,*) 'i, Adiag_v, Asubdiag_v, Asupdiag_v, b_v:'
-       do i = itest-3, itest+3
-          write(iulog,'(i4, 4e16.8)') i, Adiag_v(i,j), Asubdiag_v(i,j), Asupdiag_v(i,j), b_v(i,j)
-       enddo
-    endif
+           zu(:,:,:) = ru(:,:,:)         ! PC(r) = r
+           zv(:,:,:) = rv(:,:,:)         ! PC(r) = r    
 
-          call tridiag_solver_global_2d(jlocal,       ilocal,      &
-                                        parallel,     tasks_col,   &
-                                        'col',                     &  ! tridiagonal solve for each column
-!!                                             itest, jtest, rtest,      &
-                                        jtest - staggered_jlo + 1, &  ! jtest referenced to (jlocal,ilocal) coordinates
-                                        itest - staggered_ilo + 1, &  ! itest referenced to (jlocal,ilocal) coordinates
-                                        rtest,                     &
-                                        Adiag_v,                   &
-                                        Asubdiag_v,   Asupdiag_v,  &
-                                        omega_v,      denom_v,     &
-                                        xuh_v,        xlh_v,       &
-                                        b_v,          x_v,         &
-                                        .false.,                   &  ! first_time
-                                        gather_data_col)
+       elseif (precond == HO_PRECOND_DIAG ) then  ! diagonal preconditioning
 
-          ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1)
+          do j = 1, ny-1
+          do i = 1, nx-1
+          do k = 1, nz
+             if (Adiagu(k,i,j) /= 0.d0) then
+                zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j)   ! PC(r), where PC is formed from diagonal elements of A
+             else
+                zu(k,i,j) = 0.d0
+             endif
+             if (Adiagv(k,i,j) /= 0.d0) then
+                zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j)
+             else
+                zv(k,i,j) = 0.d0
+             endif
+          enddo    ! k
+          enddo    ! i
+          enddo    ! j
 
-          zv(:,:) = 0.0d0
-          do i = 1, ilocal
-             ii = i + staggered_ilo - 1
-             do j = 1, jlocal
-                jj = j + staggered_jlo - 1
-                zv(ii,jj) = x_v(j,i)
-             enddo
-          enddo
+       elseif (precond == HO_PRECOND_SIA) then   ! local vertical shallow-ice solver for preconditioning
 
-          !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells
-          !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells?
-          call staggered_parallel_halo(zu, parallel)
-          call staggered_parallel_halo(zv, parallel)
+          call easy_sia_solver(nx,   ny,   nz,        &
+                               active_vertex,         &
+                               Muu,  ru,   zu)      ! solve Muu*zu = ru for zu 
+
+          call easy_sia_solver(nx,   ny,   nz,        &
+                               active_vertex,         &
+                               Mvv,  rv,   zv)      ! solve Mvv*zv = rv for zv
+
+       elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then
+
+          ! Use a local tridiagonal solver to find an approximate solution of A*z = r
+
+          call tridiag_solver_local_3d(&
+               nx,            ny,            &
+               nz,            parallel,      &
+               active_vertex,                &
+               itest, jtest,  rtest,         &
+               Adiag_u,       Adiag_v,       &  ! entries of 2D preconditioning matrix
+               Asubdiag_u,    Asubdiag_v,    &
+               Asupdiag_u,    Asupdiag_v,    &
+               omega_u,       omega_v,       &
+               denom_u,       denom_v,       &
+               Muu,           Mvv,           &  ! entries of SIA matrix
+               ru,            rv,            &  ! 3D residual
+               zu,            zv)               ! approximate solution of Az = r
+
+       elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then   ! tridiagonal preconditioning with global solve
+
+          ! Use a global tridiagonal solver to find an approximate solution of A*z = r
+
+          call tridiag_solver_global_3d(&
+               nx,              ny,              &
+               nz,              parallel,        &
+               active_vertex,                    &
+               ilocal,          jlocal,          &
+               tasks_row,       tasks_col,       &
+               itest,  jtest,   rtest,           &
+               Adiag_u,         Adiag_v,         &  ! entries of 2D preconditioning matrix
+               Asubdiag_u,      Asubdiag_v,      &
+               Asupdiag_u,      Asupdiag_v,      &
+               omega_u,         omega_v,         &
+               denom_u,         denom_v,         &
+               xuh_u,           xuh_v,           &
+               xlh_u,           xlh_v,           &
+               Muu,             Mvv,             &  ! entries of SIA matrix
+               gather_data_row, gather_data_col, &
+               .false.,                          &  ! first_time = F (iteration 2+)
+               ru,              rv,              &  ! 3D residual
+               zu,              zv)                 ! approximate solution of Az = r
 
        endif    ! precond
 
@@ -2907,26 +3382,27 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
        !---- Compute Az = A*z
        !---- This is the one matvec multiply required per iteration
-       !---- Az is correct for locally owned nodes and needs a halo update (below)
+       !---- Az is correct for local owned nodes and needs a halo update (below)
 
        call t_startf("pcg_matmult_iter")
-       call matvec_multiply_structured_2d(nx,            ny,            &
-                                          parallel,                     &
-                                          indxA_2d,      active_vertex, &
+       call matvec_multiply_structured_3d(nx,            ny,            &
+                                          nz,            parallel,      &
+                                          indxA_3d,      active_vertex, &
                                           Auu,           Auv,           &
                                           Avu,           Avv,           &
                                           zu,            zv,            &
                                           Azu,           Azv)
        call t_stopf("pcg_matmult_iter")
 
+
        !---- Compute intermediate results for the dot products (r,z) and (Az,z)
 
        call t_startf("pcg_dotprod")
-       work2u(:,:,1) = ru(:,:)*zu(:,:)     ! terms of dot product (r,z)
-       work2v(:,:,1) = rv(:,:)*zv(:,:)    
+       work2u(:,:,:,1) = ru(:,:,:)*zu(:,:,:)     ! terms of dot product (r,z)
+       work2v(:,:,:,1) = rv(:,:,:)*zv(:,:,:)    
 
-       work2u(:,:,2) = Azu(:,:)*zu(:,:)    ! terms of dot product (A*z,z)
-       work2v(:,:,2) = Azv(:,:)*zv(:,:)    
+       work2u(:,:,:,2) = Azu(:,:,:)*zu(:,:,:)    ! terms of dot product (A*z,z)
+       work2v(:,:,:,2) = Azv(:,:,:)*zv(:,:,:)    
        call t_stopf("pcg_dotprod")
 
        ! Take the global sums of (r,z) and (Az,z)
@@ -2934,14 +3410,11 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        ! this is the one MPI global reduction per iteration.
 
        call t_startf("pcg_glbsum_iter")
-       call parallel_global_sum_staggered(&
-            nx,      ny,    &
-            parallel,        &
-            gsum,            &
-            work2u,  work2v)
+       gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v)
        call t_stopf("pcg_glbsum_iter")
 
        !---- Halo update for Az
+       !---- This is the one halo update required per iteration
 
        call t_startf("pcg_halo_iter")
        call staggered_parallel_halo(Azu, parallel)
@@ -2960,7 +3433,7 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
        if (alpha /= alpha) then  ! alpha is NaN
 !!          write(iulog,*) 'rho, sigma, alpha:', rho, sigma, alpha
-          call write_log('Chron_Gear PCG solver has failed, alpha = NaN', GM_FATAL)
+          call write_log('Chron-Gear PCG solver has failed, alpha = NaN', GM_FATAL)
        endif
 
        !---- Update d and q
@@ -2968,33 +3441,36 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
        call t_startf("pcg_vecupdate")
 
-       du(:,:) = zu(:,:) + beta*du(:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
-       dv(:,:) = zv(:,:) + beta*dv(:,:)       !
-                                              !                    (r_(i+1), PC(r_(i+1)))
-                                              ! where beta_(i+1) = --------------------  
-                                              !                        (r_i, PC(r_i)) 
-       qu(:,:) = Azu(:,:) + beta*qu(:,:)
-       qv(:,:) = Azv(:,:) + beta*qv(:,:)
+       du(:,:,:) = zu(:,:,:) + beta*du(:,:,:)       ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i
+       dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:)       !
+                                                    !                    (r_(i+1), PC(r_(i+1)))
+                                                    ! where beta_(i+1) = --------------------  
+                                                    !                        (r_i, PC(r_i)) 
+       qu(:,:,:) = Azu(:,:,:) + beta*qu(:,:,:)
+       qv(:,:,:) = Azv(:,:,:) + beta*qv(:,:,:)
 
        !---- Update solution and residual
        !---- These are correct in halo
 
-       xu(:,:) = xu(:,:) + alpha*du(:,:)
-       xv(:,:) = xv(:,:) + alpha*dv(:,:)
+       xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:)
+       xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:)
 
-       ru(:,:) = ru(:,:) - alpha*qu(:,:)
-       rv(:,:) = rv(:,:) - alpha*qv(:,:)
+       ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:)
+       rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:)
 
        call t_stopf("pcg_vecupdate")
 
-       !WHL - debug
        if (verbose_pcg .and. this_rank == rtest) then
           j = jtest
-          write(iulog,*) 'i, xu, xv, ru, rv:'
-!!             do i = staggered_ihi, staggered_ilo, -1
+          write(iulog,*) ' '
+          write(iulog,*) 'iter =', iter
+          write(iulog,*) 'i, k, xu, xv, ru, rv:'
           do i = itest-3, itest+3
-             write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j)
-          enddo
+             write(iulog,*) ' '
+             do k = 1, nz
+                write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j)
+             enddo
+          enddo  ! i
        endif
 
        ! Check for convergence every linear_solve_ncheck iterations.
@@ -3004,19 +3480,18 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
        ! For convergence check, use r = b - Ax
 
        if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then
-!!       if (mod(iter, linear_solve_ncheck) == 0 .or. iter == linear_solve_ncheck/2) then
 
-          if (verbose_pcg .and. this_rank == rtest) then
+          if (verbose_pcg .and. main_task) then
              write(iulog,*) ' '
-             write(iulog,*) '   check convergence, iter =', iter
+             write(iulog,*) 'Check convergence, iter =', iter
           endif
 
           !---- Compute z = A*x  (use z as a temp vector for A*x)
            
           call t_startf("pcg_matmult_resid")
-          call matvec_multiply_structured_2d(nx,            ny,            &
-                                             parallel,                     &
-                                             indxA_2d,      active_vertex, &
+          call matvec_multiply_structured_3d(nx,            ny,            &
+                                             nz,            parallel,      &
+                                             indxA_3d,      active_vertex, &
                                              Auu,           Auv,           &
                                              Avu,           Avv,           &
                                              xu,            xv,            &
@@ -3026,32 +3501,32 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
           !---- Compute residual r = b - A*x
 
           call t_startf("pcg_vecupdate")
-          ru(:,:) = bu(:,:) - zu(:,:)
-          rv(:,:) = bv(:,:) - zv(:,:)
+          ru(:,:,:) = bu(:,:,:) - zu(:,:,:)
+          rv(:,:,:) = bv(:,:,:) - zv(:,:,:)
           call t_stopf("pcg_vecupdate")
 
           !---- Compute dot product (r, r)
 
           call t_startf("pcg_dotprod")
-          worku(:,:) = ru(:,:)*ru(:,:)
-          workv(:,:) = rv(:,:)*rv(:,:)
+          worku(:,:,:) = ru(:,:,:)*ru(:,:,:)
+          workv(:,:,:) = rv(:,:,:)*rv(:,:,:)
           call t_stopf("pcg_dotprod")
 
           call t_startf("pcg_glbsum_resid")
-          call parallel_global_sum_staggered(&
-               nx,     ny,    &
-               parallel,      &
-               rr,            &
-               worku,  workv)
+          rr = parallel_global_sum_stagger(worku, parallel, workv)
           call t_stopf("pcg_glbsum_resid")
 
-          L2_resid = sqrt(rr)          ! L2 norm of residual
-          err = L2_resid/L2_rhs        ! normalized error
+          ! take square root
+          L2_resid = sqrt(rr)
 
-          if (verbose_pcg .and. this_rank == rtest) then
+          ! compute normalized error
+          err = L2_resid/L2_rhs
+
+          if (verbose_pcg .and. main_task) then
              write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err
           endif
 
+          !WHL - debug
           if (verbose_pcg .and. this_rank == rtest) then
              ru_max = 0.d0
              rv_max = 0.d0
@@ -3059,13 +3534,13 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
              ju_max = 0
              do j = staggered_jlo, staggered_jhi
                 do i = staggered_ilo, staggered_ihi
-                   if (abs(ru(i,j)) > ru_max) then
-                      ru_max = ru(i,j)
+                   if (sum(abs(ru(:,i,j))) > ru_max) then
+                      ru_max = sum(abs(ru(:,i,j)))
                       iu_max = i
                       ju_max = j
                    endif
-                   if (abs(rv(i,j)) > rv_max) then
-                      rv_max = rv(i,j)
+                   if (sum(abs(rv(:,i,j))) > rv_max) then
+                      rv_max = sum(abs(rv(:,i,j)))
                       iv_max = i
                       jv_max = j
                    endif
@@ -3081,14 +3556,16 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
 
           if (err < tolerance) then
              niters = iter
-             if (verbose_pcg .and. this_rank == rtest) then
+             if (verbose_pcg .and. main_task) then
                 write(iulog,*) 'Glissade PCG solver has converged, iter =', niters
+                write(iulog,*) ' '
              endif
              exit iter_loop
-          elseif (niters == maxiters_chrongear) then
-             if (verbose_pcg .and. this_rank == rtest) then
-                write(iulog,*) 'Glissade PCG solver not converged'
-                write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance
+          elseif (iter == maxiters_chrongear) then
+             if (verbose_pcg .and. main_task) then
+                write(iulog,*) 'Glissade PCG solver did not converge'
+                write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance
+                write(iulog,*) ' '
              endif
           endif
 
@@ -3111,12 +3588,10 @@ subroutine pcg_solver_chrongear_2d(nx,        ny,            &
     if (allocated(denom_u))    deallocate(denom_u, denom_v)
     if (allocated(xuh_u))      deallocate(xuh_u, xuh_v)
     if (allocated(xlh_u))      deallocate(xlh_u, xlh_v)
-    if (allocated(b_u))        deallocate(b_u, b_v)
-    if (allocated(x_u))        deallocate(x_u, x_v)
     if (allocated(gather_data_row)) deallocate(gather_data_row)
     if (allocated(gather_data_col)) deallocate(gather_data_col)
 
-  end subroutine pcg_solver_chrongear_2d
+  end subroutine pcg_solver_chrongear_3d
 
 !****************************************************************************
 
@@ -3957,6 +4432,7 @@ subroutine setup_preconditioner_tridiag_global_2d(ilocal,       jlocal,    &
 
     !WHL - debug
     if (verbose_tridiag .and. this_rank == rtest) then
+       write(iulog,*)
        write(iulog,*) 'In setup_preconditioner_tridiag_global_2d: itest, jtest, rtest =', itest, jtest, rtest
     endif
 
@@ -4637,9 +5113,8 @@ subroutine tridiag_solver_global_2d(ilocal,       jlocal,     &
                                       first_time,   gather_data)
 
     use glimmer_utils, only: tridiag
-    use cism_parallel, only: distributed_gather_var_row, distributed_gather_var_col, &
-         distributed_gather_all_var_row, distributed_gather_all_var_col, &
-         distributed_scatter_var_row, distributed_scatter_var_col
+    use cism_parallel, only: gather_var_row, gather_var_col, &
+         gather_all_var_row, gather_all_var_col, scatter_var_row, scatter_var_col
 
     integer, intent(in) :: &
          ilocal, jlocal            ! size of input/output arrays; number of locally owned vertices in each direction
@@ -4864,11 +5339,11 @@ subroutine tridiag_solver_global_2d(ilocal,       jlocal,     &
 
           if (tridiag_solver_flag == 'row') then
              call t_startf("pcg_tridiag_gather_row")
-             call distributed_gather_all_var_row(outdata, gather_data2, parallel)
+             call gather_all_var_row(outdata, gather_data2, parallel)
              call t_stopf ("pcg_tridiag_gather_row")
           elseif (tridiag_solver_flag == 'col') then
              call t_startf("pcg_tridiag_gather_col")
-             call distributed_gather_all_var_col(outdata, gather_data2, parallel)
+             call gather_all_var_col(outdata, gather_data2, parallel)
              call t_stopf ("pcg_tridiag_gather_col")
           endif
 
@@ -4879,11 +5354,11 @@ subroutine tridiag_solver_global_2d(ilocal,       jlocal,     &
 
           if (tridiag_solver_flag == 'row') then
              call t_startf("pcg_tridiag_gather_row")
-             call distributed_gather_var_row(outdata, gather_data2, parallel)
+             call gather_var_row(outdata, gather_data2, parallel)
              call t_stopf ("pcg_tridiag_gather_row")
           elseif (tridiag_solver_flag == 'col') then
              call t_startf("pcg_tridiag_gather_col")
-             call distributed_gather_var_col(outdata, gather_data2, parallel)
+             call gather_var_col(outdata, gather_data2, parallel)
              call t_stopf ("pcg_tridiag_gather_col")
           endif
 
@@ -4998,11 +5473,11 @@ subroutine tridiag_solver_global_2d(ilocal,       jlocal,     &
 
           if (tridiag_solver_flag == 'row') then
              call t_startf("pcg_tridiag_scatter_row")
-             call distributed_scatter_var_row(local_coeffs, global_coeffs, parallel)
+             call scatter_var_row(local_coeffs, global_coeffs, parallel)
              call t_stopf ("pcg_tridiag_scatter_row")
           elseif (tridiag_solver_flag == 'col') then
              call t_startf("pcg_tridiag_scatter_col")
-             call distributed_scatter_var_col(local_coeffs, global_coeffs, parallel)
+             call scatter_var_col(local_coeffs, global_coeffs, parallel)
              call t_stopf ("pcg_tridiag_scatter_col")
           endif
 
diff --git a/utils/build/generate_ncvars.py b/utils/build/generate_ncvars.py
index 993c2c0a..e791670d 100755
--- a/utils/build/generate_ncvars.py
+++ b/utils/build/generate_ncvars.py
@@ -193,9 +193,9 @@ def write(self,vars):
         self.print_warning()
         for l in self.infile.readlines():
             for token in self.handletoken:
-                if l.find(token) is not -1:
+                if l.find(token) != -1:
                     break
-            if l.find(token) is not -1:
+            if l.find(token) != -1:
                 for v in vars.keys():
                     self.handletoken[token](vars[v])
             else:
@@ -267,9 +267,9 @@ def write(self,vars):
             for k in module.keys():
                 l = l.replace(k.upper(),module[k])
             for token in self.handletoken:
-                if l.find(token) is not -1:
+                if l.find(token) != -1:
                     break
-            if l.find(token) is not -1:
+            if l.find(token) != -1:
                 for v in vars.keys():
                     self.handletoken[token](vars[v])
             elif '!GENVAR_DIMS!' in l: