diff --git a/libglide/glide.F90 b/libglide/glide.F90 index 69f7f5e3..aca02953 100644 --- a/libglide/glide.F90 +++ b/libglide/glide.F90 @@ -384,8 +384,7 @@ subroutine glide_initialise(model) call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) + model%climate%eus, model%geometry%thkmask) ! calculate lower and upper ice surface call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus,model%geometry%lsrf) @@ -503,16 +502,10 @@ subroutine glide_init_state_diagnostic(model, evolve_ice) call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) + model%climate%eus, model%geometry%thkmask) endif ! calving_init - ! Compute total areas of grounded and floating ice - call calc_iareaf_iareag(model%numerics%dew, model%numerics%dns, & - model%geometry%thkmask, & - model%geometry%iareaf, model%geometry%iareag) - ! ------------------------------------------------------------------------ ! ***Part 2: Calculate geometry related fields ! ------------------------------------------------------------------------ @@ -874,13 +867,10 @@ subroutine glide_tstep_p2(model) call glide_prof_start(model,model%glide_prof%ice_mask2) - !TODO - Calculate area and vol separately from glide_set_mask? - call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) + model%climate%eus, model%geometry%thkmask) call glide_prof_stop(model,model%glide_prof%ice_mask2) @@ -909,14 +899,7 @@ subroutine glide_tstep_p2(model) call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) - endif ! oldglide = F - - if (.not. oldglide) then ! calculate area of floating and grounded ice - call calc_iareaf_iareag(model%numerics%dew, model%numerics%dns, & - model%geometry%thkmask, & - model%geometry%iareaf, model%geometry%iareag) + model%climate%eus, model%geometry%thkmask) endif ! oldglide = F ! ------------------------------------------------------------------------ diff --git a/libglide/glide_diagnostics.F90 b/libglide/glide_diagnostics.F90 index e49ebb2f..a4571ed1 100644 --- a/libglide/glide_diagnostics.F90 +++ b/libglide/glide_diagnostics.F90 @@ -39,12 +39,14 @@ module glide_diagnostics use cism_parallel, only: this_rank, main_task, lhalo, uhalo, nhalo, & parallel_type, broadcast, & parallel_localindex, parallel_globalindex, & - parallel_reduce_sum, parallel_reduce_max, & + parallel_global_sum, parallel_reduce_max, & parallel_reduce_maxloc, parallel_reduce_minloc, & parallel_is_zero implicit none + logical, parameter :: verbose_diagnostics = .false. + contains subroutine glide_write_diagnostics (model, time, & @@ -64,8 +66,6 @@ subroutine glide_write_diagnostics (model, time, & ! local arguments - logical, parameter :: verbose_diagnostics = .false. - ! debug if (main_task .and. verbose_diagnostics) then write(iulog,*) ' ' @@ -231,8 +231,12 @@ subroutine glide_write_diag (model, time) integer, dimension(model%general%ewn,model%general%nsn) :: & ice_mask, & ! = 1 where ice is present with thck > minthck, else = 0 floating_mask, & ! = 1 where ice is present and floating, else = 0 + grounded_mask, & ! = 1 where ice is present and grounded, else = 0 glacier_ice_mask ! = 1 where glacier ice is present, initially and/or currently + integer, dimension(model%general%ewn-1,model%general%nsn-1) :: & + stag_ice_mask ! staggered mask; = 1 if ice_mask = 1 for any of the four neighbors + real(dp), dimension(model%general%upn) :: & temp_diag, & ! Note: sfc temp not included if temps are staggered ! (use artm instead) @@ -273,8 +277,14 @@ subroutine glide_write_diag (model, time) velo_ew_ubound, velo_ns_ubound ! upper bounds for velocity variables real(dp), dimension(model%general%ewn, model%general%nsn) :: & - velo_sfc, & ! surface ice speed - thck_obs ! observed ice thickness, derived from usrf_obs and topg + mass_above_flotation,& ! ice mass above flotation (kg) + thck_obs ! observed ice thickness (m), derived from usrf_obs and topg + + real(dp), dimension(model%general%ewn-1, model%general%nsn-1) :: & + velo_sfc ! surface ice speed (m/s) + + real(dp), dimension(:,:,:), allocatable :: & + local_energy ! internal energy (J) per layer in a column character(len=100) :: message @@ -337,12 +347,26 @@ subroutine glide_write_diag (model, time) ice_mask(i,j) = 1 if (model%geometry%topg(i,j) - model%climate%eus < (-rhoi/rhoo)*model%geometry%thck(i,j)) then floating_mask(i,j) = 1 + grounded_mask(i,j) = 0 else floating_mask(i,j) = 0 + grounded_mask(i,j) = 1 endif else ice_mask(i,j) = 0 floating_mask(i,j) = 0 + grounded_mask(i,j) = 0 + endif + enddo + enddo + + do j = 1, nsn-1 + do i = 1, ewn-1 + if (ice_mask(i,j+1) == 1 .or. ice_mask(i+1,j+1) == 1 .or. & + ice_mask(i,j) == 1 .or. ice_mask(i+1,j) == 1) then + stag_ice_mask(i,j) = 1 + else + stag_ice_mask(i,j) = 0 endif enddo enddo @@ -358,109 +382,75 @@ subroutine glide_write_diag (model, time) call write_log(' ') ! total ice area (m^2) - - tot_area = 0.d0 - tot_area_ground = 0.d0 - tot_area_float = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - if (ice_mask(i,j) == 1) then - tot_area = tot_area + cell_area(i,j) - if (floating_mask(i,j) == 1) then - tot_area_float = tot_area_float + cell_area(i,j) - else - tot_area_ground = tot_area_ground + cell_area(i,j) - endif - endif - enddo - enddo - - tot_area = parallel_reduce_sum(tot_area) - tot_area_ground = parallel_reduce_sum(tot_area_ground) - tot_area_float = parallel_reduce_sum(tot_area_float) + tot_area = parallel_global_sum(cell_area, parallel, ice_mask) + tot_area_float = parallel_global_sum(cell_area, parallel, floating_mask) + tot_area_ground = parallel_global_sum(cell_area, parallel, grounded_mask) ! total ice volume (m^3) - - tot_volume = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - if (ice_mask(i,j) == 1) then - tot_volume = tot_volume + model%geometry%thck(i,j) * cell_area(i,j) - endif - enddo - enddo - tot_volume = parallel_reduce_sum(tot_volume) + tot_volume = parallel_global_sum(model%geometry%thck*cell_area, parallel, ice_mask) ! total ice mass (kg) tot_mass = tot_volume * rhoi ! total ice mass above flotation (kg) - tot_mass_above_flotation = 0.d0 - - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - if (ice_mask(i,j) == 1) then - if (floating_mask(i,j) == 0) then ! grounded ice - if (model%geometry%topg(i,j) - model%climate%eus < 0.0d0) then ! grounded below sea level - thck_floating = (-rhoo/rhoi) * (model%geometry%topg(i,j) - model%climate%eus) ! exactly floating - thck_above_flotation = model%geometry%thck(i,j) - thck_floating - tot_mass_above_flotation = tot_mass_above_flotation & - + thck_above_flotation * cell_area(i,j) - else ! grounded above sea level - tot_mass_above_flotation = tot_mass_above_flotation & - + model%geometry%thck(i,j) * cell_area(i,j) - endif + mass_above_flotation = 0.0d0 + do j = 1, nsn + do i = 1, ewn + if (ice_mask(i,j) == 1 .and. floating_mask(i,j) == 0) then + if (model%geometry%topg(i,j) - model%climate%eus < 0.0d0) then ! grounded below sea level + thck_floating = (-rhoo/rhoi) * (model%geometry%topg(i,j) - model%climate%eus) ! exactly floating + mass_above_flotation(i,j) = (model%geometry%thck(i,j) - thck_floating) * cell_area(i,j) + else ! grounded above sea level + mass_above_flotation(i,j) = model%geometry%thck(i,j) * cell_area(i,j) endif endif enddo enddo - - tot_mass_above_flotation = tot_mass_above_flotation * rhoi ! convert from m^3 to kg - tot_mass_above_flotation = parallel_reduce_sum(tot_mass_above_flotation) + mass_above_flotation = mass_above_flotation * rhoi + tot_mass_above_flotation = parallel_global_sum(mass_above_flotation, parallel) ! total ice energy relative to T = 0 deg C (J) - - tot_energy = 0.d0 - if (size(model%temper%temp,1) == upn+1) then ! temps are staggered in vertical - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo + local_energy = 0.0d0 + if (size(model%temper%temp,1) == upn+1) then ! temps are staggered in vertical, located at layer centers + allocate(local_energy(model%general%upn-1, model%general%ewn, model%general%nsn)) + do j = 1, nsn + do i = 1, ewn if (ice_mask(i,j) == 1) then - do k = 1, upn-1 - tot_energy = tot_energy + & - model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j) & - *(model%numerics%sigma(k+1) - model%numerics%sigma(k)) + do k = 1, upn-1 ! (upn-1) layers + local_energy(k,i,j) = & + model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j) & + * (model%numerics%sigma(k+1) - model%numerics%sigma(k)) enddo endif enddo enddo - - else ! temps are unstaggered in vertical - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo + else ! temps are unstaggered in vertical, located at layer interfaces + allocate(local_energy(model%general%upn, model%general%ewn, model%general%nsn)) + do j = 1, nsn + do i = 1, ewn if (ice_mask(i,j) == 1) then - ! upper half-layer, T = upper sfc temp - tot_energy = tot_energy + & - model%geometry%thck(i,j) * model%temper%temp(1,i,j) * cell_area(i,j) & - * 0.5d0 * model%numerics%sigma(2) - do k = 2, upn-1 - tot_energy = tot_energy + & - model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j) & - * 0.5d0*(model%numerics%sigma(k+1) - model%numerics%sigma(k-1)) + k = 1 ! top interface; assign this temperature to the top half layer + local_energy(k,i,j) = & + model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j) & + * 0.5d0 * model%numerics%sigma(k+1) + do k = 2, upn-1 ! interior layers + local_energy(k,i,j) = & + model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j) & + * 0.5d0*(model%numerics%sigma(k+1) - model%numerics%sigma(k-1)) enddo - ! lower half-layer, T = lower sfc temp - tot_energy = tot_energy + & - model%geometry%thck(i,j) * model%temper%temp(upn,i,j) * cell_area(i,j) & - * 0.5d0 * (1.0d0 - model%numerics%sigma(upn-1)) + k = upn ! bottom interface; assign this temperature to the bottom half layer + local_energy(k,i,j) = & + model%geometry%thck(i,j) * model%temper%temp(k,i,j) * cell_area(i,j) & + * 0.5d0 * (1.0d0 - model%numerics%sigma(k-1)) endif enddo enddo endif - - tot_energy = tot_energy * rhoi * shci - tot_energy = parallel_reduce_sum(tot_energy) + local_energy = local_energy * rhoi * shci + tot_energy = parallel_global_sum(local_energy, parallel, ice_mask) + deallocate(local_energy) ! mean thickness - if (tot_area > eps) then mean_thck = tot_volume/tot_area else @@ -468,7 +458,6 @@ subroutine glide_write_diag (model, time) endif ! mean temperature - if (tot_volume > eps) then mean_temp = tot_energy/ (rhoi*shci*tot_volume) else @@ -490,19 +479,11 @@ subroutine glide_write_diag (model, time) if (model%options%whichdycore == DYCORE_GLISSADE) then ! total surface accumulation/ablation rate (m^3/yr ice) - - tot_acab = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - tot_acab = tot_acab + model%climate%acab_applied(i,j) * cell_area(i,j) - enddo - enddo - - tot_acab = tot_acab * scyr ! convert to m^3/yr - tot_acab = parallel_reduce_sum(tot_acab) + tot_acab = parallel_global_sum(model%climate%acab_applied*cell_area, parallel) + tot_acab = tot_acab * scyr ! convert from m^3/s to m^3/yr ! total surface mass balance flux (kg/s) - tot_smb_flux = tot_acab * rhoi / scyr ! convert m^3/yr to kg/s + tot_smb_flux = tot_acab * rhoi / scyr ! convert m^3/yr to kg/s ! mean accumulation/ablation rate (m/yr) ! Note: This will be only approximate if some ice has melted completely during the time step @@ -513,15 +494,8 @@ subroutine glide_write_diag (model, time) endif ! total basal melting rate (positive for ice loss) - tot_bmlt = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - tot_bmlt = tot_bmlt + model%basal_melt%bmlt_applied(i,j) * cell_area(i,j) - enddo - enddo - - tot_bmlt = tot_bmlt * scyr ! convert to m^3/yr - tot_bmlt = parallel_reduce_sum(tot_bmlt) + tot_bmlt = parallel_global_sum(model%basal_melt%bmlt_applied*cell_area, parallel) + tot_bmlt = tot_bmlt * scyr ! convert from m^3/s to m^3/yr ! total basal mass balance (kg/s, positive for freeze-on, negative for melt) tot_bmb_flux = -tot_bmlt * rhoi / scyr ! convert m^3/yr to kg/s @@ -536,14 +510,7 @@ subroutine glide_write_diag (model, time) ! total calving rate (m^3/yr ice) ! Note: calving%calving_rate has units of m/yr ice - - tot_calving = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - tot_calving = tot_calving + model%calving%calving_rate(i,j) * cell_area(i,j) ! m^3/yr ice - enddo - enddo - tot_calving = parallel_reduce_sum(tot_calving) + tot_calving = parallel_global_sum(model%calving%calving_rate*cell_area, parallel) ! total calving mass balance flux (kg/s, negative for ice loss by calving) tot_calving_flux = -tot_calving * rhoi / scyr ! convert m^3/yr to kg/s @@ -559,15 +526,10 @@ subroutine glide_write_diag (model, time) ! total grounding line mass balance flux (< 0 by definition) ! Note: At this point, gl_flux_east and gl_flux_north are already dimensionalized in kg/m/s, ! so tot_gl_flux will have units of kg/s - - tot_gl_flux = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - tot_gl_flux = tot_gl_flux - abs(model%geometry%gl_flux_east(i,j)) * model%numerics%dns & - - abs(model%geometry%gl_flux_north(i,j)) * model%numerics%dew - enddo - enddo - tot_gl_flux = parallel_reduce_sum(tot_gl_flux) + tot_gl_flux = parallel_global_sum(abs(model%geometry%gl_flux_east) * model%numerics%dns & + + abs(model%geometry%gl_flux_north) * model%numerics%dew, & + parallel) + tot_gl_flux = -tot_gl_flux ! negative by definition ! total rate of change of ice mass (kg/s) ! Note: dthck_dt has units of m/s @@ -575,14 +537,8 @@ subroutine glide_write_diag (model, time) ! in successive time steps, instead of summing over dthck_dt. ! Note that dthck_dt does not account for global outflow fluxes (i.e., removal of ice ! near the global boundary in halo updates). - tot_dmass_dt = 0.d0 - do j = lhalo+1, nsn-uhalo - do i = lhalo+1, ewn-uhalo - tot_dmass_dt = tot_dmass_dt + model%geometry%dthck_dt(i,j) * cell_area(i,j) - enddo - enddo + tot_dmass_dt = parallel_global_sum(model%geometry%dthck_dt*cell_area, parallel) tot_dmass_dt = tot_dmass_dt * rhoi ! convert to kg/s - tot_dmass_dt = parallel_reduce_sum(tot_dmass_dt) ! mass conservation error ! Note: For most runs, this should be close to zero. @@ -957,8 +913,8 @@ subroutine glide_write_diag (model, time) + model%velocity%vvel(1,:,:)**2) call glissade_rms_error(& - ewn, nsn, & - ice_mask, & + ewn-1, nsn-1, & + stag_ice_mask, & parallel, & velo_sfc * scyr, & ! m/yr model%velocity%velo_sfc_obs * scyr, & ! m/yr @@ -1310,48 +1266,57 @@ subroutine glide_write_diag (model, time) ng = model%glacier%ngdiag - write(message,'(a35,i14)') 'Diagnostic glacier index (RGI) ', & - model%glacier%cism_to_rgi_glacier_id(ng) - call write_log(trim(message), type = GM_DIAGNOSTIC) + if (ng > 0) then - write(message,'(a35,i14)') 'Diagnostic glacier index (CISM) ', ng - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,i14)') 'Diagnostic glacier index (RGI) ', & + model%glacier%cism_to_rgi_glacier_id(ng) + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'Glacier area_init (km^2) ', & - model%glacier%area_init(ng) / 1.0d6 - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,i14)') 'Diagnostic glacier index (CISM) ', ng + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'Glacier area (km^2) ', & - model%glacier%area(ng) / 1.0d6 - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'Glacier area_init (km^2) ', & + model%glacier%area_init(ng) / 1.0d6 + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'Glacier area_init_extent (km^2) ', & - model%glacier%area_init_extent(ng) / 1.0d6 - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'Glacier area (km^2) ', & + model%glacier%area(ng) / 1.0d6 + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'Glacier volume (km^3) ', & - model%glacier%volume(ng) / 1.0d9 - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'Glacier area_init_extent (km^2) ', & + model%glacier%area_init_extent(ng) / 1.0d6 + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'Glacier volume_init (km^3) ', & - model%glacier%volume_init(ng) / 1.0d9 - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'Glacier volume (km^3) ', & + model%glacier%volume(ng) / 1.0d9 + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'Glacier volume_init_extent (km^3) ', & - model%glacier%volume_init_extent(ng) / 1.0d9 - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'Glacier volume_init (km^3) ', & + model%glacier%volume_init(ng) / 1.0d9 + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'mu_star (mm/yr w.e./deg C) ', & - model%glacier%mu_star(ng) - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'Glacier volume_init_extent (km^3) ', & + model%glacier%volume_init_extent(ng) / 1.0d9 + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'alpha_snow ', & - model%glacier%alpha_snow(ng) - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'mu_star (mm/yr w.e./deg C) ', & + model%glacier%mu_star(ng) + call write_log(trim(message), type = GM_DIAGNOSTIC) - write(message,'(a35,f14.6)') 'beta_artm (deg C) ', & - model%glacier%beta_artm(ng) - call write_log(trim(message), type = GM_DIAGNOSTIC) + write(message,'(a35,f14.6)') 'alpha_snow ', & + model%glacier%alpha_snow(ng) + call write_log(trim(message), type = GM_DIAGNOSTIC) + + write(message,'(a35,f14.6)') 'beta_artm (deg C) ', & + model%glacier%beta_artm(ng) + call write_log(trim(message), type = GM_DIAGNOSTIC) + + else ! glacier ID = 0 + + write(message,'(a35,i14)') 'Diagnostic glacier index (CISM) ', ng + call write_log(trim(message), type = GM_DIAGNOSTIC) + + endif call write_log(' ') diff --git a/libglide/glide_mask.F90 b/libglide/glide_mask.F90 index c08dfd17..1e4ef585 100644 --- a/libglide/glide_mask.F90 +++ b/libglide/glide_mask.F90 @@ -41,14 +41,11 @@ module glide_mask contains -!TODO - Remove iarea and ivol calculations? They are now computed in glide_write_diag.. - !TODO - Write a new subroutine (in addition to glide_set_mask) to compute mask for staggered grid? ! This subroutine is now called from glissade_velo_driver with stagthck and stagtopg ! as input arguments. - subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol, & - exec_serial, parallel) + subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask) use glide_types use glimmer_physcon, only : rhoi, rhoo @@ -60,14 +57,9 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol integer, intent(in) :: ewn, nsn ! Grid size real(dp), intent(in) :: eus ! Sea level integer, dimension(:,:), intent(inout) :: mask ! Output mask - real(dp), intent(inout), optional :: ivol, iarea ! Area and volume of ice - - logical, optional :: exec_serial !JEFF If executing in serial in MPI program. - type(parallel_type), optional :: parallel ! info for parallel communication ! local variables integer ew,ns - logical :: exec_serial_flag !Note - This array may not be needed, at least in parallel. @@ -76,21 +68,8 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol integer, dimension(0:ewn+1,0:nsn+1) :: maskWithBounds; - !TODO - What is the exec_serial option? Is it still needed? - - !JEFF Handle exec_serial optional parameter - if ( present(exec_serial) ) then - exec_serial_flag = exec_serial - else - ! Default to off - exec_serial_flag = .FALSE. - endif - mask = 0 - if (present(iarea)) iarea = 0.d0 - if (present(ivol)) ivol = 0.d0 - !Note - This mask is confusing. Wondering if we should replace it by a series of logical masks. ! Would need the following: @@ -138,10 +117,6 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol mask = ior(mask, GLIDE_MASK_LAND) ! GLIDE_MASK_LAND = 4 endwhere - if (present(iarea) .and. present(ivol)) then - call get_area_vol(thck, numerics%dew, numerics%dns, numerics%thklim, iarea, ivol, exec_serial_flag) - end if - !TODO - Replace the following with a halo call for 'mask', with appropriate global BC? maskWithBounds = 0 @@ -185,14 +160,9 @@ subroutine glide_set_mask(numerics, thck, topg, ewn, nsn, eus, mask, iarea, ivol end do end do - !JEFF Don't call halo update if running in serial mode - !WHL - I think the halo update will now work in serial mode. - if (.NOT. exec_serial_flag .and. present(parallel)) then - call parallel_halo(mask, parallel) - endif - end subroutine glide_set_mask + subroutine augment_kinbc_mask(mask, kinbcmask) ! Augments the Glide mask with the location of kinematic (dirichlet) boundary @@ -217,83 +187,8 @@ subroutine augment_kinbc_mask(mask, kinbcmask) endwhere end subroutine augment_kinbc_mask - subroutine get_area_vol(thck, dew, dns, thklim, iarea, ivol, exec_serial) - - implicit none - real(dp), dimension(:,:) :: thck - real(dp) :: dew, dns, thklim - real(dp) :: iarea, ivol, sum(2) - logical :: exec_serial - - integer :: i,j - - do i = 1+lhalo, size(thck,1)-uhalo - do j = 1+lhalo, size(thck,2)-uhalo - if (thck(i,j) > thklim ) then - iarea = iarea + 1 - ivol = ivol + thck(i,j) - end if - end do - end do - - iarea = iarea * dew * dns - ivol = ivol * dew * dns - - if (.NOT. exec_serial) then - sum(1) = iarea - sum(2) = ivol - sum = parallel_reduce_sum(sum) - iarea = sum(1) - ivol = sum(2) - endif - - end subroutine get_area_vol - - subroutine calc_iareaf_iareag(dew, dns, mask, iareaf, iareag, exec_serial) - - implicit none - real(dp), intent(in) :: dew, dns - real(dp), intent(out) :: iareaf, iareag - integer, dimension(:,:), intent(in) :: mask - logical, optional :: exec_serial ! If executing in serial in MPI program. - - integer :: i,j - logical :: exec_serial_flag - real(dp) :: sum(2) - - !TODO - exec_serial option may not be needed - if ( present(exec_serial) ) then - exec_serial_flag = exec_serial - else - ! Default to off - exec_serial_flag = .FALSE. - endif - - iareaf = 0.d0 - iareag = 0.d0 - - !loop over locally owned scalars - do j = 1+lhalo, size(mask,2)-uhalo - do i = 1+lhalo, size(mask,1)-uhalo - if (GLIDE_IS_FLOAT(mask(i,j))) then - iareaf = iareaf + dew * dns - else if(GLIDE_IS_GROUND_OR_GNDLINE(mask(i,j))) then - iareag = iareag + dew * dns - end if - end do - end do - - if (.NOT. exec_serial_flag) then - sum(1) = iareaf - sum(2) = iareag - sum = parallel_reduce_sum(sum) - iareaf = sum(1) - iareag = sum(2) - endif - - end subroutine calc_iareaf_iareag - subroutine glide_marine_margin_normal(thck, mask, marine_bc_normal, & + subroutine glide_marine_margin_normal(thck, mask, marine_bc_normal, & exec_serial, parallel) !TODO - Remove subroutine glide_marine_margin_normal? Old PBJ routine. diff --git a/libglide/glide_nc_custom.F90 b/libglide/glide_nc_custom.F90 index a672203a..bd464a4f 100644 --- a/libglide/glide_nc_custom.F90 +++ b/libglide/glide_nc_custom.F90 @@ -116,7 +116,8 @@ subroutine glide_nc_filldvars(outfile, model) ! This does not work, in general, when computing on active blocks only, because the local versions ! of model%general%x1 may not span the global domain. ! The revised code calls parallel_put_var to write (x0,y0) and (x1,y1) to the output file. - ! This assumes that x1_global and y1_global were read from the input file and saved in a global array. + ! This assumes that x1_global and y1_global were read from the input file and saved in a global array + ! (e.g., in subroutine glide_io_read). status = parallel_inq_varid(NCO%id,'x1',varid) status = parallel_put_var(NCO%id,varid,model%general%x1_global) @@ -126,8 +127,8 @@ subroutine glide_nc_filldvars(outfile, model) status = parallel_put_var(NCO%id,varid,model%general%y1_global) call nc_errorhandle(__FILE__,__LINE__,status) - ! create the x0 and y0 grids from x1 and y1 - + ! create the x0 and y0 grids from x1 and y1; + ! this does not require model%general%x0_global and y0_global to have been filled status = parallel_inq_varid(NCO%id,'x0',varid) do i = 1, global_ewn-1 x0_global(i) = (model%general%x1_global(i) + model%general%x1_global(i+1)) / 2.0d0 diff --git a/libglide/glide_setup.F90 b/libglide/glide_setup.F90 index 1f5fd0a2..e25fcc3e 100644 --- a/libglide/glide_setup.F90 +++ b/libglide/glide_setup.F90 @@ -907,6 +907,7 @@ subroutine handle_ho_options(section, model) call GetValue(section, 'linear_solve_ncheck', model%options%linear_solve_ncheck) call GetValue(section, 'linear_maxiters', model%options%linear_maxiters) call GetValue(section, 'linear_tolerance', model%options%linear_tolerance) + call GetValue(section, 'reproducible_sums', model%options%reproducible_sums) end subroutine handle_ho_options @@ -1128,9 +1129,9 @@ subroutine print_options(model) 'power law ', & 'Coulomb friction law w/ effec press ', & 'Schoof friction law ', & + 'modified Schoof friction law ', & 'min of Coulomb stress and power-law stress (Tsai)', & 'power law using effective pressure ', & - 'simple pattern of beta ', & 'till yield stress (Picard) ' /) character(len=*), dimension(0:1), parameter :: ho_whichbeta_limit = (/ & @@ -1828,8 +1829,8 @@ subroutine print_options(model) if (model%options%use_c_space_factor) then if (model%options%which_ho_babc == HO_BABC_COULOMB_FRICTION .or. & - model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_SCHOOF .or. & - model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_TSAI) then + model%options%which_ho_babc == HO_BABC_SCHOOF .or. & + model%options%which_ho_babc == HO_BABC_TSAI) then write(message,*) 'Multiplying beta by C_space_factor' call write_log(message) else @@ -1878,18 +1879,19 @@ subroutine print_options(model) ! Inversion options - ! Note: Inversion for Cp is currently supported for the Schoof sliding law, Tsai law, and basic power law + ! Note: Inversion for Cp is supported for the basic power law plus the Schoof and Tsai laws if (model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION .or. & model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION_BASIN) then - if (model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_SCHOOF .or. & - model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_TSAI .or. & - model%options%which_ho_babc == HO_BABC_POWERLAW) then + if (model%options%which_ho_babc == HO_BABC_POWERLAW .or. & + model%options%which_ho_babc == HO_BABC_SCHOOF .or. & + model%options%which_ho_babc == HO_BABC_MODIFIED_SCHOOF .or. & + model%options%which_ho_babc == HO_BABC_TSAI) then ! inversion for Cp is supported else call write_log('Error, Cp inversion is not supported for this basal BC option') write(message,*) 'Cp inversion is supported for these options: ', & - HO_BABC_COULOMB_POWERLAW_SCHOOF, HO_BABC_COULOMB_POWERLAW_TSAI, HO_BABC_POWERLAW + HO_BABC_POWERLAW, HO_BABC_SCHOOF, HO_BABC_MODIFIED_SCHOOF, HO_BABC_TSAI call write_log(message, GM_FATAL) endif endif @@ -1899,12 +1901,13 @@ subroutine print_options(model) model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION_BASIN) then if (model%options%which_ho_babc == HO_BABC_ZOET_IVERSON .or. & - model%options%which_ho_babc == HO_BABC_PSEUDO_PLASTIC) then + model%options%which_ho_babc == HO_BABC_PSEUDO_PLASTIC .or. & + model%options%which_ho_babc == HO_BABC_MODIFIED_SCHOOF) then ! inversion for Cc is supported else call write_log('Error, Cc inversion is not supported for this basal BC option') write(message,*) 'Cc inversion is supported for these options: ', & - HO_BABC_ZOET_IVERSON, HO_BABC_PSEUDO_PLASTIC + HO_BABC_ZOET_IVERSON, HO_BABC_PSEUDO_PLASTIC, HO_BABC_MODIFIED_SCHOOF call write_log(message, GM_FATAL) endif endif @@ -1970,9 +1973,6 @@ subroutine print_options(model) model%basal_hydro%ho_flux_routing_scheme >= size(ho_flux_routing_scheme)) then call write_log('Error, HO flux routing scheme out of range', GM_FATAL) end if - write(message,*) 'ho_flux_routing_scheme : ',model%basal_hydro%ho_flux_routing_scheme, & - ho_flux_routing_scheme(model%basal_hydro%ho_flux_routing_scheme) - call write_log(message) endif write(message,*) 'ho_whicheffecpress : ',model%options%which_ho_effecpress, & @@ -2183,6 +2183,10 @@ subroutine print_options(model) write(message,*) 'linear_tolerance : ',model%options%linear_tolerance call write_log(message) + if (model%options%reproducible_sums) then + call write_log('Global sums will be reproducible') + endif + end if ! DYCORE_GLISSADE if (model%options%whichdycore == DYCORE_GLISSADE .and. & @@ -2195,8 +2199,15 @@ subroutine print_options(model) if (model%options%whichdycore == DYCORE_GLISSADE .and. & (model%options%which_ho_sparse == HO_SPARSE_PCG_STANDARD .or. & model%options%which_ho_sparse == HO_SPARSE_PCG_CHRONGEAR) ) then + if (model%options%reproducible_sums) then + if (model%options%which_ho_precond == HO_PRECOND_TRIDIAG_LOCAL .or. & + model%options%which_ho_precond == HO_PRECOND_TRIDIAG_GLOBAL) then + call write_log ('Tridiagonal preconditioners are not supported with reproducible sums.') + call write_log ('Please choose a different preconditioner (e.g., diagonal)', GM_FATAL) + endif + endif write(message,*) 'ho_whichprecond : ',model%options%which_ho_precond, & - ho_whichprecond(model%options%which_ho_precond) + ho_whichprecond(model%options%which_ho_precond) call write_log(message) if (model%options%which_ho_precond < 0 .or. model%options%which_ho_precond >= size(ho_whichprecond)) then call write_log('Error, glissade preconditioner out of range', GM_FATAL) @@ -2694,6 +2705,9 @@ subroutine print_parameters(model) if (model%options%which_ho_babc == HO_BABC_BETA_CONSTANT) then write(message,*) 'uniform beta (Pa yr/m) : ',model%basal_physics%ho_beta_const call write_log(message) + elseif (model%options%which_ho_babc == HO_BABC_BETA_LARGE) then + write(message,*) 'large beta (Pa yr/m) : ',model%basal_physics%ho_beta_large + call write_log(message) elseif (model%options%which_ho_babc == HO_BABC_BETA_BPMP) then write(message,*) 'large (frozen) beta (Pa yr/m) : ',model%basal_physics%ho_beta_large call write_log(message) @@ -2762,7 +2776,7 @@ subroutine print_parameters(model) call write_log(message) write(message,*) 'bed bump wavelength for Coulomb friction law : ', model%basal_physics%coulomb_bump_wavelength call write_log(message) - elseif (model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_SCHOOF) then + elseif (model%options%which_ho_babc == HO_BABC_SCHOOF) then ! Note: The Schoof law typically uses a spatially variable powerlaw_c. ! If so, the value written here is just the initial value. write(message,*) 'Cc for Schoof Coulomb law : ', model%basal_physics%coulomb_c_const @@ -2775,7 +2789,24 @@ subroutine print_parameters(model) call write_log(message) write(message,*) 'm exponent for Schoof power law : ', model%basal_physics%powerlaw_m call write_log(message) - elseif (model%options%which_ho_babc == HO_BABC_COULOMB_POWERLAW_TSAI) then + elseif (model%options%which_ho_babc == HO_BABC_MODIFIED_SCHOOF) then + ! Note: This law supports inversion for both Cc and Cp. + ! When inverting, the values here are just the initial values. + write(message,*) 'Cc for modified Schoof law : ', model%basal_physics%coulomb_c_const + call write_log(message) + write(message,*) 'Max Cc : ', model%basal_physics%coulomb_c_max + call write_log(message) + write(message,*) 'Min Cc : ', model%basal_physics%coulomb_c_min + call write_log(message) + write(message,*) 'Cp for modified Schoof law, Pa (m/yr)^(-1/3) : ', model%basal_physics%powerlaw_c_const + call write_log(message) + write(message,*) 'Max Cp : ', model%basal_physics%powerlaw_c_max + call write_log(message) + write(message,*) 'Min Cp : ', model%basal_physics%powerlaw_c_min + call write_log(message) + write(message,*) 'm exponent for power law : ', model%basal_physics%powerlaw_m + call write_log(message) + elseif (model%options%which_ho_babc == HO_BABC_TSAI) then ! Note: The Tsai law typically uses a spatially variable powerlaw_c. ! If so, the value written here is just the initial value. write(message,*) 'Cc for Tsai Coulomb law : ', model%basal_physics%coulomb_c_const @@ -2877,6 +2908,9 @@ subroutine print_parameters(model) write(message,*) 'coulomb_c min : ', & model%basal_physics%coulomb_c_min call write_log(message) + write(message,*) 'coulomb_c const : ', & + model%basal_physics%coulomb_c_const + call write_log(message) write(message,*) 'thickness scale (m) for C_c inversion : ', & model%inversion%babc_thck_scale call write_log(message) @@ -3305,7 +3339,8 @@ subroutine handle_basal_hydro(section, model) ! flux routing call GetValue(section, 'ho_flux_routing_scheme', model%basal_hydro%ho_flux_routing_scheme) call GetValue(section, 'const_source', model%basal_hydro%const_source) - call GetValue(section, 'btemp_scale', model%basal_hydro%btemp_scale) + call GetValue(section, 'btemp_flow_scale', model%basal_hydro%btemp_flow_scale) + call GetValue(section, 'btemp_freeze_scale', model%basal_hydro%btemp_freeze_scale) ! effective pressure options and parameters call GetValue(section, 'effecpress_delta', model%basal_hydro%effecpress_delta) @@ -3380,6 +3415,19 @@ subroutine print_basal_hydro(model) model%basal_hydro%ho_flux_routing_scheme >= size(ho_flux_routing_scheme)) then call write_log('Error, HO flux routing scheme out of range', GM_FATAL) end if + if (model%options%reproducible_sums) then + if (model%basal_hydro%ho_flux_routing_scheme /= HO_FLUX_ROUTING_D8) then + write(message,*) 'With reproducible sums, only D8 flux-routing is supported; switching to D8' + model%basal_hydro%ho_flux_routing_scheme = HO_FLUX_ROUTING_D8 + call write_log(message) + endif + if (model%basal_hydro%btemp_freeze_scale > 0.0d0) then + write(message,*) 'With reproducible sums, the flux-routing does not support refreezing;' // & + ' setting btemp_freeze_scale = 0' + call write_log(message) + model%basal_hydro%btemp_freeze_scale = 0.0d0 + endif + endif ! reproducible sums write(message,*) 'ho_flux_routing_scheme : ',model%basal_hydro%ho_flux_routing_scheme, & ho_flux_routing_scheme(model%basal_hydro%ho_flux_routing_scheme) call write_log(message) @@ -3387,8 +3435,14 @@ subroutine print_basal_hydro(model) write(message,*) 'constant melt source at the bed (m/yr): ', model%basal_hydro%const_source call write_log(message) endif - if (model%basal_hydro%btemp_scale > 0.0d0) then - write(message,*) 'temp scale (deg C) for frz/thaw transition: ', model%basal_hydro%btemp_scale + if (model%basal_hydro%btemp_flow_scale > 0.0d0) then + write(message,*) 'temp scale (deg C) for flow around frozen bed: ', & + model%basal_hydro%btemp_flow_scale + call write_log(message) + endif + if (model%basal_hydro%btemp_freeze_scale > 0.0d0) then + write(message,*) 'temp scale (deg C) for refreezing at the bed: ', & + model%basal_hydro%btemp_freeze_scale call write_log(message) endif if (model%options%which_ho_effecpress == HO_EFFECPRESS_BWAT) then @@ -3995,12 +4049,12 @@ subroutine define_glide_restart_variables(model, model_id) ! basal sliding option select case (options%which_ho_babc) !WHL - Removed effecpress as a restart variable; it is recomputed with each velocity solve. -!! case (HO_BABC_POWERLAW, HO_BABC_COULOMB_FRICTION, HO_BABC_COULOMB_POWERLAW_SCHOOF) +!! case (HO_BABC_POWERLAW, HO_BABC_COULOMB_FRICTION, HO_BABC_SCHOOF) !! ! These friction laws need effective pressure !! call glide_add_to_restart_variable_list('effecpress', model_id) !! case(HO_BABC_COULOMB_POWERLAW_TSAI) !! call glide_add_to_restart_variable_list('effecpress', model_id) - case (HO_BABC_COULOMB_FRICTION, HO_BABC_COULOMB_POWERLAW_SCHOOF, HO_BABC_COULOMB_POWERLAW_TSAI) + case (HO_BABC_COULOMB_FRICTION, HO_BABC_SCHOOF, HO_BABC_TSAI) ! Note: These options compute beta internally, so it does not need to be in the restart file. if (options%use_c_space_factor) then ! c_space_factor needs to be in the restart file diff --git a/libglide/glide_types.F90 b/libglide/glide_types.F90 index 0be4b0fe..e52641f1 100644 --- a/libglide/glide_types.F90 +++ b/libglide/glide_types.F90 @@ -249,7 +249,7 @@ module glide_types integer, parameter :: HO_THERMAL_AFTER_TRANSPORT = 1 integer, parameter :: HO_THERMAL_SPLIT_TIMESTEP = 2 - !TODO - Deprecate the last two options? Rarely if ever used. + !TODO - Deprecate some little-used options? integer, parameter :: HO_BABC_BETA_CONSTANT = 0 integer, parameter :: HO_BABC_BETA_BPMP = 1 integer, parameter :: HO_BABC_PSEUDO_PLASTIC = 2 @@ -261,10 +261,10 @@ module glide_types integer, parameter :: HO_BABC_ISHOMC = 8 integer, parameter :: HO_BABC_POWERLAW = 9 integer, parameter :: HO_BABC_COULOMB_FRICTION = 10 - integer, parameter :: HO_BABC_COULOMB_POWERLAW_SCHOOF = 11 - integer, parameter :: HO_BABC_COULOMB_POWERLAW_TSAI = 12 - integer, parameter :: HO_BABC_POWERLAW_EFFECPRESS = 13 - integer, parameter :: HO_BABC_SIMPLE = 14 + integer, parameter :: HO_BABC_SCHOOF = 11 + integer, parameter :: HO_BABC_MODIFIED_SCHOOF = 12 + integer, parameter :: HO_BABC_TSAI = 13 + integer, parameter :: HO_BABC_POWERLAW_EFFECPRESS = 14 integer, parameter :: HO_BABC_YIELD_PICARD = 15 integer, parameter :: HO_BETA_LIMIT_ABSOLUTE = 0 @@ -851,10 +851,10 @@ module glide_types !> \item[8] beta field as prescribed for ISMIP-HOM test C (serial only) !> \item[9] power law !> \item[10] Coulomb friction law using effective pressure, with flwa from lowest ice layer - !> \item[11] Coulomb friction law using effective pressure, with constant basal flwa - !> \item[12] basal stress is the minimum of Coulomb and power-law values, as in Tsai et al. (2015) - !> \item[13] power law using effective pressure - !> \item[14] simple hard-coded pattern (useful for debugging) + !> \item[11] Schoof law that blends powerlaw and Coulomb behavior + !> \item[12] modified version of the Schoof law + !> \item[13] basal stress is the minimum of Coulomb and power-law values, as in Tsai et al. (2015) + !> \item[14] power law using effective pressure !> \item[15] treat beta value as a till yield stress (in Pa) using Picard iteration !> \end{description} @@ -1157,6 +1157,10 @@ module glide_types real(dp) :: linear_tolerance = 1.0d-08 !> error tolerance for linear solver + logical :: reproducible_sums = .false. + !> if true, then compute reproducible global sums + !> (independent of the number of tasks) + ! The remaining options are not currently supported !integer :: which_bproc = 0 @@ -2154,7 +2158,11 @@ module glide_types !TODO - Add visc_water and omega_hydro? Currently set in glissade_basal_water module real(dp) :: const_source = 0.0d0 !> constant melt source at the bed (m/yr) !> could be used to represent an englacial or surface source - real(dp) :: btemp_scale = 0.0d0 !> temperature scale (degC) for transition between thawed and frozen bed + real(dp) :: btemp_flow_scale = 0.0d0 !> temperature scale (degC) for transition between thawed and frozen bed; + !> used to route flow away from cells with a frozen bed; + !> btemp_scale = 0 => temperature-independent flow + real(dp) :: btemp_freeze_scale = 0.0d0 !> temperature scale (degC) for transition between thawed and frozen bed; + !> used to refreeze water beneath cells with a frozen bed; !> btemp_scale = 0 => temperature-independent flow ! parameters for macroporous sheet real(dp) :: bwat_threshold = 1.0d-3 !> scale over which N ramps down from overburden to a small value (m) @@ -2196,7 +2204,7 @@ module glide_types !> Note: Defined on velocity grid, whereas temp and bpmp are on ice grid - ! Note: c_space_factor supported for which_ho_babc = HO_BABC_COULOMB_FRICTION, *COULOMB_POWERLAW_SCHOOF AND *COULOMB_POWERLAW_TSAI + ! Note: c_space_factor supported for which_ho_babc = HO_BABC_COULOMB_FRICTION, *SCHOOF AND *TSAI real(dp), dimension(:,:), pointer :: c_space_factor => null() !> spatial factor for basal shear stress (no dimension) real(dp), dimension(:,:), pointer :: c_space_factor_stag => null() !> spatial factor for basal shear stress on staggered grid @@ -2238,7 +2246,7 @@ module glide_types coulomb_c_hi => null(), & !> coulomb_c value at high bed elevation, topg >= bed_hi coulomb_c_lo => null() !> coulomb_c value at low bed elevation, topg <= bed_lo - ! parameters for power law, taub_b = C * u_b^(1/m); used for HO_BABC_COULOMB_POWERLAW_TSAI/SCHOOF + ! parameters for power law, taub_b = C * u_b^(1/m); used for HO_BABC_SCHOOF AND *_TSAI ! The default values are from Asay-Davis et al. (2016). ! The value of powerlaw_c suggested by Tsai et al. (2015) is 7.624d6 Pa m^(-1/3) s^(1/3). ! This value can be converted to CISM units by dividing by scyr^(1/3), to obtain 2.413d4 Pa m^(-1/3) yr^(1/3). diff --git a/libglide/isostasy_elastic.F90 b/libglide/isostasy_elastic.F90 index 60f2f55e..d309137d 100644 --- a/libglide/isostasy_elastic.F90 +++ b/libglide/isostasy_elastic.F90 @@ -136,7 +136,7 @@ subroutine calc_elastic(& !> the load for simulations on more than one task. use cism_parallel, only: this_rank, main_task, & - parallel_type, distributed_gather_var, distributed_scatter_var, parallel_halo + parallel_type, gather_var, scatter_var, parallel_halo implicit none @@ -182,8 +182,8 @@ subroutine calc_elastic(& ! Gather the local arrays onto the main task ! Note: global arrays are allocated in the subroutine - call distributed_gather_var(load_factors, load_factors_global, parallel) - call distributed_gather_var(load, load_global, parallel) + call gather_var(load_factors, load_factors_global, parallel) + call gather_var(load, load_global, parallel) if (main_task) then do j = 1, global_nsn @@ -207,9 +207,9 @@ subroutine calc_elastic(& ! Scatter the load values back to local arrays ! Note: The global array is deallocated in the subroutine - call distributed_scatter_var(load, load_global, parallel) + call scatter_var(load, load_global, parallel) - ! distributed_scatter_var does not update the halo, so do an update here + ! scatter_var does not update the halo, so do an update here call parallel_halo(load, parallel) ! Deallocate the other global array (which is intent(in) and does not need to be scattered) @@ -251,7 +251,7 @@ subroutine init_rbel(rbel, a) use isostasy_kelvin implicit none type(isos_elastic) :: rbel !> structure holding elastic litho data - real(dp), intent(in) :: a !> radius of disk + real(dp), intent(in) :: a !> radius of disk real(dp) :: dummy_a diff --git a/libglimmer/cism_reprosum_mod.F90 b/libglimmer/cism_reprosum_mod.F90 new file mode 100644 index 00000000..1b4161a8 --- /dev/null +++ b/libglimmer/cism_reprosum_mod.F90 @@ -0,0 +1,2084 @@ +module cism_reprosum_mod +!------------------------------------------------------------------------ +! +! Purpose: +! Compute reproducible global sums of a set of arrays across an MPI +! subcommunicator +! +! Methods: +! Compute using either or both a scalable, reproducible algorithm and a +! scalable, nonreproducible algorithm: +! * Reproducible (scalable): +! Convert each floating point summand to an integer vector +! representation, to enable reproducibility when using +! MPI_Allreduce, then convert the resulting global sum back to a +! floating point representation locally; +! * Alternative usually reproducible (scalable): +! Use parallel double-double algorithm due to Helen He and +! Chris Ding, based on David Bailey's/Don Knuth's DDPDD algorithm; +! * Nonreproducible (scalable): +! Floating point and MPI_Allreduce based. +! If computing both reproducible and nonreproducible sums, compare +! these and report relative difference (if absolute difference +! less than sum) or absolute difference back to calling routine. +! +! Author: P. Worley (based on suggestions from J. White for integer +! vector algorithm and on He/Ding paper for DDPDD +! algorithm) +! +! William Lipscomb, Jan. 2026: +! Renamed this module to cism_reprosum_mod. Modified as follows to build +! without using CESM shared code: +! * Replaced some use statements +! * Changed 'shr' to 'cism' to avoid name conflicts with shared code +! * Output directed to iulog (declared in glimmer_paramets) +! * Added some optional debugging statements +! I started from a version that includes some logic fixes and code cleanup +! done by Pat Worley in 2023. Pat's revised version differs from the code +! in the CESM repo as of Jan. 2026. +! See here for information on Pat's mods: +! * https://github.com/E3SM-Project/E3SM/pull/5534 +! * https://github.com/E3SM-Project/E3SM/pull/5549 +! * https://github.com/E3SM-Project/E3SM/pull/5560 +!------------------------------------------------------------------------ + +!------------------------------------------------------------------------ +!- use statements ------------------------------------------------------- +!------------------------------------------------------------------------ + !WHL - Use CISM modules instead of shared modules +!#if ( defined noI8 ) +! ! Workaround for when shr_kind_i8 is not supported. +! use shr_kind_mod, only: r8 => shr_kind_r8, i8 => shr_kind_i4 +!#else +! use shr_kind_mod, only: r8 => shr_kind_r8, i8 => shr_kind_i8 +!#endif +! use shr_log_mod, only: s_loglev => shr_log_Level +! use shr_log_mod, only: s_logunit => shr_log_Unit +! use shr_sys_mod, only: shr_sys_abort +! use shr_infnan_mod,only: shr_infnan_inf_type, assignment(=), & +! shr_infnan_posinf, shr_infnan_neginf, & +! shr_infnan_nan, & +! shr_infnan_isnan, shr_infnan_isinf, & +! shr_infnan_isposinf, shr_infnan_isneginf +!#ifndef EAMXX_STANDALONE +! use perf_mod +!#endif + use glimmer_global, only: r8 => dp + use glimmer_global, only: i8 + use glimmer_paramets, only: iulog + use profile, only: t_startf, t_stopf !WHL - replace with perf_mod? + ! end WHL mods + + ! Import MPI fcns/types + use mpi + +!------------------------------------------------------------------------ +!- module boilerplate --------------------------------------------------- +!------------------------------------------------------------------------ + implicit none + private + + save + +!------------------------------------------------------------------------ +! Public interfaces ----------------------------------------------------- +!------------------------------------------------------------------------ + !WHL - cism_reprosum_setopts is not currently called + public :: & + cism_reprosum_setopts, &! set runtime options + cism_reprosum_calc, &! calculate distributed sum + cism_reprosum_tolExceeded ! utility function to check relative + ! differences against the tolerance + +!------------------------------------------------------------------------ +! Public data ----------------------------------------------------------- +!------------------------------------------------------------------------ + logical, public :: cism_reprosum_recompute = .false. + + real(r8), public :: cism_reprosum_reldiffmax = -1.0_r8 + + logical, parameter, public :: verbose_reprosum = .false. + +!------------------------------------------------------------------------ +! Private interfaces ---------------------------------------------------- +!------------------------------------------------------------------------ + private :: & + ddpdd, &! double-double sum routine + split_indices ! split indices among OMP threads + +!------------------------------------------------------------------------ +! Private data ---------------------------------------------------------- +!------------------------------------------------------------------------ + + !--------------------------------------------------------------------- + ! cism_reprosum_mod options + !--------------------------------------------------------------------- + logical :: repro_sum_use_ddpdd = .false. + + !WHL - Should this code be declared? + ! Not sure what EAMXX_STANDALONE means +#ifdef EAMXX_STANDALONE + ! Declare the C function interface + interface + subroutine cism_reprosumx86_fix_start(arg) bind(c) + use iso_c_binding + integer, intent(out) :: arg + end subroutine cism_reprosumx86_fix_start + end interface + + interface + subroutine cism_reprosumx86_fix_end(arg) bind(c) + use iso_c_binding + integer, intent(in) :: arg + end subroutine cism_reprosumx86_fix_end + end interface +#endif + + CONTAINS + +! +!======================================================================== +! + subroutine cism_reprosum_setopts(repro_sum_use_ddpdd_in, & + repro_sum_rel_diff_max_in, & + repro_sum_recompute_in, & + repro_sum_master, & + repro_sum_logunit ) + +!------------------------------------------------------------------------ +! Purpose: Set runtime options +! Author: P. Worley +!------------------------------------------------------------------------ +!------------------------------Arguments--------------------------------- + ! Use DDPDD algorithm instead of integer vector algorithm + logical, intent(in), optional :: repro_sum_use_ddpdd_in + real(r8), intent(in), optional :: repro_sum_rel_diff_max_in + ! recompute using different algorithm when difference between + ! reproducible and nonreproducible sums is too great + logical, intent(in), optional :: repro_sum_recompute_in + ! flag indicating whether this MPI task should output + ! log messages + logical, intent(in), optional :: repro_sum_master + ! unit number for log messages + integer, intent(in), optional :: repro_sum_logunit +!---------------------------Local Workspace------------------------------ + integer logunit ! unit number for log messages + logical master ! local master? + logical,save :: firstcall = .true. ! first call + integer :: ierr ! MPI error return +!------------------------------------------------------------------------ + + if ( present(repro_sum_master) ) then + master = repro_sum_master + else + master = .false. + endif + + if ( present(repro_sum_logunit) ) then + logunit = repro_sum_logunit + else +!! logunit = s_logunit + logunit = iulog + endif + + if (.not. firstcall) then + !WHL mod +! write(logunit,*) 'shr_reprosum_setopts: ERROR can only be called once' +! call shr_sys_abort('shr_reprosum_setopts ERROR: multiple calls') + write(logunit,*) 'cism_reprosum_setopts: ERROR can only be called once' + call mpi_abort(MPI_COMM_WORLD, 1001, ierr) + ! end WHL mod + endif + firstcall = .false. + + if ( present(repro_sum_use_ddpdd_in) ) then + repro_sum_use_ddpdd = repro_sum_use_ddpdd_in + endif + if ( present(repro_sum_rel_diff_max_in) ) then + cism_reprosum_reldiffmax = repro_sum_rel_diff_max_in + endif + if ( present(repro_sum_recompute_in) ) then + cism_reprosum_recompute = repro_sum_recompute_in + endif + if (master) then + if ( repro_sum_use_ddpdd ) then + write(logunit,*) 'cism_REPROSUM_SETOPTS: ',& + 'Using double-double-based (scalable) usually reproducible ', & + 'distributed sum algorithm' + else + write(logunit,*) 'cism_REPROSUM_SETOPTS: ',& + 'Using integer-vector-based (scalable) reproducible ', & + 'distributed sum algorithm' + endif + + if (cism_reprosum_reldiffmax >= 0.0_r8) then + write(logunit,*) ' ',& + 'with a maximum relative error tolerance of ', & + cism_reprosum_reldiffmax + if (cism_reprosum_recompute) then + write(logunit,*) ' ',& + 'If tolerance exceeded, sum is recomputed using ', & + 'a serial algorithm.' + else + write(logunit,*) ' ',& + 'If tolerance exceeded, integer-vector-based sum is used ', & + 'but a warning is output.' + endif + else + write(logunit,*) ' ',& + 'and not comparing with floating point algorithms.' + endif + + endif + end subroutine cism_reprosum_setopts + +! +!======================================================================== +! + + subroutine cism_reprosum_calc(arr, arr_gsum, nsummands, dsummands, & + nflds, ddpdd_sum, & + arr_gbl_max, arr_gbl_max_out, & + arr_max_levels, arr_max_levels_out, & + gbl_max_nsummands, gbl_max_nsummands_out,& + gbl_count, repro_sum_validate, & + repro_sum_stats, rel_diff, commid ) +!------------------------------------------------------------------------ +! +! Purpose: +! Compute the global sum of each field in 'arr' using the indicated +! communicator with a reproducible yet scalable implementation based +! on first converting each floating point summand into an equivalent +! representation using a vector of integers, summing the integer +! vectors, then converting the resulting sum back to a floating point +! representation. An alternative is to use an 'almost always +! reproducible' floating point algorithm (DDPDD), as described below. +! +! Description of integer vector algorithm: +!----------------------------------------- +! The basic idea is to represent the mantissa of each floating point +! value as an integer, add these integers, and then convert back to a +! floating point value. For a real*8 value, there are enough digits in +! an integer*8 variable to not lose any information (in the +! mantissa). However, each of these integers would have a different +! implicit exponent if done in a naive way, and so the sum would not +! be accurate. Also, even with the same 'normalization', the sum might +! exceed the maximum value representable by an integer*8, causing +! an overflow. Instead, a vector of integers is generated, where a +! given element (or level using the terminology used in the code) of +! the vector is associated with a particular exponent. The mantissa +! for a given floating point value is then converted to some number of +! integer values, depending on the exponent of the floating point +! value, the normalization of its mantissa, the maximum number of +! summands, the number of participating MPI tasks and of OpenMP +! threads, and the exponents associated with the levels of the integer +! vector, and added into the appropriate levels of the integer +! vector. Each MPI task has its own integer vector representing the +! local sum. This is then summed across all participating MPI tasks +! using an MPI_Allreduce, and, lastly, converted back to a floating +! point value. Note that the same approach works for a vector of +! integer*4 variables, simply requiring more levels, both for the full +! summation vector and for each individual real*8 summand. This is a +! compile time option in the code, in support of systems for which the +! compiler or MPI library has issues when using integer*8. As +! implemented, this algorithm should work for any floating point and +! integer type as long as they share the same base. The code is +! written as if for real*8 and integer*8 variables, but the only +! dependence is on the types 'r8' and 'i8', which are defined in the +! code, currently with reference to the corresponding types in +! shr_kind_mod. This is how integer*4 support is implemented, by +! defining i8 to be shr_kind_i4 instead of shr_kind_i8. + ! !WHL - The CISM types are declared in glimmer_global. +! For this to work, each MPI task must have the same number of levels +! and same implicit exponent for each level. These levels must be +! sufficient to represent the smallest and largest nonzero individual +! summands (in absolute value) and the largest possible intermediate +! sum, including the final sum. Most of the complexity in the +! algorithm is in identifying the number of levels, the exponent +! associated with each level, and the appropriate levels to target +! when converting a floating point value into its integer vector +! representation. There are also some subtleties in reconstructing the +! final sum from the integer vector, as described below. For each +! floating point value, the exponent and mantissa are extracted using +! the fortran intrinsics 'exponent' and 'fraction'. The mantissa is +! then 'shifted' to match the exponent for a target level in the +! integer vector using the 'scale' intrinsic. 'int(X,i8)' is used +! for the conversion for the given level, and subtraction between +! this integer and the original 'shifted' value identifies the +! remainder that will be converted to an integer for the next level +! in the vector. The logic continues until the remainder is zero. As +! mentioned above, the only requirement, due to the implementation +! using these fortran intrinsics, is that floating point and integer +! models use the same base, e.g. +! radix(1.0_r8) == radix(1_i8) +! for real*8 and integer*8. If not, then the alternative algorithm +! DDPDD mentioned above and described below is used instead. The +! integer representation must also have enough digits for the +! potential growth of the sum for each level, so could conceivably be +! too small for a large number of summands. +! +! Upper bounds on the total number of summands and on all intermediate +! sums are calculated as +! * +! and +! * +! * +! respectively. The maximum number of summands per MPI task and the +! maximum absolute value over all nonzero summands are global +! information that need to be determined with additional MPI +! collectives. The minimum nonzero absolute value summand is also +! global information. Fortunately, all of these can be determined with +! a single MPI_Allreduce call, so only one more than that required for +! the sum itself. (Note that, in actuality, the exponents of max and +! min summands are determined, and these are used to calculate bounds +! on the maximum and minimum, allowing the use of an MPI_INTEGER +! vector in the MPI_Allreduce call.) +! +! The actual code is made a little messier by (a) supporting summation +! of multiple fields without increasing the number of MPI_Allreduce +! calls, (b) supporting OpenMP threading of the local arithmetic, (c) +! allowing the user to specify estimates for the global information +! (to avoid the additional MPI_Allreduce), (d) including a check of +! whether user specified bounds were sufficient and, if not, +! determining the actual bounds and recomputing the sum, and (e) +! allowing the user to specify the maximum number of levels to use, +! potentially losing accuracy but still preserving reproducibility and +! being somewhat cheaper to compute. +! +! The conversion of the local summands to vectors of integers, the +! summation of the local vectors of integers, and the summation of the +! distributed vectors of integers will be exact (if optional parameters +! are not used to decrease the accuracy - see below). However, the +! conversion of the vector of integer representation to a floating +! point value may be subject to rounding errors. Before the +! conversion, the vector of integers is adjusted so that all elements +! have the same sign, and so that the value, in absolute value, at a +! given level is strictly less than what can be represented at the +! next lower level (larger exponent) and strictly greater than what +! can represented at the next higher level (smaller exponent). Since +! all elements have the same sign, the sign is set to positive +! temporarily and then restored when the conversion to floating point +! is complete. These are all integer operations, so no accuracy is +! lost. These adjustments eliminate the possibility of catastrophic +! cancellation. Also, when converting the individual elements to +! floating point values and summing them, the summation is now +! equivalent to concatenating the digits in the mantissas for the +! component summands. In consequence, in the final step when each +! element of this modified vector of integers is converted to a +! floating point value and added into the intermediate sum, any +! rounding is limited to the least significant digit representable +! in the final floating point sum. +! +! Any such rounding error will be sensitive to the particular floating +! values generated from the integer vector, and so will be +! sensitive to the number of levels in the vector and the implicit +! exponent associated with each level, which are themselves functions +! of the numbers of MPI tasks and OpenMP threads and the number of +! digits representable in an integer. To avoid this sensitivity, +! (effectively) generate a new integer vector in which each component +! integer has a fixed number of significant digits (e.g., +! digits(1.0_r8)) and generate the floating point values from these +! before summing. (See comments in code for more details.) This +! creates a sequence of floating point values to be summed that are +! independent of, for example, the numbers of MPI tasks and OpenMP +! threads or whether using integer*8 or integer*4 internal +! representations in the integer vector, and thus ensure +! reproducibility with respect to these options. +! +! Description of optional parameters for integer vector algorithm: +!----------------------------------------------------------------- +! The accuracy of the integer vector algorithm is controlled by the +! total number of levels of integer expansion. The algorithm +! calculates the number of levels that is required for the sum to be +! essentially exact. (The sum as represented by the integer expansion +! is exact, but roundoff may perturb the least significant digit of +! the returned floating point representation of the sum.) The optional +! parameter arr_max_levels can be used to override the calculated +! value for each field. The optional parameter arr_max_levels_out can +! be used to return the values used. +! +! The algorithm requires an upper bound on the maximum summand +! (in absolute value) for each field, and will calculate this internally +! using an MPI_Allreduce. However, if the optional parameters +! arr_max_levels and arr_gbl_max are both set, then the algorithm will +! use the values in arr_gbl_max for the upper bounds instead. If only +! arr_gbl_max is present, then the maxima are computed internally +! (and the specified values are ignored). The optional parameter +! arr_gbl_max_out can be used to return the values used. +! +! The algorithm also requires an upper bound on the number of +! local summands across all MPI tasks. (By definition, the number of +! local summands is the same for each field on a given MPI task, i.e., +! the input parameter nsummands.) This will be calculated internally, +! using an MPI_Allreduce, but the value in the optional argument +! gbl_max_nsummands will be used instead if (1) it is present, +! (2) the value is > 0, and (3) the maximum values and required number +! of levels are also specified. (If the maximum values are calculated, +! then the same MPI_Allreduce is used to determine the maximum numbers +! of local summands.) The accuracy of the user-specified value is not +! checked. However, if set to < 1, the value will instead be calculated. +! If the optional parameter gbl_max_nsummands_out is present, +! then the value used (gbl_max_nsummands if >= 1; calculated otherwise) +! will be returned. +! +! If the user-specified upper bounds on maximum summands are +! inaccurate or if the user-specified upper bounds (maximum summands +! and number of local summands) and numbers of levels causes +! any of the global sums to have fewer than the expected +! number of significant digits, and if the optional parameter +! repro_sum_validate is NOT set to .false., then the algorithm will +! repeat the computations with internally calculated values for +! arr_max_levels, arr_gbl_max, and gbl_max_nsummands. +! +! If requested (by setting cism_reprosum_reldiffmax >= 0.0 and passing in +! the optional rel_diff parameter), results are compared with a +! nonreproducible floating point algorithm. +! +! Note that the cost of the algorithm is not strongly correlated with +! the number of levels, which primarily shows up as a (modest) increase +! in the cost of the MPI_Allreduce as a function of vector length. +! Rather the cost is more a function of (a) the number of integers +! required to represent an individual summand and (b) the number of +! MPI_Allreduce calls. The number of integers required to represent an +! individual summand is 1 or 2 when using 8-byte integers for 8-byte +! real summands when the number of local summands and number of MPI +! tasks are not too large. As the magnitude of either of these increase, +! the number of integers required increases. The number of +! MPI_Allreduce calls is either 2 (specifying nothing or just +! arr_max_levels and arr_gbl_max correctly) or 1 (specifying +! gbl_max_nsummands, arr_max_levels, and arr_gbl_max correctly). +! When specifying arr_max_nsummands, arr_max_levels, or arr_gbl_max +! incorrectly, 3 or 4 MPI_Allreduce calls will be required. +! +! Description of alternative (DDPDD) algorithm: +!---------------------------------------------- +! The alternative algorithm is a minor modification of a parallel +! implementation of David Bailey's routine DDPDD by Helen He +! and Chris Ding. See, for example, +! Y. He, and C. Ding, 'Using Accurate Arithmetics to Improve +! Numerical Reproducibility and Stability in Parallel Applications,' +! J. Supercomputing, vol. 18, no. 3, 2001, pp. 259–277 +! and the citations therein. Bailey uses the Knuth trick to implement +! quadruple precision summation of double precision values with 10 +! double precision operations. The advantage of this algorithm is that +! it requires a single MPI_Allreduce and is less expensive per summand +! than is the integer vector algorithm. The disadvantage is that it +! is not guaranteed to be reproducible (though it is reproducible +! much more often than is the standard floating point algorithm). +! This alternative is used when the optional parameter ddpdd_sum is +! set to .true. It is also used if the integer vector algorithm radix +! assumption does not hold. +! +!------------------------------------------------------------------------ +! +! Arguments +! + integer, intent(in) :: nsummands ! number of local summands + integer, intent(in) :: dsummands ! declared first dimension + integer, intent(in) :: nflds ! number of fields + real(r8), intent(in) :: arr(dsummands,nflds) + ! input array + + real(r8), intent(out):: arr_gsum(nflds) + ! global sums + + logical, intent(in), optional :: ddpdd_sum + ! use ddpdd algorithm instead + ! of integer vector algorithm + + real(r8), intent(in), optional :: arr_gbl_max(nflds) + ! upper bound on max(abs(arr)) + + real(r8), intent(out), optional :: arr_gbl_max_out(nflds) + ! calculated upper bound on + ! max(abs(arr)) + + integer, intent(in), optional :: arr_max_levels(nflds) + ! maximum number of levels of + ! integer expansion to use + + integer, intent(out), optional :: arr_max_levels_out(nflds) + ! output of number of levels of + ! integer expansion to used + + integer, intent(in), optional :: gbl_max_nsummands + ! maximum of nsummand over all + ! MPI tasks + + integer, intent(out), optional :: gbl_max_nsummands_out + ! calculated maximum nsummands + ! over all MPI tasks + + integer, intent(in), optional :: gbl_count + ! was total number of summands; + ! now is ignored; use + ! gbl_max_nsummands instead + + logical, intent(in), optional :: repro_sum_validate + ! flag enabling/disabling testing that gmax and max_levels are + ! accurate/sufficient. Default is enabled. + + integer, intent(inout), optional :: repro_sum_stats(6) + ! increment running totals for + ! (1) one-reduction repro_sum + ! (2) two-reduction repro_sum + ! (3) both types in one call + ! (4) nonrepro_sum + ! (5) global max nsummands reduction + ! (6) global lor 3*nflds reduction + + real(r8), intent(out), optional :: rel_diff(2,nflds) + ! relative and absolute + ! differences between integer + ! vector and floating point sums + + integer, intent(in), optional :: commid + ! MPI communicator + +! +! Local workspace +! + logical :: use_ddpdd_sum ! flag indicating whether to + ! use cism_reprosum_ddpdd or not + logical :: recompute ! flag indicating need to + ! determine gmax/gmin before + ! computing sum + logical :: validate ! flag indicating need to + ! verify gmax and max_levels + ! are accurate/sufficient + integer :: gbl_lor_red ! global lor reduction? (0/1) + integer :: gbl_max_red ! global max reduction? (0/1) + integer :: repro_sum_fast ! 1 reduction repro_sum? (0/1) + integer :: repro_sum_slow ! 2 reduction repro_sum? (0/1) + integer :: repro_sum_both ! both fast and slow? (0/1) + integer :: nonrepro_sum ! nonrepro_sum? (0/1) + + integer :: omp_nthreads ! number of OpenMP threads + integer :: mpi_comm ! MPI subcommunicator + integer :: mypid ! MPI task ID (COMM_WORLD) + integer :: tasks ! number of MPI tasks + integer :: ierr ! MPI error return + integer :: ifld, isum, ithread ! loop variables + integer :: max_nsummands ! max nsummands over all MPI tasks + ! or threads (used in both ways) + + integer, allocatable :: isum_beg(:), isum_end(:) + ! range of summand indices for each + ! OpenMP thread + integer, allocatable :: arr_tlmin_exp(:,:) + ! per thread local exponent minima + integer, allocatable :: arr_tlmax_exp(:,:) + ! per thread local exponent maxima + integer :: arr_exp, arr_exp_tlmin, arr_exp_tlmax + ! summand exponent and working min/max + integer :: arr_lmin_exp(nflds) ! local exponent minima + integer :: arr_lmax_exp(nflds) ! local exponent maxima + integer :: arr_lextremes(0:nflds,2)! local exponent extrema + integer :: arr_gextremes(0:nflds,2)! global exponent extrema + + integer :: arr_gmax_exp(nflds) ! global exponents maxima + integer :: arr_gmin_exp(nflds) ! global exponents minima + integer :: arr_max_shift ! maximum safe exponent for + ! value < 1 (so that sum does + ! not overflow) + integer :: max_levels(nflds) ! maximum number of levels of + ! integer expansion to use + integer :: max_level ! maximum value in max_levels + integer :: extra_levels ! number of extra levels needed + ! to guarantee that sum over threads + ! or tasks does not cause overflow + + real(r8) :: xmax_nsummands ! real(max_nsummands,r8) + real(r8) :: arr_lsum(nflds) ! local sums + real(r8) :: arr_gsum_fast(nflds) ! global sum calculated using + ! fast, nonreproducible, + ! floating point alg. + real(r8) :: abs_diff ! absolute difference between + ! integer vector and floating point + ! sums + !WHL mod + character(len=64) :: binary_str ! string to represent 64 bits of i8 integer + integer :: n + !end WHL mod +#ifdef _OPENMP + integer omp_get_max_threads + external omp_get_max_threads +#endif +! +!------------------------------------------------------------------------ +! +! Initialize local statistics variables + gbl_lor_red = 0 + gbl_max_red = 0 + repro_sum_fast = 0 + repro_sum_slow = 0 + repro_sum_both = 0 + nonrepro_sum = 0 + +! Set MPI communicator + if ( present(commid) ) then + mpi_comm = commid + else + mpi_comm = MPI_COMM_WORLD + endif +#ifndef EAMXX_STANDALONE +!WHL - commented out since the profile mod does not include tbarrier_f +! call t_barrierf('sync_repro_sum',mpi_comm) +#endif + +! Check whether should use cism_reprosum_ddpdd algorithm + use_ddpdd_sum = repro_sum_use_ddpdd + if ( present(ddpdd_sum) ) then + use_ddpdd_sum = ddpdd_sum + endif + +! Check whether intrinsic-based algorithm will work on this system +! (requires floating point and integer bases to be the same) +! If not, always use ddpdd. + use_ddpdd_sum = use_ddpdd_sum .or. (radix(1.0_r8) /= radix(1_i8)) + + if ( use_ddpdd_sum ) then + +#ifndef EAMXX_STANDALONE + call t_startf('cism_reprosum_ddpdd') +#endif + + call cism_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, & + nflds, mpi_comm) + repro_sum_fast = 1 + +#ifndef EAMXX_STANDALONE + call t_stopf('cism_reprosum_ddpdd') +#endif + + else + +#ifndef EAMXX_STANDALONE + call t_startf('cism_reprosum_int') +#endif + +! Get number of MPI tasks + call mpi_comm_size(mpi_comm, tasks, ierr) + +! Get number of OpenMP threads +#ifdef _OPENMP + omp_nthreads = omp_get_max_threads() +#else + omp_nthreads = 1 +#endif + +! See if have sufficient information to not require max/min allreduce + recompute = .true. + validate = .false. + if ( present(arr_gbl_max) .and. present(arr_max_levels) ) then + recompute = .false. + +! Setting lower bound on max_level*nflds to be 64 to improve OpenMP +! performance for loopb in cism_reprosum_int + max_level = (64/nflds) + 1 + do ifld=1,nflds + if ((arr_gbl_max(ifld) >= 0.0_r8) .and. & + (arr_max_levels(ifld) > 0)) then + + arr_gmax_exp(ifld) = exponent(arr_gbl_max(ifld)) + if (max_level < arr_max_levels(ifld)) & + max_level = arr_max_levels(ifld) + + else + recompute = .true. + endif + enddo + + if (.not. recompute) then + +! Determine maximum number of summands in local phases of the +! algorithm +#ifndef EAMXX_STANDALONE + call t_startf("repro_sum_allr_max") +#endif + if ( present(gbl_max_nsummands) ) then + if (gbl_max_nsummands < 1) then + call mpi_allreduce (nsummands, max_nsummands, 1, & + MPI_INTEGER, MPI_MAX, mpi_comm, ierr) + gbl_max_red = 1 + else + max_nsummands = gbl_max_nsummands + endif + else + call mpi_allreduce (nsummands, max_nsummands, 1, & + MPI_INTEGER, MPI_MAX, mpi_comm, ierr) + gbl_max_red = 1 + endif +#ifndef EAMXX_STANDALONE + call t_stopf("repro_sum_allr_max") +#endif + +! Determine maximum shift. Shift needs to be small enough that summation, +! in absolute value, does not exceed maximum value representable by i8. + +! If requested, return max_nsummands before it is redefined + if ( present( gbl_max_nsummands_out) ) then + gbl_max_nsummands_out = max_nsummands + endif + +! Summing within each thread first (adding 1 to max_nsummands +! to ensure that integer division rounds up) + max_nsummands = (max_nsummands/omp_nthreads) + 1 +! then over threads and tasks + max_nsummands = max(max_nsummands, tasks*omp_nthreads) +! A 'max' is used in the above calculation because the partial sum for +! each thread, calculated in cism_reprosum_int, is postprocessed so that +! each integer in the corresponding vector of integers is reduced in +! magnitude to be less than (radix(1_i8)**arr_max_shift). Therefore, +! the maximum shift can be calculated separately for per thread sums +! and sums over threads and tasks, and the smaller value used. This is +! equivalent to using max_nsummands as defined above. + + xmax_nsummands = real(max_nsummands,r8) + arr_max_shift = digits(1_i8) - (exponent(xmax_nsummands) + 1) + if (arr_max_shift < 2) then + !WHL mod +! call shr_sys_abort('repro_sum failed: number of summands too '// & +! 'large for integer vector algorithm' ) + write(iulog,*) 'repro_sum failed: number of summands too '// & + 'large for integer vector algorithm' + call mpi_abort(MPI_COMM_WORLD, 1001, ierr) + ! end WHL mod + endif +! Note: by construction, each floating point value will be decomposed +! into a vector of integers each component of which will be strictly +! less than radix(1_i8)**arr_max_shift in absolute value, and the +! summation of max_nsummands of these, again in absolute value, will +! then be less than +! radix(1_i8)**(arr_max_shift + exponent(xmax_nsummands)) +! or radix(1_i8)**(digits(1_i8) - 1). This is more conservative than +! necessary, but it also allows the postprocessing mentioned above +! (and described later) to proceed without danger of introducing +! overflow. + +! Determine additional number of levels needed to support the +! postprocessing that reduces the magnitude of each component +! of the integer vector of the partial sum for each thread +! to be less than (radix(1_i8)**arr_max_shift). + extra_levels = (digits(1_i8) - 1)/arr_max_shift +! Extra levels are indexed by (-(extra_levels-1):0) +! Derivation of this is described in the comments in +! cism_reprosum_int. + +! Calculate sum + if (present(repro_sum_validate)) then + validate = repro_sum_validate + else + validate = .true. + endif + call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, & + nflds, arr_max_shift, arr_gmax_exp, & + arr_max_levels, max_level, extra_levels, & + validate, recompute, & + omp_nthreads, mpi_comm) + +! Record statistics, etc. + repro_sum_fast = 1 + if (recompute) then + repro_sum_both = 1 + else +! If requested, return specified levels and upper bounds on maxima + if ( present(arr_max_levels_out) ) then + do ifld=1,nflds + arr_max_levels_out(ifld) = arr_max_levels(ifld) + enddo + endif + if ( present(arr_gbl_max_out) ) then + do ifld=1,nflds + arr_gbl_max_out(ifld) = arr_gbl_max(ifld) + enddo + endif + endif + endif + endif + +! Do not have sufficient information; calculate global max/min and +! use to compute required number of levels + if (recompute) then + +! Record statistic + repro_sum_slow = 1 + +! Determine maximum and minimum (non-zero) summand values and +! maximum number of local summands + +! Allocate thread-specific work space + allocate(arr_tlmax_exp(nflds,omp_nthreads)) + allocate(arr_tlmin_exp(nflds,omp_nthreads)) + allocate(isum_beg(omp_nthreads)) + allocate(isum_end(omp_nthreads)) + +! Split summand index range over OpenMP threads + call split_indices(nsummands, omp_nthreads, isum_beg, isum_end) + +!$omp parallel do & +!$omp default(shared) & +!$omp private(ithread, ifld, isum, arr_exp, arr_exp_tlmin, arr_exp_tlmax) + do ithread=1,omp_nthreads +#ifndef EAMXX_STANDALONE + call t_startf('repro_sum_loopa') +#endif + do ifld=1,nflds + arr_exp_tlmin = MAXEXPONENT(1.0_r8) + arr_exp_tlmax = MINEXPONENT(1.0_r8) + do isum=isum_beg(ithread),isum_end(ithread) + if (arr(isum,ifld) /= 0.0_r8) then + arr_exp = exponent(arr(isum,ifld)) + arr_exp_tlmin = min(arr_exp,arr_exp_tlmin) + arr_exp_tlmax = max(arr_exp,arr_exp_tlmax) + endif + end do + arr_tlmin_exp(ifld,ithread) = arr_exp_tlmin + arr_tlmax_exp(ifld,ithread) = arr_exp_tlmax + end do +#ifndef EAMXX_STANDALONE + call t_stopf('repro_sum_loopa') +#endif + end do + + do ifld=1,nflds + arr_lmax_exp(ifld) = maxval(arr_tlmax_exp(ifld,:)) + arr_lmin_exp(ifld) = minval(arr_tlmin_exp(ifld,:)) + end do + deallocate(arr_tlmin_exp,arr_tlmax_exp,isum_beg,isum_end) + + arr_lextremes(0,:) = -nsummands + arr_lextremes(1:nflds,1) = -arr_lmax_exp(:) + arr_lextremes(1:nflds,2) = arr_lmin_exp(:) +#ifndef EAMXX_STANDALONE + call t_startf("repro_sum_allr_minmax") +#endif + call mpi_allreduce (arr_lextremes, arr_gextremes, 2*(nflds+1), & + MPI_INTEGER, MPI_MIN, mpi_comm, ierr) +#ifndef EAMXX_STANDALONE + call t_stopf("repro_sum_allr_minmax") +#endif + max_nsummands = -arr_gextremes(0,1) + arr_gmax_exp(:) = -arr_gextremes(1:nflds,1) + arr_gmin_exp(:) = arr_gextremes(1:nflds,2) + +! If a field is identically zero, arr_gmin_exp still equals MAXEXPONENT +! and arr_gmax_exp still equals MINEXPONENT. +! In this case, set arr_gmin_exp = arr_gmax_exp = MINEXPONENT + do ifld=1,nflds + arr_gmin_exp(ifld) = min(arr_gmax_exp(ifld),arr_gmin_exp(ifld)) + enddo + +! If requested, return upper bounds on observed maxima + if ( present(arr_gbl_max_out) ) then + do ifld=1,nflds + arr_gbl_max_out(ifld) = scale(1.0_r8,arr_gmax_exp(ifld)) + enddo + endif + +! If requested, return max_nsummands before it is redefined + if ( present( gbl_max_nsummands_out) ) then + gbl_max_nsummands_out = max_nsummands + endif + +! Determine maximum shift (same as in previous branch, but with calculated +! max_nsummands). Shift needs to be small enough that summation, in absolute +! value, does not exceed maximum value representable by i8. + +! Summing within each thread first (adding 1 to max_nsummands +! to ensure that integer division rounds up) + max_nsummands = (max_nsummands/omp_nthreads) + 1 +! then over threads and tasks + max_nsummands = max(max_nsummands, tasks*omp_nthreads) +! A 'max' is used in the above calculation because the partial sum for +! each thread, calculated in cism_reprosum_int, is postprocessed so that +! each integer in the corresponding vector of integers is reduced in +! magnitude to be less than (radix(1_i8)**arr_max_shift). Therefore, +! the maximum shift can be calculated separately for per thread sums +! and sums over threads and tasks, and the smaller value used. This is +! equivalent to using max_nsummands as defined above. + + xmax_nsummands = real(max_nsummands,r8) + arr_max_shift = digits(1_i8) - (exponent(xmax_nsummands) + 1) + if (arr_max_shift < 2) then + !WHL mod +! call shr_sys_abort('repro_sum failed: number of summands too '// & +! 'large for integer vector algorithm' + write(iulog,*) 'repro_sum failed: number of summands too '// & + 'large for integer vector algorithm' + call mpi_abort(MPI_COMM_WORLD, 1001, ierr) + ! end WHL mod + endif +! Note: by construction, each floating point value will be decomposed +! into a vector of integers each component of which will be strictly +! less than radix(1_i8)**arr_max_shift in absolute value, and the +! summation of max_nsummands of these, again in absolute value, will +! then be less than +! radix(1_i8)**(arr_max_shift + exponent(xmax_nsummands)) +! or radix(1_i8)**(digits(1_i8) - 1). This is more conservative than +! necessary, but it also allows the postprocessing mentioned above +! (and described later) to proceed without danger of introducing +! overflow. + +! Determine maximum number of levels required for each field. +! Need enough levels to represent both the smallest and largest +! nonzero summands (in absolute value), and any values in between. +! The number of digits from the most significant digit in the +! largest summand to the most significant digit in the smallest +! summand is (arr_gmax_exp(ifld)-arr_gmin_exp(ifld)), and the maximum +! number of digits needed to represent the smallest value is +! digits(1.0_r8). Divide this total number of digits by the number of +! digits per level (arr_max_shift) to get the number of levels +! ((digits(1.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) / arr_max_shift) +! with some tweaks: +! + 1 because first truncation for any given summand probably does +! not involve a maximal shift (but this adds only one to the total) +! + 1 to guarantee that the integer division rounds up (not down) +! (setting lower bound on max_level*nflds to be 64 to improve OpenMP +! performance for loopb in cism_reprosum_int) + max_level = (64/nflds) + 1 + do ifld=1,nflds + max_levels(ifld) = 2 + & + ((digits(1.0_r8) + (arr_gmax_exp(ifld)-arr_gmin_exp(ifld))) & + / arr_max_shift) + if ( present(arr_max_levels) .and. (.not. validate) ) then +! If validate true, then computation with arr_max_levels failed +! previously + if ( arr_max_levels(ifld) > 0 ) then + max_levels(ifld) = & + min(arr_max_levels(ifld),max_levels(ifld)) + endif + endif + if (max_level < max_levels(ifld)) & + max_level = max_levels(ifld) + enddo + +! If requested, return calculated levels + if ( present(arr_max_levels_out) ) then + do ifld=1,nflds + arr_max_levels_out(ifld) = max_levels(ifld) + enddo + endif + +! Determine additional number of levels needed to support the +! postprocessing that reduces the magnitude of each component +! of the integer vector of the partial sum for each thread +! to be less than (radix(1_i8)**arr_max_shift). + extra_levels = (digits(1_i8) - 1)/arr_max_shift +! Extra levels are indexed by (-(extra_levels-1):0) +! Derivation of this is described in the comments in +! cism_reprosum_int. + +! Calculate sum + validate = .false. + call cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, & + nflds, arr_max_shift, arr_gmax_exp, & + max_levels, max_level, extra_levels, & + validate, recompute, & + omp_nthreads, mpi_comm) + + endif + +#ifndef EAMXX_STANDALONE + call t_stopf('cism_reprosum_int') +#endif + + endif + +! Compare integer vector and floating point results + if ( present(rel_diff) ) then + if (cism_reprosum_reldiffmax >= 0.0_r8) then +#ifndef EAMXX_STANDALONE + !WHL - commented out since the profile mod does not include tbarrier_f +! call t_barrierf('sync_nonrepro_sum',mpi_comm) +#endif +#ifndef EAMXX_STANDALONE + call t_startf('nonrepro_sum') +#endif +! Record statistic + nonrepro_sum = 1 +! Compute nonreproducible sum + arr_lsum(:) = 0.0_r8 +!$omp parallel do & +!$omp default(shared) & +!$omp private(ifld, isum) + do ifld=1,nflds + do isum=1,nsummands + arr_lsum(ifld) = arr(isum,ifld) + arr_lsum(ifld) + end do + end do + +#ifndef EAMXX_STANDALONE + call t_startf("nonrepro_sum_allr_r8") +#endif + call mpi_allreduce (arr_lsum, arr_gsum_fast, nflds, & + MPI_REAL8, MPI_SUM, mpi_comm, ierr) +#ifndef EAMXX_STANDALONE + call t_stopf("nonrepro_sum_allr_r8") +#endif + +#ifndef EAMXX_STANDALONE + call t_stopf('nonrepro_sum') +#endif + +! Determine differences +!$omp parallel do & +!$omp default(shared) & +!$omp private(ifld, abs_diff) + do ifld=1,nflds + abs_diff = abs(arr_gsum_fast(ifld)-arr_gsum(ifld)) + if (abs(arr_gsum(ifld)) > abs_diff) then + rel_diff(1,ifld) = abs_diff/abs(arr_gsum(ifld)) + else + rel_diff(1,ifld) = abs_diff + endif + rel_diff(2,ifld) = abs_diff + enddo + else + rel_diff(:,:) = 0.0_r8 + endif + endif + +! Return statistics + if ( present(repro_sum_stats) ) then + repro_sum_stats(1) = repro_sum_stats(1) + repro_sum_fast + repro_sum_stats(2) = repro_sum_stats(2) + repro_sum_slow + repro_sum_stats(3) = repro_sum_stats(3) + repro_sum_both + repro_sum_stats(4) = repro_sum_stats(4) + nonrepro_sum + repro_sum_stats(5) = repro_sum_stats(5) + gbl_max_red + repro_sum_stats(6) = repro_sum_stats(6) + gbl_lor_red + endif + + if (verbose_reprosum) then + call mpi_comm_rank(MPI_COMM_WORLD, mypid, ierr) + if (mypid == 0) then + write(iulog,*) 'Exit reprosum, nflds, arr_gsum =', nflds, arr_gsum + endif + endif + + end subroutine cism_reprosum_calc + +! +!======================================================================== +! + + subroutine cism_reprosum_int(arr, arr_gsum, nsummands, dsummands, nflds, & + arr_max_shift, arr_gmax_exp, max_levels, & +! max_level, extra_levels, skip_field, & + max_level, extra_levels, & + validate, recompute, omp_nthreads, mpi_comm ) +!------------------------------------------------------------------------ +! +! Purpose: +! Compute the global sum of each field in 'arr' using the indicated +! communicator with a reproducible yet scalable implementation based +! on first converting each floating point summand into an equivalent +! representation using a vector of integers, summing the integer +! vectors, then converting the resulting sum back to a floating point +! representation. The accuracy of the integer vector algorithm is +! controlled by the number of 'levels' of integer expansion, the maximum +! value of which is specified by max_level. +! +!------------------------------------------------------------------------ +! +! Arguments +! + integer, intent(in) :: nsummands ! number of local summands + integer, intent(in) :: dsummands ! declared first dimension + integer, intent(in) :: nflds ! number of fields + integer, intent(in) :: arr_max_shift ! maximum safe exponent for + ! value < 1 (so that sum + ! does not overflow) + integer, intent(in) :: arr_gmax_exp(nflds) + ! exponents of global maxima + integer, intent(in) :: max_levels(nflds) + ! maximum number of levels + ! of integer expansion + integer, intent(in) :: max_level ! maximum value in + ! max_levels + integer, intent(in) :: extra_levels ! number of extra levels + ! needed to guarantee that + ! sum over threads or tasks + ! does not cause overflow + integer, intent(in) :: omp_nthreads ! number of OpenMP threads + integer, intent(in) :: mpi_comm ! MPI subcommunicator + + real(r8), intent(in) :: arr(dsummands,nflds) + ! input array + + logical, intent(in) :: validate + ! flag indicating that accuracy of solution generated from + ! arr_gmax_exp and max_levels should be tested + + logical, intent(out):: recompute + ! flag indicating that either the upper bounds are inaccurate, + ! or max_levels and arr_gmax_exp do not generate accurate + ! enough sums + + real(r8), intent(out):: arr_gsum(nflds) ! global sums +! +! Local workspace +! + integer, parameter :: max_svlevel_factor = & + 1 + (digits(1_i8)/digits(1.0_r8)) + + integer(i8) :: i8_arr_tlsum_level(-(extra_levels-1):max_level,nflds,omp_nthreads) + ! integer vector representing local + ! sum (per thread, per field) + integer(i8) :: i8_arr_lsum_level((max_level+extra_levels+2)*nflds) + ! integer vector representing local + ! sum + integer(i8) :: i8_arr_level ! integer part of summand for current + ! expansion level + integer(i8) :: i8_arr_gsum_level((max_level+extra_levels+2)*nflds) + ! integer vector representing global + ! sum + integer(i8) :: i8_gsum_level(-(extra_levels-1):max_level) + ! integer vector representing global + ! sum for one field + integer(i8) :: IX_8 ! integer representation of r8 value + integer(i8) :: i8_sign ! sign global sum + integer(i8) :: i8_radix ! radix for i8 variables (and r8 + ! variables by earlier if-test) + + integer :: max_error(nflds,omp_nthreads) + ! accurate upper bound on data? + integer :: not_exact(nflds,omp_nthreads) + ! max_levels sufficient to + ! capture all digits? + integer :: isum_beg(omp_nthreads), isum_end(omp_nthreads) + ! range of summand indices for each + ! OpenMP thread + integer :: ifld, isum, ithread, jlevel + ! loop variables + integer :: arr_exp ! exponent of summand + integer :: arr_shift ! exponent used to generate integer + ! for current expansion level + integer :: ilevel ! current integer expansion level + integer :: offset(nflds) ! beginning location in + ! i8_arr_{g,l}sum_level for integer + ! expansion of current ifld + integer :: voffset ! modification to offset used to + ! include validation metrics + integer :: min_level ! index of minimum levels (including + ! extra levels) for i8_arr_tlsum_level + integer :: ioffset ! offset(ifld) + integer :: svlevel ! number of summands in summand_vector + integer :: ierr ! MPI error return + integer :: LX ! exponent of X_8 (see below) + integer :: veclth ! total length of i8_arr_lsum_level + integer :: i8_digit_count ! number of digits in integer + ! expansion of sum + integer :: i8_begin_level ! level starting from in + ! creating next 'exactly representable' + ! floating point value from modified + ! integer expansion of the sum + integer :: i8_trunc_level ! level at which the number of digits in + ! the modified integer expansion of the + ! sum exceeds the number of representable + ! digits in the floating point sum + integer :: i8_trunc_loc ! location of last digit at i8_trunc_level + ! in the modified integer expansion of the + ! sum that is representable in the floating + ! point sum + integer(i8) :: i8_trunc_level_rem + ! truncated digits at i8_trunc_level + ! in the modified integer expansion + ! of the sum + integer :: curr_exp ! exponent of partial sum during + ! reconstruction from integer vector + integer :: corr_exp ! exponent of current summand in + ! reconstruction from integer vector + + real(r8) :: arr_frac ! fraction of summand + real(r8) :: arr_remainder ! part of summand remaining after + ! current level of integer expansion + real(r8) :: X_8 ! r8 representation of current + ! i8_arr_gsum_level + real(r8) :: RX_8 ! r8 representation of (other) + ! integers used in calculation. + real(r8) :: summand_vector((max_level+extra_levels)*max_svlevel_factor) + ! vector of r8 values generated from + ! integer vector representation to be + ! summed to generate global sum + + logical :: first_stepd_iteration + ! flag used to indicate whether first + ! time through process of converting + ! vector of integers into a floating + ! point value, as it requires + ! special logic +! +!------------------------------------------------------------------------ +! Save radix of i8 variables in an i8 variable + i8_radix = radix(IX_8) + +! If validating upper bounds, reserve space for validation metrics +! In both cases, reserve extra levels for overflows from the top level + if (validate) then + voffset = extra_levels + 2 + else + voffset = extra_levels + endif + +! For convenience, define minimum level index for i8_arr_tlsum_level + min_level = -(extra_levels-1) + +! Compute offsets for each field + offset(1) = voffset + do ifld=2,nflds + offset(ifld) = offset(ifld-1) & + + (max_levels(ifld-1) + voffset) + enddo + veclth = offset(nflds) + max_levels(nflds) + +! Split summand index range over OpenMP threads + call split_indices(nsummands, omp_nthreads, isum_beg, isum_end) + +! Convert local summands to vector of integers and sum +! (Using scale instead of set_exponent because arr_remainder may not be +! 'normal' after level 1 calculation) + i8_arr_lsum_level(:) = 0_i8 + +!$omp parallel do & +!$omp default(shared) & +!$omp private(ithread, ifld, ioffset, isum, arr_frac, arr_exp, & +!$omp arr_shift, ilevel, i8_arr_level, arr_remainder, RX_8, IX_8) + do ithread=1,omp_nthreads +#ifndef EAMXX_STANDALONE + call t_startf('repro_sum_loopb') +#endif + do ifld=1,nflds + ioffset = offset(ifld) + + max_error(ifld,ithread) = 0 + not_exact(ifld,ithread) = 0 + i8_arr_tlsum_level(:,ifld,ithread) = 0_i8 + + do isum=isum_beg(ithread),isum_end(ithread) + arr_remainder = 0.0_r8 + + if (arr(isum,ifld) /= 0.0_r8) then + arr_exp = exponent(arr(isum,ifld)) + arr_frac = fraction(arr(isum,ifld)) + +! Test that global maximum upper bound is an upper bound + if (arr_exp > arr_gmax_exp(ifld)) then + max_error(ifld,ithread) = 1 + exit + endif + +! Calculate first shift + arr_shift = arr_max_shift - (arr_gmax_exp(ifld)-arr_exp) + +! Determine first (probably) nonzero level (assuming initial fraction is +! 'normal' - algorithm still works if this is not true) +! NOTE: this is critical; scale will set to zero if min exponent is too small. + if (arr_shift < 1) then + ilevel = (1 + (arr_gmax_exp(ifld)-arr_exp))/arr_max_shift + arr_shift = ilevel*arr_max_shift - (arr_gmax_exp(ifld)-arr_exp) + + do while (arr_shift < 1) + arr_shift = arr_shift + arr_max_shift + ilevel = ilevel + 1 + enddo + else + ilevel = 1 + endif + + if (ilevel <= max_levels(ifld)) then +! Apply first shift/truncate, add it to the relevant running +! sum, and calculate the remainder. + arr_remainder = scale(arr_frac,arr_shift) + i8_arr_level = int(arr_remainder,i8) + i8_arr_tlsum_level(ilevel,ifld,ithread) = & + i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level + arr_remainder = arr_remainder - i8_arr_level + +! While the remainder is non-zero, continue to shift, truncate, +! sum, and calculate new remainder + do while ((arr_remainder /= 0.0_r8) & + .and. (ilevel < max_levels(ifld))) + ilevel = ilevel + 1 + arr_remainder = scale(arr_remainder,arr_max_shift) + i8_arr_level = int(arr_remainder,i8) + i8_arr_tlsum_level(ilevel,ifld,ithread) = & + i8_arr_tlsum_level(ilevel,ifld,ithread) + i8_arr_level + arr_remainder = arr_remainder - i8_arr_level + enddo + + endif + endif + + if (arr_remainder /= 0.0_r8) then + not_exact(ifld,ithread) = 1 + endif + + enddo +! Postprocess integer vector to eliminate possibility of overflow +! during subsequent sum over threads and tasks, as per earlier +! comment on logic behind definition of max_nsummands. If value at a +! given level is larger than or equal to +! (radix(1_i8)**arr_max_shift), subtract this 'overlap' from the +! current value and add it (appropriately shifted) to the value at +! the next smaller level in the vector. +! (a) As described earlier, prior to this postprocessing the integer +! components are each strictly less than +! radix(1_i8)**(digits(1_i8) - 1) in absolute value. So, after +! shifting, the absolute value of the amount added to level +! max_levels(ifld)-1 from level max_levels(ifld) is less than +! radix(1_i8)**(digits(1_i8) - 1 - arr_max_shift) with the +! resulting sum, in absolute value, being less than +! (radix(1_i8)**(digits(1_i8) - 1))*(1 + radix(1_i8)**(-arr_max_shift)). +! Any overlap from this component is then added to the level +! max_levels(ifld)-2, etc., with resulting intermediate sums, in +! absolute value, for levels 1 to max_levels(ifld) being bounded +! from above by +! (radix(1_i8)**(digits(1_i8) - 1))*sum{i=0,inf}(radix(1_i8)**(-i*arr_max_shift)). +! Since radix(1_i8) >= 2 and arr_max_shift is also required to be +! >= 2 (otherwise the code exits with an error) this is less than +! or equal to +! (radix(1_i8)**(digits(1_i8) - 1))*sum{i=0,inf}(2**(-2i)), +! or +! (radix(1_i8)**(digits(1_i8) - 1))*(4/3). +! In summary, this shows that no absolute value generated during +! this process will exceed the maximum value representable in i8, +! i.e. (radix(1_i8)**(digits(1_i8)) - 1), as long as +! digits(1_i8) >= 2. +! (b) 'ilevel==0,...,-(extra_levels-1)' correspond to extra levels +! used to continue the above process until values at all levels +! are less than radix(1_i8)**arr_max_shift in absolute value +! (except level -(extra_levels-1), as described below). The +! result of shifting the overlap from level 1 to level 0, which +! is initially zero, is bounded in absolute value by +! (radix(1_i8)**(digits(1_i8) - 1 - arr_max_shift))*(4/3). +! After removing any overlap from level 0, the upper bound for +! level -1, which is also initially zero, is +! (radix(1_i8)**(digits(1_i8) - 1 - 2*arr_max_shift))*(4/3). +! Continuing the process, when get to level -(extra_levels-1), +! the upper bound is +! (radix(1_i8)**(digits(1_i8) - 1 - extra_levels*arr_max_shift))*(4/3). +! If we define +! extra_levels = ceiling[(digits(1_i8) - 1)/arr_max_shift - 1] +! then the upper bound is +! (radix(1_i8)**(arr_max_shift))*(4/3). +! Setting +! extra_levels = (digits(1_i8) - 1)/arr_max_shift +! is then a slightly conservative estimate that achieves the same +! upper bound. While the above upper bound at level +! -(extra_levels-1)is a factor of (4/3) larger than the target +! radix(1_i8)**arr_max_shift, it is still small enough so that +! the sum over threads and tasks, bounded from above in absolute +! value by +! (radix(1_i8)**(digits(1_i8) - 1))*(4/3), +! will not cause an overflow at level -(extra_levels-1) as long as +! digits(1_i8) >= 2. + do ilevel=max_levels(ifld),min_level+1,-1 + if (abs(i8_arr_tlsum_level(ilevel,ifld,ithread)) >= & + (i8_radix**arr_max_shift)) then + + IX_8 = i8_arr_tlsum_level(ilevel,ifld,ithread) & + / (i8_radix**arr_max_shift) + i8_arr_tlsum_level(ilevel-1,ifld,ithread) = & + i8_arr_tlsum_level(ilevel-1,ifld,ithread) + IX_8 + + IX_8 = IX_8*(i8_radix**arr_max_shift) + i8_arr_tlsum_level(ilevel,ifld,ithread) = & + i8_arr_tlsum_level(ilevel,ifld,ithread) - IX_8 + endif + enddo + enddo +#ifndef EAMXX_STANDALONE + call t_stopf('repro_sum_loopb') +#endif + enddo + +! Sum contributions from different threads + do ifld=1,nflds + ioffset = offset(ifld) + do ithread = 1,omp_nthreads + do ilevel = min_level,max_levels(ifld) + i8_arr_lsum_level(ioffset+ilevel) = & + i8_arr_lsum_level(ioffset+ilevel) & + + i8_arr_tlsum_level(ilevel,ifld,ithread) + enddo + enddo + enddo + +! Record if upper bound was inaccurate or if level expansion stopped +! before full accuracy was achieved + if (validate) then + do ifld=1,nflds + ioffset = offset(ifld) + i8_arr_lsum_level(ioffset-voffset+1) = maxval(max_error(ifld,:)) + i8_arr_lsum_level(ioffset-voffset+2) = maxval(not_exact(ifld,:)) + enddo + endif + +! Sum integer vector element-wise +#if ( defined noI8 ) + ! Workaround for when shr_kind_i8 is not supported. +#ifndef EAMXX_STANDALONE + call t_startf("repro_sum_allr_i4") +#endif + call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, & + veclth, MPI_INTEGER, MPI_SUM, mpi_comm, ierr) +#ifndef EAMXX_STANDALONE + call t_stopf("repro_sum_allr_i4") +#endif +#else +#ifndef EAMXX_STANDALONE + call t_startf("repro_sum_allr_i8") +#endif + call mpi_allreduce (i8_arr_lsum_level, i8_arr_gsum_level, & + veclth, MPI_INTEGER8, MPI_SUM, mpi_comm, ierr) +#ifndef EAMXX_STANDALONE + call t_stopf("repro_sum_allr_i8") +#endif +#endif + +#ifndef EAMXX_STANDALONE + call t_startf('repro_sum_finalsum') +#endif +! Construct global sum from integer vector representation: +! 1) arr_max_shift is the shift applied to fraction(arr_gmax) . +! When shifting back, need to 'add back in' the true arr_gmax exponent. +! This was removed implicitly by working only with the fraction. +! 2) To avoid the possibility of catastrophic cancellation, and +! an unacceptable floating point rounding error, can do some arithmetic +! with the integer vector so that all components have the same sign. +! 3) If convert each integer in the integer vector to a floating +! point value and then add these together, smallest to largest, to +! calculate the final sum, there may be roundoff error in the least +! significant digit. This error will be sensitive to the particular +! floating values generated from the integer vector, and so will be +! sensitive to the number of levels in the vector and the implicit +! exponent associated with each level. So this approach is not +! guaranteed to be reproducible with respect to a change in the +! number of MPI tasks and OpenMP threads (as this changes the +! definition of max_nsummands, and thus also arr_max_shift). It is +! also not guaranteed to be reproducible with respect to changing +! the integer size, e.g. from i8 to i4, as this also changes +! arr_max_shift. However, can eliminate this potential loss of +! reproducibility by taking the following steps. +! a) Manipulate the integer vector so that +! i) the component values do not 'overlap', that is, the value +! represented by a component is strictly less than the value +! represented by the least significant digit in the previous +! component, and +! ii) all components are positive (saving the sign to be restored +! to the final result). +! b) Identify the digit in the resulting integer vector that is the +! last representable in the floating point representation, then +! truncate the vector at this point, i.e., all digits of lesser +! significance in the given component and all components +! representing digits of lesser significance (call this the +! remainder). +! c) Convert each integer component in the modified integer vector +! to its corresponding floating point value and sum the +! sequence. (Order is unimportant, as explained below, but here +! add largest to smallest.) +! d) Repeat (b) and (c) for the remainder (recursively, as +! necessary). +! e) Sum all floating point numbers generated by step (c), smallest +! to largest. +! f) Restore the sign. +! With the manipulations in (a) and (b), the summation in (c) is +! equivalent to concatenating the digits in the mantissas for the +! component summands, so rounding is irrelevant (so far). Repeating +! this with the remainder(s) generates a sequence of 'exact' +! floating point numbers. Summing these can still generate a +! rounding error in the least significant digit in the largest +! floating point value (which is the last representable digit in the +! final result), but the floating point values being summed and +! order of summation are independent of the number of levels and +! implicit exponents, so reproducibility is ensured. +! +! Note that assignment of an i8 integer value to an r8 floating point +! variable in step (c) can lead to a loss of accuracy because the +! maximum number of digits in the i8 integer can be greater than the +! maximum number of digits representable in the r8 variable (if the +! xmax_nsummands correction is not very large). With the same sign +! and nonoverlapping properties of the integer components, these lost +! digits will also not be representable in the final sum. The process +! described above of truncating at this last representable digit, and +! then separately generating floating point value(s) for the +! remainder, takes care of this automatically. Similar reasoning +! applies to r4 floating point values with either i8 or i4 integer +! components. + + recompute = .false. + do ifld=1,nflds + arr_gsum(ifld) = 0.0_r8 + ioffset = offset(ifld) + svlevel = 0 + +! If validate is .true., test whether the summand upper bound +! was exceeded on any of the MPI tasks + if (validate) then + if (i8_arr_gsum_level(ioffset-voffset+1) /= 0_i8) then + recompute = .true. + endif + endif + + if (.not. recompute) then +! Copy integer vector for current field from i8_arr_gsum_level, so that +! can be modified without changing i8_arr_gsum_level. (Preserving +! i8_arr_gsum_level unchanged is not necessary, but is convenient for debugging +! and makes indexing clearer and less error prone.) + i8_gsum_level(:) = 0_i8 + do ilevel=min_level,max_levels(ifld) + i8_gsum_level(ilevel) = i8_arr_gsum_level(ioffset+ilevel) + enddo + +! Preprocess integer vector (as described in 3(a) above): +! i) If value larger than or equal to (radix(1_i8)**arr_max_shift), +! add this 'overlap' to the value at the next smaller level +! in the vector, resulting in nonoverlapping ranges for each +! component. +! +! As before, no intermediate sums for levels +! max_levels(ifld) to -(extra_levels-2), in absolute value, +! will exceed the the maximum value representable in i8, but the +! upper bound on the final sum, in absolute value, at +! level -(extra_levels-1) is now +! (radix(1_i8)**(digits(1_i8) - 1))*(4/3) + +! + sum{i=1,inf}(radix(1_i8)**(-i*arr_max_shift)) +! = (radix(1_i8)**(digits(1_i8) - 1))* +! ((4/3) + sum{i=1,inf}(radix(1_i8)**(-i*arr_max_shift)). +! which is less than or equal to +! (radix(1_i8)**(digits(1_i8) - 1))*((4/3) + (1/3)) +! or +! (radix(1_i8)**(digits(1_i8) - 1))*(5/3) +! which will not cause an overflow at level -(extra_levels-1) +! as long as digits(1_i8) >= 3. +! +! Since the exponents associated with each successive level +! differ by arr_max_shift, monotonically decreasing with +! increasing level, the absolute value at each level after this +! preprocessing is strictly less than what can be represented at +! the next lower level (larger exponent). If nonzero, it is also +! strictly greater than what is represented at the next higher +! level (smaller exponent). Note that the smallest level, +! -(extra_levels-1), does not have to be less than +! (radix(1_i8)**arr_max_shift) for this 'nonoverlap' property to +! hold. + do ilevel=max_levels(ifld),min_level+1,-1 + if (abs(i8_gsum_level(ilevel)) >= & + (i8_radix**arr_max_shift)) then + + IX_8 = i8_gsum_level(ilevel) & + / (i8_radix**arr_max_shift) + i8_gsum_level(ilevel-1) = & + i8_gsum_level(ilevel-1) + IX_8 + + IX_8 = IX_8*(i8_radix**arr_max_shift) + i8_gsum_level(ilevel) = & + i8_gsum_level(ilevel) - IX_8 + endif + enddo + +! ii) Working consecutively from the first level with a nonzero value +! up to level max_levels(ifld), subtract +/- 1 from level with +! larger exponent (e.g., ilevel) and add +/- +! (i8_radix**arr_max_shift) to level with smaller exponent +! (ilevel+1), when necessary, so that the value at ilevel+1 +! has the same sign as the value at ilevel. Treat a zero value at +! ilevel+1 as always a different sign from the value at ilevel so +! that the process always makes this nonzero. (Otherwise, the +! wrong sign could be reintroduced by subtracting from a zero +! value at the next step.) When finished with the process values +! at all levels are either greater than or equal to zero or all +! are less than or equal to zero. Note that this can decrease +! (but not increase) the absolute value at level +! -(extra_levels-1) by 1. All other levels are now less than or +! equal to (radix(1_i8)**arr_max_shift) in absolute value rather +! than strictly less than. + ilevel = min_level + do while ((i8_gsum_level(ilevel) == 0_i8) & + .and. (ilevel < max_levels(ifld))) + ilevel = ilevel + 1 + enddo +! + if (i8_gsum_level(ilevel) < 0_i8) then + i8_sign = -1_i8 + else + i8_sign = 1_i8 + endif +! + if (ilevel < max_levels(ifld)) then + do jlevel=ilevel,max_levels(ifld)-1 + if ((sign(1_i8,i8_gsum_level(jlevel)) & + /= sign(1_i8,i8_gsum_level(jlevel+1)))& + .or. (i8_gsum_level(jlevel+1) == 0_i8)) then + i8_gsum_level(jlevel) = & + i8_gsum_level(jlevel) - i8_sign + i8_gsum_level(jlevel+1) = & + i8_gsum_level(jlevel+1) & + + i8_sign*(i8_radix**arr_max_shift) + endif + enddo + endif + +! iii) If 'same sign' is negative, then change to positive +! temporarily. + if (i8_sign < 0_i8) then + do jlevel=ilevel,max_levels(ifld) + i8_gsum_level(jlevel) = -i8_gsum_level(jlevel) + enddo + endif + +! iv) Nonoverlap property can be lost after imposition of same sign +! over components. Reintroduce this property (retaining same sign +! property). Note that carryover is never more than '1' to the +! next smaller level, so, again, no intermediate or final sums +! will exceed the maximum value representable in i8, including +! level -(extra_levels-1) as long as digits(1_i8) >= 4. + do ilevel=max_levels(ifld),min_level+1,-1 + if (abs(i8_gsum_level(ilevel)) >= & + (i8_radix**arr_max_shift)) then + + IX_8 = i8_gsum_level(ilevel)/ & + (i8_radix**arr_max_shift) + i8_gsum_level(ilevel-1) = & + i8_gsum_level(ilevel-1) + IX_8 + + IX_8 = IX_8*(i8_radix**arr_max_shift) + i8_gsum_level(ilevel) = & + i8_gsum_level(ilevel) - IX_8 + endif + enddo + +! Step 3(d): iterate over steps 3(b) and 3(c), truncating integer +! vector to 'fit' into a floating point value, then repeating with +! remainder + first_stepd_iteration = .true. + arr_shift = arr_gmax_exp(ifld) - (min_level)*arr_max_shift + i8_digit_count = 0 + i8_begin_level = min_level + do while (i8_begin_level <= max_levels(ifld)) + +! Determine at which level the total number of integer digits equals +! or exceeds the number of digits representable in the floating point +! sum. Then determine which digit at this level is the last +! representable in the floating point sum. Note that this location +! (i8_trunc_loc) is zero-based, i.e. smallest digit is at location +! 0. Note that the exponent is a count of the number of digits for the +! first nonzero level. All subsequent levels contribute arr_max_shift +! digits. + i8_trunc_loc = 0 + i8_trunc_level = max_levels(ifld) + do ilevel=i8_begin_level,max_levels(ifld) + if (first_stepd_iteration) then +! Special logic for first time through. Subsequent iterations treat +! leading zeroes as significant. + if (i8_digit_count == 0) then + if (i8_gsum_level(ilevel) /= 0_i8) then + X_8 = i8_gsum_level(ilevel) + LX = exponent(X_8) +! Note that even if i8_gsum_level(ilevel) is truncated when assigned +! to X_8, the exponent LX will still capture the original number of +! digits. + else + LX = 0 + endif + else + LX = arr_max_shift + endif + else +! If i8_digit_count /= 0 during the first iteration +! (ilevel == i8_begin_level), then there is a remainder left at the +! previous i8_trunc_level and LX should be set to zero for this +! iteration. + if ((ilevel == i8_begin_level) .and. (i8_digit_count /= 0)) then + LX = 0 + else + LX = arr_max_shift + endif + endif + if (i8_digit_count + LX >= digits(1.0_r8)) then + i8_trunc_level = ilevel + i8_trunc_loc = (i8_digit_count + LX) - digits(1.0_r8) + exit + else + i8_digit_count = i8_digit_count + LX + endif + enddo + first_stepd_iteration = .false. + +! Truncate at i8_trunc_loc as needed and determine what the remainder +! is. + if (i8_trunc_loc == 0) then +! No truncation is necessary, and remainder is just the components +! for the remaining levels + i8_trunc_level_rem = 0 + else +! Shift right to identify the digits to be preserved and truncate +! there + IX_8 = i8_gsum_level(i8_trunc_level)/ & + (i8_radix**i8_trunc_loc) +! Shift left to put digits in the correct location (right fill with +! zeroes) + IX_8 = IX_8*(i8_radix**i8_trunc_loc) +! Calculate local remainder + i8_trunc_level_rem = (i8_gsum_level(i8_trunc_level) - IX_8) +! Update level with the truncated value + i8_gsum_level(i8_trunc_level) = IX_8 + endif + +! Calculate floating point value corresponding to modified integer +! vector. Note that, by construction, i8 integer value will fit into +! r8 floating point value, so do not need to test for this. + svlevel = svlevel + 1 + summand_vector(svlevel) = 0.0_r8 + do ilevel=i8_begin_level,i8_trunc_level + if (i8_gsum_level(ilevel) /= 0_i8) then + +! Convert integer to floating point representation + X_8 = i8_gsum_level(ilevel) + LX = exponent(X_8) + +! Add to vector of floating point summands, scaling first if exponent +! is too small to apply directly + curr_exp = LX + arr_shift + if (curr_exp >= MINEXPONENT(1.0_r8)) then + summand_vector(svlevel) = & + summand_vector(svlevel) + set_exponent(X_8,curr_exp) + else + RX_8 = set_exponent(X_8, & + curr_exp-MINEXPONENT(1.0_r8)) + summand_vector(svlevel) = & + summand_vector(svlevel) + scale(RX_8,MINEXPONENT(1.0_r8)) + endif + + endif + +! Note that same arr_shift should be used for next 'step 3(d)' +! iteration if i8_trunc_loc > 0. + if ((ilevel < i8_trunc_level) .or. (i8_trunc_loc == 0)) then + arr_shift = arr_shift - arr_max_shift + endif + + enddo + + if (i8_trunc_loc == 0) then + i8_digit_count = 0 + i8_begin_level = i8_trunc_level + 1 + else + i8_digit_count = i8_trunc_loc + i8_begin_level = i8_trunc_level + i8_gsum_level(i8_trunc_level) = i8_trunc_level_rem + endif + + enddo + +! Step 3(e): sum vector of floating point values, smallest to largest + arr_gsum(ifld) = 0.0_r8 + do jlevel=svlevel,1,-1 + arr_gsum(ifld) = arr_gsum(ifld) + summand_vector(jlevel) + enddo + +! Step 3(f): restore the sign + arr_gsum(ifld) = i8_sign*arr_gsum(ifld) + +! If validate is .true. and some precision lost, test whether 'too +! much' was lost, due to too loose an upper bound, too stringent a +! limit on number of levels of expansion, cancellation, ... +! Calculated by comparing lower bound on number of significant digits +! with number of digits in 1.0_r8 . + if (validate) then + if (i8_arr_gsum_level(ioffset-voffset+2) /= 0_i8) then + +! Find first nonzero level and use exponent for this level, then +! assume all subsequent levels contribute arr_max_shift digits. + i8_digit_count = 0 + do ilevel=min_level,max_levels(ifld) + if (i8_digit_count == 0) then + if (i8_arr_gsum_level(ioffset+ilevel) /= 0_i8) then + X_8 = i8_arr_gsum_level(ioffset+ilevel) + LX = exponent(X_8) + i8_digit_count = LX + endif + else + i8_digit_count = i8_digit_count + arr_max_shift + endif + enddo + + if (i8_digit_count < digits(1.0_r8)) then + recompute = .true. + endif + endif + endif + + endif + + enddo +#ifndef EAMXX_STANDALONE + call t_stopf('repro_sum_finalsum') +#endif + + end subroutine cism_reprosum_int + +! +!======================================================================== +! + + logical function cism_reprosum_tolExceeded(name, nflds, master, & + logunit, rel_diff ) +!------------------------------------------------------------------------ +! +! Purpose: +! Test whether distributed sum exceeds tolerance and print out a +! warning message. +! +!------------------------------------------------------------------------ +! +! Arguments +! + character(len=*), intent(in) :: name ! distributed sum identifier + integer, intent(in) :: nflds ! number of fields + logical, intent(in) :: master ! MPI task that will write + ! warning messages? + integer, optional, intent(in) :: logunit! unit warning messages + ! written to + real(r8), intent(in) :: rel_diff(2,nflds) + ! relative and absolute + ! differences between integer + ! vector and floating point sums + +! +! Local workspace +! + integer :: llogunit ! local log unit + integer :: ifld ! field index + integer :: exceeds_limit ! number of fields whose + ! sum exceeds tolerance + real(r8) :: max_rel_diff ! maximum relative difference + integer :: max_rel_diff_idx ! field index for max. rel. diff. + real(r8) :: max_abs_diff ! maximum absolute difference + integer :: max_abs_diff_idx ! field index for max. abs. diff. +! +!------------------------------------------------------------------------ +! + cism_reprosum_tolExceeded = .false. + if (cism_reprosum_reldiffmax < 0.0_r8) return + + if ( present(logunit) ) then + llogunit = logunit + else +!! llogunit = s_logunit + llogunit = iulog + endif + +! Check that 'fast' reproducible sum is accurate enough. + exceeds_limit = 0 + max_rel_diff = 0.0_r8 + max_abs_diff = 0.0_r8 + max_rel_diff_idx = 0 + do ifld=1,nflds + if (rel_diff(1,ifld) > cism_reprosum_reldiffmax) then + exceeds_limit = exceeds_limit + 1 + if (rel_diff(1,ifld) > max_rel_diff) then + max_rel_diff = rel_diff(1,ifld) + max_rel_diff_idx = ifld + endif + if (rel_diff(2,ifld) > max_abs_diff) then + max_abs_diff = rel_diff(2,ifld) + max_abs_diff_idx = ifld + endif + endif + enddo + + if (exceeds_limit > 0) then + if (master) then + write(llogunit,*) trim(name), & + ': difference between integer vector and floating point sums ', & + ' exceeds tolerance in ', exceeds_limit, & + ' fields.' + write(llogunit,*) ' Maximum relative diff: (rel)', & + rel_diff(1,max_rel_diff_idx), ' (abs) ', & + rel_diff(2,max_rel_diff_idx) + write(llogunit,*) ' Maximum absolute diff: (rel)', & + rel_diff(1,max_abs_diff_idx), ' (abs) ', & + rel_diff(2,max_abs_diff_idx) + endif + cism_reprosum_tolExceeded = .true. + endif + + end function cism_reprosum_tolExceeded + +! +!======================================================================== +! + + subroutine cism_reprosum_ddpdd(arr, arr_gsum, nsummands, dsummands, & + nflds, mpi_comm ) +!------------------------------------------------------------------------ +! +! Purpose: +! Compute the global sum of each field in 'arr' using the indicated +! communicator with a reproducible yet scalable implementation based +! on He and Ding's implementation of the double-double algorithm. +! +!------------------------------------------------------------------------ +! +! Arguments +! + integer, intent(in) :: nsummands ! number of local summands + integer, intent(in) :: dsummands ! declared first dimension + integer, intent(in) :: nflds ! number of fields + real(r8), intent(in) :: arr(dsummands,nflds) + ! input array + integer, intent(in) :: mpi_comm ! MPI subcommunicator + + real(r8), intent(out):: arr_gsum(nflds) + ! global sums + +! +! Local workspace +! + integer :: old_cw ! for x86 processors, save + ! current arithmetic mode + integer :: ifld, isum ! loop variables + integer :: ierr ! MPI error return + + real(r8) :: e, t1, t2 ! temporaries + complex(r8) :: arr_lsum_dd(nflds) ! local sums (in double-double + ! format) + complex(r8) :: arr_gsum_dd(nflds) ! global sums (in double-double + ! format) + + integer, save :: mpi_sumdd + logical, save :: first_time = .true. + +! +!------------------------------------------------------------------------ +! + !WHL - commented out +! call cism_reprosumx86_fix_start (old_cw) + + if (first_time) then + call mpi_op_create(ddpdd, .true., mpi_sumdd, ierr) + first_time = .false. + endif + + do ifld=1,nflds + arr_lsum_dd(ifld) = (0.0_r8,0.0_r8) + + do isum=1,nsummands + +! Compute arr(isum,ifld) + arr_lsum_dd(ifld) using Knuth''s trick. + t1 = arr(isum,ifld) + real(arr_lsum_dd(ifld)) + e = t1 - arr(isum,ifld) + t2 = ((real(arr_lsum_dd(ifld)) - e) & + + (arr(isum,ifld) - (t1 - e))) & + + aimag(arr_lsum_dd(ifld)) + + ! The result is t1 + t2, after normalization. + arr_lsum_dd(ifld) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 ) + enddo + + enddo + +#ifndef EAMXX_STANDALONE + call t_startf("repro_sum_allr_c16") +#endif + call mpi_allreduce (arr_lsum_dd, arr_gsum_dd, nflds, & + MPI_COMPLEX16, mpi_sumdd, mpi_comm, ierr) +#ifndef EAMXX_STANDALONE + call t_stopf("repro_sum_allr_c16") +#endif + + do ifld=1,nflds + arr_gsum(ifld) = real(arr_gsum_dd(ifld)) + enddo + + !WHL - commented out +! call cism_reprosumx86_fix_end (old_cw) + + end subroutine cism_reprosum_ddpdd +! +!------------------------------------------------------------------------ +! + subroutine DDPDD (dda, ddb, len, itype) +!------------------------------------------------------------------------ +! +! Purpose: +! Modification of original codes written by David H. Bailey +! This subroutine computes ddb(i) = dda(i)+ddb(i) +! +!------------------------------------------------------------------------ +! +! Arguments +! + integer, intent(in) :: len ! array length + complex(r8), intent(in) :: dda(len) ! input + complex(r8), intent(inout) :: ddb(len) ! result + integer, intent(in) :: itype ! unused +! +! Local workspace +! + real(r8) e, t1, t2 + integer i +! +!------------------------------------------------------------------------ +! + do i = 1, len + +! Compute dda + ddb using Knuth's trick. + t1 = real(dda(i)) + real(ddb(i)) + e = t1 - real(dda(i)) + t2 = ((real(ddb(i)) - e) + (real(dda(i)) - (t1 - e))) & + + aimag(dda(i)) + aimag(ddb(i)) + +! The result is t1 + t2, after normalization. + ddb(i) = cmplx ( t1 + t2, t2 - ((t1 + t2) - t1), r8 ) + enddo + + end subroutine DDPDD +! +!------------------------------------------------------------------------ +! + subroutine split_indices(total,num_pieces,ibeg,iend) +!------------------------------------------------------------------------ +! +! Purpose: +! Split range into 'num_pieces' +! +!------------------------------------------------------------------------ +! +! Arguments +! + integer, intent(in) :: total + integer, intent(in) :: num_pieces + integer, intent(out) :: ibeg(num_pieces), iend(num_pieces) +! +! Local workspace +! + integer :: itmp1, itmp2, ioffset, i +! +!------------------------------------------------------------------------ +! + itmp1 = total/num_pieces + itmp2 = mod(total,num_pieces) + ioffset = 0 + do i=1,itmp2 + ibeg(i) = ioffset + 1 + iend(i) = ioffset + (itmp1+1) + ioffset = iend(i) + enddo + do i=itmp2+1,num_pieces + ibeg(i) = ioffset + 1 + if (ibeg(i) > total) then + iend(i) = ibeg(i) - 1 + else + iend(i) = ioffset + itmp1 + ioffset = iend(i) + endif + enddo + + end subroutine split_indices +! +!======================================================================== +! +end module cism_reprosum_mod diff --git a/libglimmer/glimmer_global.F90 b/libglimmer/glimmer_global.F90 index e9eb745d..0aefadee 100644 --- a/libglimmer/glimmer_global.F90 +++ b/libglimmer/glimmer_global.F90 @@ -30,12 +30,12 @@ module glimmer_global - !> Module holding global variables for Glimmer. Holds real-type - !> kind values, and other global code parameters. + !> Module holding global variables for Glimmer. + !> Holds real and integer kind values and other global code parameters. implicit none - integer,parameter :: sp = kind(1.0) + integer,parameter :: sp = kind(1.0) !> Single precision --- Fortran single-precision real-type kind !> value. Used internally. @@ -44,7 +44,7 @@ module glimmer_global !> the -r8 flag), then this parameter may need to be set in agreement with !> that. - integer,parameter :: dp = kind(1.0d0) + integer,parameter :: dp = kind(1.0d0) !> Double precision --- Fortran double-precision real-type kind !> value. Used internally. @@ -53,6 +53,12 @@ module glimmer_global !> the -r8 flag), then this parameter may need to be set in agreement !> with that + ! Integer kinds + ! Note: Integers are i4 by default. + ! i8 integers can be used to generate reproducible sums + integer, parameter :: i4 = kind(1) + integer, parameter :: i8 = kind(1_8) + !WHL - Removed rk from the code, so commenting out these declarations !!#ifdef GLIMMER_SP !! integer,parameter :: rk=sp !< Precision of glimmer module --- the general Fortran real-type kind value for the Glimmer module and its interfaces. diff --git a/libglimmer/glimmer_utils.F90 b/libglimmer/glimmer_utils.F90 index c21b05c9..45038da6 100644 --- a/libglimmer/glimmer_utils.F90 +++ b/libglimmer/glimmer_utils.F90 @@ -589,6 +589,91 @@ subroutine point_diag_real8_2d(& end subroutine point_diag_real8_2d +!-------------------------------------------------------------------------- + + subroutine double_to_binary(& + x, binary_str, binary_full, binary_sign, binary_exponent, binary_mantissa) + + ! Find the internal binary representation of a double-precision floating point number + ! Based on the IEEE-754 standard + + use glimmer_global, only: dp, i8 + implicit none + + real(dp), intent(in) :: x + character(len=64), intent(out) :: binary_str ! string representation of the binary number + + integer(i8), intent(out), optional :: binary_full ! 64 bits + integer, intent(out), optional :: binary_sign ! 1 bit + integer, intent(out), optional :: binary_exponent ! 11 bits + integer(i8), intent(out), optional :: binary_mantissa ! 52 bits + + integer :: i + character(len=1) :: bin(64) + integer (i8) :: binary_number + integer :: sign_bit + integer :: exponent_bits + integer :: mantissa_bits + + logical :: verbose_binary = .false. + + ! Transfer the double value into a 64-bit integer + binary_number = transfer(x, binary_number) + + ! Get the sign bit (bit 64) + sign_bit = IAND(ishft(binary_number, -63), INT(1, 8)) + + ! Get the exponent bits (bits 63–53) + exponent_bits = IAND(ishft(binary_number, -52), INT(Z'7FF', 8)) + + ! Extract mantissa (fraction) bits (bits 52–1) + mantissa_bits = IAND(binary_number, INT(Z'FFFFFFFFFFFFF', 8)) + + if (present(binary_full)) binary_full = binary_number + if (present(binary_sign)) binary_sign = sign_bit + if (present(binary_exponent)) binary_exponent = exponent_bits + if (present(binary_mantissa)) binary_mantissa = mantissa_bits + + if (verbose_binary) then + write(iulog,*) ' ' + write(iulog,*) 'x =', x + write(iulog,*) 'IEEE-754 double precision representation of x:' + write(iulog,*) 'Sign bit: ', sign_bit + write(iulog,*) 'Exponent (11 bits):', exponent_bits + write(iulog,*) 'Mantissa (52 bits):', mantissa_bits + endif + + ! Convert full 64-bit integer to a binary string + do i = 1, 64 + if (btest(binary_number, 64 - i)) then + bin(i) = '1' + else + bin(i) = '0' + end if + end do + + binary_str = concat(bin) + if (verbose_binary) then + write(iulog,*) 'Full 64-bit binary:' + write(iulog,*) ' ', binary_str + endif + + end subroutine double_to_binary + + + pure function concat(arr) result(str) + ! Turn a character array into a string + + character(len=*), intent(in) :: arr(:) + character(len=size(arr)) :: str + integer :: k + + do k = 1, size(arr) + str(k:k) = arr(k) + end do + + end function concat + !**************************************************************************** end module glimmer_utils diff --git a/libglimmer/ncdf_template.F90.in b/libglimmer/ncdf_template.F90.in index 177c5e29..378b18b4 100644 --- a/libglimmer/ncdf_template.F90.in +++ b/libglimmer/ncdf_template.F90.in @@ -437,14 +437,14 @@ contains ic=>model%funits%frc_first do while(associated(ic)) -! if (main_task .and. verbose_read_forcing) write(6,*) 'possible forcing times', ic%times +! if (main_task .and. verbose_read_forcing) write(iulog,*) 'possible forcing times', ic%times if (ic%read_once) then ! read once at initialization; do not re-read at runtime ic%nc%just_processed = .true. ! prevent the file from being read if (main_task .and. verbose_read_forcing) then - write(6,*) ' ' - write(6,*) 'In NAME_read_forcing; will not re-read the read_once file ', trim(ic%nc%filename) + write(iulog,*) ' ' + write(iulog,*) 'In NAME_read_forcing; will not re-read the read_once file ', trim(ic%nc%filename) endif else ! not a read_once file @@ -464,11 +464,11 @@ contains endif if (main_task .and. verbose_read_forcing) then - write(6,*) ' ' - write(6,*) 'In NAME_read_forcing, model time + eps =', model%numerics%time + eps - write(6,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset - write(6,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle - write(6,*) 'current forcing time =', current_forcing_time + write(iulog,*) ' ' + write(iulog,*) 'In NAME_read_forcing, model time + eps =', model%numerics%time + eps + write(iulog,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset + write(iulog,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle + write(iulog,*) 'current forcing time =', current_forcing_time endif ! Find the time index associated with the previous model time step @@ -476,7 +476,7 @@ contains do t = ic%nt, 1, -1 ! look through the time array backwards if (ic%times(t) <= current_forcing_time - model%numerics%tinc) then t_prev = t - if (main_task .and. verbose_read_forcing) write(6,*) 'Previous time index =', t_prev + if (main_task .and. verbose_read_forcing) write(iulog,*) 'Previous time index =', t_prev exit end if enddo @@ -486,7 +486,7 @@ contains if ( ic%times(t) <= current_forcing_time) then ! use the largest time that is smaller or equal to the current time (stepwise forcing) if (main_task .and. verbose_read_forcing) & - write(6,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t) + write(iulog,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t) ! If this time index (t) is larger than the previous index (t_prev), then read a new time slice. ! Otherwise, we already have the current slice, and there is nothing new to read. @@ -494,7 +494,7 @@ contains ! Set the desired time to be read ic%current_time = t ic%nc%just_processed = .false. ! set this to false so file will be read. - if (main_task .and. verbose_read_forcing) write(6,*) 'Read new forcing slice: t, times(t) =', t, ic%times(t) + if (main_task .and. verbose_read_forcing) write(iulog,*) 'Read new forcing slice: t, times(t) =', t, ic%times(t) endif ! t > t_prev exit ! once we find the time, exit the loop @@ -548,10 +548,10 @@ contains if (ic%read_once) then if (main_task .and. verbose_read_forcing) then - write(6,*) ' ' - write(6,*) 'In NAME_read_forcing_once' - write(6,*) 'Filename =', trim(ic%nc%filename) - write(6,*) 'Number of slices =', ic%nt + write(iulog,*) ' ' + write(iulog,*) 'In NAME_read_forcing_once' + write(iulog,*) 'Filename =', trim(ic%nc%filename) + write(iulog,*) 'Number of slices =', ic%nt endif write(message,*) 'Reading', ic%nt, 'slices of file ', trim(ic%nc%filename), ' just once at initialization' @@ -568,7 +568,7 @@ contains do t = 1, ic%nt if (main_task .and. verbose_read_forcing) then - write(6,*) 'Read new forcing slice: t index, times(t) =', t, ic%times(t) + write(iulog,*) 'Read new forcing slice: t index, times(t) =', t, ic%times(t) endif ! Set the desired time to be read @@ -588,7 +588,7 @@ contains endif ! read_once if (main_task .and. verbose_read_forcing) then - write(6,*) 'Final ic%nc%vars = ', trim(ic%nc%vars) + write(iulog,*) 'Final ic%nc%vars = ', trim(ic%nc%vars) endif ic=>ic%next @@ -650,13 +650,13 @@ contains endif if (main_task .and. verbose_read_forcing) then - write(6,*) ' ' - write(6,*) 'In NAME_retrieve_forcing, model time + eps =', model%numerics%time + eps - write(6,*) 'Filename = ', trim(ic%nc%filename) - write(6,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset - write(6,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle - write(6,*) 'current forcing time =', current_forcing_time - write(6,*) 'variable list:', trim(ic%nc%vars) + write(iulog,*) ' ' + write(iulog,*) 'In NAME_retrieve_forcing, model time + eps =', model%numerics%time + eps + write(iulog,*) 'Filename = ', trim(ic%nc%filename) + write(iulog,*) 'Forcing file nt, time_offset =', ic%nt, ic%time_offset + write(iulog,*) 'time_start_cycle, nyear_cycle:', ic%time_start_cycle, ic%nyear_cycle + write(iulog,*) 'current forcing time =', current_forcing_time + write(iulog,*) 'variable list:', trim(ic%nc%vars) endif ! Optionally, associate the current forcing time with a different date in the forcing file. @@ -669,8 +669,8 @@ contains open(unit=11, file=trim(ic%shuffle_file), status='old') this_year = int(current_forcing_time - model%numerics%tstart) if (main_task .and. verbose_read_forcing) then - write(6,*) 'shuffle_file = ', trim(ic%shuffle_file) - write(6,*) 'tstart, this_year =', model%numerics%tstart, this_year + write(iulog,*) 'shuffle_file = ', trim(ic%shuffle_file) + write(iulog,*) 'tstart, this_year =', model%numerics%tstart, this_year endif forcing_year = 0 do while (forcing_year == 0) @@ -684,11 +684,11 @@ contains decimal_year = current_forcing_time - floor(current_forcing_time) current_forcing_time = real(forcing_year,dp) + decimal_year if (main_task .and. verbose_read_forcing) then - write(6,*) 'forcing_year, decimal =', forcing_year, decimal_year - write(6,*) 'shuffled forcing_time =', current_forcing_time + write(iulog,*) 'forcing_year, decimal =', forcing_year, decimal_year + write(iulog,*) 'shuffled forcing_time =', current_forcing_time endif else - if (main_task .and. verbose_read_forcing) write(6,*) 'no shuffle_file' + if (main_task .and. verbose_read_forcing) write(iulog,*) 'no shuffle_file' endif ! shuffle_file exists ! Find the time index associated with the previous model time step @@ -696,7 +696,7 @@ contains do t = ic%nt, 1, -1 ! look through the time array backwards if (ic%times(t) <= current_forcing_time - model%numerics%tinc) then t_prev = t - if (main_task .and. verbose_read_forcing) write(6,*) 'Previous time index =', t_prev + if (main_task .and. verbose_read_forcing) write(iulog,*) 'Previous time index =', t_prev exit end if enddo @@ -706,14 +706,14 @@ contains if ( ic%times(t) <= current_forcing_time) then ! use the largest time that is smaller or equal to the current time (stepwise forcing) if (main_task .and. verbose_read_forcing) & - write(6,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t) + write(iulog,*) 'Largest time less than current forcing time: t, times(t):', t, ic%times(t) ! If this time index (t) is larger than the previous index (t_prev), then retrieve a new time slice. ! Otherwise, we already have the current slice, and there is nothing new to read. if (t > t_prev) then ! Set the desired time to be read ic%current_time = t retrieve_new_slice = .true. - if (main_task .and. verbose_read_forcing) write(6,*) 'Retrieve new forcing slice' + if (main_task .and. verbose_read_forcing) write(iulog,*) 'Retrieve new forcing slice' write(message,*) & 'Retrieve slice', t, 'at forcing time', ic%times(t), 'from file ', trim(ic%nc%filename) call write_log(message) diff --git a/libglimmer/parallel_mpi.F90 b/libglimmer/parallel_mpi.F90 index 39a06994..d043cde2 100644 --- a/libglimmer/parallel_mpi.F90 +++ b/libglimmer/parallel_mpi.F90 @@ -27,9 +27,11 @@ module cism_parallel use netcdf - use glimmer_global, only : dp, sp + use glimmer_global, only : dp, sp, i8 use glimmer_paramets, only: iulog + use cism_reprosum_mod, only: cism_reprosum_calc, verbose_reprosum + implicit none ! integers associated with the main global communicator @@ -144,6 +146,10 @@ module cism_parallel integer :: main_rank_col ! integer ID for the master task on the column logical :: main_task_col ! true if this_rank_col = main_rank_col + ! option to compute reproducible sums + logical :: reprosum ! if true, compute reproducible global sums + ! (interface parallel_reduce_sum) + end type parallel_type ! Information on the local & global bounds of an array @@ -182,29 +188,29 @@ module cism_parallel module procedure broadcast_real8_1d end interface - interface distributed_gather_var - module procedure distributed_gather_var_integer_2d - module procedure distributed_gather_var_logical_2d - module procedure distributed_gather_var_real4_2d - module procedure distributed_gather_var_real4_3d - module procedure distributed_gather_var_real8_2d - module procedure distributed_gather_var_real8_3d + interface gather_var + module procedure gather_var_integer_2d + module procedure gather_var_logical_2d + module procedure gather_var_real4_2d + module procedure gather_var_real4_3d + module procedure gather_var_real8_2d + module procedure gather_var_real8_3d end interface - interface distributed_gather_var_row - module procedure distributed_gather_var_row_real8_2d + interface gather_var_row + module procedure gather_var_row_real8_2d end interface - interface distributed_gather_all_var_row - module procedure distributed_gather_all_var_row_real8_2d + interface gather_all_var_row + module procedure gather_all_var_row_real8_2d end interface - interface distributed_gather_var_col - module procedure distributed_gather_var_col_real8_2d + interface gather_var_col + module procedure gather_var_col_real8_2d end interface - interface distributed_gather_all_var_col - module procedure distributed_gather_all_var_col_real8_2d + interface gather_all_var_col + module procedure gather_all_var_col_real8_2d end interface interface distributed_get_var @@ -232,21 +238,21 @@ module cism_parallel module procedure distributed_put_var_real8_3d end interface - interface distributed_scatter_var - module procedure distributed_scatter_var_integer_2d - module procedure distributed_scatter_var_logical_2d - module procedure distributed_scatter_var_real4_2d - module procedure distributed_scatter_var_real4_3d - module procedure distributed_scatter_var_real8_2d - module procedure distributed_scatter_var_real8_3d + interface scatter_var + module procedure scatter_var_integer_2d + module procedure scatter_var_logical_2d + module procedure scatter_var_real4_2d + module procedure scatter_var_real4_3d + module procedure scatter_var_real8_2d + module procedure scatter_var_real8_3d end interface - interface distributed_scatter_var_row - module procedure distributed_scatter_var_row_real8_2d + interface scatter_var_row + module procedure scatter_var_row_real8_2d end interface - interface distributed_scatter_var_col - module procedure distributed_scatter_var_col_real8_2d + interface scatter_var_col + module procedure scatter_var_col_real8_2d end interface interface parallel_boundary_value @@ -291,17 +297,22 @@ module cism_parallel interface parallel_global_sum module procedure parallel_global_sum_integer_2d module procedure parallel_global_sum_integer_3d - module procedure parallel_global_sum_real4_2d + module procedure parallel_global_sum_integer8_2d module procedure parallel_global_sum_real8_2d module procedure parallel_global_sum_real8_3d end interface - interface parallel_global_sum_staggered - module procedure parallel_global_sum_staggered_3d_real8 - module procedure parallel_global_sum_staggered_3d_real8_nvar - module procedure parallel_global_sum_staggered_2d_real8 - module procedure parallel_global_sum_staggered_2d_real8_nvar - end interface parallel_global_sum_staggered + interface parallel_global_sum_patch + module procedure parallel_global_sum_patch_integer_2d + module procedure parallel_global_sum_patch_real8_2d + end interface parallel_global_sum_patch + + interface parallel_global_sum_stagger + module procedure parallel_global_sum_stagger_real8_2d + module procedure parallel_global_sum_stagger_real8_3d + module procedure parallel_global_sum_stagger_real8_2d_nflds + module procedure parallel_global_sum_stagger_real8_3d_nflds + end interface parallel_global_sum_stagger interface parallel_halo module procedure parallel_halo_integer_2d @@ -311,9 +322,11 @@ module cism_parallel module procedure parallel_halo_real8_2d module procedure parallel_halo_real8_3d module procedure parallel_halo_real8_4d + module procedure parallel_halo_integer8_4d end interface interface parallel_halo_extrapolate + module procedure parallel_halo_extrapolate_real8_1d module procedure parallel_halo_extrapolate_integer_2d module procedure parallel_halo_extrapolate_real8_2d end interface @@ -330,6 +343,8 @@ module cism_parallel end interface interface parallel_is_zero + module procedure parallel_is_zero_integer_1d + module procedure parallel_is_zero_real8_1d module procedure parallel_is_zero_integer_2d module procedure parallel_is_zero_real8_2d module procedure parallel_is_zero_real8_3d @@ -388,6 +403,7 @@ module cism_parallel interface parallel_reduce_sum module procedure parallel_reduce_sum_integer + module procedure parallel_reduce_sum_integer8 module procedure parallel_reduce_sum_real4 module procedure parallel_reduce_sum_real8 module procedure parallel_reduce_sum_integer_nvar @@ -613,10 +629,10 @@ end function distributed_execution !======================================================================= - ! subroutines belonging to the distributed_gather_var interface + ! subroutines belonging to the gather_var interface ! WHL, July 2019: - ! There is an issue with allocating the global_values array in the distributed_gather_var_*, + ! There is an issue with allocating the global_values array in the gather_var_*, ! distributed_get_var_*, distributed_print_*, and distributed_put_var_* functions and subroutines ! when computing only on active blocks (compute_blocks = 1). ! This array is allocated based on the max and min of ewlb, ewub, nslb, and nsub over the global domain. @@ -628,7 +644,7 @@ end function distributed_execution ! global_minval_nslb, and global_maxval_nsub, which are now computed at initialization ! based on the bounds in all blocks (including inactive blocks), not just active blocks. - subroutine distributed_gather_var_integer_2d(values, global_values, parallel) + subroutine gather_var_integer_2d(values, global_values, parallel) ! Gather a distributed variable back to main_task node ! values = local portion of distributed variable @@ -661,7 +677,7 @@ subroutine distributed_gather_var_integer_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(*,*) "distributed_gather does not currently work for" + write(*,*) "gather does not currently work for" write(*,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -685,9 +701,7 @@ subroutine distributed_gather_var_integer_2d(values, global_values, parallel) mpi_integer,main_rank,comm) if (main_task) then - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) !WHL - See comments above on allocating the global_values array !! allocate(global_values(& !! minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),& @@ -706,9 +720,7 @@ subroutine distributed_gather_var_integer_2d(values, global_values, parallel) end do allocate(recvbuf(displs(tasks+1))) else - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) allocate(global_values(1,1)) ! This prevents a problem with NULL pointers later. allocate(displs(1)) allocate(recvcounts(1)) @@ -733,10 +745,10 @@ subroutine distributed_gather_var_integer_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_gather_var_integer_2d + end subroutine gather_var_integer_2d - subroutine distributed_gather_var_logical_2d(values, global_values, parallel) + subroutine gather_var_logical_2d(values, global_values, parallel) ! Gather a distributed variable back to main_task node ! values = local portion of distributed variable @@ -769,7 +781,7 @@ subroutine distributed_gather_var_logical_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(*,*) "distributed_gather does not currently work for" + write(*,*) "gather does not currently work for" write(*,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -793,9 +805,7 @@ subroutine distributed_gather_var_logical_2d(values, global_values, parallel) mpi_integer,main_rank,comm) if (main_task) then - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) !WHL - See comments above on allocating the global_values array !! allocate(global_values(& !! minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),& @@ -814,9 +824,7 @@ subroutine distributed_gather_var_logical_2d(values, global_values, parallel) end do allocate(recvbuf(displs(tasks+1))) else - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) allocate(global_values(1,1)) ! This prevents a problem with NULL pointers later. allocate(displs(1)) allocate(recvcounts(1)) @@ -840,10 +848,10 @@ subroutine distributed_gather_var_logical_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_gather_var_logical_2d + end subroutine gather_var_logical_2d - subroutine distributed_gather_var_real4_2d(values, global_values, parallel) + subroutine gather_var_real4_2d(values, global_values, parallel) ! Gather a distributed variable back to main_task node ! values = local portion of distributed variable @@ -876,7 +884,7 @@ subroutine distributed_gather_var_real4_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(*,*) "distributed_gather does not currently work for" + write(*,*) "gather does not currently work for" write(*,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -900,9 +908,7 @@ subroutine distributed_gather_var_real4_2d(values, global_values, parallel) mpi_integer,main_rank,comm) if (main_task) then - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) !WHL - See comments above on allocating the global_values array !! allocate(global_values(& !! minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),& @@ -921,9 +927,7 @@ subroutine distributed_gather_var_real4_2d(values, global_values, parallel) end do allocate(recvbuf(displs(tasks+1))) else - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) allocate(global_values(1,1)) ! This prevents a problem with NULL pointers later. allocate(displs(1)) allocate(recvcounts(1)) @@ -947,10 +951,10 @@ subroutine distributed_gather_var_real4_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_gather_var_real4_2d + end subroutine gather_var_real4_2d - subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1, ud1) + subroutine gather_var_real4_3d(values, global_values, parallel, ld1, ud1) ! Gather a distributed variable back to main_task node ! values = local portion of distributed variable @@ -984,7 +988,7 @@ subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1, if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(*,*) "distributed_gather does not currently work for" + write(*,*) "gather does not currently work for" write(*,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -1008,9 +1012,7 @@ subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1, mpi_integer,main_rank,comm) if (main_task) then - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) if (present(ld1)) then d1l = ld1 else @@ -1044,9 +1046,7 @@ subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1, end do allocate(recvbuf(displs(tasks+1))) else - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) allocate(global_values(1,1,1)) ! This prevents a problem with NULL pointers later. allocate(displs(1)) allocate(recvcounts(1)) @@ -1073,10 +1073,10 @@ subroutine distributed_gather_var_real4_3d(values, global_values, parallel, ld1, end associate ! automatic deallocation - end subroutine distributed_gather_var_real4_3d + end subroutine gather_var_real4_3d - subroutine distributed_gather_var_real8_2d(values, global_values, parallel) + subroutine gather_var_real8_2d(values, global_values, parallel) ! Gather a distributed variable back to main_task node ! values = local portion of distributed variable @@ -1109,7 +1109,7 @@ subroutine distributed_gather_var_real8_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(*,*) "distributed_gather does not currently work for" + write(*,*) "gather does not currently work for" write(*,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -1133,9 +1133,7 @@ subroutine distributed_gather_var_real8_2d(values, global_values, parallel) mpi_integer,main_rank,comm) if (main_task) then - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) !WHL - See comments above on allocating the global_values array !! allocate(global_values(& !! minval(d_gs_bounds(1,:)):maxval(d_gs_bounds(2,:)),& @@ -1154,9 +1152,7 @@ subroutine distributed_gather_var_real8_2d(values, global_values, parallel) end do allocate(recvbuf(displs(tasks+1))) else - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) allocate(global_values(1,1)) ! This prevents a problem with NULL pointers later. allocate(displs(1)) allocate(recvcounts(1)) @@ -1181,10 +1177,10 @@ subroutine distributed_gather_var_real8_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_gather_var_real8_2d + end subroutine gather_var_real8_2d - subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1, ud1) + subroutine gather_var_real8_3d(values, global_values, parallel, ld1, ud1) ! Gather a distributed variable back to main_task node ! values = local portion of distributed variable @@ -1193,7 +1189,7 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1, use mpi_mod implicit none - real(dp),dimension(:,:,:),intent(in) :: values + real(dp),dimension(:,:,:),intent(in) :: values ! i and j are indices 2 and 3 real(dp),dimension(:,:,:),allocatable,intent(inout) :: global_values integer,optional,intent(in) :: ld1, ud1 type(parallel_type) :: parallel @@ -1216,9 +1212,9 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1, global_maxval_nsub => parallel%global_maxval_nsub & ) - if (uhalo==0 .and. size(values,1)==local_ewn-1) then + if (uhalo==0 .and. size(values,2)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(*,*) "distributed_gather does not currently work for" + write(*,*) "gather does not currently work for" write(*,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -1242,9 +1238,7 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1, mpi_integer,main_rank,comm) if (main_task) then - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) if (present(ld1)) then d1l = ld1 else @@ -1278,9 +1272,7 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1, end do allocate(recvbuf(displs(tasks+1))) else - if (allocated(global_values)) then - deallocate(global_values) - endif + if (allocated(global_values)) deallocate(global_values) allocate(global_values(1,1,1)) ! This prevents a problem with NULL pointers later. allocate(displs(1)) allocate(recvcounts(1)) @@ -1307,16 +1299,16 @@ subroutine distributed_gather_var_real8_3d(values, global_values, parallel, ld1, end associate ! automatic deallocation - end subroutine distributed_gather_var_real8_3d + end subroutine gather_var_real8_3d !======================================================================= - ! subroutines belonging to the distributed_gather_var_row interface + ! subroutines belonging to the gather_var_row interface - subroutine distributed_gather_var_row_real8_2d(values, global_values, parallel) + subroutine gather_var_row_real8_2d(values, global_values, parallel) ! Gather data along a row of tasks onto the main task for that row. - ! Based on distributed_gather_var_real8_2d. + ! Based on gather_var_real8_2d. ! Note: The first index represents a data dimension that is the same on each task, ! whose size generally is less than own_ewn. ! The second index represents the north-south dimension, and is assumed @@ -1351,7 +1343,7 @@ subroutine distributed_gather_var_row_real8_2d(values, global_values, parallel) if (size(values,2) /= own_nsn) then ! Note: Removing this restriction would require some recoding below. - write(*,*) "ERROR: distributed_gather_var_row requires N-S array size of own_nsn" + write(*,*) "ERROR: gather_var_row requires N-S array size of own_nsn" write(*,*) 'rank, own_nsn, size(values,2) =', this_rank, own_nsn, size(values,2) call parallel_stop(__FILE__, __LINE__) end if @@ -1446,16 +1438,16 @@ subroutine distributed_gather_var_row_real8_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_gather_var_row_real8_2d + end subroutine gather_var_row_real8_2d !======================================================================= - ! subroutines belonging to the distributed_gather_all_var_row interface + ! subroutines belonging to the gather_all_var_row interface - subroutine distributed_gather_all_var_row_real8_2d(values, global_values, parallel) + subroutine gather_all_var_row_real8_2d(values, global_values, parallel) ! Gather global data along a row of tasks onto each task for that row. - ! Based on distributed_gather_var_real8_2d. + ! Based on gather_var_real8_2d. ! Note: The first index represents a data dimension that is the same on each task, ! whose size generally is less than own_ewn. ! The second index represents the north-south dimension, and is assumed @@ -1493,7 +1485,7 @@ subroutine distributed_gather_all_var_row_real8_2d(values, global_values, parall ! TODO: Do this recoding. This subroutine currently fails with outflow BC, because ! the southern and western rows of tasks have an extra locally owned vertex, ! giving size(values,2) = own_nsn + 1 - write(*,*) "ERROR: distributed_gather_var_row requires N-S array size of own_nsn" + write(*,*) "ERROR: gather_var_row requires N-S array size of own_nsn" write(*,*) 'rank, own_nsn, size(values,2) =', this_rank, own_nsn, size(values,2) call parallel_stop(__FILE__, __LINE__) end if @@ -1568,16 +1560,16 @@ subroutine distributed_gather_all_var_row_real8_2d(values, global_values, parall end associate ! automatic deallocation - end subroutine distributed_gather_all_var_row_real8_2d + end subroutine gather_all_var_row_real8_2d !======================================================================= - ! subroutines belonging to the distributed_gather_var_col interface + ! subroutines belonging to the gather_var_col interface - subroutine distributed_gather_var_col_real8_2d(values, global_values, parallel) + subroutine gather_var_col_real8_2d(values, global_values, parallel) ! Gather data along a column of tasks onto the main task for that column. - ! Based on distributed_gather_var_real8_2d. + ! Based on gather_var_real8_2d. ! Note: The first index represents a data dimension that is the same on each task, ! whose size generally is less than own_nsn. ! The second index represents the east-west dimension, and is assumed @@ -1612,7 +1604,7 @@ subroutine distributed_gather_var_col_real8_2d(values, global_values, parallel) if (size(values,2) /= own_ewn) then ! Note: Removing this restriction would require some recoding below. - write(*,*) "ERROR: distributed_gather_var_row requires E-W array size of own_ewn" + write(*,*) "ERROR: gather_var_row requires E-W array size of own_ewn" write(*,*) 'rank, own_ewn, size(values,2) =', this_rank, own_ewn, size(values,2) call parallel_stop(__FILE__, __LINE__) end if @@ -1708,16 +1700,16 @@ subroutine distributed_gather_var_col_real8_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_gather_var_col_real8_2d + end subroutine gather_var_col_real8_2d !======================================================================= - ! subroutines belonging to the distributed_gather_all_var_col interface + ! subroutines belonging to the gather_all_var_col interface - subroutine distributed_gather_all_var_col_real8_2d(values, global_values, parallel) + subroutine gather_all_var_col_real8_2d(values, global_values, parallel) ! Gather global data along a column of tasks onto each task for that column. - ! Based on distributed_gather_var_real8_2d. + ! Based on gather_var_real8_2d. ! Note: The first index represents a data dimension that is the same on each task, ! whose size generally is less than own_nsn. ! The second index represents the east-west dimension, and is assumed @@ -1752,7 +1744,7 @@ subroutine distributed_gather_all_var_col_real8_2d(values, global_values, parall if (size(values,2) /= own_ewn) then ! Note: Removing this restriction would require some recoding below. - write(*,*) "ERROR: distributed_gather_var_row requires E-W array size of own_ewn" + write(*,*) "ERROR: gather_var_row requires E-W array size of own_ewn" write(*,*) 'rank, own_ewn, size(values,2) =', this_rank, own_ewn, size(values,2) call parallel_stop(__FILE__, __LINE__) end if @@ -1827,7 +1819,7 @@ subroutine distributed_gather_all_var_col_real8_2d(values, global_values, parall end associate ! automatic deallocation - end subroutine distributed_gather_all_var_col_real8_2d + end subroutine gather_all_var_col_real8_2d !======================================================================= @@ -2210,6 +2202,7 @@ function distributed_get_var_real8_1d(ncid, varid, values, parallel, start) global_values(:) = 0.0d0 distributed_get_var_real8_1d = & nf90_get_var(ncid,varid,global_values(1:myn),start) + allocate(displs(tasks+1)) allocate(sendcounts(tasks)) sendcounts(:) = bounds(2,:)-bounds(1,:)+1 @@ -2427,7 +2420,9 @@ end function distributed_get_var_real8_3d subroutine distributed_grid(ewn, nsn, & parallel, & - nhalo_in, global_bc_in) + nhalo_in, & + global_bc_in, & + reprosum_in) ! Divide the global domain into blocks, with one task per block. ! Set various grid and domain variables for the local task. @@ -2437,6 +2432,7 @@ subroutine distributed_grid(ewn, nsn, & type(parallel_type), intent(inout) :: parallel ! info for parallel communication, computed here integer, intent(in), optional :: nhalo_in ! number of rows of halo cells character(*), intent(in), optional :: global_bc_in ! string indicating the global BC option + logical, intent(in), optional :: reprosum_in ! if true, compute reproducible global sums integer :: best,i,j,metric real(dp) :: rewtasks,rnstasks @@ -2479,7 +2475,8 @@ subroutine distributed_grid(ewn, nsn, & staggered_ilo => parallel%staggered_ilo, & staggered_ihi => parallel%staggered_ihi, & staggered_jlo => parallel%staggered_jlo, & - staggered_jhi => parallel%staggered_jhi & + staggered_jhi => parallel%staggered_jhi, & + reprosum => parallel%reprosum & ) ! set the boundary conditions (periodic by default) @@ -2665,6 +2662,25 @@ subroutine distributed_grid(ewn, nsn, & call parallel_stop(__FILE__, __LINE__) endif + if (present(reprosum_in)) then + reprosum = reprosum_in + else + reprosum = .false. + endif + + ! If computing reproducible sums, then set some options + !TODO - Are these saved from one call to the next? + ! Note: For standalone CISM, reprosum = F by default; can set = T in the config file + ! For CESM coupled runs, reprosum = T by default + if (reprosum) then + +#ifdef CCSM_COUPLED +!! call shr_reprosum_setops() +#else +!! call cism_reprosum_setops() +#endif + endif ! reprosum + ! call parallel_barrier ! write(iulog,*) 'task, west, east, south, north:', this_rank, west, east, south, north @@ -2693,6 +2709,7 @@ subroutine distributed_grid_active_blocks(ewn, nsn, & nx_block, ny_block, & ice_domain_mask, & parallel, & + reprosum_in, & inquire_only) ! Divide the global domain into blocks, setting various grid and domain variables @@ -2731,6 +2748,7 @@ subroutine distributed_grid_active_blocks(ewn, nsn, & integer, intent(in), dimension(:,:) :: & ice_domain_mask ! = 1 where ice is potentially present and active, else = 0 type(parallel_type), intent(inout) :: parallel ! info for parallel communication, computed here + logical, intent(in), optional :: reprosum_in ! if true, compute reproducible global sums logical, intent(in), optional :: inquire_only ! if true, then report the number of active blocks and abort integer :: i, j, nb, nt @@ -2809,7 +2827,8 @@ subroutine distributed_grid_active_blocks(ewn, nsn, & staggered_ilo => parallel%staggered_ilo, & staggered_ihi => parallel%staggered_ihi, & staggered_jlo => parallel%staggered_jlo, & - staggered_jhi => parallel%staggered_jhi & + staggered_jhi => parallel%staggered_jhi, & + reprosum => parallel%reprosum & ) if (present(inquire_only)) then @@ -3283,7 +3302,28 @@ subroutine distributed_grid_active_blocks(ewn, nsn, & ! southwest_corner, southeast_corner, northwest_corner, northeast_corner endif - ! Uncomment to print grid geometry + if (present(reprosum_in)) then + reprosum = reprosum_in + else + reprosum = .false. + endif + + ! If computing reproducible sums, then set some options + !TODO - Are these saved from one call to the next? + ! Note: For standalone CISM, reprosum = F by default; can set = T in the config file + ! For CESM coupled runs, reprosum = T by default + + if (reprosum) then + +#ifdef CCSM_COUPLED +!! call shr_reprosum_setops() +#else +!! call cism_reprosum_setops() +#endif + + endif ! reprosum + +! Uncomment to print grid geometry ! write(iulog,*) " " ! write(iulog,*) "Process ", this_rank, " Total = ", tasks, " ewtasks = ", ewtasks, " nstasks = ", nstasks ! write(iulog,*) "Process ", this_rank, " ewrank = ", ewrank, " nsrank = ", nsrank @@ -4269,9 +4309,9 @@ end function distributed_put_var_real8_3d !======================================================================= - ! subroutines belonging to the distributed_scatter_var interface + ! subroutines belonging to the scatter_var interface - subroutine distributed_scatter_var_integer_2d(values, global_values, parallel) + subroutine scatter_var_integer_2d(values, global_values, parallel) ! Scatter a variable on the main_task node back to the distributed ! values = local portion of distributed variable @@ -4301,7 +4341,7 @@ subroutine distributed_scatter_var_integer_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(iulog,*) "distributed_scatter does not currently work for" + write(iulog,*) "scatter does not currently work for" write(iulog,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -4356,10 +4396,10 @@ subroutine distributed_scatter_var_integer_2d(values, global_values, parallel) deallocate(global_values) ! TODO - Is this deallocation necessary, here and below? ! automatic deallocation - end subroutine distributed_scatter_var_integer_2d + end subroutine scatter_var_integer_2d - subroutine distributed_scatter_var_logical_2d(values, global_values, parallel) + subroutine scatter_var_logical_2d(values, global_values, parallel) ! Scatter a variable on the main_task node back to the distributed ! values = local portion of distributed variable @@ -4389,7 +4429,7 @@ subroutine distributed_scatter_var_logical_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(iulog,*) "distributed_scatter does not currently work for" + write(iulog,*) "scatter does not currently work for" write(iulog,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -4444,10 +4484,10 @@ subroutine distributed_scatter_var_logical_2d(values, global_values, parallel) deallocate(global_values) ! automatic deallocation - end subroutine distributed_scatter_var_logical_2d + end subroutine scatter_var_logical_2d - subroutine distributed_scatter_var_real4_2d(values, global_values, parallel) + subroutine scatter_var_real4_2d(values, global_values, parallel) ! Scatter a variable on the main_task node back to the distributed ! values = local portion of distributed variable @@ -4477,7 +4517,7 @@ subroutine distributed_scatter_var_real4_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(iulog,*) "distributed_scatter does not currently work for" + write(iulog,*) "scatter does not currently work for" write(iulog,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -4532,10 +4572,10 @@ subroutine distributed_scatter_var_real4_2d(values, global_values, parallel) deallocate(global_values) ! automatic deallocation - end subroutine distributed_scatter_var_real4_2d + end subroutine scatter_var_real4_2d - subroutine distributed_scatter_var_real4_3d(values, global_values, parallel) + subroutine scatter_var_real4_3d(values, global_values, parallel) ! Scatter a variable on the main_task node back to the distributed ! values = local portion of distributed variable @@ -4565,7 +4605,7 @@ subroutine distributed_scatter_var_real4_3d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(iulog,*) "distributed_scatter does not currently work for" + write(iulog,*) "scatter does not currently work for" write(iulog,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -4622,10 +4662,10 @@ subroutine distributed_scatter_var_real4_3d(values, global_values, parallel) deallocate(global_values) ! automatic deallocation - end subroutine distributed_scatter_var_real4_3d + end subroutine scatter_var_real4_3d - subroutine distributed_scatter_var_real8_2d(values, global_values, parallel) + subroutine scatter_var_real8_2d(values, global_values, parallel) ! Scatter a variable on the main_task node back to the distributed ! values = local portion of distributed variable @@ -4655,7 +4695,7 @@ subroutine distributed_scatter_var_real8_2d(values, global_values, parallel) if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(iulog,*) "distributed_scatter does not currently work for" + write(iulog,*) "scatter does not currently work for" write(iulog,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -4710,10 +4750,10 @@ subroutine distributed_scatter_var_real8_2d(values, global_values, parallel) deallocate(global_values) ! automatic deallocation - end subroutine distributed_scatter_var_real8_2d + end subroutine scatter_var_real8_2d - subroutine distributed_scatter_var_real8_3d(values, global_values, parallel, deallocflag) + subroutine scatter_var_real8_3d(values, global_values, parallel, deallocflag) ! Scatter a variable on the main_task node back to the distributed ! values = local portion of distributed variable @@ -4745,7 +4785,7 @@ subroutine distributed_scatter_var_real8_3d(values, global_values, parallel, dea if (uhalo==0 .and. size(values,1)==local_ewn-1) then ! Fixing this would require some generalization as is done for distributed_put_var - write(iulog,*) "distributed_scatter does not currently work for" + write(iulog,*) "scatter does not currently work for" write(iulog,*) "variables on the staggered grid when uhalo=0" call parallel_stop(__FILE__, __LINE__) end if @@ -4809,16 +4849,16 @@ subroutine distributed_scatter_var_real8_3d(values, global_values, parallel, dea if (deallocmem) deallocate(global_values) ! automatic deallocation - end subroutine distributed_scatter_var_real8_3d + end subroutine scatter_var_real8_3d !======================================================================= - ! subroutines belonging to the distributed_scatter_var_row interface + ! subroutines belonging to the scatter_var_row interface - subroutine distributed_scatter_var_row_real8_2d(values, global_values, parallel) + subroutine scatter_var_row_real8_2d(values, global_values, parallel) ! Scatter data to a row of tasks from the main task for that row. - ! Based on distributed_scatter_var_real8_2d. + ! Based on scatter_var_real8_2d. ! Note: The first index represents a data dimension that is the same on each task, ! whose size generally is less than own_ewn. ! The second index represents the north-south dimension, and is assumed @@ -4852,7 +4892,7 @@ subroutine distributed_scatter_var_row_real8_2d(values, global_values, parallel) if (size(values,2) /= own_nsn) then ! Note: Removing this restriction would require some recoding below. - write(iulog,*) "ERROR: distributed_scatter_var_row requires N-S array size of own_nsn" + write(iulog,*) "ERROR: scatter_var_row requires N-S array size of own_nsn" call parallel_stop(__FILE__, __LINE__) end if @@ -4909,16 +4949,16 @@ subroutine distributed_scatter_var_row_real8_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_scatter_var_row_real8_2d + end subroutine scatter_var_row_real8_2d !======================================================================= - ! subroutines belonging to the distributed_scatter_var_col interface + ! subroutines belonging to the scatter_var_col interface - subroutine distributed_scatter_var_col_real8_2d(values, global_values, parallel) + subroutine scatter_var_col_real8_2d(values, global_values, parallel) ! Scatter data to a column of tasks from the main task for that column - ! Based on distributed_scatter_var_real8_2d. + ! Based on scatter_var_real8_2d. ! Note: The first index represents a data dimension that is the same on each task, ! whose size generally is less than own_nsn. ! The second index represents the east-west dimension, and is assumed @@ -4951,7 +4991,7 @@ subroutine distributed_scatter_var_col_real8_2d(values, global_values, parallel) if (size(values,2) /= own_ewn) then ! Note: Removing this restriction would require some recoding below. - write(iulog,*) "ERROR: distributed_scatter_var_col requires E-W array size of own_nsn" + write(iulog,*) "ERROR: scatter_var_col requires E-W array size of own_nsn" call parallel_stop(__FILE__, __LINE__) end if @@ -5008,7 +5048,7 @@ subroutine distributed_scatter_var_col_real8_2d(values, global_values, parallel) end associate ! automatic deallocation - end subroutine distributed_scatter_var_col_real8_2d + end subroutine scatter_var_col_real8_2d !======================================================================= @@ -6047,7 +6087,7 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d) integer, dimension(:,:,:), intent(in), optional :: mask_3d integer :: i, j, k - integer :: kmax + integer :: nz integer, dimension(size(a,1),parallel%local_ewn,parallel%local_nsn) :: mask integer :: local_sum integer :: parallel_global_sum_integer_3d @@ -6056,7 +6096,7 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d) local_ewn => parallel%local_ewn, & local_nsn => parallel%local_nsn) - kmax = size(a,1) + nz = size(a,1) if (present(mask_3d)) then mask = mask_3d @@ -6067,7 +6107,7 @@ function parallel_global_sum_integer_3d(a, parallel, mask_3d) local_sum = 0 do j = nhalo+1, local_nsn-nhalo do i = nhalo+1, local_ewn-nhalo - do k = 1, kmax + do k = 1, nz if (mask(k,i,j) == 1) then local_sum = local_sum + a(k,i,j) endif @@ -6082,30 +6122,30 @@ end function parallel_global_sum_integer_3d !======================================================================= - function parallel_global_sum_real4_2d(a, parallel, mask_2d) + function parallel_global_sum_integer8_2d(a, parallel, mask_2d) - ! Calculates the global sum of a 2D single-precision field + ! Calculates the global sum of a 2D integer(i8) field - real(sp),dimension(:,:),intent(in) :: a + integer(i8), dimension(:,:),intent(in) :: a type(parallel_type) :: parallel integer, dimension(:,:), intent(in), optional :: mask_2d integer :: i, j integer, dimension(parallel%local_ewn,parallel%local_nsn) :: mask - real(sp) :: local_sum - real(sp) :: parallel_global_sum_real4_2d + integer(i8) :: local_sum + integer(i8) :: parallel_global_sum_integer8_2d associate( & local_ewn => parallel%local_ewn, & local_nsn => parallel%local_nsn) - if (present(mask_2d)) then + if (present(mask_2d)) then mask = mask_2d else mask = 1 endif - local_sum = 0.0 + local_sum = 0 do j = nhalo+1, local_nsn-nhalo do i = nhalo+1, local_ewn-nhalo if (mask(i,j) == 1) then @@ -6113,11 +6153,11 @@ function parallel_global_sum_real4_2d(a, parallel, mask_2d) endif enddo enddo - parallel_global_sum_real4_2d = parallel_reduce_sum(local_sum) + parallel_global_sum_integer8_2d = parallel_reduce_sum(local_sum) end associate - end function parallel_global_sum_real4_2d + end function parallel_global_sum_integer8_2d !======================================================================= @@ -6134,9 +6174,17 @@ function parallel_global_sum_real8_2d(a, parallel, mask_2d) real(dp) :: local_sum real(dp) :: parallel_global_sum_real8_2d + ! variables for computing reproductible sums + integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum + associate( & local_ewn => parallel%local_ewn, & - local_nsn => parallel%local_nsn) + local_nsn => parallel%local_nsn, & + own_ewn => parallel%own_ewn, & + own_nsn => parallel%own_nsn) if (present(mask_2d)) then mask = mask_2d @@ -6144,15 +6192,59 @@ function parallel_global_sum_real8_2d(a, parallel, mask_2d) mask = 1 endif - local_sum = 0.0d0 - do j = nhalo+1, local_nsn-nhalo - do i = nhalo+1, local_ewn-nhalo - if (mask(i,j) == 1) then - local_sum = local_sum + a(i,j) - endif + if (parallel%reprosum) then ! compute using parallel_reduce_reprosum + + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = own_ewn*own_nsn + nflds = 1 + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) + + count = 0 + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + count = count + 1 + if (mask(i,j) == 1) then + arr(count,1) = a(i,j) + else + arr(count,1) = 0.0d0 + endif + enddo enddo - enddo - parallel_global_sum_real8_2d = parallel_reduce_sum(local_sum) + + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif + + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) + + parallel_global_sum_real8_2d = arr_gsum(1) + + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'arr_gsum =', arr_gsum + endif + + deallocate(arr) + deallocate(arr_gsum) + + else ! compute using parallel_reduce_sum (not reproducible) + + local_sum = 0.0d0 + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + if (mask(i,j) == 1) then + local_sum = local_sum + a(i,j) + endif + enddo + enddo + + ! Compute the global sum + parallel_global_sum_real8_2d = parallel_reduce_sum(local_sum) + + endif ! reprosum end associate @@ -6160,318 +6252,768 @@ end function parallel_global_sum_real8_2d !======================================================================= - function parallel_global_sum_real8_3d(a, parallel, mask_3d) + function parallel_global_sum_real8_3d(a, parallel, mask_2d) ! Calculates the global sum of a 3D double-precision field ! Note: The vertical dimension should be the first dimension of the input field. real(dp), dimension(:,:,:),intent(in) :: a type(parallel_type) :: parallel - integer, dimension(:,:,:), intent(in), optional :: mask_3d + integer, dimension(:,:), intent(in), optional :: mask_2d integer :: i, j, k - integer :: kmax - integer, dimension(size(a,1),parallel%local_ewn,parallel%local_nsn) :: mask + integer :: nz + integer, dimension(parallel%local_ewn,parallel%local_nsn) :: mask real(dp) :: local_sum real(dp) :: parallel_global_sum_real8_3d + ! variables for computing reproductible sums + integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum + associate( & local_ewn => parallel%local_ewn, & - local_nsn => parallel%local_nsn) + local_nsn => parallel%local_nsn, & + own_ewn => parallel%own_ewn, & + own_nsn => parallel%own_nsn) - kmax = size(a,1) + nz = size(a,1) - if (present(mask_3d)) then - mask = mask_3d + ! Note: The mask is 2D, since typically all layers in a column are either masked in or masked out + if (present(mask_2d)) then + mask = mask_2d else mask = 1 endif - local_sum = 0 - do j = nhalo+1, local_nsn-nhalo - do i = nhalo+1, local_ewn-nhalo - do k = 1, kmax - if (mask(k,i,j) == 1) then - local_sum = local_sum + a(k,i,j) + if (parallel%reprosum) then ! compute using cism_reprosum_calc + + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = own_ewn*own_nsn*nz + nflds = 1 + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) + + count = 0 + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + if (mask(i,j) == 1) then + do k = 1, nz + count = count + 1 + arr(count,1) = a(k,i,j) + enddo + else + do k = 1, nz + count = count + 1 + arr(count,1) = 0.0d0 + enddo endif enddo enddo - enddo - parallel_global_sum_real8_3d = parallel_reduce_sum(local_sum) - end associate + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif - end function parallel_global_sum_real8_3d + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) -!======================================================================= + parallel_global_sum_real8_3d = arr_gsum(1) - ! subroutines belonging to the parallel_global_sum_staggered interface - !TODO - Turn these into functions, analogous to the parallel_global_sum functions above. + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'arr_gsum =', arr_gsum + endif - subroutine parallel_global_sum_staggered_3d_real8(& - nx, ny, & - nz, parallel, & - global_sum, & - work1, work2) + deallocate(arr) + deallocate(arr_gsum) - ! Sum one or two local arrays on the staggered grid, then take the global sum. + else ! compute using parallel_reduce_sum (not reproducible) - integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions (for scalars) - nz ! number of vertical layers at which velocity is computed + local_sum = 0 + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + if (mask(i,j) == 1) then + do k = 1, nz + local_sum = local_sum + a(k,i,j) + enddo + endif + enddo + enddo + parallel_global_sum_real8_3d = parallel_reduce_sum(local_sum) - type(parallel_type), intent(in) :: & - parallel ! info for parallel communication + endif ! reprosum - real(dp), intent(out) :: global_sum ! global sum - real(dp), intent(in), dimension(nz,nx-1,ny-1) :: work1 ! local array - real(dp), intent(in), dimension(nz,nx-1,ny-1), optional :: work2 ! local array + end associate - integer :: i, j, k - real(dp) :: local_sum + end function parallel_global_sum_real8_3d - integer :: & - staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid - staggered_jlo, staggered_jhi +!======================================================================= + ! subroutines belonging to the parallel_global_sum_patch interface - staggered_ilo = parallel%staggered_ilo - staggered_ihi = parallel%staggered_ihi - staggered_jlo = parallel%staggered_jlo - staggered_jhi = parallel%staggered_jhi + function parallel_global_sum_patch_integer_2d(a, npatch, patch_id, parallel) - local_sum = 0.d0 + ! Calculates the global sum of a 2D double-precision field over each + ! user-defined patch of the domain. + ! The number of patches = npatch. + ! Each cell has an integer ID assigning it to at most one patch. + ! If a cell has patch_id = 0, it belongs to no patches. - ! sum over locally owned velocity points + integer, dimension(:,:), intent(in) :: a + integer, intent(in) :: npatch + integer, dimension(:,:), intent(in) :: patch_id + type(parallel_type) :: parallel - if (present(work2)) then - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - do k = 1, nz - local_sum = local_sum + work1(k,i,j) + work2(k,i,j) - enddo - enddo - enddo - else - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - do k = 1, nz - local_sum = local_sum + work1(k,i,j) - enddo - enddo + integer :: i, j, np + integer, dimension(npatch) :: local_patch_sum + integer, dimension(npatch) :: parallel_global_sum_patch_integer_2d + + associate( & + local_ewn => parallel%local_ewn, & + local_nsn => parallel%local_nsn) + + local_patch_sum = 0 + + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + np = patch_id(i,j) + if (np > 0) then + local_patch_sum(np) = local_patch_sum(np) + a(i,j) + endif enddo - endif + enddo - ! take the global sum + parallel_global_sum_patch_integer_2d = parallel_reduce_sum(local_patch_sum) - global_sum = parallel_reduce_sum(local_sum) + end associate - end subroutine parallel_global_sum_staggered_3d_real8 + end function parallel_global_sum_patch_integer_2d !======================================================================= - subroutine parallel_global_sum_staggered_3d_real8_nvar(& - nx, ny, & - nz, parallel, & - global_sum, & - work1, work2) + function parallel_global_sum_patch_real8_2d(a, npatch, patch_id, parallel) - ! Sum one or two local arrays on the staggered grid, then take the global sum. + ! Calculates the global sum of a 2D double-precision field over each + ! user-defined patch of the domain. + ! The number of patches = npatch. + ! Each cell has an integer ID assigning it to at most one patch. + ! If a cell has patch_id = 0, it belongs to no patches. + !TODO - Add a reprosum option with npatch = nflds - integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions (for scalars) - nz ! number of vertical layers at which velocity is computed + real(dp), dimension(:,:), intent(in) :: a + integer, intent(in) :: npatch + integer, dimension(:,:), intent(in) :: patch_id + type(parallel_type) :: parallel - type(parallel_type), intent(in) :: & - parallel ! info for parallel communication + integer :: i, j, np + real(dp), dimension(npatch) :: local_patch_sum + real(dp), dimension(npatch) :: parallel_global_sum_patch_real8_2d - real(dp), intent(out), dimension(:) :: global_sum ! global sum + ! variables for computing reproductible sums + integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum - real(dp), intent(in), dimension(nz,nx-1,ny-1,size(global_sum)) :: work1 ! local array - real(dp), intent(in), dimension(nz,nx-1,ny-1,size(global_sum)), optional :: work2 ! local array + associate( & + local_ewn => parallel%local_ewn, & + local_nsn => parallel%local_nsn, & + own_ewn => parallel%own_ewn, & + own_nsn => parallel%own_nsn) - integer :: i, j, k, n, nvar - real(dp), dimension(size(global_sum)) :: local_sum + if (parallel%reprosum) then ! compute using cism_reprosum_calc - integer :: & - staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid - staggered_jlo, staggered_jhi + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = own_ewn*own_nsn + nflds = npatch + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) - staggered_ilo = parallel%staggered_ilo - staggered_ihi = parallel%staggered_ihi - staggered_jlo = parallel%staggered_jlo - staggered_jhi = parallel%staggered_jhi + count = 0 + arr(:,:) = 0.0d0 - nvar = size(global_sum) + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + count = count + 1 + np = patch_id(i,j) + if (np > 0) then + arr(count,np) = a(i,j) + endif + enddo + enddo - local_sum(:) = 0.d0 + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif - do n = 1, nvar + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) - ! sum over locally owned velocity points + parallel_global_sum_patch_real8_2d(:) = arr_gsum(:) - if (present(work2)) then - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - do k = 1, nz - local_sum(n) = local_sum(n) + work1(k,i,j,n) + work2(k,i,j,n) - enddo - enddo - enddo - else - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - do k = 1, nz - local_sum(n) = local_sum(n) + work1(k,i,j,n) - enddo - enddo - enddo + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'arr_gsum =', arr_gsum endif - enddo ! nvar + deallocate(arr) + deallocate(arr_gsum) - ! take the global sum + else ! compute using parallel_reduce_sum (not reproducible) - global_sum(:) = parallel_reduce_sum(local_sum(:)) + local_patch_sum = 0.0d0 - end subroutine parallel_global_sum_staggered_3d_real8_nvar + do j = nhalo+1, local_nsn-nhalo + do i = nhalo+1, local_ewn-nhalo + np = patch_id(i,j) + if (np > 0) then + local_patch_sum(np) = local_patch_sum(np) + a(i,j) + endif + enddo + enddo -!======================================================================= + parallel_global_sum_patch_real8_2d = parallel_reduce_sum(local_patch_sum) - subroutine parallel_global_sum_staggered_2d_real8(& - nx, ny, & - parallel, & - global_sum, & - work1, work2) + endif ! reprosum - ! Sum one or two local arrays on the staggered grid, then take the global sum. + end associate - integer, intent(in) :: & - nx, ny ! horizontal grid dimensions (for scalars) + end function parallel_global_sum_patch_real8_2d - type(parallel_type), intent(in) :: & - parallel ! info for parallel communication +!======================================================================= + ! subroutines belonging to the parallel_global_sum_stagger interface - real(dp), intent(out) :: global_sum ! global sum + function parallel_global_sum_stagger_real8_2d(arr1, parallel, arr2) - real(dp), intent(in), dimension(nx-1,ny-1) :: work1 ! local array - real(dp), intent(in), dimension(nx-1,ny-1), optional :: work2 ! local array + ! Calculate the global sum of a 2D double-precision field on the staggered grid + ! Similar to unstagged version, except it uses staggered_ilo/ihi/jlo/jhi + + real(dp), dimension(:,:), intent(in) :: arr1 + type(parallel_type) :: parallel + real(dp), dimension(:,:), intent(in), optional :: arr2 integer :: i, j real(dp) :: local_sum + real(dp) :: parallel_global_sum_stagger_real8_2d integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi + ! variables for computing reproductible sums + integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum + + !TODO - associate staggered_ilo = parallel%staggered_ilo staggered_ihi = parallel%staggered_ihi staggered_jlo = parallel%staggered_jlo staggered_jhi = parallel%staggered_jhi - local_sum = 0.d0 + if (parallel%reprosum) then ! compute using cism_reprosum_calc + + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) + nflds = 1 + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) + + arr(:,:) = 0.0d0 + + if (present(arr2)) then ! compute global sum of arr1 + arr2 - ! sum over locally owned velocity points + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + count = count + 1 + arr(count,1) = arr1(i,j) + arr2(i,j) + enddo + enddo - if (present(work2)) then - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - local_sum = local_sum + work1(i,j) + work2(i,j) + else ! compute global sum of arr1 + + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + count = count + 1 + arr(count,1) = arr1(i,j) + enddo enddo - enddo - else - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - local_sum = local_sum + work1(i,j) + + endif + + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif + + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) + + parallel_global_sum_stagger_real8_2d = arr_gsum(1) + + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'arr_gsum =', arr_gsum + endif + + deallocate(arr) + deallocate(arr_gsum) + + else ! compute using parallel_reduce_sum (not reproducible) + + local_sum = 0.0d0 + + if (present(arr2)) then ! compute global sum of arr1 + arr2 + + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + local_sum = local_sum + arr1(i,j) + arr2(i,j) + enddo enddo - enddo - endif - ! take the global sum + else ! compute global sum of arr1 - global_sum = parallel_reduce_sum(local_sum) + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + local_sum = local_sum + arr1(i,j) + enddo + enddo - end subroutine parallel_global_sum_staggered_2d_real8 + endif -!======================================================================= + parallel_global_sum_stagger_real8_2d = parallel_reduce_sum(local_sum) - subroutine parallel_global_sum_staggered_2d_real8_nvar(& - nx, ny, & - parallel, & - global_sum, & - work1, work2) + endif ! reprosum - ! Sum one or two local arrays on the staggered grid, then take the global sum. + end function parallel_global_sum_stagger_real8_2d - integer, intent(in) :: & - nx, ny ! horizontal grid dimensions (for scalars) +!======================================================================= - type(parallel_type), intent(in) :: & - parallel ! info for parallel communication + function parallel_global_sum_stagger_real8_3d(arr1, parallel, arr2) - real(dp), intent(out), dimension(:) :: & - global_sum ! global sum + ! Calculate the global sum of a 3D double-precision field on the staggered grid + ! Assumes k is the first index, followed by i and j - real(dp), intent(in), dimension(nx-1,ny-1,size(global_sum)) :: work1 ! local array - real(dp), intent(in), dimension(nx-1,ny-1,size(global_sum)), optional :: work2 ! local array + real(dp), dimension(:,:,:), intent(in) :: arr1 + type(parallel_type) :: parallel + real(dp), dimension(:,:,:), intent(in), optional :: arr2 - integer :: i, j, n, nvar + integer :: i, j, k, nz + real(dp) :: local_sum + real(dp) :: parallel_global_sum_stagger_real8_3d - real(dp), dimension(size(global_sum)) :: local_sum + ! variables for computing reproductible sums + integer :: nsummands, nflds ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi + !TODO - Associate these variables (and not the ones above) staggered_ilo = parallel%staggered_ilo staggered_ihi = parallel%staggered_ihi staggered_jlo = parallel%staggered_jlo staggered_jhi = parallel%staggered_jhi - nvar = size(global_sum) + nz = size(arr1,1) + + if (parallel%reprosum) then ! compute using cism_reprosum_calc - local_sum(:) = 0.d0 + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) * nz + nflds = 1 + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) - do n = 1, nvar + arr(:,:) = 0.0d0 - ! sum over locally owned velocity points + if (present(arr2)) then ! compute global sum of arr1 + arr2 - if (present(work2)) then + count = 0 do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - local_sum(n) = local_sum(n) + work1(i,j,n) + work2(i,j,n) + do k = 1, nz + count = count + 1 + arr(count,1) = arr1(k,i,j) + arr2(k,i,j) + enddo enddo enddo - else + + else ! compute global sum of arr1 + + count = 0 do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - local_sum(n) = local_sum(n) + work1(i,j,n) + do k = 1, nz + count = count + 1 + arr(count,1) = arr1(k,i,j) + enddo enddo enddo - endif - enddo ! nvar - - ! take the global sum + endif - global_sum(:) = parallel_reduce_sum(local_sum(:)) + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif - end subroutine parallel_global_sum_staggered_2d_real8_nvar + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) -!======================================================================= + parallel_global_sum_stagger_real8_3d = arr_gsum(1) - ! functions belonging to the parallel_is_zero interface + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'arr_gsum =', arr_gsum + endif - function parallel_is_zero_integer_2d(a) + deallocate(arr) + deallocate(arr_gsum) - ! returns .true. if the field has all zero values, else returns .false. + else ! compute using parallel_reduce_sum (not reproducible) - integer, dimension(:,:), intent(in) :: a - logical :: parallel_is_zero_integer_2d + local_sum = 0.0d0 - real(dp) :: maxval_a + if (present(arr2)) then ! compute global sum of arr1 + arr2 - maxval_a = maxval(abs(a)) - maxval_a = parallel_reduce_max(maxval_a) + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + do k = 1, nz + local_sum = local_sum + arr1(k,i,j) + arr2(k,i,j) + enddo + enddo + enddo + + else ! compute global sum of arr1 + + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + do k = 1, nz + local_sum = local_sum + arr1(k,i,j) + enddo + enddo + enddo + + endif + + parallel_global_sum_stagger_real8_3d = parallel_reduce_sum(local_sum) + + endif ! reprosum + + end function parallel_global_sum_stagger_real8_3d + +!======================================================================= + + function parallel_global_sum_stagger_real8_2d_nflds(arr1, nflds, parallel, arr2) + + ! Sum one or two local arrays on the staggered grid, then take the global sum. + ! The final index is equal to the number of independent fields to be summed. + !TODO - Don't have to pass in nflds, since it equals size(a,3)? + + real(dp), dimension(:,:,:), intent(in) :: arr1 + + integer, intent(in) :: nflds + + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + + real(dp), dimension(:,:,:), intent(in), optional :: arr2 + + real(dp), dimension(size(arr1,3)) :: parallel_global_sum_stagger_real8_2d_nflds + + integer :: i, j, n + + real(dp), dimension(size(arr1,3)) :: local_sum + + integer :: & + staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid + staggered_jlo, staggered_jhi + + ! variables for computing reproductible sums + integer :: nsummands ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum + + staggered_ilo = parallel%staggered_ilo + staggered_ihi = parallel%staggered_ihi + staggered_jlo = parallel%staggered_jlo + staggered_jhi = parallel%staggered_jhi + + if (parallel%reprosum) then ! compute using cism_reprosum_calc + + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) + + arr(:,:) = 0.0d0 + + do n = 1, nflds + + if (present(arr2)) then ! compute global sum of arr1 + arr2 + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + count = count + 1 + arr(count,n) = arr1(i,j,n) + arr2(i,j,n) + enddo + enddo + else ! compute global sum of arr1 + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + count = count + 1 + arr(count,n) = arr1(i,j,n) + enddo + enddo + endif + + enddo ! nflds + + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif + + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) + + parallel_global_sum_stagger_real8_2d_nflds = arr_gsum(:) + + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'arr_gsum =', arr_gsum + endif + + deallocate(arr) + deallocate(arr_gsum) + + else ! compute using parallel_reduce_sum (not reproducible) + + local_sum(:) = 0.d0 + + do n = 1, nflds + + ! sum over locally owned velocity points + + if (present(arr2)) then + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + local_sum(n) = local_sum(n) + arr1(i,j,n) + arr2(i,j,n) + enddo + enddo + else + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + local_sum(n) = local_sum(n) + arr1(i,j,n) + enddo + enddo + endif + + enddo ! nflds + + parallel_global_sum_stagger_real8_2d_nflds = parallel_reduce_sum(local_sum(:)) + + endif ! reprosum + + end function parallel_global_sum_stagger_real8_2d_nflds + +!======================================================================= + + function parallel_global_sum_stagger_real8_3d_nflds(arr1, nflds, parallel, arr2) + + ! Sum one or two local arrays on the staggered grid, then take the global sum. + ! Assumes k is the first index, followed by i and j. + ! The final index is equal to the number of independent fields to be summed. + + real(dp), dimension(:,:,:,:), intent(in) :: arr1 + + integer, intent(in) :: nflds ! size of final index; number of global sums to be computed + + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + + real(dp), dimension(:,:,:,:), intent(in), optional :: arr2 + + real(dp), dimension(size(arr1,4)) :: parallel_global_sum_stagger_real8_3d_nflds + + integer :: i, j, k, n, nz + + real(dp), dimension(size(arr1,4)) :: local_sum + + integer :: & + staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid + staggered_jlo, staggered_jhi + + ! variables for computing reproductible sums + integer :: nsummands ! dimensions of array passed to parallel_reduce_reprosum + integer :: count + real(dp), dimension(:,:), allocatable :: arr + real(dp), dimension(:), allocatable :: arr_gsum + + !TODO - Associate these variables + staggered_ilo = parallel%staggered_ilo + staggered_ihi = parallel%staggered_ihi + staggered_jlo = parallel%staggered_jlo + staggered_jhi = parallel%staggered_jhi + + nz = size(arr1,1) + + if (parallel%reprosum) then ! compute using cism_reprosum_calc + + ! Allocate and fill arrays to pass to parallel_reduce_reprosum + nsummands = (staggered_ihi-staggered_ilo+1) * (staggered_jhi-staggered_jlo+1) * nz + allocate(arr(nsummands,nflds)) + allocate(arr_gsum(nflds)) + + arr(:,:) = 0.0d0 + + do n = 1, nflds + + if (present(arr2)) then ! compute global sum of arr1 + arr2 + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + do k = 1, nz + count = count + 1 + arr(count,n) = arr1(k,i,j,n) + arr2(k,i,j,n) + enddo + enddo + enddo + else ! compute global sum of arr1 + count = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + do k = 1, nz + count = count + 1 + arr(count,n) = arr1(k,i,j,n) + enddo + enddo + enddo + endif + + enddo ! nflds + + ! bug check + if (count /= nsummands) then + if (main_task) write(iulog,*) 'Error: count, nsummands =', count, nsummands + call parallel_stop(__FILE__,__LINE__) + endif + + ! Call parallel_reduce_reprosum + call parallel_reduce_reprosum(arr, arr_gsum) + + parallel_global_sum_stagger_real8_3d_nflds = arr_gsum(:) + + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'arr_gsum =', arr_gsum + endif + + deallocate(arr) + deallocate(arr_gsum) + + else ! compute using parallel_reduce_sum (not reproducible) + + local_sum(:) = 0.d0 + + do n = 1, nflds + + ! sum over locally owned velocity points + + if (present(arr2)) then + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + do k = 1, nz + local_sum(n) = local_sum(n) + arr1(k,i,j,n) + arr2(k,i,j,n) + enddo + enddo + enddo + else + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + do k = 1, nz + local_sum(n) = local_sum(n) + arr1(k,i,j,n) + enddo + enddo + enddo + endif + + enddo ! nflds + + parallel_global_sum_stagger_real8_3d_nflds = parallel_reduce_sum(local_sum(:)) + + endif ! reprosum + + end function parallel_global_sum_stagger_real8_3d_nflds + +!======================================================================= + ! functions belonging to the parallel_is_zero interface + + function parallel_is_zero_integer_1d(a) + + ! returns .true. if the field has all zero values, else returns .false. + + integer, dimension(:), intent(in) :: a + logical :: parallel_is_zero_integer_1d + + integer :: maxval_a + + maxval_a = maxval(abs(a)) + maxval_a = parallel_reduce_max(maxval_a) + if (maxval_a > 0) then + parallel_is_zero_integer_1d = .false. + else + parallel_is_zero_integer_1d = .true. + endif + + end function parallel_is_zero_integer_1d + +!======================================================================= + + function parallel_is_zero_real8_1d(a) + + ! returns .true. if the field has all zero values, else returns .false. + + real(dp), dimension(:), intent(in) :: a + logical :: parallel_is_zero_real8_1d + + real(dp) :: maxval_a + + maxval_a = maxval(abs(a)) + maxval_a = parallel_reduce_max(maxval_a) + if (maxval_a > 0.0d0) then + parallel_is_zero_real8_1d = .false. + else + parallel_is_zero_real8_1d = .true. + endif + + end function parallel_is_zero_real8_1d + +!======================================================================= + + function parallel_is_zero_integer_2d(a) + + ! returns .true. if the field has all zero values, else returns .false. + + integer, dimension(:,:), intent(in) :: a + logical :: parallel_is_zero_integer_2d + + integer :: maxval_a + + maxval_a = maxval(abs(a)) + maxval_a = parallel_reduce_max(maxval_a) if (maxval_a > 0) then parallel_is_zero_integer_2d = .false. else @@ -7069,26 +7611,35 @@ subroutine parallel_halo_real4_2d(a, parallel) end subroutine parallel_halo_real4_2d - subroutine parallel_halo_real8_2d(a, parallel, periodic_offset_ew, periodic_offset_ns) + subroutine parallel_halo_real8_2d(a, parallel, & + periodic_offset_ew, periodic_offset_ns, zero_global_boundary_no_ice_bc) - !WHL - added optional arguments for periodic offsets, to support ismip-hom test cases + ! Added optional arguments for periodic offsets, to support ismip-hom test cases + ! Also added an optional argument related to the no_ice BCs use mpi_mod implicit none real(dp),dimension(:,:) :: a type(parallel_type) :: parallel + real(dp), intent(in), optional :: & - periodic_offset_ew, &! offset halo values by this amount - ! if positive, the offset is positive for W halo, negative for E halo - periodic_offset_ns ! offset halo values by this amount - ! if positive, the offset is positive for S halo, negative for N halo + periodic_offset_ew, &! offset halo values by this amount + ! if positive, the offset is positive for W halo, negative for E halo + periodic_offset_ns ! offset halo values by this amount + ! if positive, the offset is positive for S halo, negative for N halo + logical, intent(in), optional :: & + zero_global_boundary_no_ice_bc ! if true, then zero out values in grid cells adjacent + ! to the global boundary when using no_ice BCs + integer :: erequest,ierror,nrequest,srequest,wrequest real(dp),dimension(lhalo, parallel%local_nsn-lhalo-uhalo) :: esend,wrecv real(dp),dimension(uhalo, parallel%local_nsn-lhalo-uhalo) :: erecv,wsend real(dp),dimension(parallel%local_ewn, lhalo) :: nsend,srecv real(dp),dimension(parallel%local_ewn, uhalo) :: nrecv,ssend + logical :: zero_global_boundary_no_ice ! local version of zero_global_boundary_no_ice_bc + ! begin associate( & outflow_bc => parallel%outflow_bc, & @@ -7105,6 +7656,12 @@ subroutine parallel_halo_real8_2d(a, parallel, periodic_offset_ew, periodic_offs northwest_corner => parallel%northwest_corner & ) + if (present(zero_global_boundary_no_ice_bc)) then + zero_global_boundary_no_ice = zero_global_boundary_no_ice_bc + else + zero_global_boundary_no_ice = .true. + endif + ! staggered grid if (size(a,1)==local_ewn-1.and.size(a,2)==local_nsn-1) return @@ -7194,31 +7751,53 @@ subroutine parallel_halo_real8_2d(a, parallel, periodic_offset_ew, periodic_offs elseif (no_ice_bc) then - ! Set values to zero in cells adjacent to the global boundary; - ! includes halo cells and one row of locally owned cells + if (zero_global_boundary_no_ice) then - if (this_rank >= east) then ! at east edge of global domain - a(local_ewn-uhalo:,:) = 0.d0 - endif + ! Set values to zero in cells adjacent to the global boundary; + ! includes halo cells and one row of locally owned cells. - if (this_rank <= west) then ! at west edge of global domain - a(:lhalo+1,:) = 0.d0 - endif + if (this_rank >= east) then ! at east edge of global domain + a(local_ewn-uhalo:,:) = 0.d0 + endif - if (this_rank >= north) then ! at north edge of global domain - a(:,local_nsn-uhalo:) = 0.d0 - endif + if (this_rank <= west) then ! at west edge of global domain + a(:lhalo+1,:) = 0.d0 + endif - if (this_rank <= south) then ! at south edge of global domain - a(:,:lhalo+1) = 0.d0 - endif + if (this_rank >= north) then ! at north edge of global domain + a(:,local_nsn-uhalo:) = 0.d0 + endif - ! Some interior blocks have a single cell at a corner of the global boundary. - ! Set values in corner cells to zero, along with adjacent halo cells. - if (southwest_corner) a(:lhalo+1,:lhalo+1) = 0.d0 - if (southeast_corner) a(local_ewn-lhalo:,:lhalo+1) = 0.d0 - if (northeast_corner) a(local_ewn-lhalo:,local_nsn-lhalo:) = 0.d0 - if (northwest_corner) a(:lhalo+1,local_nsn-lhalo:) = 0.d0 + if (this_rank <= south) then ! at south edge of global domain + a(:,:lhalo+1) = 0.d0 + endif + + ! Some interior blocks have a single cell at a corner of the global boundary. + ! Set values in corner cells to zero, along with adjacent halo cells. + if (southwest_corner) a(:lhalo+1,:lhalo+1) = 0.d0 + if (southeast_corner) a(local_ewn-lhalo:,:lhalo+1) = 0.d0 + if (northeast_corner) a(local_ewn-lhalo:,local_nsn-lhalo:) = 0.d0 + if (northwest_corner) a(:lhalo+1,local_nsn-lhalo:) = 0.d0 + + else ! set values to zero in halo cells but not in locally owned cells + + if (this_rank >= east) then ! at east edge of global domain + a(local_ewn-uhalo+1:,:) = 0.d0 + endif + + if (this_rank <= west) then ! at west edge of global domain + a(:lhalo,:) = 0.d0 + endif + + if (this_rank >= north) then ! at north edge of global domain + a(:,local_nsn-uhalo+1:) = 0.d0 + endif + + if (this_rank <= south) then ! at south edge of global domain + a(:,:lhalo) = 0.d0 + endif + + endif ! zero_global_boundary_no_ice endif ! outflow or no_ice bc @@ -7474,6 +8053,165 @@ subroutine parallel_halo_real8_4d(a, parallel) end subroutine parallel_halo_real8_4d + + subroutine parallel_halo_integer8_4d(a, parallel) + + use mpi_mod + implicit none + integer(i8),dimension(:,:,:,:) :: a + type(parallel_type) :: parallel + + integer :: erequest,ierror,one,nrequest,srequest,wrequest + integer(i8),dimension(size(a,1), size(a,2), lhalo, parallel%local_nsn-lhalo-uhalo) :: esend,wrecv + integer(i8),dimension(size(a,1), size(a,2), uhalo, parallel%local_nsn-lhalo-uhalo) :: erecv,wsend + integer(i8),dimension(size(a,1), size(a,2), parallel%local_ewn, lhalo) :: nsend,srecv + integer(i8),dimension(size(a,1), size(a,2), parallel%local_ewn, uhalo) :: nrecv,ssend + + ! begin + associate( & + outflow_bc => parallel%outflow_bc, & + no_ice_bc => parallel%no_ice_bc, & + local_ewn => parallel%local_ewn, & + local_nsn => parallel%local_nsn, & + east => parallel%east, & + west => parallel%west, & + north => parallel%north, & + south => parallel%south, & + southwest_corner => parallel%southwest_corner, & + southeast_corner => parallel%southeast_corner, & + northeast_corner => parallel%northeast_corner, & + northwest_corner => parallel%northwest_corner & + ) + + ! staggered grid + if (size(a,3)==local_ewn-1.and.size(a,4)==local_nsn-1) return + + ! unknown grid + if (size(a,3)/=local_ewn.or.size(a,4)/=local_nsn) then + write(iulog,*) "Unknown Grid: Size a=(", size(a,1), ",", size(a,2), ",", size(a,3), ",", size(a,4), ") & + &and local_ewn and local_nsn = ", local_ewn, ",", local_nsn + call parallel_stop(__FILE__,__LINE__) + endif + + ! unstaggered grid + call mpi_irecv(wrecv,size(wrecv),mpi_real8,west,west,& + comm,wrequest,ierror) + call mpi_irecv(erecv,size(erecv),mpi_real8,east,east,& + comm,erequest,ierror) + call mpi_irecv(srecv,size(srecv),mpi_real8,south,south,& + comm,srequest,ierror) + call mpi_irecv(nrecv,size(nrecv),mpi_real8,north,north,& + comm,nrequest,ierror) + + esend(:,:,:,:) = & + a(:,:,local_ewn-uhalo-lhalo+1:local_ewn-uhalo,1+lhalo:local_nsn-uhalo) + call mpi_send(esend,size(esend),mpi_real8,east,this_rank,comm,ierror) + wsend(:,:,:,:) = a(:,:,1+lhalo:1+lhalo+uhalo-1,1+lhalo:local_nsn-uhalo) + call mpi_send(wsend,size(wsend),mpi_real8,west,this_rank,comm,ierror) + + call mpi_wait(wrequest,mpi_status_ignore,ierror) + a(:,:,:lhalo,1+lhalo:local_nsn-uhalo) = wrecv(:,:,:,:) + call mpi_wait(erequest,mpi_status_ignore,ierror) + a(:,:,local_ewn-uhalo+1:,1+lhalo:local_nsn-uhalo) = erecv(:,:,:,:) + + nsend(:,:,:,:) = a(:,:,:,local_nsn-uhalo-lhalo+1:local_nsn-uhalo) + call mpi_send(nsend,size(nsend),mpi_real8,north,this_rank,comm,ierror) + ssend(:,:,:,:) = a(:,:,:,1+lhalo:1+lhalo+uhalo-1) + call mpi_send(ssend,size(ssend),mpi_real8,south,this_rank,comm,ierror) + + call mpi_wait(srequest,mpi_status_ignore,ierror) + a(:,:,:,:lhalo) = srecv(:,:,:,:) + call mpi_wait(nrequest,mpi_status_ignore,ierror) + a(:,:,:,local_nsn-uhalo+1:) = nrecv(:,:,:,:) + + if (outflow_bc) then ! set values in global halo to zero + ! interior halo cells should not be affected + + if (this_rank >= east) then ! at east edge of global domain + a(:,:,local_ewn-uhalo+1:,:) = 0 + endif + + if (this_rank <= west) then ! at west edge of global domain + a(:,:,:lhalo,:) = 0 + endif + + if (this_rank >= north) then ! at north edge of global domain + a(:,:,:,local_nsn-uhalo+1:) = 0 + endif + + if (this_rank <= south) then ! at south edge of global domain + a(:,:,:,:lhalo) = 0 + endif + + elseif (no_ice_bc) then + + ! Set values to zero in cells adjacent to the global boundary; + ! includes halo cells and one row of locally owned cells + + if (this_rank >= east) then ! at east edge of global domain + a(:,:,local_ewn-uhalo:,:) = 0 + endif + + if (this_rank <= west) then ! at west edge of global domain + a(:,:,:lhalo+1,:) = 0 + endif + + if (this_rank >= north) then ! at north edge of global domain + a(:,:,:,local_nsn-uhalo:) = 0 + endif + + if (this_rank <= south) then ! at south edge of global domain + a(:,:,:,:lhalo+1) = 0 + endif + + ! Some interior blocks have a single cell at a corner of the global boundary. + ! Set values in corner cells to zero, along with adjacent halo cells. + if (southwest_corner) a(:,:,:lhalo+1,:lhalo+1) = 0 + if (southeast_corner) a(:,:,local_ewn-lhalo:,:lhalo+1) = 0 + if (northeast_corner) a(:,:,local_ewn-lhalo:,local_nsn-lhalo:) = 0 + if (northwest_corner) a(:,:,:lhalo+1,local_nsn-lhalo:) = 0 + + endif ! outflow or no_ice bc + + end associate + + end subroutine parallel_halo_integer8_4d + +!======================================================================= + + ! subroutines for 1D halo updates + + subroutine parallel_halo_extrapolate_real8_1d(a, parallel, interval_in) + + !Note: Extrapolate a 1D real8 variable into halo cells to the east and west. + ! Currently used only to compute halo values for grid cell coordinates. + + use mpi_mod + implicit none + real(dp),dimension(:) :: a + type(parallel_type) :: parallel + real(dp),intent(in), optional :: & + interval_in ! uniform difference between adjacent values, e.g. grid cell size dew or dns + + integer :: i + real(dp) :: interval ! local version of interval_in + + if (present(interval_in)) then + interval = interval_in + else + interval = 0.0d0 + endif + + do i = 1, lhalo + a(i) = a(lhalo+1) - interval*(lhalo+1-i) + enddo + + do i = size(a)-uhalo+1, size(a) + a(i) = a(size(a)-uhalo) + interval*(uhalo+i-size(a)) + enddo + + end subroutine parallel_halo_extrapolate_real8_1d + !======================================================================= ! subroutines belonging to the parallel_halo_extrapolate interface @@ -8681,6 +9419,23 @@ function parallel_reduce_sum_integer(x) end function parallel_reduce_sum_integer + function parallel_reduce_sum_integer8(x) + + use mpi_mod + implicit none + integer(i8) :: x + + integer :: ierror + integer(i8) :: recvbuf,sendbuf, parallel_reduce_sum_integer8 + + ! begin + sendbuf = x + call mpi_allreduce(sendbuf,recvbuf,1,mpi_integer8,mpi_sum,comm,ierror) + parallel_reduce_sum_integer8 = recvbuf + + end function parallel_reduce_sum_integer8 + + function parallel_reduce_sum_real4(x) use mpi_mod @@ -8701,13 +9456,20 @@ end function parallel_reduce_sum_real4 function parallel_reduce_sum_real8(x) use mpi_mod + implicit none + real(dp) :: x integer :: ierror real(dp) :: recvbuf,sendbuf, parallel_reduce_sum_real8 + ! Input and output arguments for subroutine cism_reprosum_calc + real(dp), dimension(:,:), allocatable :: arr ! array to be summed over processors + real(dp), dimension(:), allocatable :: arr_gsum ! global sum of arr + ! begin + sendbuf = x call mpi_allreduce(sendbuf,recvbuf,1,mpi_real8,mpi_sum,comm,ierror) parallel_reduce_sum_real8 = recvbuf @@ -8742,14 +9504,128 @@ function parallel_reduce_sum_real8_nvar(x) integer :: ierror, nvar real(dp), dimension(size(x)) :: recvbuf,sendbuf, parallel_reduce_sum_real8_nvar + ! Input and output arguments for subroutine cism_reprosum_calc + real(dp), dimension(:,:), allocatable :: arr ! array to be summed over processors + real(dp), dimension(:), allocatable :: arr_gsum ! global sum of arr + ! begin nvar = size(x) + sendbuf = x call mpi_allreduce(sendbuf,recvbuf,nvar,mpi_real8,mpi_sum,comm,ierror) parallel_reduce_sum_real8_nvar = recvbuf end function parallel_reduce_sum_real8_nvar +!======================================================================= + + subroutine parallel_reduce_reprosum(arr, arr_gsum) + + ! Compute a reproducible global sum for a floating-point variable or array. + ! Can be called from parallel_global_sum, parallel_global_sum_patch, or + ! parallel_global_sum_stagger. + + implicit none + + real(dp), dimension(:,:), intent(in) :: arr + real(dp), dimension(:), intent(out) :: arr_gsum + + ! Notes on subroutine cism_reprosum_calc: + ! The first five arguments are required: arr(dsummands,nflds), arr_gsum(dsummands), + ! dsummands, dflds and nsummands. Typically, nsummands = dsummands = number of local values. + ! We use the default fixed-precision algorithm (instead of ddpdd, which apparently is less robust). + ! We do not allow Inf or NaN values in the input array. + ! Typically, the algorithm calls mpi_allreduce twice. By passing in both arr_gbl_max and arr_max_levels, + ! it may be possible to call mpi_allreduce just once, improving performance. + ! If we don't pass these arguments, then arr_gbl_max and arr_max_levels are computed internally.. + ! By passing arr_glb_max_out and arr_max_levels_out, we can see the calculated values. + ! By passing rel_diff, we can verify that the computed reproducible sum is close + ! to the (nonreproducible) floating-point value. + ! +!! ! commid ! MPI communicator + ! See comments in cism_reprosum_calc for more info + + ! Required arguments + + integer :: dsummands, nflds ! dimensions of arr + integer :: nsummands ! number of processors + + ! Optional arguments + + real(dp), dimension(:), allocatable :: & + arr_gbl_max, & ! upper bound on max(abs(arr)) + arr_gbl_max_out ! calculated upper bound on max(abs(arr)) + + real(dp), dimension(:,:), allocatable :: & + rel_diff ! relative and absolute differences between fixed and floating point sums + + integer, dimension(:), allocatable :: & + arr_max_levels, & ! maximum number of levels of integer expansion to use + arr_max_levels_out ! output of number of levels of integer expansion to use + + integer :: & + gbl_max_nsummands, & ! maximum of nsummand over all processes + gbl_max_nsummands_out ! calculated maximum nsummands over all processes + + integer, dimension(6) :: & + repro_sum_stats(6) ! increment running totals for + ! (1) one-reduction repro_sum + ! (2) two-reduction repro_sum + ! (3) both types in one call + ! (4) nonrepro_sum + ! (5) global max nsummands reduction + ! (6) global lor 3*nflds reduction + + logical :: & + ddpdd_sum, & ! use ddpdd algorithm instead of fixed-precision algorithm + repro_sum_validate ! flag enabling/disabling testing that gmax and max_levels + ! are accurate/sufficient. Default is enabled. + + ! Set parameters and allocate arrays + dsummands = size(arr,1) + nflds = size(arr,2) + nsummands = dsummands + + allocate (arr_gbl_max(nflds)) + allocate (arr_gbl_max_out(nflds)) + allocate (arr_max_levels(nflds)) + allocate (arr_max_levels_out(nflds)) + allocate (rel_diff(2,nflds)) + + ddpdd_sum = .false. + repro_sum_validate = .true. + + ! The following subroutine is adapted from shr_reprosum_calc in CESM shared code. + + call cism_reprosum_calc(& + arr, arr_gsum, & + nsummands, dsummands, nflds, & + ddpdd_sum = ddpdd_sum, & +! arr_gbl_max = arr_gbl_max, & + arr_gbl_max_out = arr_gbl_max_out, & +! arr_max_levels = arr_max_levels, & + arr_max_levels_out = arr_max_levels_out, & +! gbl_max_nsummands = gbl_max_nsummands, & + gbl_max_nsummands_out = gbl_max_nsummands_out, & + repro_sum_validate = repro_sum_validate, & + repro_sum_stats = repro_sum_stats, & + rel_diff = rel_diff) + + if (verbose_reprosum .and. main_task) then +! write(iulog,*) 'arr_gbl_max_out =', arr_gbl_max_out +! write(iulog,*) 'arr_max_levels_out =', arr_max_levels_out +! write(iulog,*) 'gbl_max_nsummands_out =', gbl_max_nsummands_out +! write(iulog,*) 'rel diff =', rel_diff(1,:) +! write(iulog,*) 'abs diff =', rel_diff(2,:) +! write(iulog,*) 'stats =', repro_sum_stats(:) + endif + + deallocate(arr_gbl_max, arr_gbl_max_out) + deallocate(arr_max_levels, arr_max_levels_out) + deallocate(rel_diff) + + end subroutine parallel_reduce_reprosum + !======================================================================= ! functions belonging to the parallel_reduce_max interface @@ -9122,7 +9998,7 @@ subroutine parallel_test_comm_row_col(parallel) enddo endif ! this_rank - call distributed_gather_var_row(test_array, global_test_array, parallel) + call gather_var_row(test_array, global_test_array, parallel) !! if (parallel%main_task_row) then if (parallel%main_task_row .and. this_rank == 0) then @@ -9136,7 +10012,7 @@ subroutine parallel_test_comm_row_col(parallel) write(iulog,*) ' ' endif - call distributed_scatter_var_row(test_array, global_test_array, parallel) + call scatter_var_row(test_array, global_test_array, parallel) if (this_rank == 0) then write(iulog,*) ' ' @@ -9176,7 +10052,7 @@ subroutine parallel_test_comm_row_col(parallel) enddo endif ! this_rank - call distributed_gather_var_col(test_array, global_test_array, parallel) + call gather_var_col(test_array, global_test_array, parallel) !! if (parallel%main_task_col) then if (parallel%main_task_col .and. this_rank == 0) then @@ -9190,7 +10066,7 @@ subroutine parallel_test_comm_row_col(parallel) write(iulog,*) ' ' endif - call distributed_scatter_var_col(test_array, global_test_array, parallel) + call scatter_var_col(test_array, global_test_array, parallel) if (this_rank == 0) then write(iulog,*) ' ' diff --git a/libglimmer/profile.F90 b/libglimmer/profile.F90 index f88ee51c..92840b14 100644 --- a/libglimmer/profile.F90 +++ b/libglimmer/profile.F90 @@ -35,8 +35,6 @@ module profile #if (defined CCSMCOUPLED || defined CESMTIMERS) use perf_mod - !TODO - Add an 'only' for 'use cism_parallel'? - use cism_parallel #endif use glimmer_global, only: dp @@ -89,16 +87,18 @@ end subroutine profile_init !> register a new series of meassurements function profile_register(prof,msg) - use glimmer_log + use glimmer_paramets, only: iulog + use mpi implicit none type(profile_type) :: prof !< structure storing profile definitions character(len=*), intent(in) :: msg !< the message to be associated integer profile_register + integer :: ierr prof%nump = prof%nump+1 if (prof%nump > max_prof) then - call write_log('Maximum number of profiles reached',type=GM_FATAL, & - file=__FILE__,line=__LINE__) + write(iulog,*) ('Maximum number of profiles reached') + call mpi_abort(MPI_COMM_WORLD, 1001, ierr) end if profile_register = prof%nump prof%pname(prof%nump) = trim(msg) diff --git a/libglint/glint_initialise.F90 b/libglint/glint_initialise.F90 index a94a105e..79cd9038 100644 --- a/libglint/glint_initialise.F90 +++ b/libglint/glint_initialise.F90 @@ -607,7 +607,7 @@ subroutine setup_lgrid_fulldomain(instance, grid, grid_orog) use glint_global_grid , only : global_grid use glimmer_coordinates, only : coordsystem_new use glide_types , only : get_dew, get_dns - use cism_parallel , only : parallel_type, distributed_gather_var + use cism_parallel , only : parallel_type, gather_var implicit none @@ -630,7 +630,7 @@ subroutine setup_lgrid_fulldomain(instance, grid, grid_orog) global_ewn = instance%model%parallel%global_ewn global_nsn = instance%model%parallel%global_nsn - call distributed_gather_var(instance%out_mask, out_mask_fulldomain, parallel) + call gather_var(instance%out_mask, out_mask_fulldomain, parallel) if (main_task) then diff --git a/libglint/glint_interp.F90 b/libglint/glint_interp.F90 index 83491b99..5ef84eb0 100644 --- a/libglint/glint_interp.F90 +++ b/libglint/glint_interp.F90 @@ -248,7 +248,7 @@ subroutine interp_to_local (lgrid_fulldomain, global, & use glimmer_utils use glimmer_coordinates use glimmer_log - use cism_parallel, only : main_task, parallel_type, distributed_scatter_var, parallel_halo + use cism_parallel, only : main_task, parallel_type, scatter_var, parallel_halo !TODO - Not sure we need localsp now that the code is fully double precision @@ -299,9 +299,9 @@ subroutine interp_to_local (lgrid_fulldomain, global, & ! Allocate variables to hold result of interpolation ! We allocate size 0 arrays on non-main task (rather than leaving variables - ! unallocated there), because distributed_scatter_var tries to do a deallocate on all tasks + ! unallocated there), because scatter_var tries to do a deallocate on all tasks ! Note that coordsystem_allocate can't be used here because it only works on pointer - ! variables, and the *_fulldomain variables are non-pointers (as is required for distributed_scatter_var) + ! variables, and the *_fulldomain variables are non-pointers (as is required for scatter_var) if (present(localsp)) then if (main_task) then @@ -450,25 +450,25 @@ subroutine interp_to_local (lgrid_fulldomain, global, & end if ! main_task ! Main task scatters interpolated data from the full domain to the task owning each point - ! Note that distributed_scatter_var doesn't set halo values, so we need to do a halo + ! Note that scatter_var doesn't set halo values, so we need to do a halo ! update if it's important to have correct values in the halo cells. ! Although it's not strictly necessary to have the halo values, we compute them just in ! case another part of the code (e.g., glissade_temp) assumes they are available. if (present(localsp)) then localsp(:,:) = 0.d0 - call distributed_scatter_var(localsp, localsp_fulldomain, parallel) + call scatter_var(localsp, localsp_fulldomain, parallel) call parallel_halo(localsp, parallel) endif if (present(localdp)) then localdp(:,:) = 0.d0 - call distributed_scatter_var(localdp, localdp_fulldomain, parallel) + call scatter_var(localdp, localdp_fulldomain, parallel) call parallel_halo(localdp, parallel) endif ! We do NOT deallocate the local*_fulldomain variables here, because the - ! distributed_scatter_var routines do this deallocation + ! scatter_var routines do this deallocation end subroutine interp_to_local @@ -489,7 +489,7 @@ subroutine copy_to_local (lgrid_fulldomain, global, & ! on the main task. use glimmer_coordinates - use cism_parallel, only : main_task, parallel_type, distributed_scatter_var, parallel_halo + use cism_parallel, only : main_task, parallel_type, scatter_var, parallel_halo ! Argument declarations @@ -524,16 +524,16 @@ subroutine copy_to_local (lgrid_fulldomain, global, & end if ! Main task scatters interpolated data from the full domain to the task owning each point - ! Note that distributed_scatter_var doesn't set halo values, so we need to do a halo + ! Note that scatter_var doesn't set halo values, so we need to do a halo ! update if it's important to have correct values in the halo cells. ! Although it's not strictly necessary to have the halo values, we compute them just in ! case another part of the code (e.g., glissade_temp) assumes they are available. local(:,:) = 0.d0 - call distributed_scatter_var(local, local_fulldomain, parallel) + call scatter_var(local, local_fulldomain, parallel) call parallel_halo(local, parallel) - ! We do NOT deallocate local_fulldomain here, because the distributed_scatter_var + ! We do NOT deallocate local_fulldomain here, because the scatter_var ! routine does this deallocation end subroutine copy_to_local @@ -695,7 +695,7 @@ subroutine local_to_global_avg(ups, parallel, local, global, mask) !> \texttt{interp\_to\_local} routine. !> \end{itemize} - use cism_parallel, only : main_task, parallel_type, distributed_gather_var + use cism_parallel, only : main_task, parallel_type, gather_var ! Arguments @@ -729,8 +729,8 @@ subroutine local_to_global_avg(ups, parallel, local, global, mask) ! Gather 'local' and 'tempmask' onto main task, which is the only one that does the regridding - call distributed_gather_var(local, local_fulldomain, parallel) - call distributed_gather_var(tempmask, tempmask_fulldomain, parallel) + call gather_var(local, local_fulldomain, parallel) + call gather_var(tempmask, tempmask_fulldomain, parallel) ! Main task does regridding @@ -785,7 +785,7 @@ subroutine local_to_global_sum(ups, parallel, local, global, mask) !> \item \texttt{gboxn} is the same size as \texttt{global} !> \end{itemize} - use cism_parallel, only : main_task, parallel_type, distributed_gather_var + use cism_parallel, only : main_task, parallel_type, gather_var ! Arguments @@ -816,8 +816,8 @@ subroutine local_to_global_sum(ups, parallel, local, global, mask) ! Gather 'local' and 'tempmask' onto main task, which is the only one that does the regridding - call distributed_gather_var(local, local_fulldomain, parallel) - call distributed_gather_var(tempmask, tempmask_fulldomain, parallel) + call gather_var(local, local_fulldomain, parallel) + call gather_var(tempmask, tempmask_fulldomain, parallel) ! Main task does regridding if (main_task) then @@ -854,7 +854,7 @@ subroutine local_to_global_min(ups, parallel, local, global, mask) !> \item \texttt{gboxn} is the same size as \texttt{global} !> \end{itemize} - use cism_parallel, only : main_task, parallel_type, distributed_gather_var + use cism_parallel, only : main_task, parallel_type, gather_var ! Arguments @@ -885,8 +885,8 @@ subroutine local_to_global_min(ups, parallel, local, global, mask) ! Gather 'local' and 'tempmask' onto main task, which is the only one that does the regridding - call distributed_gather_var(local, local_fulldomain, parallel) - call distributed_gather_var(tempmask, tempmask_fulldomain, parallel) + call gather_var(local, local_fulldomain, parallel) + call gather_var(tempmask, tempmask_fulldomain, parallel) ! Main task does regridding if (main_task) then diff --git a/libglissade/glissade.F90 b/libglissade/glissade.F90 index 83eaec79..2f3acb4f 100644 --- a/libglissade/glissade.F90 +++ b/libglissade/glissade.F90 @@ -85,20 +85,19 @@ module glissade !======================================================================= ! Note: There is no glissade_config subroutine; glide_config works for all dycores. - +! glide_config is called from cism_init_dycore before glissade_initialise. !======================================================================= subroutine glissade_initialise(model, evolve_ice) ! initialise Glissade model instance - use cism_parallel, only: parallel_type, distributed_gather_var, & - distributed_scatter_var, parallel_finalise, & + use cism_parallel, only: parallel_type, parallel_finalise, & distributed_grid, distributed_grid_active_blocks, parallel_global_edge_mask, & - parallel_halo, parallel_halo_extrapolate, parallel_reduce_max, & + parallel_halo, parallel_halo_extrapolate, & staggered_parallel_halo_extrapolate, staggered_no_penetration_mask, & parallel_create_comm_row, parallel_create_comm_col, & - parallel_is_zero, not_parallel + parallel_reduce_max, parallel_is_zero, not_parallel use glide_setup use glimmer_ncio, only: openall_in, openall_out, glimmer_nc_get_var, glimmer_nc_get_dimlength @@ -218,11 +217,12 @@ subroutine glissade_initialise(model, evolve_ice) ! The subroutine will report how many tasks are needed to compute on all active blocks, and then abort. ! The user can then resubmit (on an optimal number of processors) with model%options%compute_blocks = ACTIVE_BLOCKS. - call distributed_grid_active_blocks(model%general%ewn, model%general%nsn, & - model%general%nx_block, model%general%ny_block, & - model%general%ice_domain_mask, & - model%parallel, & - inquire_only = .true.) + call distributed_grid_active_blocks(& + model%general%ewn, model%general%nsn, & + model%general%nx_block, model%general%ny_block, & + model%general%ice_domain_mask, & + model%parallel, & + inquire_only = .true.) else ! compute_blocks = ACTIVE_BLOCKS_ONLY @@ -235,10 +235,12 @@ subroutine glissade_initialise(model, evolve_ice) model%general%global_bc = GLOBAL_BC_NO_ICE endif - call distributed_grid_active_blocks(model%general%ewn, model%general%nsn, & - model%general%nx_block, model%general%ny_block, & - model%general%ice_domain_mask, & - model%parallel) + call distributed_grid_active_blocks(& + model%general%ewn, model%general%nsn, & + model%general%nx_block, model%general%ny_block, & + model%general%ice_domain_mask, & + model%parallel, & + reprosum_in = model%options%reproducible_sums) endif ! compute_blocks @@ -247,13 +249,19 @@ subroutine glissade_initialise(model, evolve_ice) elseif (model%general%global_bc == GLOBAL_BC_OUTFLOW) then - call distributed_grid(model%general%ewn, model%general%nsn, & - model%parallel, global_bc_in = 'outflow') + call distributed_grid(& + model%general%ewn, model%general%nsn, & + model%parallel, & + reprosum_in = model%options%reproducible_sums, & + global_bc_in = 'outflow') elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then - call distributed_grid(model%general%ewn, model%general%nsn, & - model%parallel, global_bc_in = 'no_ice') + call distributed_grid(& + model%general%ewn, model%general%nsn, & + model%parallel, & + reprosum_in = model%options%reproducible_sums, & + global_bc_in = 'no_ice') elseif (model%general%global_bc == GLOBAL_BC_NO_PENETRATION) then @@ -261,15 +269,19 @@ subroutine glissade_initialise(model, evolve_ice) ! The difference is that we also use no-penetration masks for (uvel,vvel) at the global boundary ! (computed by calling staggered_no_penetration_mask below). - call distributed_grid(model%general%ewn, model%general%nsn, & - model%parallel, global_bc_in = 'no_penetration') + call distributed_grid(& + model%general%ewn, model%general%nsn, & + model%parallel, & + reprosum_in = model%options%reproducible_sums, & + global_bc_in = 'no_penetration') else ! global_bc = GLOBAL_BC_PERIODIC -! call distributed_grid(model%general%ewn, model%general%nsn, global_bc_in = 'periodic') - - call distributed_grid(model%general%ewn, model%general%nsn, & - model%parallel, global_bc_in = 'periodic') + call distributed_grid(& + model%general%ewn, model%general%nsn, & + model%parallel, & + reprosum_in = model%options%reproducible_sums, & + global_bc_in = 'periodic') endif @@ -354,8 +366,47 @@ subroutine glissade_initialise(model, evolve_ice) itest = model%numerics%idiag_local jtest = model%numerics%jdiag_local - ! Check that lat and lon fields were read in, if desired - !TODO - Use the parallel_is_nonzero function instead, here and below + ! Make sure the grid coordinates (x1,y1) and (x0,y0) have been read in. + ! If (x1,y1) have not been read in, then abort the run. + ! If (x0,y0) have not been read in, then compute them from (x1,y1). + ! Extrapolate these coordinates to halo cells as needed. + ! Note: The extrapolation works only on a regular grid. + !TODO - Put the following code in a subroutine. + + if (parallel_is_zero(model%general%x1)) then + call write_log('model%general%x1 = 0.0 everywhere', GM_FATAL) + else ! extrapolate x1 to halo cells + call parallel_halo_extrapolate(model%general%x1, parallel, model%numerics%dew) + endif + + if (parallel_is_zero(model%general%y1)) then + call write_log('model%general%y1 = 0.0 everywhere', GM_FATAL) + else ! extrapolate y1 to halo cells + call parallel_halo_extrapolate(model%general%y1, parallel, model%numerics%dns) + endif + + ! Check whether x0 and y0 were read in. If not, then compute them from x1 and y1. + if (parallel_is_zero(model%general%x0)) then + if (main_task) write(iulog,*) 'x0 not read in; initialize from x1' + do i = 1, model%general%ewn-1 + model%general%x0(i) = 0.5d0 * (model%general%x1(i) + model%general%x1(i+1)) + enddo + else + ! extrapolate x0 to halo cells + call parallel_halo_extrapolate(model%general%x0, parallel, model%numerics%dew) + endif + + if (parallel_is_zero(model%general%y0)) then + if (main_task) write(iulog,*) 'y0 not read in; initialize from y1' + do j = 1, model%general%nsn-1 + model%general%y0(j) = 0.5d0 * (model%general%y1(j) + model%general%y1(j+1)) + enddo + else + ! extrapolate y0 to halo cells + call parallel_halo_extrapolate(model%general%y0, parallel, model%numerics%dns) + endif + + ! Check that lat and lon fields were read in if (model%options%read_lat_lon) then if (parallel_is_zero(model%general%lat)) then call write_log('Failed to read latitude (lat) field from input file', GM_FATAL) @@ -574,50 +625,9 @@ subroutine glissade_initialise(model, evolve_ice) endif ! geothermal heat flux - ! If running with glaciers, then process the input glacier data - ! On start-up, this subroutine counts the glaciers. It should be called before glide_io_createall, - ! which needs to know nglacier to set up glacier output files with the right dimensions. - ! On restart, most of the required glacier arrays are in the restart file, and this subroutine - ! computes a few remaining variable. - - if (model%options%enable_glaciers) then - - ! Glaciers are run with a no-ice BC to allow removal of inactive regions. - ! This can be problematic when running in a sub-region that has glaciers along the global boundary. - ! A halo update here for 'thck' will remove ice from cells along the global boundary. - ! It is best to do this before initializing glaciers, so that ice that initially exists - ! in these cells is removed before computing the area and thickness targets. - !TODO - These calls are repeated a few lines below. Try moving them up, before the call - ! to glissade_glacier_init. I don't think it's possible to move the glissade_glacier_init call - ! down, because we need to compute nglacier before setting up output files. - - call parallel_halo(model%geometry%thck, parallel) - ! calculate the lower and upper ice surface - call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus, model%geometry%lsrf) - model%geometry%usrf = max(0.d0, model%geometry%thck + model%geometry%lsrf) - - ! Initialize glaciers - ! Note: This subroutine can return modified values of model%numerics%dew, model%numerics%dns, - ! and model%geometry%cell_area. - ! This is a fix to deal with the fact that actual grid cell dimensions can be different - ! from the nominal dimensions on a projected grid. - ! See comments near the top of glissade_glacier_init. - - call glissade_glacier_init(model, model%glacier) - - endif - - ! open all output files - call openall_out(model) - - ! create glide I/O variables - call glide_io_createall(model, model) - - ! initialize glissade components - ! Set some variables in halo cells ! Note: We need thck and artm in halo cells so that temperature will be initialized correctly - ! (if not read from input file). + ! (if not read from the input file). ! We do an update here for temp in case temp is read from an input file. ! If temp is computed below in glissade_init_therm (based on the value of options%temp_init), ! then the halos will receive the correct values. @@ -626,37 +636,36 @@ subroutine glissade_initialise(model, evolve_ice) call parallel_halo(model%climate%artm, parallel) call parallel_halo(model%temper%temp, parallel) call parallel_halo(model%temper%tempunstag, parallel) - - ! calculate the lower and upper ice surface - call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus, model%geometry%lsrf) - model%geometry%usrf = max(0.d0, model%geometry%thck + model%geometry%lsrf) + if (model%options%whichtemp == TEMP_ENTHALPY) & + call parallel_halo(model%temper%waterfrac, parallel) ! Note: For outflow BCs, most fields (thck, usrf, temp, etc.) are set to zero in the global halo, ! to create ice-free conditions. However, we might not want to set topg = 0 in the global halo, ! because then the global halo will be interpreted as ice-free land, whereas we may prefer to ! treat it as ice-free ocean. For this reason, topg is extrapolated from adjacent cells. - ! Similarly, for no_ice BCs, we want to zero out ice state variables adjacent to the global boundary, + ! For no_ice BCs, we want to zero out ice state variables adjacent to the global boundary, ! but we do not want to zero out the topography. - ! Note: For periodic BCs, there is an optional argument periodic_offset_ew for topg. - ! This is for ismip-hom experiments. A positive EW offset means that + ! For periodic BCs, there are optional periodic_offset arguments for topg. + ! These are for ismip-hom experiments or similar geometries. A positive EW offset means that ! the topography in west halo cells will be raised, and the topography ! in east halo cells will be lowered. This ensures that the topography ! and upper surface elevation are continuous between halo cells ! and locally owned cells at the edge of the global domain. - ! In other cases (anything but ismip-hom), periodic_offset_ew = periodic_offset_ns = 0, - ! and this argument will have no effect. + ! After this call, topg does not need another halo update unless isostasy is active. - if (model%general%global_bc == GLOBAL_BC_OUTFLOW .or. & - model%general%global_bc == GLOBAL_BC_NO_ICE) then + if (model%general%global_bc == GLOBAL_BC_OUTFLOW) then call parallel_halo_extrapolate(model%geometry%topg, parallel) + elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then + call parallel_halo(model%geometry%topg, parallel, zero_global_boundary_no_ice_bc = .false.) else ! other global BCs, including periodic call parallel_halo(model%geometry%topg, parallel, & periodic_offset_ew = model%numerics%periodic_offset_ew, & periodic_offset_ns = model%numerics%periodic_offset_ns) endif - if (model%options%whichtemp == TEMP_ENTHALPY) & - call parallel_halo(model%temper%waterfrac, parallel) + ! calculate the lower and upper ice surface (will be correct in halos following the halo updates above) + call glide_calclsrf(model%geometry%thck, model%geometry%topg, model%climate%eus, model%geometry%lsrf) + model%geometry%usrf = max(0.d0, model%geometry%thck + model%geometry%lsrf) ! halo update for kinbcmask (= 1 where uvel and vvel are prescribed, elsewhere = 0) ! Note: Instead of assuming that kinbcmask is periodic, we extrapolate it into the global halo @@ -664,6 +673,34 @@ subroutine glissade_initialise(model, evolve_ice) ! on the global staggered grid). call staggered_parallel_halo_extrapolate (model%velocity%kinbcmask, parallel) ! = 1 for Dirichlet BCs + if (model%options%enable_glaciers) then + + ! If running with glaciers, then process the input glacier data and initialize glacier arrays + + ! Note: On start-up, this subroutine counts the glaciers. It should be called before glide_io_createall, + ! which needs to know nglacier to set up glacier output files with the right dimensions. + ! On restart, most of the required glacier arrays are in the restart file, and this subroutine + ! computes a few remaining variables. + ! Note: Glaciers are usually run with a no-ice BC to allow removal of inactive regions. + ! This means that any grid cells adjacent to the global boundary are not handled correctly. + ! The preceding halo update for 'thck' removes ice from these grid cells. + ! Note: If glacier%length_scale_factor /= 1, This subroutine modifies the values of model%numerics%dew, + ! model%numerics%dns, model%geometry%cell_area, and the grid coordinates (x0,y0) and (x1,y1). + ! This is done if the true grid cell dimensions differ from the nominal dimensions on a projected grid. + ! See comments near the top of glissade_glacier_init. + + call glissade_glacier_init(model, model%glacier) + + endif + + ! open all output files + call openall_out(model) + + ! create glide I/O variables + call glide_io_createall(model, model) + + ! initialize glissade components + !TODO - Remove call to init_velo in glissade_initialise? ! Most of what's done in init_velo is needed for SIA only, but still need velowk for call to wvelintg call init_velo(model) @@ -815,8 +852,7 @@ subroutine glissade_initialise(model, evolve_ice) call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) + model%climate%eus, model%geometry%thkmask) ! compute halo for relaxed topography ! Note: See comments above with regard to the halo update for topg. @@ -824,10 +860,11 @@ subroutine glissade_initialise(model, evolve_ice) ! adjacent to or beyond the global boundary. This is an appropriate treatment for ! ice state variables, but not for bed topography and related fields (like relx). !TODO - Is this halo update necessary? - if (model%general%global_bc == GLOBAL_BC_OUTFLOW .or. & - model%general%global_bc == GLOBAL_BC_NO_ICE) then + if (model%general%global_bc == GLOBAL_BC_OUTFLOW) then call parallel_halo_extrapolate(model%isostasy%relx, parallel) - else + elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then + call parallel_halo(model%isostasy%relx, parallel, zero_global_boundary_no_ice_bc = .false.) + else ! other global BCs, including periodic call parallel_halo(model%isostasy%relx, parallel) endif @@ -879,8 +916,7 @@ subroutine glissade_initialise(model, evolve_ice) call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) + model%climate%eus, model%geometry%thkmask) endif ! initial calving @@ -907,7 +943,7 @@ subroutine glissade_initialise(model, evolve_ice) !TODO: Have a single option that is applied with or without glaciers enabled? if (model%options%which_ho_powerlaw_c == HO_POWERLAW_C_CONSTANT .or. & - model%options%is_restart == NO_RESTART) then + parallel_is_zero(model%basal_physics%powerlaw_c)) then if (model%options%enable_glaciers .and. & model%glacier%set_powerlaw_c /= GLACIER_POWERLAW_C_CONSTANT) then ! do nothing; see note above @@ -917,15 +953,29 @@ subroutine glissade_initialise(model, evolve_ice) endif ! Initialize coulomb_c - ! If inverting for coulomb_c, we read in the saved coulomb_c field on restart. - if (model%options%which_ho_coulomb_c == HO_COULOMB_C_CONSTANT .or. & - model%options%is_restart == NO_RESTART) then + ! Note: If inverting for coulomb_c, then coulomb_c is initialized here. + ! On restart, however, the saved coulomb_c (or alternatively, + ! coulomb_c_hi and coulomb_c_lo, for the elevation-based option) + ! should have been read from the restart file and is not reset here. + + if (model%options%which_ho_coulomb_c == HO_COULOMB_C_CONSTANT) then + + model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const + + else ! either inverting for coulomb_c or reading values from an input file + + if (model%options%elevation_based_coulomb_c) then ! need coulomb_c_hi and coulomb_c_lo - !TODO - Make sure the initialization is correct when reading from an external file in a restart. - if (model%options%elevation_based_coulomb_c) then + if (parallel_is_zero(model%basal_physics%coulomb_c_hi) .or. & + parallel_is_zero(model%basal_physics%coulomb_c_lo)) then - model%basal_physics%coulomb_c_hi = model%basal_physics%coulomb_c_const_hi - model%basal_physics%coulomb_c_lo = model%basal_physics%coulomb_c_const_lo + ! initialize to constants + model%basal_physics%coulomb_c_hi = model%basal_physics%coulomb_c_const_hi + model%basal_physics%coulomb_c_lo = model%basal_physics%coulomb_c_const_lo + + endif + + ! Given coulomb_c_hi and coulomb_c_lo, compute coulomb_c based on elevation call glissade_elevation_based_coulomb_c(& model%general%ewn, model%general%nsn, & @@ -937,17 +987,24 @@ subroutine glissade_initialise(model, evolve_ice) model%basal_physics%coulomb_c_bed_hi, & model%basal_physics%coulomb_c) - call parallel_halo(model%basal_physics%coulomb_c, parallel) + else ! coulomb_c not elevation-based + + if (parallel_is_zero(model%basal_physics%coulomb_c)) then + + ! initialize to constant + model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const - if (verbose_inversion) then - call point_diag(model%basal_physics%coulomb_c, 'Initial coulomb_c', itest, jtest, rtest, 7, 7) endif - else - model%basal_physics%coulomb_c = model%basal_physics%coulomb_c_const + endif ! elevation-based + + call parallel_halo(model%basal_physics%coulomb_c, parallel) + + if (verbose_inversion) then + call point_diag(model%basal_physics%coulomb_c, 'Initial coulomb_c', itest, jtest, rtest, 7, 7) endif - endif + endif ! coulomb_c options ! Optionally, do initial calculations for inversion ! At the start of the run (but not on restart), this might lead to further thickness adjustments, @@ -1104,6 +1161,7 @@ subroutine glissade_initialise(model, evolve_ice) call glissade_basin_average(& model%general%ewn, model%general%nsn, & + parallel, & model%ocean_data%nbasin, & model%ocean_data%basin_number, & floating_mask * 1.0d0, & ! real mask @@ -1155,7 +1213,7 @@ subroutine glissade_tstep(model, time) use glimmer_paramets, only: eps11 use glimmer_physcon, only: scyr - use glide_mask, only: glide_set_mask, calc_iareaf_iareag + use glide_mask, only: glide_set_mask use glissade_mass_balance, only: glissade_prepare_climate_forcing implicit none @@ -1360,15 +1418,7 @@ subroutine glissade_tstep(model, time) call glide_set_mask(model%numerics, & model%geometry%thck, model%geometry%topg, & model%general%ewn, model%general%nsn, & - model%climate%eus, model%geometry%thkmask, & - model%geometry%iarea, model%geometry%ivol) - - ! --- Calculate global area of ice that is floating and grounded. - !TODO May want to calculate iareaf and iareag in glide_write_diag and remove those calculations here. - - call calc_iareaf_iareag(model%numerics%dew, model%numerics%dns, & - model%geometry%thkmask, & - model%geometry%iareaf, model%geometry%iareag) + model%climate%eus, model%geometry%thkmask) ! ------------------------------------------------------------------------ ! Do the vertical thermal solve if it is time to do so. @@ -1742,8 +1792,6 @@ subroutine glissade_thermal_solve(model, dt) use glissade_therm, only: glissade_therm_driver use glissade_basal_water, only: glissade_calcbwat, glissade_bwat_flux_routing use glissade_masks, only: glissade_get_masks - !WHL - debug - use cism_parallel, only: parallel_reduce_max implicit none @@ -1882,12 +1930,6 @@ subroutine glissade_thermal_solve(model, dt) endwhere endif - !WHL - debug - Set mask = 0 where thck = 0 for dome test - ! An alternative would be to identify cells that have a path through land to the domain edge - where (model%geometry%thck == 0) - bwat_mask = 0 - endwhere - call parallel_halo(bwat_mask, parallel) ! Set the meltwater source for the basal hydrology scheme. @@ -1902,15 +1944,6 @@ subroutine glissade_thermal_solve(model, dt) model%basal_hydro%bmlt_hydro = 0.0d0 endwhere - call glissade_calcbwat(& - model%options%which_ho_bwat, & - model%basal_hydro, & - dt, & ! s - model%geometry%thck, & ! m - model%numerics%thklim_temp, & ! m - model%basal_melt%bmlt_ground, & ! m/s - model%basal_hydro%bwat) ! m - ! Compute the steady-state basal water flux based on a flux-routing scheme call glissade_bwat_flux_routing(& @@ -1926,12 +1959,14 @@ subroutine glissade_thermal_solve(model, dt) floating_mask, & model%basal_hydro%bmlt_hydro, & ! m/s model%temper%bpmp - model%temper%btemp_ground, & ! degC - model%basal_hydro%btemp_scale, & ! degC + model%basal_hydro%btemp_flow_scale, & ! degC + model%basal_hydro%btemp_freeze_scale, & ! degC model%basal_hydro%bwatflx, & ! m/s model%basal_hydro%bwat_diag, & ! m model%temper%bhydroflx, & ! W/m2 model%basal_hydro%head, & ! m - model%basal_hydro%grad_head) ! m/m + model%basal_hydro%grad_head, & ! m/m + reprosum_in = model%options%reproducible_sums) ! halo updates (not sure if all are needed) call parallel_halo(model%basal_hydro%bwatflx, parallel) @@ -2203,7 +2238,6 @@ subroutine glissade_thickness_tracer_solve(model) ! pre-transport halo updates for thickness and tracers call parallel_halo(model%geometry%thck, parallel) - call parallel_halo(model%geometry%topg, parallel) call parallel_halo_tracers(model%geometry%tracers, parallel) call parallel_halo_tracers(model%geometry%tracers_usrf, parallel) call parallel_halo_tracers(model%geometry%tracers_lsrf, parallel) @@ -2430,6 +2464,7 @@ subroutine glissade_calving_solve(model, init_calving) use glissade_masks, only: glissade_get_masks, glissade_ocean_connection_mask, & glissade_calving_front_mask use glissade_grounding_line, only: glissade_grounded_fraction + implicit none type(glide_global_type), intent(inout) :: model ! model instance @@ -3041,12 +3076,14 @@ subroutine glissade_isostasy_solve(model) ! but the argument is included to be on the safe side. ! TODO: Do we need similar logic for halo updates of relx? - if (model%general%global_bc == GLOBAL_BC_OUTFLOW .or. & - model%general%global_bc == GLOBAL_BC_NO_ICE) then + if (model%general%global_bc == GLOBAL_BC_OUTFLOW) then call parallel_halo_extrapolate(model%geometry%topg, parallel) + elseif (model%general%global_bc == GLOBAL_BC_NO_ICE) then + call parallel_halo(model%geometry%topg, parallel, zero_global_boundary_no_ice_bc = .false.) else ! other global BCs, including periodic call parallel_halo(model%geometry%topg, parallel, & - periodic_offset_ew = model%numerics%periodic_offset_ew) + periodic_offset_ew = model%numerics%periodic_offset_ew, & + periodic_offset_ns = model%numerics%periodic_offset_ns) endif ! update the marine connection mask, which depends on topg diff --git a/libglissade/glissade_basal_traction.F90 b/libglissade/glissade_basal_traction.F90 index a5142dab..91ec33a1 100644 --- a/libglissade/glissade_basal_traction.F90 +++ b/libglissade/glissade_basal_traction.F90 @@ -52,7 +52,7 @@ module glissade_basal_traction use glimmer_utils, only: point_diag use glide_types use cism_parallel, only : this_rank, main_task, parallel_type, & - parallel_halo, staggered_parallel_halo, parallel_globalindex, distributed_scatter_var + parallel_halo, staggered_parallel_halo, parallel_globalindex, scatter_var implicit none @@ -157,9 +157,10 @@ subroutine glissade_calcbeta (& big_lambda, & ! bedrock characteristics flwa_basal_stag ! basal flwa interpolated to the staggered grid (Pa^{-n} yr^{-1}) - ! variables for Tsai et al. parameterization - real(dp) :: taub_powerlaw ! basal shear stress given by a power law as in Tsai et al. (2015) - real(dp) :: taub_coulomb ! basal shear stress given by Coulomb friction as in Tsai et al. (2015) + ! variables for mixed power/Coulomb laws + real(dp) :: taub ! basal shear stress + real(dp) :: taub_powerlaw ! basal shear stress given by a power law + real(dp) :: taub_coulomb ! basal shear stress given by Coulomb friction ! variables for pseudo-plastic law real(dp) :: q ! exponent for pseudo-plastic law (unitless) @@ -185,6 +186,7 @@ subroutine glissade_calcbeta (& ! Enforce a minimum speed to prevent beta from become very large when velocity is small. speed(:,:) = dsqrt(thisvel(:,:)**2 + othervel(:,:)**2 + smallnum**2) + !TODO - Should this be done only for powerlaw sliding, and not for ZI or PP? ! If beta_powerlaw_umax is set to a nonzero value, then limit the speed to this value. ! Note: The actual ice speed can be greater than umax. This is just a way of shutting off the feedback ! between beta and ice speed (beta down as speed up) when the ice speed is large. @@ -347,7 +349,7 @@ subroutine glissade_calcbeta (& ! The following code sets beta on the full grid as prescribed by Pattyn et al. (2008). ! Allocate a global array on the main task only. - ! On other tasks, allocate a size 0 array, since distributed_scatter_var wants to deallocate on all tasks. + ! On other tasks, allocate a size 0 array, since scatter_var wants to deallocate on all tasks. if (main_task) then allocate(beta_global(parallel%global_ewn, parallel%global_nsn)) else @@ -377,9 +379,9 @@ subroutine glissade_calcbeta (& ! Note: beta_extend has dimensions (ewn,nsn), so it can receive scattered data from beta_global. allocate(beta_extend(ewn, nsn)) beta_extend(:,:) = 0.d0 - call distributed_scatter_var(beta_extend, beta_global, parallel) + call scatter_var(beta_extend, beta_global, parallel) - ! distributed_scatter_var does not update the halo, so do an update here + ! scatter_var does not update the halo, so do an update here call parallel_halo(beta_extend, parallel) ! Copy beta_extend to beta on the local processor. @@ -391,7 +393,7 @@ subroutine glissade_calcbeta (& enddo enddo - ! beta_extend is no longer needed (beta_global is deallocated in distributed_scatter_var) + ! beta_extend is no longer needed (beta_global is deallocated in scatter_var) deallocate(beta_extend) case(HO_BABC_BETA_EXTERNAL) ! use beta value from external file @@ -504,9 +506,9 @@ subroutine glissade_calcbeta (& beta = 1.0d8 end where - case(HO_BABC_COULOMB_POWERLAW_SCHOOF) + case(HO_BABC_SCHOOF) - ! Use the basal friction formulation of Schoof (2005), modified following Asay-Davis et al. (2016). + ! Use the basal friction formulation of Schoof (2005), formluated following Asay-Davis et al. (2016). ! This formulation uses a constant value of basal flwa, which allows several Coulomb parameters ! (lambda_max, m_max and flwa_basal) to be combined into a single parameter powerlaw_c, ! as in the Tsai power law below. @@ -576,16 +578,48 @@ subroutine glissade_calcbeta (& ! write(iulog,*) ew, ns, speed(ew,ns), basal_physics%effecpress_stag(ew,ns), beta(ew,ns), beta(ew,ns)*speed(ew,ns) ! enddo - case(HO_BABC_COULOMB_POWERLAW_TSAI) + case(HO_BABC_MODIFIED_SCHOOF) - ! Basal stress representation based on Tsai et al. (2015) - ! The basal stress is the minimum of two values: - ! (1) power law: tau_b = powerlaw_c * |u_b|^(1/powerlaw_m) - ! (2) Coulomb friction: tau_b = coulomb_c * N - ! N = effective pressure = rhoi*g*(H - H_f) - ! H_f = flotation thickness = (rhow/rhoi)*(eus-topg) - ! This value of N is obtained by setting p_ocean_penetration = 1.0 in the config file. - ! The other parameters (powerlaw_c, powerlaw_m and coulomb_c) can also be set in the config file. + ! Modified version of the Schoof law, with a simpler albebraic form + ! The basal stress is given by + ! 1/tau_b = 1/tau_p + 1/tau_c + ! where tau_p = powerlaw_c * |u_b|^(1/powerlaw_m) + ! tau_c = coulomb_c * N + ! N = effective pressure + ! Note: taub = 1/2 of the harmonic mean of tau_p and tau_c + + do ns = 1, nsn-1 + do ew = 1, ewn-1 + + taub_powerlaw = basal_physics%powerlaw_c(ew,ns) * speed(ew,ns)**(1.d0/basal_physics%powerlaw_m) + taub_coulomb = basal_physics%coulomb_c(ew,ns) * basal_physics%effecpress_stag(ew,ns) + + if (taub_coulomb > 0.0d0 .and. taub_powerlaw > 0.0d0) then + taub = 1.0d0 / (1.0d0/taub_powerlaw + 1.0d0/taub_coulomb) + elseif (taub_powerlaw > 0.0d0) then + taub = taub_powerlaw + elseif (taub_coulomb > 0.0d0) then + taub = taub_coulomb + endif + beta(ew,ns) = taub / speed(ew,ns) + + if (verbose_beta .and. present(rtest) .and. present(itest) .and. present(jtest)) then + if (this_rank == rtest .and. ew == itest .and. ns == jtest) then + write(iulog,*) ' ' + write(iulog,'(a38,3i4,4f12.3)') 'rank, i, j, tau_p, tau_c, tau_b, beta:', & + this_rank, ew, ns, taub_powerlaw, taub_coulomb, taub, beta(ew,ns) + endif + endif + enddo ! ew + enddo ! ns + + case(HO_BABC_TSAI) + + ! Basal stress representation based on Tsai et al. (2015) + ! The basal stress is the minimum of two values: + ! (1) power law: tau_p = powerlaw_c * |u_b|^(1/powerlaw_m) + ! (2) Coulomb friction: tau_c = coulomb_c * N + ! N = effective pressure do ns = 1, nsn-1 do ew = 1, ewn-1 @@ -607,18 +641,6 @@ subroutine glissade_calcbeta (& beta(:,:) = beta(:,:) * basal_physics%c_space_factor_stag(:,:) endif - case(HO_BABC_SIMPLE) ! simple pattern; also useful for debugging and test cases - ! (here, a strip of weak bed surrounded by stronger bed to simulate an ice stream) - - beta(:,:) = 1.d4 ! Pa yr/m - - !TODO - Change this loop to work in parallel (set beta on the global grid and scatter to local) - do ns = 5, nsn-5 - do ew = 1, ewn-1 - beta(ew,ns) = 100.d0 ! Pa yr/m - end do - end do - case default ! do nothing @@ -686,11 +708,10 @@ subroutine glissade_calcbeta (& !TODO - Move this halo update to a higher level? call staggered_parallel_halo(beta, parallel) - !WHL - debug if (verbose_beta .and. present(rtest) .and. present(itest) .and. present(jtest)) then if (this_rank == rtest) then ew = itest; ns = jtest - write(iulog,*) 'End of calcbeta, r, i, j, speed, f_ground, beta:', & + write(iulog,'(a48,3i4,3f12.5)') 'End of calcbeta, r, i, j, speed, f_ground, beta:', & rtest, ew, ns, speed(ew,ns), f_ground(ew,ns), beta(ew,ns) endif endif diff --git a/libglissade/glissade_basal_water.F90 b/libglissade/glissade_basal_water.F90 index 3b642157..68ef2d3f 100644 --- a/libglissade/glissade_basal_water.F90 +++ b/libglissade/glissade_basal_water.F90 @@ -26,20 +26,33 @@ module glissade_basal_water - use glimmer_global, only: dp + use glimmer_global, only: dp, i8 use glimmer_paramets, only: iulog, eps11, eps08 use glimmer_physcon, only: rhoi, rhow, lhci, grav, scyr use glimmer_log use glimmer_utils, only: point_diag use glide_types - use cism_parallel, only: main_task, this_rank, nhalo, parallel_type, parallel_halo + use cism_parallel, only: main_task, this_rank, nhalo, parallel_type, & + parallel_halo, parallel_global_sum + + !WHL - debug + use glimmer_utils, only: double_to_binary implicit none private public :: glissade_basal_water_init, glissade_calcbwat, glissade_bwat_flux_routing - logical, parameter :: verbose_bwat = .false. +!! logical, parameter :: verbose_bwat = .false. + logical, parameter :: verbose_bwat = .true. + + character(len=64) :: binary_str + + ! two versions of this subroutine; the second supports reproducible sums + interface route_flux_to_margin_or_halo + module procedure route_flux_to_margin_or_halo_real8 + module procedure route_flux_to_margin_or_halo_integer8 + end interface contains @@ -172,10 +185,13 @@ subroutine glissade_bwat_flux_routing(& thklim, & bwat_mask, floating_mask, & bmlt_hydro, & - delta_Tb, btemp_scale, & + delta_Tb, & + btemp_flow_scale, & + btemp_freeze_scale, & bwatflx, bwat_diag, & bhydroflx, & - head, grad_head) + head, grad_head, & + reprosum_in) ! Compute the subglacial water flux and water depth using a steady-state flux routing scheme. ! Water is routed down the hydropotential. For routing purposes, assume p_w = p_i (i.e., N = 0). @@ -208,13 +224,20 @@ subroutine glissade_bwat_flux_routing(& delta_Tb ! difference T_pmp - T_bed (degC) real(dp), intent(in) :: & - thklim, & ! minimum ice thickness for basal melt and hydropotential calculations (m) + thklim ! minimum ice thickness for basal melt and hydropotential calculations (m) ! Note: This is typically model%geometry%thklim_temp - btemp_scale ! temperature scale for transition from frozen to thawed bed (degC) + + ! Note: These scales ensure a smooth transition in behavior between frozen and thawed beds. + ! Both scales are computed in a similar way, but they apply to different parts of the algorithm. + ! TODO: Decide whether to keep both scales. Only the flow scale works for reprosums, so we might want + ! to remove the freeze scale. + real(dp), intent(in) :: & + btemp_flow_scale, & ! temperature scale for routing water flow around cells with a frozen bed (deg C) + btemp_freeze_scale ! temperature scale for refreezing water beneath cells with a frozen bed (degC) integer, dimension(nx,ny), intent(in) :: & bwat_mask, & ! mask to identify cells through which basal water is routed; - ! = 0 for floating and ocean cells; cells at global domain edge; + ! = 0 for floating and ocean cells, cells at global domain edge, ! and cells with thck = 0 and forced negative SMB floating_mask ! = 1 if ice is present (thck > thklim) and floating, else = 0 @@ -233,6 +256,11 @@ subroutine glissade_bwat_flux_routing(& head, & ! hydraulic head (m) grad_head ! gradient of hydraulic head (m/m), averaged to cell centers + ! Note: The reprosum option requires (1) D8 routing (each cell routes its flux to one downstream neighbor only) + ! and (2) no temperature-weighted refreezing. + logical, intent(in), optional :: & + reprosum_in ! if true, then do a computation independent of the number of tasks + ! Local variables integer :: i, j, p @@ -273,6 +301,8 @@ subroutine glissade_bwat_flux_routing(& real(dp) :: c_flux_to_depth ! proportionality coefficient in Sommers et al., Eq. 6 real(dp) :: Reynolds ! Reynolds number (unitless), = 0 for pure laminar flow + logical :: reprosum ! local version of reprosum_in + integer :: nx_test, ny_test real(dp), dimension(:,:), allocatable :: phi_test integer, dimension(:,:), allocatable :: mask_test @@ -281,12 +311,16 @@ subroutine glissade_bwat_flux_routing(& write(iulog,*) 'In glissade_bwat_flux_routing: rtest, itest, jtest =', rtest, itest, jtest endif + if (present(reprosum_in)) then + reprosum = reprosum_in + else + reprosum = .false. + endif + ! Uncomment if the following fields are not already up to date in halo cells ! call parallel_halo(thk, parallel) ! call parallel_halo(topg, parallel) call parallel_halo(bmlt_hydro, parallel) - !TODO - Add bfrz? - ! Compute the hydraulic head ! For purposes of flux routing, assume N = 0. @@ -309,7 +343,6 @@ subroutine glissade_bwat_flux_routing(& endif ! Route basal water down the gradient of hydraulic head, giving a water flux - ! TODO - Pass in bfrz_pot, return bfrz? call route_basal_water(& nx, ny, & @@ -320,11 +353,13 @@ subroutine glissade_bwat_flux_routing(& head, & bmlt_hydro, & delta_Tb, & - btemp_scale, & + btemp_flow_scale, & + btemp_freeze_scale, & bwat_mask, & bwatflx, & bwatflx_refreeze, & - lakes) + lakes, & + reprosum) call parallel_halo(bwatflx, parallel) @@ -362,7 +397,11 @@ subroutine glissade_bwat_flux_routing(& bwatflx(:,:) = bwatflx(:,:) / (dx*dy) ! Given bwatflx_refreeze in m^3/s, compute bhydroflx in W/m2. - ! This is the heat flux needed to refreeze the meltwater held in each cell + ! This is the heat flux needed to refreeze the meltwater held in each cell. + ! This heat flux is supplied at the bed during the next thermal solve. + ! If there is more than enough heat to thaw the bed, some meltwater will be returned later + ! instead of refrozen. + bhydroflx(:,:) = bwatflx_refreeze(:,:) * rhoi * lhci / (dx*dy) if (verbose_bwat) then @@ -439,11 +478,13 @@ subroutine route_basal_water(& head, & bmlt_hydro, & delta_Tb, & - btemp_scale, & + btemp_flow_scale, & + btemp_freeze_scale, & bwat_mask, & bwatflx, & bwatflx_refreeze, & - lakes) + lakes, & + reprosum_in) ! Route water from the basal melt field to its destination, recording the water flux along the way. ! Water flow direction is determined according to the gradient of the hydraulic head. @@ -452,11 +493,7 @@ subroutine route_basal_water(& ! This results in the lakes field, which is the difference between the filled head and the original head. ! The method used is by Quinn et. al. (1991). ! - ! Based on code by Jesse Johnson (2005), adapted from the glimmer_routing file by Ian Rutt. - - ! TODO - Pass in bfrz_pot, return bfrz. - - use cism_parallel, only: parallel_global_sum + ! Originally based on code by Jesse Johnson and Ian Rutt in the Glimmer model !WHL - debug use cism_parallel, only: parallel_globalindex, parallel_reduce_max @@ -481,8 +518,9 @@ subroutine route_basal_water(& delta_Tb ! difference T_pmp - T_bed (degC) real(dp), intent(in) :: & - btemp_scale ! temperature scale for transition from frozen to thawed bed (degC) - ! If btemp_scale = 0, assume no temperature dependence + btemp_flow_scale, & ! temperature scale for routing water flow around cells with a frozen bed (deg C) + btemp_freeze_scale ! temperature scale for refreezing water beneath cells with a frozen bed (degC) + ! If scale = 0, assume no temperature dependence real(dp), dimension(nx,ny), intent(inout) :: & head ! hydraulic head (m) @@ -490,19 +528,22 @@ subroutine route_basal_water(& integer, dimension(nx,ny), intent(in) :: & bwat_mask ! mask to identify cells through which basal water is routed; - ! = 1 where ice is present and not floating + ! excludes floating and ocean cells real(dp), dimension(nx,ny), intent(out) :: & bwatflx, & ! water flux through a grid cell (m^3/s) bwatflx_refreeze, & ! water flux held for refreezing (m^3/s) lakes ! lakes field, difference between filled and original head + logical, intent(in), optional :: & + reprosum_in ! if true, then do a computation independent of the number of tasks + ! Local variables integer :: nlocal ! number of locally owned cells integer :: count, count_max ! iteration counters - integer :: i, j, k, ii, jj, ip, jp, p - integer :: i1, j1, i2, j2, itmp, jtmp, iglobal, jglobal + integer :: i, j, k, iglobal, jglobal + integer :: ii, jj, imax, jmax logical :: finished ! true when an iterative loop has finished @@ -510,15 +551,14 @@ subroutine route_basal_water(& sorted_ij ! i and j indices of all cells, sorted from low to high values of head real(dp), dimension(-1:1,-1:1,nx,ny) :: & - flux_fraction, & ! fraction of flux from each cell that flows downhill to each of 8 neighbors - bwatflx_halo ! water flux (m^3/s) routed to a neighboring halo cell; routed further in next iteration + flux_fraction ! fraction of flux from each cell that flows downhill to each of 8 neighbors real(dp), dimension(nx,ny) :: & head_filled, & ! head after depressions are filled (m) + btemp_weight_flow, & ! temp-dependent weighting factor, forcing flow around cells with frozen beds + btemp_weight_freeze, & ! temp-dependent weighting factor, favoring refreezing in cells with frozen beds bwatflx_accum, & ! water flux through the cell (m^3/s) accumulated over multiple iterations - bwatflx_refreeze_accum,& ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations - sum_bwatflx_halo, & ! bwatflx summed over the first 2 dimensions in each grid cell - btemp_weight ! temperature-dependent weighting factor, favoring flow where the bed is thawed + bwatflx_refreeze_accum ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations integer, dimension(nx,ny) :: & local_mask, & ! = 1 for cells owned by the local processor, else = 0 @@ -533,8 +573,33 @@ subroutine route_basal_water(& err, & ! water conservation error global_flux_sum ! flux sum over all cells in global domain + ! The following i8 variables are for computing reproducible sums + integer(i8), dimension(nx,ny) :: & + bwatflx_int, & ! water flux through a grid cell (m^3/s) + btemp_weight_freeze_int, & ! temp-dependent weighting factor, favoring refreezing in cells with frozen beds + bwatflx_accum_int, & ! water flux through the cell (m^3/s) accumulated over multiple iterations + bwatflx_refreeze_accum_int ! water flux (m^3/s) refreezing in place, accumulated over multiple iterations + + integer(i8), dimension(-1:1,-1:1,nx,ny) :: & + flux_fraction_int ! fraction of flux from each cell that flows downhill to each of 8 neighbors + + real(dp), parameter :: & + factor_bwatflx = 1.d16 ! factor for converting between bwatflx and bwatflx_int; + ! large value desired for water mass conservation + + logical :: reprosum ! local version of reprosum_in + character(len=100) :: message + !WHL - debug + character(len=64) :: binary_str + + if (present(reprosum_in)) then + reprosum = reprosum_in + else + reprosum = .false. + endif + ! Allocate the sorted_ij array nlocal = parallel%own_ewn * parallel%own_nsn @@ -608,28 +673,44 @@ subroutine route_basal_water(& enddo enddo - ! Compute a temperature-dependent weighting factor for flux routing. - ! This is used in two parts of the code: - ! (1) In subroutine get_flux_fraction, btemp_weight is used to weight potential downstream paths. - ! A small value of btemp_weight means that a cell is less likely to receive water from upstream. - ! (2) When water enters a frozen cell (delta_Tb > 0), btemp_weight is used to determine - ! how much of the flux is refrozen in place rather than passing through. - ! A low value of btemp_weight means that less water passes through. - - btemp_weight = 1.0d0 + ! Compute temperature-dependent weighting factors for flux routing. + ! There are two scales with related but distinct functions: + ! (1) In subroutine get_flux_fraction, btemp_flow_scale is used to weigh potential downstream paths. + ! A low value of btemp_weight_flow means that water is less likely to pass through. + ! (2) When water enters a frozen cell (delta_Tb > 0), btemp_freeze_scale determines + ! how much of the flux refreezes in place rather than passing through. + ! A small value of btemp_weight_freeze means that more water refreezes, and less passes through. + ! Note: For reproducible sums, refreezing is not supported; must have btemp_weight_freeze = 1 everywhere. + ! TODO: Possibly remove btemp_freeze_scale and just keep btemp_flow_scale. + + btemp_weight_flow = 1.0d0 + if (btemp_flow_scale > 0.0d0) then + if (.not. reprosum) then + where (bwat_mask == 1) + where (delta_Tb > 0.0d0) + btemp_weight_flow = exp(-delta_Tb/btemp_flow_scale) + endwhere + endwhere + endif + endif - if (btemp_scale > 0.0d0) then - where (bwat_mask == 1) - where (delta_Tb > 0.0d0) - btemp_weight = exp(-delta_Tb/btemp_scale) + btemp_weight_freeze = 1.0d0 + if (btemp_freeze_scale > 0.0d0) then + if (.not. reprosum) then + where (bwat_mask == 1) + where (delta_Tb > 0.0d0) + btemp_weight_freeze = exp(-delta_Tb/btemp_freeze_scale) + endwhere endwhere - endwhere - if (verbose_bwat) then - call point_diag(delta_Tb, 'Tpmp - Tb', itest, jtest, rtest, 7, 7) - call point_diag(btemp_weight, 'btemp_weight', itest, jtest, rtest, 7, 7) endif endif + if (verbose_bwat) then + call point_diag(delta_Tb, 'Tpmp - Tb', itest, jtest, rtest, 7, 7) + call point_diag(btemp_weight_flow, 'btemp_weight_flow', itest, jtest, rtest, 7, 7) + call point_diag(btemp_weight_freeze, 'btemp_weight_freeze', itest, jtest, rtest, 7, 7) + endif + ! Compute the fraction of the incoming flux sent to each downstream neighbor. call get_flux_fraction(& @@ -639,18 +720,12 @@ subroutine route_basal_water(& flux_routing_scheme, & sorted_ij, & head, & - btemp_weight, & + btemp_weight_flow, & bwat_mask, & flux_fraction) - ! Initialize bwatflx in locally owned cells with the basal melt, which will be routed downslope. - ! Multiply by area, so units are m^3/s. - ! The halo water flux, bwatflx_halo, holds water routed to halo cells; - ! it will be routed downhill during the next iteration. - ! The accumulated flux, bwatflx_accum, holds the total flux over multiple iterations. - ! Some or all of the water entering a frozen cell can be refrozen in place. - ! The heat flux associated with refreezing is passed to the next thermal solve. - ! If this heat flux is enough to thaw the cell, some of the meltwater is returned later. + ! Initialize bwatflx in locally owned cells. + ! Set to the local melt rate, multiplied by area (so the units are m^3/s). bwatflx = 0.0d0 do j = nhalo+1, ny-nhalo @@ -659,174 +734,132 @@ subroutine route_basal_water(& enddo enddo - ! Initialize other fluxes - bwatflx_halo = 0.0d0 - bwatflx_refreeze = 0.0d0 - bwatflx_accum = 0.0d0 - bwatflx_refreeze_accum = 0.0d0 - ! Compute total input of meltwater (m^3/s) total_flux_in = parallel_global_sum(bwatflx, parallel) - if (verbose_bwat .and. this_rank == rtest) then - write(iulog,*) ' ' write(iulog,*) 'Total input basal melt flux (m^3/s):', total_flux_in +!! call double_to_binary(total_flux_in, binary_str) +!! write(iulog,*) ' Binary string', binary_str endif - ! Loop over locally owned cells, from highest to lowest. - ! During each iteration, there are two possible outcomes for routing: - ! (1) Routed to the ice sheet margin, to a cell with bwat_mask = 0. - ! In this case, the routing of that flux is done. - ! (2) Routed to a halo cell, i.e. a downslope cell on a neighboring processor. - ! In this case, the flux will be routed further downhill on the next iteration. - ! When all the water has been routed to the margin, we are done. + ! Route the water downstream, keeping track of the steady-state flux through each cell. + ! The loop goes from highest to lowest values of head on the local processor. + ! At the end of the loop, all the incoming flux has either been + ! (1) routed to the ice sheet margin, + ! (2) set aside for later refreezing, or + ! (3) routed to a halo cell, from which it will continue downstream on the next iteration. + ! When all the water has been routed to the margin or set aside for refreezing, we are done. - count = 0 ! Note: It is hard to predict how many iterations will be sufficient. - ! With Dinf or FD8, we can have flow back and forth across processor boundaries, + ! With Dinf or FD8 we can have flow back and forth across processor boundaries, ! requiring many iterations to reach the margin. ! For Greenland 4 km, Dinf requires ~20 iterations on 4 cores, and FD8 can require > 40. ! For Antarctica 8 km, FD8 can require > 50. - count_max = 100 - finished = .false. - do while (.not.finished) - - count = count + 1 + ! Initialize the cumulative fluxes + bwatflx_accum = 0.0d0 + bwatflx_refreeze_accum = 0.0d0 - if (verbose_bwat .and. this_rank == rtest) then - write(iulog,*) 'flux routing, count =', count - endif + if (reprosum) then - do k = nlocal, 1, -1 + ! Convert bwatflx to a scaled i8 array + bwatflx_int(:,:) = nint(bwatflx(:,:)*factor_bwatflx, i8) - ! Get i and j indices of current cell - i = sorted_ij(k,1) - j = sorted_ij(k,2) + ! Convert flux_fraction to i8 + ! Note: This will work only for the D8 scheme, where all the flux goes downstream + ! to a single cell. + flux_fraction_int(:,:,:,:) = nint(flux_fraction(:,:,:,:), i8) - ! Route the flux - if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then + ! Convert btemp_weight_freeze to i8 + btemp_weight_freeze_int(:,:) = 1 + ! Note: Can round up to 1 and down to 0 by uncommenting the following line. + ! However, a mix of 1's and 0's leads to oscillations in basal temperature, + ! so it is safer to turn off refreezing by setting btemp_weight_freeze = 1 everywhere. +! btemp_weight_freeze_int(:,:) = nint(btemp_weight_freeze(:,:), i8) - ! Distribute the flux to downslope neighbors. - ! Where the bed is frozen, all or part of the flux is refrozen in place instead of being routed downstream. - do jj = -1,1 - do ii = -1,1 - ip = i + ii - jp = j + jj - if (flux_fraction(ii,jj,i,j) > 0.0d0) then - if (halo_mask(ip,jp) == 1) then - bwatflx_halo(ii,jj,i,j) = bwatflx(i,j)*flux_fraction(ii,jj,i,j)*btemp_weight(i,j) - bwatflx_refreeze(i,j) = bwatflx_refreeze(i,j) & - + bwatflx(i,j)*flux_fraction(ii,jj,i,j)*(1.0d0 - btemp_weight(i,j)) - if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then - write(iulog,*) 'Flux to halo, i, j, ii, jj, flux:', & - i, j, ii, jj, bwatflx(i,j)*flux_fraction(ii,jj,i,j) - endif - elseif (local_mask(ip,jp) == 1) then - bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx(i,j)*flux_fraction(ii,jj,i,j)*btemp_weight(i,j) - bwatflx_refreeze(i,j) = bwatflx_refreeze(i,j) & - + bwatflx(i,j)*flux_fraction(ii,jj,i,j)*(1.0d0 - btemp_weight(i,j)) - endif - endif ! flux_fraction > 0 - enddo - enddo - endif ! bwat_mask = 1, bwatflx > 0 - enddo ! loop from high to low + ! Initialize other arrays + bwatflx_accum_int = 0 + bwatflx_refreeze_accum_int = 0 - ! Accumulate bwatflx from the latest iteration, then reset to zero for the next iteration. - bwatflx_accum = bwatflx_accum + bwatflx - bwatflx_refreeze_accum = bwatflx_refreeze_accum + bwatflx_refreeze - bwatflx = 0.0d0 - bwatflx_refreeze = 0.0d0 + endif ! reprosum - if (verbose_bwat .and. this_rank == rtest .and. count <= 2) then - i = itest - j = jtest - write(iulog,*) 'i, j, bwatflx_accum:', i, j, bwatflx_accum(i,j) - endif + count = 0 + count_max = 100 + finished = .false. - ! If bwatflx_halo = 0 everywhere, then we are done. - ! (If the remaining flux is very small (< eps11), discard it to avoid - ! unnecessary extra iterations.) - ! If bwatflx_halo remains, then communicate it to neighboring tasks and - ! continue routing on the next iteration. - - do j = 1, ny - do i = 1, nx - sum_bwatflx_halo(i,j) = sum(bwatflx_halo(:,:,i,j)) -! if (verbose_bwat .and. sum_bwatflx_halo(i,j) > eps11 .and. count > 50) then -! write(iulog,*) 'Nonzero bwatflx_halo, count, rank, i, j, sum_bwatflx_halo:', & -! count, this_rank, i, j, sum_bwatflx_halo(i,j) -! call parallel_globalindex(i, j, iglobal, jglobal, parallel) -! write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -! endif - enddo - enddo - global_flux_sum = parallel_global_sum(sum_bwatflx_halo, parallel) + do while (.not.finished) - if (verbose_bwat .and. count <= 2) then - if (this_rank == rtest) then - write(iulog,*) 'Before halo update, sum of bwatflx_halo:', global_flux_sum - endif - call point_diag(sum_bwatflx_halo, 'sum_bwatflx_halo', itest, jtest, rtest, 7, 7) + count = count + 1 + if (verbose_bwat .and. this_rank == rtest) write(iulog,*) 'flux routing, count =', count + if (count > count_max) then + call write_log('Hydrology error: too many iterations in route_basal_water', GM_FATAL) endif - if (global_flux_sum > eps11) then - - finished = .false. + if (reprosum) then + + ! route downstream + ! Note: The fluxes are scaled by factor_bwatflx + + call route_flux_to_margin_or_halo(& + nx, ny, nlocal, & + itest, jtest, rtest, count, & + parallel, & + sorted_ij, & + local_mask, & + halo_mask, & + bwat_mask, & + flux_fraction_int, & + btemp_weight_freeze_int, & + bwatflx_int, & + bwatflx_accum_int, & + bwatflx_refreeze_accum_int, & + finished) - ! Communicate bmltflx_halo to the halo cells of neighboring processors - call parallel_halo(bwatflx_halo(:,:,:,:), parallel) - - ! bmltflx_halo is now available in the halo cells of the local processor. - ! Route downslope to the adjacent locally owned cells. - ! These fluxes will be routed further downslope during the next iteration. - - do j = 2, ny-1 - do i = 2, nx-1 - if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0.0d0) then - do jj = -1,1 - do ii = -1,1 - if (bwatflx_halo(ii,jj,i,j) > 0.0d0) then - ip = i + ii - jp = j + jj - if (local_mask(ip,jp) == 1) then - bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx_halo(ii,jj,i,j) - if (verbose_bwat .and. ip==itest .and. jp==jtest .and. this_rank==rtest & - .and. count <= 2) then - write(iulog,*) 'Nonzero bwatflx from halo, rank, i, j:', & - this_rank, ip, jp, bwatflx_halo(ii,jj,i,j) - endif - endif - endif ! bwatflx_halo > 0 to a local cell - enddo ! ii - enddo ! jj - endif ! bwatflx_halo > 0 from this halo cell - enddo ! i - enddo ! j + if (verbose_bwat .and. this_rank == rtest .and. count <= 2) then + i = itest; j = jtest + write(iulog,*) 'count, rank i, j, bwatflx_accum (m/yr), bwatflx_refreeze_accum:', & + count, rtest, i, j, real(bwatflx_accum_int(i,j),dp)/factor_bwatflx, & + real(bwatflx_refreeze_accum_int(i,j),dp)/factor_bwatflx + endif - ! Reset bwatflx_halo for the next iteration - bwatflx_halo = 0.0d0 + else ! non-reproducible sums + + call route_flux_to_margin_or_halo(& + nx, ny, nlocal, & + itest, jtest, rtest, count, & + parallel, & + sorted_ij, & + local_mask, & + halo_mask, & + bwat_mask, & + flux_fraction, & + btemp_weight_freeze, & + bwatflx, & + bwatflx_accum, & + bwatflx_refreeze_accum, & + finished) - global_flux_sum = parallel_global_sum(bwatflx, parallel) if (verbose_bwat .and. this_rank == rtest .and. count <= 2) then - ! Should be equal to the global sum of bwatflx_halo computed above - write(iulog,*) 'After halo update, sum(bwatflx from halo) =', global_flux_sum - write(iulog,*) ' ' + i = itest; j = jtest + write(iulog,*) 'count, rank i, j, bwatflx_accum(m/yr), bwatflx_refreeze_accum:', & + count, rtest, i, j, bwatflx_accum(i,j) * scyr/(dx*dy), & + bwatflx_refreeze_accum(i,j) * scyr/(dx*dy) endif - else ! bwatflx_halo = 0 everywhere; no fluxes to route to adjacent processors - if (verbose_bwat .and. this_rank == rtest) write(iulog,*) 'Done routing fluxes' - finished = .true. - bwatflx = bwatflx_accum - bwatflx_refreeze = bwatflx_refreeze_accum - endif + endif ! reprosum - if (count > count_max) then - call write_log('Hydrology error: too many iterations in route_basal_water', GM_FATAL) - endif + enddo ! finished + + if (reprosum) then + ! Convert fluxes back to real(dp) + bwatflx_accum = real(bwatflx_accum_int, dp) / factor_bwatflx + bwatflx_refreeze_accum = real(bwatflx_refreeze_accum_int, dp) / factor_bwatflx + endif - enddo ! finished routing + ! Copy the accumulated values to the output arrays bwatflx and bwatflx_refreeze + bwatflx = bwatflx_accum + bwatflx_refreeze = bwatflx_refreeze_accum + if (verbose_bwat .and. this_rank == rtest) write(iulog,*) 'Done routing fluxes' ! Identify cells just beyond the ice sheet margin, which can receive from upstream but not send downstream where (bwat_mask == 0 .and. bwatflx > 0.0d0) @@ -847,7 +880,6 @@ subroutine route_basal_water(& write(iulog,*) 'Difference between output and input =', total_flux_out - total_flux_in endif - ! Not sure if a threshold of eps11 is large enough. Increase if needed. if (total_flux_in > 0.0d0) then err = abs(total_flux_in - total_flux_out) @@ -1031,8 +1063,7 @@ subroutine fill_depressions(& ! Continue until no further lowering of phi is possible. At that point, phi = phi_out. ! Note: Setting eps = 0 would result in flat surfaces that would need to be fixed later. - use cism_parallel, only: parallel_reduce_sum - use cism_parallel, only: parallel_globalindex + use cism_parallel, only: parallel_reduce_sum, parallel_globalindex implicit none @@ -1054,7 +1085,7 @@ subroutine fill_depressions(& real(dp), dimension(nx,ny), intent(out) :: & phi ! output field with depressions filled - ! Local variables -------------------------------------- + ! Local variables logical, dimension(nx,ny) :: & known ! = true for cells where the final phi(i,j) is known @@ -1264,7 +1295,7 @@ subroutine sort_heights(& integer, intent(in) :: & nx, ny, & ! number of grid cells in each direction nlocal, & ! number of locally owned cells - itest, jtest, rtest ! coordinates of diagnostic point + itest, jtest, rtest ! coordinates of diagnostic point !! not currently used real(dp), dimension(nx,ny), intent(in) :: & phi ! input field, to be sorted from low to high @@ -1307,17 +1338,6 @@ subroutine sort_heights(& call indexx(vect, ind) - if (verbose_bwat .and. this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'Sort from low to high, nlocal =', nlocal - write(iulog,*) 'k, local i and j, ind(k), phi:' - do k = nlocal, nlocal-10, -1 - i = floor(real(ind(k)-1)/real(ny_local)) + 1 + nhalo - j = mod(ind(k)-1,ny_local) + 1 + nhalo - write(iulog,*) k, i, j, ind(k), phi(i,j) - enddo - endif - ! Fill the sorted_ij array with the i and j values of each cell. ! Note: These are the i and j values we would have if there were no halo cells. do k = 1, nlocal @@ -1339,7 +1359,7 @@ subroutine get_flux_fraction(& flux_routing_scheme, & sorted_ij, & head, & - btemp_weight, & + btemp_weight_flow, & bwat_mask, & flux_fraction) @@ -1369,7 +1389,7 @@ subroutine get_flux_fraction(& real(dp), dimension(nx,ny), intent(in) :: & head, & ! hydraulic head (m) - btemp_weight ! temperature-dependent weighting factor, favoring flow where the bed is thawed + btemp_weight_flow ! temperature-dependent weighting factor, forcing flow around cells with frozen beds integer, dimension(nx,ny), intent(in) :: & bwat_mask ! = 1 for cells in the region where basal water fluxes can be nonzero @@ -1428,7 +1448,7 @@ subroutine get_flux_fraction(& jp = j + jj if (ip >= 1 .and. ip <= nx .and. jp > 1 .and. jp <= ny) then if (head(ip,jp) < head(i,j)) then - slope(ii,jj) = btemp_weight(ip,jp) * (head(i,j) - head(ip,jp)) / dists(ii,jj) + slope(ii,jj) = btemp_weight_flow(ip,jp) * (head(i,j) - head(ip,jp)) / dists(ii,jj) endif endif endif @@ -1588,6 +1608,390 @@ subroutine get_flux_fraction(& end subroutine get_flux_fraction +!============================================================== + + subroutine route_flux_to_margin_or_halo_real8(& + nx, ny, nlocal, & + itest, jtest, rtest, count, & + parallel, & + sorted_ij, & + local_mask, & + halo_mask, & + bwat_mask, & + flux_fraction, & + btemp_weight_freeze, & + bwatflx, & + bwatflx_accum, & + bwatflx_refreeze_accum, & + finished) + + ! Given the input bwatflx, route the water downstream, keeping track of fluxes along the way. + ! The loop goes from highest to lowest values of 'head' on the local processor. + ! At the end of the loop, all the incoming flux has either been + ! (1) routed to the ice sheet margin; + ! (2) set aside for later refreezing; or + ! (3) routed to a halo cell, from which it continues downstream the next time the subroutine is called. + ! The subroutine is called iteratively until all no water remains in halo cells. + + implicit none + + ! Input/output variables + + integer, intent(in) :: & + nx, ny, & ! number of cells in each direction + nlocal, & ! number of locally owned grid cells on the processor + itest, jtest, rtest, & ! coordinates of diagnostic point + count ! iteration count (diagnostic only) + + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + + integer, dimension(nlocal,2), intent(in) :: & + sorted_ij ! i and j indices of each local cell, sorted low to high + + integer, dimension(nx,ny), intent(in) :: & + local_mask, & ! = 1 for cells owned by the local processor, else = 0 + halo_mask, & ! = 1 for the layer of halo cells adjacent to locally owned cells, else = 0 + bwat_mask ! = 1 for cells through which basal water is routed; excludes floating and ocean cells + + real(dp), dimension(-1:1,-1:1,nx,ny), intent(in) :: & + flux_fraction ! fraction of flux from a cell that flows downhill to each of 8 neighbors + ! last two indices identify the source cell; + ! 1st two indices give relative location of receiving cell + + real(dp), dimension(nx,ny), intent(in) :: & + btemp_weight_freeze ! temperature-dependent weighting factor, favoring refreezing at frozen beds + + real(dp), dimension(nx,ny), intent(inout) :: & + bwatflx, & ! on input: water flux (m^3/s) to be routed to the margin or halo + ! on output: flux routed to halo, to be routed further next time + bwatflx_accum, & ! cumulative bwatflx (m/3/s) over multiple iterations + bwatflx_refreeze_accum ! cumulative bwatflx_refreeze (m^s/s) over multiple iterations + + logical, intent(inout) :: & + finished ! initially F; set to T when all water has been routed as far as it can go + + ! Local variables + + integer :: i, j, k + integer :: ii, jj, ip, jp + + real(dp), dimension(-1:1,-1:1,nx,ny):: & + bwatflx_halo ! flux routed to halo cells + ! last two indices identify the source cell; + ! 1st two indices give relative location of receiving cell + + real(dp), dimension(nx,ny) :: & + bwatflx_refreeze, & ! flux (m^3/s) saved for later refreezing; not routed further downstream + sum_bwatflx_halo ! bwatflx_halo summed over the first 2 indices + + real(dp) :: & + flx_thru, & ! flux (m^3/s) that continues downstream + global_halo_sum ! global sum of water flux in halo cells + + ! Initialize fluxes + bwatflx_refreeze = 0.0d0 + bwatflx_halo = 0.0d0 + + ! loop from high to low values on the local processor + do k = nlocal, 1, -1 + + ! Get i and j indices of current cell + i = sorted_ij(k,1) + j = sorted_ij(k,2) + + if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then + + ! Distribute the flux to downstream neighbors. + ! Based on the temperature-dependent weighting factor btemp_weight_freeze, all or part of the flux + ! is refrozen in place instead of being routed downstream. + flx_thru = bwatflx(i,j) * btemp_weight_freeze(i,j) + bwatflx_refreeze(i,j) = bwatflx(i,j) * (1.0d0 - btemp_weight_freeze(i,j)) + do jj = -1,1 + do ii = -1,1 + ip = i + ii + jp = j + jj + if (flux_fraction(ii,jj,i,j) > 0.0d0) then + if (halo_mask(ip,jp) == 1) then + bwatflx_halo(ii,jj,i,j) = flx_thru*flux_fraction(ii,jj,i,j) + if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then + write(iulog,*) 'Flux to halo, i, j, ii, jj, flux:', & + i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j) + endif + elseif (local_mask(ip,jp) == 1) then + bwatflx(ip,jp) = bwatflx(ip,jp) + flx_thru*flux_fraction(ii,jj,i,j) + if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then + write(iulog,*) 'Flux to neighbor, i, j, ii, jj, flux:', & + i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j) + endif + endif + endif ! flux_fraction > 0 + enddo ! ii + enddo ! jj + endif ! bwat_mask = 1, bwatflx > 0 + enddo ! loop from high to low + + ! Accumulate the fluxes in the output arrays + bwatflx_accum = bwatflx_accum + bwatflx + bwatflx_refreeze_accum = bwatflx_refreeze_accum + bwatflx_refreeze + + ! Compute the total bwatflx in halo cells + do j = 1, ny + do i = 1, nx + sum_bwatflx_halo(i,j) = sum(bwatflx_halo(:,:,i,j)) + enddo + enddo + global_halo_sum = parallel_global_sum(sum_bwatflx_halo, parallel) + + ! If bwatflx_halo = 0 everywhere, then we are done. + ! Where bwatflx_halo is nonzero, communicate it to the neighboring task. + ! It will be routed further downstream the next time this subroutine is called. + + if (global_halo_sum > 0.0d0) then + + if (verbose_bwat .and. count <= 2) then + if (this_rank == rtest) write(iulog,*) 'Before halo update, global_halo_sum:', global_halo_sum + call point_diag(sum_bwatflx_halo, 'sum_bwatflx_halo', itest, jtest, rtest, 7, 7) + endif + + ! Reset bwatflx to zero for the halo transfer + bwatflx = 0.0d0 + + ! Communicate bmltflx_halo to the halo cells of neighboring processors + call parallel_halo(bwatflx_halo(:,:,:,:), parallel) + + ! bmltflx_halo is now available in the halo cells of the local processor. + ! Route downslope to the adjacent locally owned cells. + ! These fluxes will be routed further downstream during the next iteration. + + do j = 2, ny-1 + do i = 2, nx-1 + if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0.0d0) then + do jj = -1,1 + do ii = -1,1 + if (bwatflx_halo(ii,jj,i,j) > 0.0d0) then + ip = i + ii + jp = j + jj + if (local_mask(ip,jp) == 1) then + bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx_halo(ii,jj,i,j) + if (verbose_bwat .and. ip==itest .and. jp==jtest .and. this_rank==rtest .and. count <= 2) then + write(iulog,*) 'Nonzero bwatflx from halo, rank, i, j:', & + this_rank, ip, jp, bwatflx_halo(ii,jj,i,j) + endif + endif + endif ! bwatflx_halo > 0 to a local cell + enddo ! ii + enddo ! jj + endif ! bwatflx_halo > 0 from this halo cell + enddo ! i + enddo ! j + + else + + finished = .true. ! no water in halo cells to route further + + endif ! global_halo_sum > 0 + + end subroutine route_flux_to_margin_or_halo_real8 + +!============================================================== + + subroutine route_flux_to_margin_or_halo_integer8(& + nx, ny, nlocal, & + itest, jtest, rtest, count, & + parallel, & + sorted_ij, & + local_mask, & + halo_mask, & + bwat_mask, & + flux_fraction, & + btemp_weight_freeze, & + bwatflx, & + bwatflx_accum, & + bwatflx_refreeze_accum, & + finished) + + ! Given the input bwatflx, route the water downstream, keeping track of fluxes along the way. + ! The loop goes from highest to lowest values of 'head' on the local processor. + ! At the end of the loop, all the incoming flux has either been + ! (1) routed to the ice sheet margin; + ! (2) set aside for later refreezing; or + ! (3) routed to a halo cell, from which it continues downstream the next time the subroutine is called. + ! The subroutine is called iteratively until all no water remains in halo cells. + + implicit none + + ! Input/output variables + + integer, intent(in) :: & + nx, ny, & ! number of cells in each direction + nlocal, & ! number of locally owned grid cells on the processor + itest, jtest, rtest, & ! coordinates of diagnostic point + count ! iteration count (diagnostic only) + + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + + integer, dimension(nlocal,2), intent(in) :: & + sorted_ij ! i and j indices of each local cell, sorted low to high + + integer, dimension(nx,ny), intent(in) :: & + local_mask, & ! = 1 for cells owned by the local processor, else = 0 + halo_mask, & ! = 1 for the layer of halo cells adjacent to locally owned cells, else = 0 + bwat_mask ! = 1 for cells through which basal water is routed; excludes floating and ocean cells + + ! Note: Both flux_fraction and btemp_weight_freeze are constrained to be 0 or 1. + ! This means that the routing is limited to D8 (all the flux goes to one downstream cell), + ! and partial refreezing is not allowed (i.e., btemp_weight_freeze = 1 everywhere). + ! Thus, btemp_weight_freeze is not needed, but I kept it to keep the code similar to the subroutine above. + ! We could make refreezing all-or-nothing (i.e., weights of either 0 or 1), but this leads to + ! oscillations in bed temperature. + ! I thought of rescaling flux_fraction and btemp_weight_freeze to largish i8 integers (e.g., 1000) + ! to keep everything BFB, and then scaling back at the end. The problem is that this subroutine + ! may need to be called repeatedly, and each scaling would lead to larger and larger integers + ! that eventually exceed the i8 limit on integer size, ~10^(19). + + integer(i8), dimension(-1:1,-1:1,nx,ny), intent(in) :: & + flux_fraction ! fraction of flux from a cell that flows downhill to each of 8 neighbors + ! last two indices identify the source cell; + ! 1st two indices give relative location of receiving cell + + integer(i8), dimension(nx,ny), intent(in) :: & + btemp_weight_freeze ! temperature-dependent weighting factor, favoring refreezing at frozen beds + + integer(i8), dimension(nx,ny), intent(inout) :: & + bwatflx, & ! on input: water flux (m^3/s * factor_bwatflx) to be routed to the margin or halo + ! on output: flux routed to halo, to be routed further next time + bwatflx_accum, & ! cumulative bwatflx (m^3/s * factor_bwatflx) over multiple iterations + bwatflx_refreeze_accum ! cumulative bwatflx_refreeze (m^3/s * factor_bwatflx) over multiple iterations + + logical, intent(inout) :: & + finished ! initially F; set to T when all water has been routed as far as it can go + + ! Local variables + + integer :: i, j, k + integer :: ii, jj, ip, jp + + ! Note: Some of the local variables are scaled by products of all three scale factors above. + integer(i8), dimension(-1:1,-1:1,nx,ny):: & + bwatflx_halo ! flux routed to halo cells + ! last two indices identify the source cell; + ! 1st two indices give relative location of receiving cell + + integer(i8), dimension(nx,ny) :: & + bwatflx_refreeze, & ! flux saved for later refreezing; not routed further downstream + sum_bwatflx_halo ! bwatflx_halo summed over the first 2 indices + + integer(i8) :: & + flx_thru, & ! flux (m^3/s) that continues downstream + global_halo_sum ! global sum of water flux in halo cells + + real(dp), dimension(nx,ny):: & + bwatflx_dp, bwatflx_halo_dp, bwatflx_refreeze_dp ! temporary dp versions of i8 arrays + + ! Initialize fluxes + bwatflx_halo = 0 + bwatflx_refreeze = 0 + + ! loop from high to low values on the local processor + do k = nlocal, 1, -1 + + ! Get i and j indices of current cell + i = sorted_ij(k,1) + j = sorted_ij(k,2) + + if (bwat_mask(i,j) == 1 .and. bwatflx(i,j) > 0.0d0) then + + ! Distribute the flux to downstream neighbors. + ! Note: If btemp_weight_freeze = 1 everwhere, there is no refreezing. + flx_thru = bwatflx(i,j) * btemp_weight_freeze(i,j) + bwatflx_refreeze(i,j) = bwatflx(i,j) * (1 - btemp_weight_freeze(i,j)) + do jj = -1,1 + do ii = -1,1 + ip = i + ii + jp = j + jj + if (flux_fraction(ii,jj,i,j) > 0) then + if (halo_mask(ip,jp) == 1) then + bwatflx_halo(ii,jj,i,j) = flx_thru*flux_fraction(ii,jj,i,j) + if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then + write(iulog,*) 'Flux to halo, i, j, ii, jj, flux:', & + i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j) + endif + elseif (local_mask(ip,jp) == 1) then + bwatflx(ip,jp) = bwatflx(ip,jp) + flx_thru*flux_fraction(ii,jj,i,j) + if (verbose_bwat .and. this_rank==rtest .and. i==itest .and. j==jtest .and. count <= 2) then + write(iulog,*) 'Flux to neighbor, i, j, ii, jj, flux:', & + i, j, ii, jj, flx_thru*flux_fraction(ii,jj,i,j) + endif + endif + endif ! flux_fraction > 0 + enddo ! ii + enddo ! jj + endif ! bwat_mask = 1, bwatflx > 0 + enddo ! loop from high to low + + ! Accumulate the fluxes in the output arrays + bwatflx_accum = bwatflx_accum + bwatflx + bwatflx_refreeze_accum = bwatflx_refreeze_accum + bwatflx_refreeze + + ! Compute the total bwatflx in halo cells + do j = 1, ny + do i = 1, nx + sum_bwatflx_halo(i,j) = sum(bwatflx_halo(:,:,i,j)) + enddo + enddo + global_halo_sum = parallel_global_sum(sum_bwatflx_halo, parallel) + + ! If bwatflx_halo = 0 everywhere, then we are done. + ! Where bwatflx_halo is nonzero, communicate it to the neighboring task. + ! It will be routed further downstream the next time this subroutine is called. + + if (global_halo_sum > 0) then + + if (verbose_bwat .and. count <= 2) then + if (this_rank == rtest) write(iulog,*) 'Before halo update, global_halo_sum (m^3/s):', global_halo_sum + endif + + ! Reset bwatflx to zero for the halo transfer + bwatflx = 0 + + ! Communicate bmltflx_halo to the halo cells of neighboring processors + call parallel_halo(bwatflx_halo(:,:,:,:), parallel) + + ! bmltflx_halo is now available in the halo cells of the local processor. + ! Route downslope to the adjacent locally owned cells. + ! These fluxes will be routed further downstream during the next iteration. + do j = 2, ny-1 + do i = 2, nx-1 + if (halo_mask(i,j) == 1 .and. sum(bwatflx_halo(:,:,i,j)) > 0) then + do jj = -1,1 + do ii = -1,1 + if (bwatflx_halo(ii,jj,i,j) > 0) then + ip = i + ii + jp = j + jj + if (local_mask(ip,jp) == 1) then + bwatflx(ip,jp) = bwatflx(ip,jp) + bwatflx_halo(ii,jj,i,j) + if (verbose_bwat .and. ip==itest .and. jp==jtest .and. this_rank==rtest .and. count <= 2) then + write(iulog,*) 'Nonzero bwatflx from halo, rank, i, j:', & + this_rank, ip, jp, bwatflx_halo(ii,jj,i,j) + endif + endif + endif ! bwatflx_halo > 0 to a local cell + enddo ! ii + enddo ! jj + endif ! bwatflx_halo > 0 from this halo cell + enddo ! i + enddo ! j + + else + + finished = .true. ! no water in halo cells to route further + + endif ! global_halo_sum > 0 + + end subroutine route_flux_to_margin_or_halo_integer8 + !============================================================== !+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ @@ -1596,8 +2000,8 @@ end subroutine get_flux_fraction ! They are a GPL-licenced replacement for the Numerical Recipes routine indexx. ! They are not derived from any NR code, but are based on a quicksort routine by ! Michael Lamont (http://linux.wku.edu/~lamonml/kb.html), originally written - ! in C, and issued under the GNU General Public License. The conversion to - ! Fortran 90, and modification to do an index sort was done by Ian Rutt. + ! in C, and issued under the GNU General Public License. Ian Rutt did the conversion + ! to Fortran 90 and modified the algorithm to do an index sort. ! !+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ diff --git a/libglissade/glissade_bmlt_float.F90 b/libglissade/glissade_bmlt_float.F90 index 7a0da2d5..227ed6e4 100644 --- a/libglissade/glissade_bmlt_float.F90 +++ b/libglissade/glissade_bmlt_float.F90 @@ -1014,6 +1014,7 @@ subroutine glissade_bmlt_float_thermal_forcing(& call glissade_basin_average(& nx, ny, & + parallel, & ocean_data%nbasin, & ocean_data%basin_number, & thermal_forcing_mask * f_float, & @@ -1024,6 +1025,7 @@ subroutine glissade_bmlt_float_thermal_forcing(& call glissade_basin_average(& nx, ny, & + parallel, & ocean_data%nbasin, & ocean_data%basin_number, & thermal_forcing_mask * f_float, & @@ -1126,6 +1128,7 @@ subroutine glissade_bmlt_float_thermal_forcing(& call glissade_basin_average(& nx, ny, & + parallel, & ocean_data%nbasin, & ocean_data%basin_number, & thermal_forcing_mask * f_float, & diff --git a/libglissade/glissade_calving.F90 b/libglissade/glissade_calving.F90 index f2ad1cb1..04903763 100644 --- a/libglissade/glissade_calving.F90 +++ b/libglissade/glissade_calving.F90 @@ -37,10 +37,9 @@ module glissade_calving use glimmer_utils, only: point_diag use cism_parallel, only: this_rank, main_task, nhalo, & - parallel_halo, parallel_globalindex, & + parallel_halo, parallel_globalindex, parallel_global_sum, & parallel_reduce_sum, parallel_reduce_max, parallel_reduce_log_or - implicit none private @@ -50,7 +49,6 @@ module glissade_calving public :: verbose_calving logical, parameter :: verbose_calving = .false. -!! logical, parameter :: verbose_calving = .true. contains @@ -582,15 +580,7 @@ subroutine glissade_calve_ice(nx, ny, & if (verbose_calving) then call point_diag(cf_length, 'cf_length (m)', itest, jtest, rtest, 7, 7) ! Diagnose the total CF length - total_cf_length = 0.0d0 - do j = nhalo+1, ny-nhalo - do i = nhalo+1, nx-nhalo - if (calving_front_mask(i,j) == 1) then - total_cf_length = total_cf_length + cf_length(i,j) - endif - enddo - enddo - total_cf_length = parallel_reduce_sum(total_cf_length) + total_cf_length = parallel_global_sum(cf_length, parallel, calving_front_mask) if (this_rank == rtest) then write(iulog,*) 'Total CF length (km)', total_cf_length/1000.d0 endif @@ -871,13 +861,7 @@ subroutine glissade_calve_ice(nx, ny, & endif ! Compute the total ice area and the area of each quadrant - total_ice_area = 0.0d0 - do j = nhalo+1, ny-nhalo - do i = nhalo+1, nx-nhalo - total_ice_area = total_ice_area + dx*dy*calving%effective_areafrac(i,j) - enddo - enddo - total_ice_area = parallel_reduce_sum(total_ice_area) + total_ice_area = parallel_global_sum(dx*dy*calving%effective_areafrac, parallel) if (verbose_calving) then if (this_rank == rtest) then diff --git a/libglissade/glissade_glacier.F90 b/libglissade/glissade_glacier.F90 index cce8bdd9..ee9e1657 100644 --- a/libglissade/glissade_glacier.F90 +++ b/libglissade/glissade_glacier.F90 @@ -74,8 +74,8 @@ subroutine glissade_glacier_init(model, glacier) ! Another array, cism_to_rgi_glacier_id, identifies the RGI ID associated with each CISM ID. ! The CISM input file contains the RGI IDs. - use cism_parallel, only: distributed_gather_var, distributed_scatter_var, & - parallel_reduce_sum, parallel_reduce_max, parallel_reduce_min, parallel_is_zero, & + use cism_parallel, only: gather_var, scatter_var, & + parallel_global_sum, parallel_reduce_max, parallel_reduce_min, parallel_is_zero, & broadcast, parallel_halo, staggered_parallel_halo, parallel_globalindex type(glide_global_type),intent(inout) :: model @@ -91,7 +91,7 @@ subroutine glissade_glacier_init(model, glacier) integer :: i, j, nc, ng, count integer :: iglobal, jglobal - integer :: ng_west, ng_east, ng_south, ng_north + integer :: ng_ne, ng_nw, ng_se, ng_sw integer :: min_id, max_id real(dp) :: max_glcval real(dp) :: theta_rad ! latitude in radians @@ -113,6 +113,9 @@ subroutine glissade_glacier_init(model, glacier) current_id, & ! current glacier_id from list gid_minval, gid_maxval ! min and max values of glacier_id + integer, dimension(model%general%ewn,model%general%nsn) :: & + glacier_mask ! = 1 for cells with glaciers (glacier_id > 0), else = 0 + type(parallel_type) :: parallel ! info for parallel communication !WHL - debug, for quicksort test @@ -152,9 +155,15 @@ subroutine glissade_glacier_init(model, glacier) if (glacier%scale_area) then - ! Optionally, rescale the grid cell dimensions dew and dns + ! Optionally, rescale the grid cell dimensions and coordinates ! This is answer-changing throughout the code. + ! Note: The global arrays model%general%x1_global, etc., which are written to output files, are not rescaled. + ! These arrays are computed from the input file, which typically ignores the scale factor. if (glacier%length_scale_factor /= 1.0d0) then + model%general%x0 = model%general%x0 * glacier%length_scale_factor + model%general%y0 = model%general%y0 * glacier%length_scale_factor + model%general%x1 = model%general%x1 * glacier%length_scale_factor + model%general%y1 = model%general%y1 * glacier%length_scale_factor model%numerics%dew = model%numerics%dew * glacier%length_scale_factor model%numerics%dns = model%numerics%dns * glacier%length_scale_factor dew = model%numerics%dew @@ -194,8 +203,7 @@ subroutine glissade_glacier_init(model, glacier) ! and these arrays should already have the correct dimensions. if (associated(glacier%glacierid)) deallocate(glacier%glacierid) - if (associated(glacier%cism_to_rgi_glacier_id)) & - deallocate(glacier%cism_to_rgi_glacier_id) + if (associated(glacier%cism_to_rgi_glacier_id)) deallocate(glacier%cism_to_rgi_glacier_id) if (associated(glacier%area)) deallocate(glacier%area) if (associated(glacier%volume)) deallocate(glacier%volume) if (associated(glacier%area_init)) deallocate(glacier%area_init) @@ -220,28 +228,16 @@ subroutine glissade_glacier_init(model, glacier) ! Count the number of cells with glaciers ! Loop over locally owned cells - count = 0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - if (glacier%rgi_glacier_id(i,j) > 0) then - count = count + 1 - elseif (glacier%rgi_glacier_id(i,j) < 0) then ! should not happen - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(message,*) 'RGI glacier_id < 0: i, j, value =', & - iglobal, jglobal, glacier%rgi_glacier_id(i,j) - call write_log(message, GM_FATAL) - endif - enddo - enddo - - ncells_glacier = parallel_reduce_sum(count) + glacier_mask = 0 + where (glacier%rgi_glacier_id > 0) glacier_mask = 1 + ncells_glacier = parallel_global_sum(glacier_mask, parallel) ! Gather the RGI glacier IDs to the main task if (main_task) allocate(rgi_glacier_id_global(global_ewn, global_nsn)) - call distributed_gather_var(glacier%rgi_glacier_id, rgi_glacier_id_global, parallel) + call gather_var(glacier%rgi_glacier_id, rgi_glacier_id_global, parallel) ! Allocate a global array for the CISM glacier IDs on the main task. - ! Allocate a size 0 array on other tasks; distributed_scatter_var wants arrays allocated on all tasks. + ! Allocate a size 0 array on other tasks; scatter_var wants arrays allocated on all tasks. if (main_task) then allocate(cism_glacier_id_global(global_ewn,global_nsn)) else @@ -387,8 +383,8 @@ subroutine glissade_glacier_init(model, glacier) endif ! main_task ! Scatter cism_glacier_id_global to all processors - ! Note: This global array is deallocated in the distributed_scatter_var subroutine - call distributed_scatter_var(glacier%cism_glacier_id, cism_glacier_id_global, parallel) + ! Note: This global array is deallocated in the scatter_var subroutine + call scatter_var(glacier%cism_glacier_id, cism_glacier_id_global, parallel) call parallel_halo(glacier%cism_glacier_id, parallel) @@ -434,6 +430,7 @@ subroutine glissade_glacier_init(model, glacier) call glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & model%geometry%cell_area, & ! m^2 @@ -543,6 +540,7 @@ subroutine glissade_glacier_init(model, glacier) call glacier_2d_to_1d(& ewn, nsn, & + parallel, & nglacier, glacier%cism_glacier_id_init, & model%climate%smb_obs, glacier%smb_obs) @@ -614,6 +612,7 @@ subroutine glissade_glacier_init(model, glacier) call glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id, & model%geometry%cell_area, & ! m^2 @@ -626,6 +625,7 @@ subroutine glissade_glacier_init(model, glacier) call glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & model%geometry%cell_area, & ! m^2 @@ -667,35 +667,25 @@ subroutine glissade_glacier_init(model, glacier) endif call broadcast(glacier%ngdiag, rtest) - ! Define a mask whose value is 1 at vertices along the boundary between two glaciers. - ! At runtime, Cp is set to a large value at masked vertices to reduce flow between glaciers. + ! Define a mask whose value is 1 at vertices that border two different glaciers. + ! At runtime, Cp is set to a large value at these vertices to reduce mass exchange between glaciers. + !TODO: Consider removing the mask. This would allow CISM to reduce basal friction to thin the ice if needed. glacier%boundary_mask(:,:) = 0 - ! Loop over locally owned cells - do j = nhalo, nsn-nhalo - do i = nhalo, ewn-nhalo - ng = glacier%cism_glacier_id_init(i,j) - if (ng > 0) then - ng_west = glacier%cism_glacier_id_init(i-1,j) - ng_east = glacier%cism_glacier_id_init(i+1,j) - ng_south = glacier%cism_glacier_id_init(i,j-1) - ng_north = glacier%cism_glacier_id_init(i,j+1) - if (ng_west > 0 .and. ng_west /= ng) then - glacier%boundary_mask(i-1,j-1) = 1 - glacier%boundary_mask(i-1,j) = 1 - endif - if (ng_east > 0 .and. ng_east /= ng) then - glacier%boundary_mask(i,j-1) = 1 - glacier%boundary_mask(i,j) = 1 - endif - if (ng_south > 0 .and. ng_south /= ng) then - glacier%boundary_mask(i-1,j-1) = 1 - glacier%boundary_mask(i,j-1) = 1 - endif - if (ng_north > 0 .and. ng_north /= ng) then - glacier%boundary_mask(i-1,j) = 1 - glacier%boundary_mask(i,j) = 1 - endif + ! Loop over locally owned vertices + do j = nhalo+1, nsn-nhalo + do i = nhalo+1, ewn-nhalo + ng_ne = glacier%cism_glacier_id_init(i+1,j+1) + ng_nw = glacier%cism_glacier_id_init(i,j+1) + ng_se = glacier%cism_glacier_id_init(i+1,j) + ng_sw = glacier%cism_glacier_id_init(i,j) + if ( (ng_ne > 0 .and. ng_nw > 0 .and. ng_ne /= ng_nw) .or. & + (ng_ne > 0 .and. ng_se > 0 .and. ng_ne /= ng_se) .or. & + (ng_ne > 0 .and. ng_sw > 0 .and. ng_ne /= ng_sw) .or. & + (ng_nw > 0 .and. ng_se > 0 .and. ng_nw /= ng_se) .or. & + (ng_nw > 0 .and. ng_sw > 0 .and. ng_nw /= ng_sw) .or. & + (ng_se > 0 .and. ng_sw > 0 .and. ng_se /= ng_sw) ) then + glacier%boundary_mask(i,j) = 1 endif enddo enddo @@ -703,6 +693,7 @@ subroutine glissade_glacier_init(model, glacier) call staggered_parallel_halo(glacier%boundary_mask, parallel) if (verbose_glacier) then + call point_diag(glacier%cism_glacier_id_init, 'cism_glacier_id_init', itest, jtest, rtest, 7, 7) call point_diag(glacier%boundary_mask, 'Glacier boundary mask', itest, jtest, rtest, 7, 7) endif @@ -730,7 +721,7 @@ subroutine glissade_glacier_update(model, glacier) use glissade_grid_operators, only: glissade_stagger use glissade_utils, only: glissade_usrf_to_thck - use cism_parallel, only: parallel_reduce_sum, parallel_global_sum, & + use cism_parallel, only: parallel_global_sum, & parallel_halo, staggered_parallel_halo ! Do glacier inversion (if applicable), update glacier masks, and compute glacier diagnostics. @@ -767,12 +758,13 @@ subroutine glissade_glacier_update(model, glacier) integer :: i, j, ng integer, dimension(model%general%ewn, model%general%nsn) :: & - ice_mask ! = 1 where ice is present (thck > thklim), else = 0 + ice_mask, & ! = 1 where ice is present (thck > thklim), else = 0 + glacier_mask ! temporary mask real(dp), dimension(model%general%ewn, model%general%nsn) :: & thck, & ! ice thickness (m) dthck_dt, & ! rate of change of thickness (m/yr) - cell_area, & ! grid cell area (m^2) + cell_area_uniform, & ! grid cell area defined as dew*dns(m^2) thck_old, & ! saved value of ice thickness (m) artm, & ! artm, baseline or current date snow, & ! snowfall, baseline or current date @@ -872,14 +864,12 @@ subroutine glissade_glacier_update(model, glacier) nglacier = glacier%nglacier ngdiag = glacier%ngdiag + cell_area_uniform = dew*dns ! some unit conversions - !TODO - Use model%geometry%thck without a copy. ! Skip these conversion and use SI units (s instead of yr) in the code. - dt = model%numerics%dt /scyr ! s to yr - thck = model%geometry%thck + dt = model%numerics%dt /scyr ! s to yr dthck_dt = model%geometry%dthck_dt * scyr ! m/s to m/yr - cell_area = model%geometry%cell_area ! model units to m^2 ! Accumulate the 2D fields used for mu_star and alpha_snow inversion: snow and Tpos. ! Also accumulate dthck_dt, which is used for powerlaw_c inversion. @@ -1189,23 +1179,23 @@ subroutine glissade_glacier_update(model, glacier) if (glacier%redistribute_advanced_ice) then - thck_old = thck + thck_old = model%geometry%thck call glacier_redistribute_advanced_ice(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, ngdiag, & real(glacier_update_interval,dp), & ! yr - dew*dns, & ! m^2 + cell_area_uniform, & ! m^2 glacier%thinning_rate_advanced_ice, & ! m/yr glacier%cism_glacier_id_init, & glacier%smb_glacier_id, & model%climate%smb, & ! m/yr - thck, & ! m - parallel) + model%geometry%thck) ! m glacier%dthck_dt_annmean = glacier%dthck_dt_annmean + & - (thck - thck_old) / real(glacier_update_interval,dp) + (model%geometry%thck - thck_old) / real(glacier_update_interval,dp) endif ! redistribute advanced ice @@ -1230,6 +1220,7 @@ subroutine glissade_glacier_update(model, glacier) call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & glacier%smb_glacier_id_init, & smb_weight_init, & @@ -1249,6 +1240,7 @@ subroutine glissade_glacier_update(model, glacier) call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & glacier%smb_glacier_id, & smb_weight_current, & @@ -1276,6 +1268,7 @@ subroutine glissade_glacier_update(model, glacier) call glacier_invert_mu_star_alpha_snow(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, ngdiag, & glacier%smb_glacier_id_init, & @@ -1300,6 +1293,7 @@ subroutine glissade_glacier_update(model, glacier) call glacier_invert_mu_star(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, ngdiag, & glacier%smb_glacier_id_init, & @@ -1318,13 +1312,13 @@ subroutine glissade_glacier_update(model, glacier) endif ! set_mu_star ! advance/retreat diagnostics - ! Note: This subroutine assumes cell_area = dew*dns for all cells call glacier_area_advance_retreat(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & glacier%cism_glacier_id, & - dew*dns, & + cell_area_uniform, & area_initial, & area_current, & area_advance, & @@ -1383,7 +1377,8 @@ subroutine glissade_glacier_update(model, glacier) ! Interpolate thck to the staggered grid call glissade_stagger(& ewn, nsn, & - thck, stag_thck) + model%geometry%thck, & + stag_thck) ! Interpolate dthck_dt to the staggered grid call glissade_stagger(& @@ -1433,8 +1428,7 @@ subroutine glissade_glacier_update(model, glacier) !------------------------------------------------------------------------- if (verbose_glacier) then - call point_diag(model%geometry%topg, 'topg', itest, jtest, rtest, 7, 7) - call point_diag(thck, 'Before advance_retreat, thck', itest, jtest, rtest, 7, 7) + call point_diag(model%geometry%thck, 'Before advance_retreat, thck', itest, jtest, rtest, 7, 7) endif ! Assign nonzero IDs in grid cells where ice has reached the minimum glacier thickness. @@ -1443,17 +1437,17 @@ subroutine glissade_glacier_update(model, glacier) call glacier_advance_retreat(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, & glacier%minthck, & ! m - thck, & ! m + model%geometry%thck, & ! m glacier%snow_annmean, & ! mm/yr w.e. glacier%Tpos_annmean, & ! deg C glacier%mu_star, & ! mm/yr/deg glacier%alpha_snow, & ! unitless glacier%cism_glacier_id_init, & - glacier%cism_glacier_id, & - parallel) + glacier%cism_glacier_id) ! Compute smb_glacier_id, which determines where the SMB is computed. It is the union of ! (1) cism_glacier_id > 0 @@ -1559,7 +1553,7 @@ subroutine glissade_glacier_update(model, glacier) endif ! set_mu_star if (verbose_glacier) then - call point_diag(thck, 'After advance_retreat, thck', itest, jtest, rtest, 7, 7) + call point_diag(model%geometry%thck, 'After advance_retreat, thck', itest, jtest, rtest, 7, 7) call point_diag(glacier%cism_glacier_id_init, 'cism_glacier_id_init', itest, jtest, rtest, 7, 7) call point_diag(glacier%smb_glacier_id_init, 'smb_glacier_id_init', itest, jtest, rtest, 7, 7) call point_diag(glacier%cism_glacier_id, 'New cism_glacier_id', itest, jtest, rtest, 7, 7) @@ -1591,16 +1585,20 @@ subroutine glissade_glacier_update(model, glacier) ! (1) Include only cells that are part of the initial glacier extent call glacier_accumulation_area_ratio(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & + cell_area_uniform, & model%climate%smb, & aar_init) ! (2) Include all cells in the glacier call glacier_accumulation_area_ratio(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id, & + cell_area_uniform, & model%climate%smb, & aar) @@ -1630,16 +1628,20 @@ subroutine glissade_glacier_update(model, glacier) ! (1) Include only cells that are part of the initial glacier extent call glacier_accumulation_area_ratio(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & + cell_area_uniform, & glacier%smb_recent, & aar_init_recent) ! (2) Include all cells in the glacier call glacier_accumulation_area_ratio(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id, & + cell_area_uniform, & glacier%smb_recent, & aar_recent) @@ -1661,10 +1663,11 @@ subroutine glissade_glacier_update(model, glacier) call glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id, & - cell_area, & ! m^2 - thck, & ! m + model%geometry%cell_area, & ! m^2 + model%geometry%thck, & ! m glacier%diagnostic_minthck, & ! m glacier%area, & ! m^2 glacier%volume) ! m^3 @@ -1674,10 +1677,11 @@ subroutine glissade_glacier_update(model, glacier) call glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & - cell_area, & ! m^2 - thck, & ! m + model%geometry%cell_area, & ! m^2 + model%geometry%thck, & ! m glacier%diagnostic_minthck, & ! m glacier%area_init_extent, & ! m^2 glacier%volume_init_extent) ! m^3 @@ -1701,6 +1705,7 @@ subroutine glissade_glacier_update(model, glacier) call glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, & glacier%cism_glacier_id_init, & model%geometry%cell_area, & ! m^2 @@ -1718,28 +1723,21 @@ subroutine glissade_glacier_update(model, glacier) if (verbose_glacier) then - ! debug - count cells in masks - count_cgii = 0 - count_cgi = 0 - count_sgii = 0 - count_sgi = 0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = glacier%cism_glacier_id_init(i,j) - if (ng == ngdiag) count_cgii = count_cgii + 1 - ng = glacier%cism_glacier_id(i,j) - if (ng == ngdiag) count_cgi = count_cgi + 1 - ng = glacier%smb_glacier_id_init(i,j) - if (ng == ngdiag) count_sgii = count_sgii + 1 - ng = glacier%smb_glacier_id(i,j) - if (ng == ngdiag) count_sgi = count_sgi + 1 - enddo - enddo + glacier_mask = 0 + where (glacier%cism_glacier_id_init == ngdiag) glacier_mask = 1 + count_cgii = parallel_global_sum(glacier_mask, parallel) - count_cgii = parallel_reduce_sum(count_cgii) - count_cgi = parallel_reduce_sum(count_cgi) - count_sgii = parallel_reduce_sum(count_sgii) - count_sgi = parallel_reduce_sum(count_sgi) + glacier_mask = 0 + where (glacier%cism_glacier_id == ngdiag) glacier_mask = 1 + count_cgi = parallel_global_sum(glacier_mask, parallel) + + glacier_mask = 0 + where (glacier%smb_glacier_id_init == ngdiag) glacier_mask = 1 + count_sgii = parallel_global_sum(glacier_mask, parallel) + + glacier_mask = 0 + where (glacier%smb_glacier_id == ngdiag) glacier_mask = 1 + count_sgi = parallel_global_sum(glacier_mask, parallel) if (this_rank == rtest) then write(iulog,*) ' ' @@ -1752,15 +1750,13 @@ subroutine glissade_glacier_update(model, glacier) endif ! glacier_update_inverval - ! Copy fields back to model derived type - model%geometry%thck = thck - end subroutine glissade_glacier_update !**************************************************** subroutine glacier_invert_mu_star(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, ngdiag, & smb_glacier_id_init, & @@ -1785,6 +1781,9 @@ subroutine glacier_invert_mu_star(& nglacier, & ! total number of glaciers in the domain ngdiag ! CISM ID of diagnostic glacier + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & smb_glacier_id_init ! smb_glacier_id based on the initial glacier extent @@ -1862,6 +1861,7 @@ subroutine glacier_invert_mu_star(& call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & smb_glacier_id_init, & smb_weight, & @@ -1869,6 +1869,7 @@ subroutine glacier_invert_mu_star(& call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & smb_glacier_id_init, & smb_weight, & @@ -1993,6 +1994,7 @@ end subroutine glacier_invert_mu_star subroutine glacier_invert_mu_star_alpha_snow(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, ngdiag, & smb_glacier_id_init, & @@ -2023,6 +2025,9 @@ subroutine glacier_invert_mu_star_alpha_snow(& nglacier, & ! total number of glaciers in the domain ngdiag ! CISM ID of diagnostic glacier + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & smb_glacier_id_init ! smb_glacier_id based on the initial glacier extent @@ -2114,6 +2119,7 @@ subroutine glacier_invert_mu_star_alpha_snow(& call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & smb_glacier_id_init, & smb_weight, & @@ -2121,6 +2127,7 @@ subroutine glacier_invert_mu_star_alpha_snow(& call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & smb_glacier_id_init, & smb_weight, & @@ -2128,6 +2135,7 @@ subroutine glacier_invert_mu_star_alpha_snow(& call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & smb_glacier_id_init, & smb_weight, & @@ -2135,6 +2143,7 @@ subroutine glacier_invert_mu_star_alpha_snow(& call glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & smb_glacier_id_init, & smb_weight, & @@ -2482,6 +2491,7 @@ subroutine glacier_invert_powerlaw_c(& if (verbose_glacier) then call point_diag(stag_thck, 'stag_thck (m)', itest, jtest, rtest, 7, 7) + call point_diag(stag_thck_target, 'stag_thck_target (m)', itest, jtest, rtest, 7, 7) call point_diag(stag_dthck, 'stag_thck - stag_thck_target (m)', itest, jtest, rtest, 7, 7) call point_diag(stag_dthck_dt, 'stag_dthck_dt (m/yr)', itest, jtest, rtest, 7, 7) call point_diag(powerlaw_c, 'new powerlaw_c', itest, jtest, rtest, 7, 7) @@ -2533,6 +2543,7 @@ end subroutine glacier_calc_snow subroutine glacier_redistribute_advanced_ice(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, ngdiag, & glacier_update_interval, & ! yr @@ -2541,15 +2552,14 @@ subroutine glacier_redistribute_advanced_ice(& cism_glacier_id_init, & smb_glacier_id, & smb, & ! m/yr - thck, & ! m - parallel) + thck) ! m ! Limit glacier advance in the accumulation zone. ! This applies to grid cells that are initially ice-free, into which ice is advected. ! The fix here is to thin the ice in these cells at a prescribed rate and ! redistribute the mass conservatively across the glacier. - use cism_parallel, only: parallel_reduce_sum, parallel_halo + use cism_parallel, only: parallel_halo, parallel_global_sum_patch ! input/output arguments @@ -2559,11 +2569,16 @@ subroutine glacier_redistribute_advanced_ice(& nglacier, & ! number of glaciers ngdiag ! CISM ID of diagnostic glacier + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + real(dp), intent(in) :: & glacier_update_interval, & ! time interval (yr) of the glacier update, typically 1 yr - cell_area, & ! grid cell area (m^2), assumed to be the same for each cell thinning_rate_advanced_ice ! thinning rate (m/yr) where glaciers advance in the accumulation zone + real(dp), dimension(ewn,nsn), intent(in) :: & + cell_area ! grid cell area (m^2) + integer, dimension(ewn,nsn), intent(in) :: & cism_glacier_id_init, & ! integer glacier ID at the start of the run smb_glacier_id ! integer ID for current glacier cells and adjacent glacier-free cells @@ -2574,14 +2589,10 @@ subroutine glacier_redistribute_advanced_ice(& real(dp), dimension(ewn,nsn), intent(inout) :: & thck ! ice thickness (m) - type(parallel_type), intent(in) :: parallel ! info for parallel communication - ! local variables integer :: i, j, ng - real(dp) :: dthck ! thickness change (m) - real(dp), dimension(nglacier) :: & glacier_area_init, & ! glacier area based on cism_glacier_id_init glacier_vol_removed, & ! total volume (m^3) removed from each advanced cells in each glacier @@ -2589,53 +2600,37 @@ subroutine glacier_redistribute_advanced_ice(& glacier_vol_1, & ! volume (m^3) of each glacier before thinning and restribution glacier_vol_2 ! volume (m^3) of each glacier after thinning and restribution + real(dp), dimension(ewn,nsn) :: & + dthck ! thickness removed (m) + + integer, dimension(ewn,nsn) :: & + glacier_id ! temporary glacier ID + + glacier_id = max(cism_glacier_id_init, smb_glacier_id) + ! Compute the total volume of each glacier before limiting advance. ! Note: This includes adjacent glacier-free cells that might have a small nonzero thickness ! (i.e., cism_glacier_id = 0 but smb_glacier_id > 0). - !TODO: Write a sum-over-glaciers subroutine - glacier_vol_1(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = smb_glacier_id(i,j) - if (ng > 0) then - glacier_vol_1(ng) = glacier_vol_1(ng) + cell_area*thck(i,j) - endif - enddo - enddo - glacier_vol_1 = parallel_reduce_sum(glacier_vol_1) + glacier_vol_1 = parallel_global_sum_patch(cell_area*thck, nglacier, glacier_id, parallel) ! compute the area of each glacier over its initial extent - glacier_area_init(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = cism_glacier_id_init(i,j) - if (ng > 0) then - glacier_area_init(ng) = glacier_area_init(ng) + cell_area - endif - enddo - enddo - glacier_area_init = parallel_reduce_sum(glacier_area_init) + + glacier_area_init = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id_init, parallel) ! Compute thinning in advanced grid cells ! This includes potential advanced cells adjacent to current glacier cells. ! Note: Currently, SMB is set to 0 in advanced cells where SMB would be > 0 otherwise. ! The logic below (smb >= 0) ensures that ice in these cells is thinned. - glacier_vol_removed(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - if (cism_glacier_id_init(i,j) == 0 .and. smb_glacier_id(i,j) > 0) then ! advanced cell - if (smb(i,j) >= 0.d0) then ! accumulation zone - ng = smb_glacier_id(i,j) - dthck = min(thinning_rate_advanced_ice*glacier_update_interval, thck(i,j)) - thck(i,j) = thck(i,j) - dthck - glacier_vol_removed(ng) = glacier_vol_removed(ng) + cell_area*dthck - endif - endif - enddo - enddo - glacier_vol_removed = parallel_reduce_sum(glacier_vol_removed) + dthck = 0.0d0 + where (cism_glacier_id_init == 0 .and. smb_glacier_id > 0) ! advanced cell + where (smb >= 0.0d0) ! accumulation zone + dthck = min(thinning_rate_advanced_ice*glacier_update_interval, thck) + thck = thck - dthck + endwhere + endwhere + glacier_vol_removed = parallel_global_sum_patch(cell_area*dthck, nglacier, smb_glacier_id, parallel) ! Assuming conservation of volume, compute the thickness to be added to each glacier. ! Only cells within the initial glacier extent can thicken. @@ -2659,16 +2654,8 @@ subroutine glacier_redistribute_advanced_ice(& call parallel_halo(thck, parallel) ! Compute the volume of each glacier after limiting advance - glacier_vol_2(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = max(cism_glacier_id_init(i,j), smb_glacier_id(i,j)) - if (ng > 0) then - glacier_vol_2(ng) = glacier_vol_2(ng) + cell_area*thck(i,j) - endif - enddo - enddo - glacier_vol_2 = parallel_reduce_sum(glacier_vol_2) + + glacier_vol_2 = parallel_global_sum_patch(cell_area*thck, nglacier, glacier_id, parallel) ! conservation check do ng = 1, nglacier @@ -2685,6 +2672,7 @@ end subroutine glacier_redistribute_advanced_ice subroutine glacier_advance_retreat(& ewn, nsn, & + parallel, & itest, jtest, rtest, & nglacier, & glacier_minthck, & @@ -2694,8 +2682,7 @@ subroutine glacier_advance_retreat(& mu_star, & alpha_snow, & cism_glacier_id_init, & - cism_glacier_id, & - parallel) + cism_glacier_id) ! Allow glaciers to advance and retreat. ! @@ -2727,6 +2714,9 @@ subroutine glacier_advance_retreat(& itest, jtest, rtest, & ! coordinates of diagnostic cell nglacier ! number of glaciers + type(parallel_type), intent(in) :: & + parallel ! info for diagnostic only + real(dp), intent(in) :: & glacier_minthck ! min ice thickness (m) counted as part of a glacier @@ -2747,8 +2737,6 @@ subroutine glacier_advance_retreat(& integer, dimension(ewn,nsn), intent(inout) :: & cism_glacier_id ! current cism glacier_id, > 0 for glaciated cells - type(parallel_type), intent(in) :: parallel ! diagnostic only - ! local variables integer, dimension(ewn,nsn) :: & @@ -2859,7 +2847,7 @@ subroutine glacier_advance_retreat(& call parallel_halo(cism_glacier_id, parallel) ! Check advanced cells (beyond the initial extent) for problematic glacier IDs. - ! This code protects against glacier 'pirating', which ccan occur when an advanced cell + ! This code protects against glacier 'pirating', which can occur when an advanced cell ! is adjacent to two different glaciers, call them A and B. ! Suppose the cell is fed primarily by glacier A but has the same ID as glacier B, ! and has a more positive SMB as a result of belonging to B rather than A. @@ -3038,13 +3026,14 @@ end subroutine update_smb_glacier_id subroutine glacier_2d_to_1d(& ewn, nsn, & + parallel, & nglacier, cism_glacier_id, & field_2d, glacier_field) ! Given a 2D field, compute the average of the field over each glacier !TODO - Pass in cellarea to compute an area average. - use cism_parallel, only: parallel_reduce_sum + use cism_parallel, only: parallel_global_sum_patch ! input/output arguments @@ -3052,6 +3041,9 @@ subroutine glacier_2d_to_1d(& ewn, nsn, & ! number of cells in each horizontal direction nglacier ! total number of glaciers in the domain + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & cism_glacier_id ! integer glacier ID in the range (1, nglacier) @@ -3067,22 +3059,12 @@ subroutine glacier_2d_to_1d(& integer, dimension(nglacier) :: ncells_glacier - ncells_glacier(:) = 0 - glacier_field(:) = 0.0d0 + integer, dimension(ewn,nsn) :: ones ! matrix = 1 everywhere - ! Loop over locally owned cells - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = cism_glacier_id(i,j) - if (ng > 0) then - ncells_glacier(ng) = ncells_glacier(ng) + 1 - glacier_field(ng) = glacier_field(ng) + field_2d(i,j) - endif - enddo - enddo + ones(:,:) = 1 - ncells_glacier = parallel_reduce_sum(ncells_glacier) - glacier_field = parallel_reduce_sum(glacier_field) + ncells_glacier = parallel_global_sum_patch(ones, nglacier, cism_glacier_id, parallel) + glacier_field = parallel_global_sum_patch(field_2d, nglacier, cism_glacier_id, parallel) where (ncells_glacier > 0) glacier_field = glacier_field/ncells_glacier @@ -3094,6 +3076,7 @@ end subroutine glacier_2d_to_1d subroutine glacier_2d_to_1d_weighted(& ewn, nsn, & + parallel, & nglacier, & glacier_id, weight, & field_2d, glacier_field) @@ -3101,7 +3084,7 @@ subroutine glacier_2d_to_1d_weighted(& ! Given a 2D field, compute the average of the field over each glacier ! Certain grid cells (e.g., at the glacier periphery) can be given weights between 0 and 1. - use cism_parallel, only: parallel_reduce_sum + use cism_parallel, only: parallel_global_sum_patch ! input/output arguments @@ -3109,6 +3092,9 @@ subroutine glacier_2d_to_1d_weighted(& ewn, nsn, & ! number of cells in each horizontal direction nglacier ! total number of glaciers in the domain + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & glacier_id ! integer glacier ID @@ -3123,26 +3109,11 @@ subroutine glacier_2d_to_1d_weighted(& ! local variables - integer :: i, j, ng - real(dp), dimension(nglacier) :: sum_weights - sum_weights(:) = 0.0d0 - glacier_field(:) = 0.0d0 - - ! Loop over locally owned cells - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = glacier_id(i,j) - if (ng > 0) then - sum_weights(ng) = sum_weights(ng) + weight(i,j) - glacier_field(ng) = glacier_field(ng) + weight(i,j) * field_2d(i,j) - endif - enddo - enddo + sum_weights = parallel_global_sum_patch(weight, nglacier, glacier_id, parallel) + glacier_field = parallel_global_sum_patch(weight*field_2d, nglacier, glacier_id, parallel) - sum_weights = parallel_reduce_sum(sum_weights) - glacier_field = parallel_reduce_sum(glacier_field) where (sum_weights > 0.0d0) glacier_field = glacier_field/sum_weights endwhere @@ -3196,12 +3167,13 @@ end subroutine glacier_1d_to_2d subroutine glacier_area_volume(& ewn, nsn, & + parallel, & nglacier, cism_glacier_id, & cell_area, thck, & diagnostic_minthck, & area, volume) - use cism_parallel, only: parallel_reduce_sum + use cism_parallel, only: parallel_global_sum_patch ! input/output arguments @@ -3209,6 +3181,9 @@ subroutine glacier_area_volume(& ewn, nsn, & ! number of cells in each horizontal direction nglacier ! total number of glaciers in the domain + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & cism_glacier_id ! integer glacier ID in the range (1, nglacier) @@ -3226,36 +3201,22 @@ subroutine glacier_area_volume(& ! local variables - real(dp), dimension(nglacier) :: & - local_area, local_volume ! area and volume on each processor, before global sum - - integer :: i, j, ng - - ! Initialize the output arrays - area(:) = 0.0d0 - volume(:) = 0.0d0 - - ! Initialize local arrays - local_area(:) = 0.0d0 - local_volume(:) = 0.0d0 + real(dp), dimension(ewn,nsn) :: & + diag_area, diag_volume ! area and volume where thck >= diagnostic_minthck ! Compute the area and volume of each glacier. - ! We need parallel sums, since a glacier can lie on two or more processors. + ! Need parallel sums, since a glacier can lie on two or more processors. - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = cism_glacier_id(i,j) - if (ng > 0) then - if (thck(i,j) >= diagnostic_minthck) then - local_area(ng) = local_area(ng) + cell_area(i,j) - local_volume(ng) = local_volume(ng) + cell_area(i,j) * thck(i,j) - endif - endif - enddo - enddo + where(thck >= diagnostic_minthck) + diag_area = cell_area + diag_volume = cell_area*thck + elsewhere + diag_area = 0.0d0 + diag_volume = 0.0d0 + endwhere - area = parallel_reduce_sum(local_area) - volume = parallel_reduce_sum(local_volume) + area = parallel_global_sum_patch(diag_area, nglacier, cism_glacier_id, parallel) + volume = parallel_global_sum_patch(diag_volume, nglacier, cism_glacier_id, parallel) end subroutine glacier_area_volume @@ -3263,6 +3224,7 @@ end subroutine glacier_area_volume subroutine glacier_area_advance_retreat(& ewn, nsn, & + parallel, & nglacier, & cism_glacier_id_init, & cism_glacier_id, & @@ -3272,7 +3234,7 @@ subroutine glacier_area_advance_retreat(& area_advance, & area_retreat) - use cism_parallel, only: parallel_reduce_sum + use cism_parallel, only: parallel_global_sum_patch ! For each glacier, compare the current glacier area (as given by cism_glacier_id) ! to the initial area (given by cism_glacier_id_init). @@ -3287,12 +3249,15 @@ subroutine glacier_area_advance_retreat(& ewn, nsn, & ! number of cells in each horizontal direction nglacier ! total number of glaciers in the domain + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & cism_glacier_id_init, & ! integer glacier ID in the range (1, nglacier), initial value cism_glacier_id ! integer glacier ID in the range (1, nglacier), current value - real(dp), intent(in) :: & - cell_area ! grid cell area = dew*dns (m^2); same for all cells + real(dp), dimension(ewn,nsn), intent(in) :: & + cell_area ! grid cell area (m^2) real(dp), dimension(nglacier), intent(out) :: & area_initial, & ! initial glacier area @@ -3302,69 +3267,38 @@ subroutine glacier_area_advance_retreat(& ! local variables - real(dp), dimension(nglacier) :: & - local_area ! area on each processor, before global sum + integer, dimension(ewn,nsn) :: glacier_id ! temporary glacier ID - integer :: i, j, ng, ngi - - ! Initialize the output arrays - area_initial(:) = 0.0d0 - area_current(:) = 0.0d0 - area_advance(:) = 0.0d0 - area_retreat(:) = 0.0d0 + integer :: ng ! Compute the area of each glacier over the initial and current masks. ! We need parallel sums, since a glacier can lie on two or more processors. - ! init area - local_area(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ngi = cism_glacier_id_init(i,j) - if (ngi > 0) then - local_area(ngi) = local_area(ngi) + cell_area - endif - enddo - enddo - area_initial = parallel_reduce_sum(local_area) + area_initial = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id_init, parallel) ! current area - local_area(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = cism_glacier_id(i,j) - if (ng > 0) then - local_area(ng) = local_area(ng) + cell_area - endif - enddo - enddo - area_current = parallel_reduce_sum(local_area) + + area_current = parallel_global_sum_patch(cell_area, nglacier, cism_glacier_id, parallel) ! area where the glacier has advanced - local_area(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ngi = cism_glacier_id_init(i,j) - ng = cism_glacier_id(i,j) - if (ngi == 0 .and. ng > 0) then - local_area(ng) = local_area(ng) + cell_area - endif - enddo - enddo - area_advance = parallel_reduce_sum(local_area) + + where (cism_glacier_id_init == 0 .and. cism_glacier_id > 0) + glacier_id = cism_glacier_id + elsewhere + glacier_id = 0 + endwhere + + area_advance = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel) ! area where the glacier has retreated - local_area(:) = 0.0d0 - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ngi = cism_glacier_id_init(i,j) - ng = cism_glacier_id(i,j) - if (ngi > 0 .and. ng == 0) then - local_area(ngi) = local_area(ngi) + cell_area - endif - enddo - enddo - area_retreat = parallel_reduce_sum(local_area) + + where (cism_glacier_id_init > 0 .and. cism_glacier_id == 0) + glacier_id = cism_glacier_id_init + elsewhere + glacier_id = 0 + endwhere + + area_retreat = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel) ! bug check do ng = 1, nglacier @@ -3382,15 +3316,16 @@ end subroutine glacier_area_advance_retreat subroutine glacier_accumulation_area_ratio(& ewn, nsn, & + parallel, & nglacier, & cism_glacier_id, & + cell_area, & smb, & aar) ! Compute the accumulation area ratio (AAR) for each glacier. - ! Note: In this subroutine the grid cell area is assumed equal for all cells. - use cism_parallel, only: parallel_reduce_sum + use cism_parallel, only: parallel_global_sum_patch ! input/output arguments @@ -3398,9 +3333,15 @@ subroutine glacier_accumulation_area_ratio(& ewn, nsn, & ! number of cells in each horizontal direction nglacier ! total number of glaciers in the domain + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, dimension(ewn,nsn), intent(in) :: & cism_glacier_id ! integer glacier ID in the range (1, nglacier) + real(dp), dimension(ewn,nsn), intent(in) :: & + cell_area ! grid cell area = dew*dns (m^2); same for all cells + real(dp), dimension(ewn,nsn), intent(in) :: & smb ! surface mass balance (mm/yr w.e.) @@ -3409,34 +3350,32 @@ subroutine glacier_accumulation_area_ratio(& ! local variables - integer :: i, j, ng +! integer :: i, j, ng real(dp), dimension(nglacier) :: & ablat_area, & ! area of accumulation zone (SMB < 0) accum_area ! area of accumulation zone (SMB > 0) - ! initialize - ablat_area(:) = 0.0d0 - accum_area(:) = 0.0d0 + integer, dimension(ewn,nsn) :: glacier_id ! temporary glacier ID ! Compute the accumulation and ablation area for each glacier ! Note: Grid cells with SMB = 0 are not counted in either zone. - do j = nhalo+1, nsn-nhalo - do i = nhalo+1, ewn-nhalo - ng = cism_glacier_id(i,j) - if (ng > 0) then - if (smb(i,j) > 0.0d0) then - accum_area(ng) = accum_area(ng) + 1.0d0 - elseif (smb(i,j) < 0.0d0) then - ablat_area(ng) = ablat_area(ng) + 1.0d0 - endif - endif - enddo ! i - enddo ! j + where (cism_glacier_id > 0 .and. smb > 0.0d0) + glacier_id = cism_glacier_id + elsewhere + glacier_id = 0 + endwhere + + accum_area = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel) + + where (cism_glacier_id > 0 .and. smb < 0.0d0) + glacier_id = cism_glacier_id + elsewhere + glacier_id = 0 + endwhere - accum_area = parallel_reduce_sum(accum_area) - ablat_area = parallel_reduce_sum(ablat_area) + ablat_area = parallel_global_sum_patch(cell_area, nglacier, glacier_id, parallel) ! Compute the AAR for each glacier diff --git a/libglissade/glissade_inversion.F90 b/libglissade/glissade_inversion.F90 index 71a40c7a..54c688b3 100644 --- a/libglissade/glissade_inversion.F90 +++ b/libglissade/glissade_inversion.F90 @@ -47,7 +47,7 @@ module glissade_inversion ! a target ice thickness field. !----------------------------------------------------------------------------- - logical, parameter :: verbose_inversion = .false. + logical, parameter :: verbose_inversion = .false. !*********************************************************************** @@ -89,9 +89,6 @@ subroutine glissade_inversion_init(model) f_flotation, & ! flotation function (m) thck_obs ! observed ice thickness, derived from usrf_obs and topg - real(dp), dimension(model%general%ewn, model%general%nsn) :: & - coulomb_c_icegrid ! initial coulomb_c at cell centers based on masks - real(dp) :: h_obs, h_flotation, h_buff ! thck_obs, flotation thickness, and thck_flotation_buffer scaled to m real(dp) :: dh ! h_obs - h_flotation real(dp) :: dh_decimal ! decimal part remaining after subtracting the truncation of dh @@ -226,7 +223,6 @@ subroutine glissade_inversion_init(model) endif endif ! inversion for Cp, Cc or deltaT_ocn - !---------------------------------------------------------------------- ! If inverting for E, then make sure there is a target surface speed, velo_sfc_obs. !---------------------------------------------------------------------- @@ -254,6 +250,9 @@ subroutine glissade_inversion_init(model) !---------------------------------------------------------------------- ! computations specific to powerlaw_c (Cp) and coulomb_c (Cc) inversion + ! Note: Most sliding laws have inversion for Cp or Cc, but not both. + ! The modified Schoof law, however, supports inversion for both. + ! (This could be extended to the School and Tsai laws.) !---------------------------------------------------------------------- if (model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION .or. & @@ -268,7 +267,11 @@ subroutine glissade_inversion_init(model) call point_diag(model%basal_physics%powerlaw_c, 'init_inversion for powerlaw_c', itest, jtest, rtest, 7, 7) endif - elseif (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then + endif ! invert for powerlaw_c + + !TODO - Add distinct logic for powerlaw_c_inversion_basin? + + if (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then if (parallel_is_zero(model%basal_physics%coulomb_c)) then ! initialize coulomb_c (for which we will invert) @@ -280,7 +283,9 @@ subroutine glissade_inversion_init(model) 'init_inversion for coulomb_c', itest, jtest, rtest, 7, 7) endif - elseif (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION_BASIN) then + endif ! invert for coulomb_c + + if (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION_BASIN) then !TODO - Should this calculation be done in glissade_initialise? if (parallel_is_zero(model%basal_physics%coulomb_c_lo)) then @@ -311,10 +316,11 @@ subroutine glissade_inversion_init(model) 'init_inversion for basin-scale coulomb_c', itest, jtest, rtest, 7, 7) endif - endif + endif ! invert for coulomb_c_basin !---------------------------------------------------------------------- ! computations specific to flow_enhancement_factor inversion + ! TODO: Remove this inversion option? !---------------------------------------------------------------------- if (model%options%which_ho_flow_enhancement_factor == HO_FLOW_ENHANCEMENT_FACTOR_INVERSION) then @@ -557,7 +563,7 @@ subroutine glissade_inversion_solve(model) call staggered_parallel_halo(stag_thck, parallel) call staggered_parallel_halo(stag_dthck_dt, parallel) - ! Invert for powerlaw_c or coulomb_c + ! Invert for powerlaw_c and/or coulomb_c ! The logic is the same for each; only the max and min values and the in/out field are different. if ( model%options%which_ho_powerlaw_c == HO_POWERLAW_C_INVERSION) then @@ -593,7 +599,9 @@ subroutine glissade_inversion_solve(model) call point_diag(model%basal_physics%powerlaw_c, 'New powerlaw_c', itest, jtest, rtest, 7, 7) endif - elseif ( model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then + endif ! invert for powerlaw_c + + if (model%options%which_ho_coulomb_c == HO_COULOMB_C_INVERSION) then if (verbose_inversion .and. this_rank == rtest) then write(iulog,*) ' ' @@ -624,23 +632,18 @@ subroutine glissade_inversion_solve(model) if (verbose_inversion) then call point_diag(model%basal_physics%effecpress_stag, 'effecpress_stag', itest, jtest, rtest, 7, 7, '(f10.1)') - call point_diag(rhoi*grav*stag_thck, 'overburden', itest, jtest, rtest, 7, 7, '(f10.1)') - call point_diag((model%geometry%thck - thck_obs), 'thck - thck_obs (m)', itest, jtest, rtest, 7, 7) call point_diag(model%basal_physics%coulomb_c, 'New coulomb_c', itest, jtest, rtest, 7, 7, '(f10.5)') endif ! verbose_inversion - endif ! invert for powerlaw_c or coulomb_c - - else ! do not invert for powerlaw_c or coulomb_c; just print optional diagnostics + endif ! invert for coulomb_c - if (verbose_inversion) then - call point_diag(model%geometry%f_ground, 'f_ground at vertices', itest, jtest, rtest, 7, 7, '(f10.4)') - call point_diag(model%basal_physics%powerlaw_c, 'powerlaw_c', itest, jtest, rtest, 7, 7, '(f10.2)') - call point_diag(model%basal_physics%coulomb_c, 'coulomb_c', itest, jtest, rtest, 7, 7, '(f10.4)') - endif + elseif (verbose_inversion) then ! not inverting, but print some diagnostic values - endif ! invert for powerlaw_c or coulomb_c + call point_diag(model%geometry%f_ground, 'f_ground at vertices', itest, jtest, rtest, 7, 7, '(f10.4)') + call point_diag(model%basal_physics%powerlaw_c, 'powerlaw_c', itest, jtest, rtest, 7, 7, '(f10.2)') + call point_diag(model%basal_physics%coulomb_c, 'coulomb_c', itest, jtest, rtest, 7, 7, '(f10.4)') + endif ! If inverting for powerlaw_c or coulomb_c at the basin scale, then update it here @@ -699,6 +702,7 @@ subroutine glissade_inversion_solve(model) ewn, nsn, & model%numerics%dew, & ! m model%numerics%dns, & ! m + parallel, & itest, jtest, rtest, & model%ocean_data%nbasin, & model%ocean_data%basin_number, & @@ -780,6 +784,7 @@ subroutine glissade_inversion_solve(model) ewn, nsn, & model%numerics%dew, & ! m model%numerics%dns, & ! m + parallel, & itest, jtest, rtest, & model%ocean_data%nbasin, & model%ocean_data%basin_number, & @@ -853,6 +858,7 @@ subroutine glissade_inversion_solve(model) ewn, nsn, & model%numerics%dew, & ! m model%numerics%dns, & ! m + parallel, & itest, jtest, rtest, & model%ocean_data%nbasin, & model%ocean_data%basin_number, & @@ -906,6 +912,7 @@ subroutine glissade_inversion_solve(model) ewn, nsn, & model%numerics%dew, & ! m model%numerics%dns, & ! m + parallel, & itest, jtest, rtest, & model%ocean_data%nbasin, & model%ocean_data%basin_number, & @@ -944,6 +951,7 @@ subroutine glissade_inversion_solve(model) if (model%ocean_data%nbasin > 1) then call glissade_basin_average(& model%general%ewn, model%general%nsn, & + model%parallel, & model%ocean_data%nbasin, & model%ocean_data%basin_number, & floating_mask * 1.0d0, & ! real mask @@ -1256,6 +1264,8 @@ subroutine invert_basal_friction(& if (f_ground(i,j) > 0.0d0) then ! ice is at least partly grounded ! Compute tendency terms based on the thickness target + !TODO: Try putting max(babc_thck_scale, stag_dthck_obs) in the denominator + ! Alex Robinson says this might improve convergence term_thck = -stag_dthck(i,j) / (babc_thck_scale*babc_timescale) term_dHdt = -stag_dthck_dt(i,j) * 2.0d0 / babc_thck_scale @@ -1325,8 +1335,8 @@ subroutine invert_basal_friction(& call point_diag(stag_dthck, 'stag_thck - stag_thck_obs', itest, jtest, rtest, 7, 7) call point_diag(stag_dthck_dt*scyr, 'stag_dthck_dt (m/yr)', itest, jtest, rtest, 7, 7) call point_diag(f_ground, 'f_ground', itest, jtest, rtest, 7, 7) - call point_diag(del2_logc, 'del2(logC)', itest, jtest, rtest, 7, 7, '(e12.3)') - call point_diag(logC, 'logC', itest, jtest, rtest, 7, 7) +!! call point_diag(del2_logc, 'del2(logC)', itest, jtest, rtest, 7, 7, '(e12.3)') +!! call point_diag(logC, 'logC', itest, jtest, rtest, 7, 7) call point_diag(dlogc, 'dlogC', itest, jtest, rtest, 7, 7, '(e12.3)') endif @@ -1338,6 +1348,7 @@ subroutine invert_basal_friction_basin(& dt, & nx, ny, & dx, dy, & + parallel, & itest, jtest, rtest, & nbasin, & basin_number, & @@ -1372,6 +1383,9 @@ subroutine invert_basal_friction_basin(& real(dp), intent(in) :: & dx, dy ! grid cell size in each direction (m) + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point @@ -1418,6 +1432,7 @@ subroutine invert_basal_friction_basin(& call get_basin_targets(& nx, ny, & dx, dy, & + parallel, & nbasin, basin_number, & itest, jtest, rtest, & stag_thck, stag_dthck_dt, & @@ -1430,6 +1445,7 @@ subroutine invert_basal_friction_basin(& call glissade_basin_average(& nx, ny, & + parallel, & nbasin, basin_number, & stag_rmask, & friction_c, friction_c_basin) @@ -1495,6 +1511,7 @@ subroutine invert_deltaT_ocn_basin(& dt, & nx, ny, & dx, dy, & + parallel, & itest, jtest, rtest, & nbasin, & basin_number, & @@ -1534,6 +1551,9 @@ subroutine invert_deltaT_ocn_basin(& real(dp), intent(in) :: & dx, dy ! grid cell size in each direction (m) + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point @@ -1592,6 +1612,7 @@ subroutine invert_deltaT_ocn_basin(& call get_basin_targets(& nx, ny, & dx, dy, & + parallel, & nbasin, basin_number, & itest, jtest, rtest, & thck, dthck_dt, & @@ -1610,6 +1631,7 @@ subroutine invert_deltaT_ocn_basin(& call glissade_basin_average(& nx, ny, & + parallel, & nbasin, basin_number, & mask, & deltaT_ocn, deltaT_basin) @@ -2142,6 +2164,7 @@ end subroutine invert_flow_enhancement_factor subroutine get_basin_targets(& nx, ny, & dx, dy, & + parallel, & nbasin, basin_number, & itest, jtest, rtest, & thck, dthck_dt, & @@ -2167,6 +2190,9 @@ subroutine get_basin_targets(& real(dp), intent(in) :: & dx, dy ! grid cell size in each direction (m) + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + integer, intent(in) :: & nbasin ! number of basins @@ -2221,6 +2247,7 @@ subroutine get_basin_targets(& call glissade_basin_sum(& nx, ny, & + parallel, & nbasin, basin_number, & target_rmask, & cell_area, & @@ -2232,6 +2259,7 @@ subroutine get_basin_targets(& call glissade_basin_sum(& nx, ny, & + parallel, & nbasin, basin_number, & target_rmask, & thck_target*dx*dy, & @@ -2241,6 +2269,7 @@ subroutine get_basin_targets(& call glissade_basin_sum(& nx, ny, & + parallel, & nbasin, basin_number, & target_rmask, & thck*dx*dy, & @@ -2250,6 +2279,7 @@ subroutine get_basin_targets(& call glissade_basin_sum(& nx, ny, & + parallel, & nbasin, basin_number, & target_rmask, & dthck_dt*dx*dy, & diff --git a/libglissade/glissade_mass_balance.F90 b/libglissade/glissade_mass_balance.F90 index 09c1b3ce..fb03dc79 100644 --- a/libglissade/glissade_mass_balance.F90 +++ b/libglissade/glissade_mass_balance.F90 @@ -43,7 +43,7 @@ module glissade_mass_balance use glimmer_utils, only: point_diag use glide_types use cism_parallel, only: this_rank, main_task, nhalo, lhalo, uhalo, & - parallel_halo, parallel_reduce_max, parallel_reduce_sum, parallel_globalindex + parallel_halo, parallel_reduce_max, parallel_global_sum, parallel_globalindex implicit none save @@ -68,8 +68,8 @@ subroutine glissade_mass_balance_init(model) ! Initialize some fields related to the surface mass balance - use glimmer_paramets, only: eps11 use glimmer_physcon, only: rhow, rhoi, scyr + use cism_parallel, only: parallel_is_zero ! input/output arguments @@ -77,9 +77,33 @@ subroutine glissade_mass_balance_init(model) ! local variables - real(dp) :: local_maxval, global_maxval character(len=100) :: message - + + ! Initialize artm for the case that we are reading in artm_ref or artm_3d. + ! For some temp_init options, this is needed for correct interior temperatures. + ! Note: Do not call if glaciers are enabled. When running with glaciers, artm is + ! accumulated over year 1 from a forcing file which hasn't been read in yet. + ! TODO: Think about how to initialize glacier temperatures. Currently assume artm = 0, + ! which isn't realistic. + if (model%options%artm_input_function /= ARTM_INPUT_FUNCTION_XY) then + if (.not.model%options%enable_glaciers) then + call downscale_artm(model) + endif + endif + + ! Initialize smb for the case that we are reading in smb_ref or smb_3d. + ! This is not strictly needed, since the SMB will be recomputed before it is used, + ! but can be a helpful diagnostic. + ! Note: Do not call if glaciers are enabled. When running with glaciers, smb is + ! accumulated over year 1 from a forcing file which hasn't been read in yet. + ! TODO - call downscale_smb for the PDD option? + if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ .or. & + model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then + if (.not.model%options%enable_glaciers) then + call downscale_smb(model) + endif + endif + ! Initialize acab, if SMB (with different units) was read in if (model%options%smb_input == SMB_INPUT_MMYR_WE) then ! Convert units from mm/yr w.e. to m/s ice @@ -87,6 +111,7 @@ subroutine glissade_mass_balance_init(model) endif ! Initialize artm_corrected. This is equal to artm, plus any prescribed temperature anomaly. + !TODO - Not sure this is needed model%climate%artm_corrected(:,:) = model%climate%artm(:,:) if (model%options%enable_artm_anomaly) then @@ -96,9 +121,7 @@ subroutine glissade_mass_balance_init(model) ! Note: The artm_anomaly field does not change during the run, ! but it is possible to ramp up the anomaly using artm_anomaly_timescale. - local_maxval = maxval(abs(model%climate%artm_anomaly)) - global_maxval = parallel_reduce_max(local_maxval) - if (global_maxval < eps11) then + if (parallel_is_zero(model%climate%artm_anomaly)) then model%climate%artm_anomaly = model%climate%artm_anomaly_const write(message,*) & 'Setting artm_anomaly = constant value (degC):', model%climate%artm_anomaly_const @@ -109,7 +132,6 @@ subroutine glissade_mass_balance_init(model) endif endif endif - !TODO - Write a short utility function to compute global_maxval of any field. !TODO - Repeat for snow and precip anomalies ! If acab is to be overwritten for some cells, then set overwrite_acab_mask = 1 for these cells. @@ -206,52 +228,9 @@ subroutine glissade_prepare_climate_forcing(model) ! Downscaling of artm_ref to artm (at the ice surface) happens below, followed by the SMB calculation. !------------------------------------------------------- - ! Downscale artm to the current surface elevation if needed. - ! The downscaling options are: - ! (0) artm(x,y); no dependence on surface elevation - ! (1) artm(x,y) + d(artm)/dz(x,y) * dz; artm depends on input field at reference elevation, plus vertical correction - ! (2) artm(x,y,z); artm obtained by linear interpolation between values prescribed at adjacent vertical levels - ! (3) artm(x,y) adjusted with a uniform lapse rate - ! For options (1) - (3), the elevation-dependent artm is computed here. - - if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then - - ! compute artm by a lapse-rate correction to the reference value - model%climate%artm(:,:) = model%climate%artm_ref(:,:) + & - (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%artm_gradz(:,:) - - elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then - - if (parallel_is_zero(model%climate%artm_3d)) then - write(message,*) 'Error: artm_3d = 0 everywhere with artm_input_function =', model%options%artm_input_function - call write_log(trim(message), GM_FATAL) - endif - - ! Note: With linear_extrapolate_in = T, the values outside the range are obtained by linear extrapolation - ! from the top two or bottom two values. - ! For temperature, which varies roughly linearly with elevation, this is more accurate - ! than simply extending the top and bottom values. - ! This call includes a halo update. - - call glissade_vertical_interpolate(& - ewn, nsn, & - nzatm, model%climate%zatm, & - model%geometry%usrf, & - model%climate%artm_3d, & - model%climate%artm, & - linear_extrapolate_in = .true.) - - elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then - - ! compute artm by a lapse-rate correction to artm_ref - ! T_lapse is defined as positive for T decreasing with height - - model%climate%artm(:,:) = model%climate%artm_ref(:,:) - & - (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%t_lapse - - endif ! artm_input_function - - call parallel_halo(model%climate%artm, parallel) + if (model%options%artm_input_function /= ARTM_INPUT_FUNCTION_XY) then + call downscale_artm(model) + endif ! Optionally, add an anomaly to the surface air temperature ! Typically, artm_corrected = artm, but sometimes (e.g., for ISMIP6 forcing experiments), @@ -348,155 +327,10 @@ subroutine glissade_prepare_climate_forcing(model) ! which is passed to the main mass balance driver. !------------------------------------------------------------------------- - ! ------------------------------------------------------------------------ - ! Depending on the SMB input options, compute model%climate%acab at the ice surface. - ! The options are: - ! (0) SMB(x,y); no dependence on surface elevation - ! (1) SMB(x,y) + dSMB/dz(x,y) * dz; SMB depends on input field at reference elevation, plus vertical correction - ! (2) SMB(x,y,z); SMB obtained by linear interpolation between values prescribed at adjacent vertical levels - ! (3) SMB obtained from precip and artm using a positive-degree scheme - ! - ! Options (1) and (2) require input fields with SMB units of mm/yr w.e. (SMB_INPUT_MMYR_WE) - ! For these options, the elevation-dependent SMB is computed here. - ! ------------------------------------------------------------------------ - - if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then - - ! downscale SMB to the local surface elevation - model%climate%smb(:,:) = model%climate%smb_ref(:,:) + & - (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%smb_gradz(:,:) - - elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then - - ! downscale SMB to the local surface elevation - ! Note: With linear_extrapolate_in = F, the values at top and bottom levels are simply extended upward and downward. - ! For SMB, this is safer than linear extrapolation (especially when extrapolating upward). - - if (parallel_is_zero(model%climate%smb_3d)) then - write(message,*) 'Error: smb_3d = 0 everywhere with smb_input_function =', model%options%smb_input_function - call write_log(trim(message), GM_FATAL) - endif - - call glissade_vertical_interpolate(& - ewn, nsn, & - nzatm, model%climate%zatm, & - model%geometry%usrf, & - model%climate%smb_3d, & - model%climate%smb, & - linear_extrapolate_in = .false.) - - elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then - - ! Compute SMB using a simple PDD scheme: - ! (1) Partition precip as rain or snow based on the downscaled artm - ! (2) Compute ablation based on artm and a degree factor - ! Assume that artm has already been downscaled, if needed, based on artm_input_function. - - ! Note: This is similar to the SMB calculation for glaciers, but that calculation is done in the glacier module. - ! TODO: Put the glacier values of snow_threshold_min and snow_threshold_max in the climate derived type. - - ! compute snow accumulation (mm/yr w.e.) - where (model%climate%artm > model%climate%snow_threshold_max) - model%climate%snow = 0.0d0 ! all precip falls as rain - elsewhere (model%climate%artm < model%climate%snow_threshold_min) - model%climate%snow = model%climate%precip ! all precip falls as snow - elsewhere (model%climate%artm > model%climate%snow_threshold_min) - model%climate%snow = model%climate%precip * (model%climate%snow_threshold_max - model%climate%artm) & - / (model%climate%snow_threshold_max - model%climate%snow_threshold_min) - endwhere - - ! compute ablation (mm/yr w.e.) - ! Note: degree_factor has units of mm/yr w.e./degC to be consistent with other mass-balance variables. - ! It is like mu_star for glaciers. - model%climate%ablation = model%climate%degree_factor * max(model%climate%artm - model%climate%tmlt, 0.0d0) - - ! compute smb (mm/yr w.e.) - model%climate%smb = model%climate%snow - model%climate%ablation - - ! set smb = 0 for open ocean - where (model%geometry%thck == 0.0d0 .and. (model%geometry%topg - model%climate%eus) < 0.0d0) - model%climate%smb = 0.0d0 - endwhere - - endif ! smb_input_function - - ! For the non-default smb_input_function options, make sure that model%climate%smb is nonzero somewhere; else abort. - ! For the default option, do not abort, since idealized tests often have a zero SMB. - - call parallel_halo(model%climate%smb, parallel) - - if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ .or. & - model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ .or. & - model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then - if (parallel_is_zero(model%climate%smb)) then - write(message,*) 'Error: smb = 0 everywhere with smb_input_function =', model%options%smb_input_function - call write_log(trim(message), GM_FATAL) - endif + if (model%options%smb_input_function /= SMB_INPUT_FUNCTION_XY) then + call downscale_smb(model) endif - ! optional diagnostics - if (verbose_smb) then - - if (this_rank == rtest) then - write(iulog,*) 'Computing runtime smb with smb_input_function =', model%options%smb_input_function - endif - call point_diag(model%geometry%usrf, 'usrf (m)', itest, jtest, rtest, 7, 7) - - if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY) then - call point_diag(model%climate%smb, 'smb (mm/yr)', itest, jtest, rtest, 7, 7) - elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then - call point_diag(model%geometry%usrf - model%climate%usrf_ref, 'usrf - usrf_ref (m)', & - itest, jtest, rtest, 7, 7) - call point_diag(model%climate%smb_ref, 'reference smb (mm/yr)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%smb_gradz, 'smb_gradz (mm/yr per m)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7) - elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then - if (this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'smb_3d at each level:' - endif - do k = 1, nzatm - if (this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'k =', k - endif - call point_diag(model%climate%smb_3d(k,:,:), 'smb_3d (mm/yr)', itest, jtest, rtest, 7, 7) - enddo - call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7) - elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then - call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%precip, 'precip (mm/yr)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%snow, 'snow (mm/yr)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%ablation, 'ablation (mm/yr)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%smb,'smb (mm/yr)', itest, jtest, rtest, 7, 7) - endif ! smb_input_function - - if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY) then - call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7) - elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then - call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%artm_gradz*1000.d0, 'artm_gradz (deg C per km)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7) - elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then - if (this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'artm_3d at each level:' - endif - do k = 1, nzatm - if (this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'k =', k - endif - call point_diag(model%climate%artm_3d(k,:,:), 'artm_3d (deg C)', itest, jtest, rtest, 7, 7) - enddo - call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7) - elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then - call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7) - call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7) - endif ! artm_input_function - - endif ! verbose_smb - ! Compute a corrected smb field that includes any anomalies or correction factors. ! initialize @@ -614,13 +448,354 @@ subroutine glissade_prepare_climate_forcing(model) end subroutine glissade_prepare_climate_forcing +!======================================================================= + + subroutine downscale_artm(model) + + use glissade_grid_operators, only: glissade_vertical_interpolate + use cism_parallel, only: parallel_is_zero + + ! input/output arguments + + type(glide_global_type), intent(inout) :: model ! model instance + + ! local variables + + integer :: itest, jtest, rtest ! coordinates of diagnostic cell + integer :: i, j, k + integer :: ewn, nsn + integer :: nzatm ! number of atmosphere levels at which smb_3d and artm_3d are provided + + type(parallel_type) :: parallel ! info for parallel communication + + character(len=100) :: message + + ! initialize + + rtest = -999 + itest = 1 + jtest = 1 + if (this_rank == model%numerics%rdiag_local) then + rtest = model%numerics%rdiag_local + itest = model%numerics%idiag_local + jtest = model%numerics%jdiag_local + endif + + ewn = model%general%ewn + nsn = model%general%nsn + nzatm = model%climate%nzatm + parallel = model%parallel + + ! Downscale artm to the current surface elevation if needed. + ! The downscaling options are: + ! (0) artm(x,y); no dependence on surface elevation + ! (1) artm(x,y) + d(artm)/dz(x,y) * dz; artm depends on input field at reference elevation, plus vertical correction + ! (2) artm(x,y,z); artm obtained by linear interpolation between values prescribed at adjacent vertical levels + ! (3) artm(x,y) adjusted with a uniform lapse rate + ! For options (1) - (3), the elevation-dependent artm is computed here. + + ! Make sure the required input fields are present with nonzero values + + if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then + + if (parallel_is_zero(model%climate%artm_ref) .or. & + parallel_is_zero(model%climate%usrf_ref) .or. & + parallel_is_zero(model%climate%artm_gradz)) then + write(message,*) & + 'Error: Must have nonzero artm_ref, artm_gradz and usrf_ref with artm_input_function =', & + model%options%artm_input_function + call write_log(trim(message), GM_FATAL) + endif + + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then + + if (parallel_is_zero(model%climate%artm_3d)) then + write(message,*) & + 'Error: Must have nonzero artm_3d with artm_input_function =', & + model%options%artm_input_function + call write_log(trim(message), GM_FATAL) + endif + + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then + + if (parallel_is_zero(model%climate%artm_ref) .or. & + parallel_is_zero(model%climate%usrf_ref) .or. & + model%climate%t_lapse <= 0.0d0) then + write(message,*) & + 'Error: Must have t_lapse > 0 and nonzero artm_ref, usrf_ref with artm_input_function =', & + model%options%artm_input_function + call write_log(trim(message), GM_FATAL) + endif + + endif + + ! Do the downscaling + + if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then + + ! compute artm by a lapse-rate correction to the reference value + model%climate%artm(:,:) = model%climate%artm_ref(:,:) + & + (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%artm_gradz(:,:) + + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then + + ! Note: With linear_extrapolate_in = T, the values outside the range are obtained + ! by linear extrapolation from the top two or bottom two values. + ! For temperature, which varies roughly linearly with elevation, this is more accurate + ! than simply extending the top and bottom values. + + call glissade_vertical_interpolate(& + ewn, nsn, & + nzatm, model%climate%zatm, & + model%geometry%usrf, & + model%climate%artm_3d, & + model%climate%artm, & + linear_extrapolate_in = .true.) + + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then + + ! compute artm by a lapse-rate correction to artm_ref + ! T_lapse is defined as positive for T decreasing with height + + model%climate%artm(:,:) = model%climate%artm_ref(:,:) - & + (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%t_lapse + + endif ! artm_input_function + + call parallel_halo(model%climate%artm, parallel) + + ! optional diagnostics + + if (verbose_smb) then + if (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY) then + call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7) + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_GRADZ) then + call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%artm_gradz*1000.d0, 'artm_gradz (deg C per km)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7) + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XYZ) then + if (this_rank == rtest) then + write(iulog,*) ' ' + write(iulog,*) 'artm_3d at each level:' + endif + do k = 1, nzatm + if (this_rank == rtest) then + write(iulog,*) ' ' + write(iulog,*) 'k =', k + endif + call point_diag(model%climate%artm_3d(k,:,:), 'artm_3d (deg C)', itest, jtest, rtest, 7, 7) + enddo + call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7) + elseif (model%options%artm_input_function == ARTM_INPUT_FUNCTION_XY_LAPSE) then + call point_diag(model%climate%artm_ref, 'reference artm (deg C)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%artm, 'downscaled artm (deg C)', itest, jtest, rtest, 7, 7) + endif ! artm_input_function + endif + + end subroutine downscale_artm + +!======================================================================= + + subroutine downscale_smb(model) + + use glissade_grid_operators, only: glissade_vertical_interpolate + use cism_parallel, only: parallel_is_zero + + ! input/output arguments + + type(glide_global_type), intent(inout) :: model ! model instance + + ! local variables + + integer :: itest, jtest, rtest ! coordinates of diagnostic cell + integer :: i, j, k + integer :: ewn, nsn + integer :: nzatm ! number of atmosphere levels at which smb_3d and artm_3d are provided + + type(parallel_type) :: parallel ! info for parallel communication + + character(len=100) :: message + + ! initialize + + rtest = -999 + itest = 1 + jtest = 1 + if (this_rank == model%numerics%rdiag_local) then + rtest = model%numerics%rdiag_local + itest = model%numerics%idiag_local + jtest = model%numerics%jdiag_local + endif + + ewn = model%general%ewn + nsn = model%general%nsn + nzatm = model%climate%nzatm + parallel = model%parallel + + ! ------------------------------------------------------------------------ + ! Depending on the SMB input options, compute model%climate%acab at the ice surface. + ! The options are: + ! (0) SMB(x,y); no dependence on surface elevation + ! (1) SMB(x,y) + dSMB/dz(x,y) * dz; SMB depends on input field at reference elevation, plus vertical correction + ! (2) SMB(x,y,z); SMB obtained by linear interpolation between values prescribed at adjacent vertical levels + ! (3) SMB obtained from precip and artm using a positive-degree scheme + ! + ! Options (1) and (2) require input fields with SMB units of mm/yr w.e. (SMB_INPUT_MMYR_WE) + ! For these options, the elevation-dependent SMB is computed here. + ! ------------------------------------------------------------------------ + + ! Make sure the required input fields are present with nonzero values + + if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then + + if (parallel_is_zero(model%climate%smb_ref) .or. & + parallel_is_zero(model%climate%usrf_ref) .or. & + parallel_is_zero(model%climate%smb_gradz)) then + write(message,*) & + 'Error: Must have nonzero smb_ref, smb_gradz and usrf_ref with smb_input_function =', & + model%options%smb_input_function + call write_log(trim(message), GM_FATAL) + endif + + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then + + if (parallel_is_zero(model%climate%smb_3d)) then + write(message,*) & + 'Error: Must have nonzero smb_3d with smb_input_function =', & + model%options%smb_input_function + call write_log(trim(message), GM_FATAL) + endif + + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then + + if (parallel_is_zero(model%climate%artm) .or. & + parallel_is_zero(model%climate%precip)) then + write(message,*) & + 'Error: Must have nonzero artm and precip with smb_input_function =', & + model%options%smb_input_function + call write_log(trim(message), GM_FATAL) + endif + + endif + + ! Do the downscaling + + if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then + + ! downscale SMB to the local surface elevation + model%climate%smb(:,:) = model%climate%smb_ref(:,:) + & + (model%geometry%usrf(:,:) - model%climate%usrf_ref(:,:)) * model%climate%smb_gradz(:,:) + + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then + + ! downscale SMB to the local surface elevation + ! Note: With linear_extrapolate_in = F, the values at top and bottom levels are simply extended upward and downward. + ! For SMB, this is safer than linear extrapolation (especially when extrapolating upward). + + call glissade_vertical_interpolate(& + ewn, nsn, & + nzatm, model%climate%zatm, & + model%geometry%usrf, & + model%climate%smb_3d, & + model%climate%smb, & + linear_extrapolate_in = .false.) + + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then + + ! Compute SMB using a simple PDD scheme: + ! (1) Partition precip as rain or snow based on the downscaled artm + ! (2) Compute ablation based on artm and a degree factor + ! Assume that artm has already been downscaled, if needed, based on artm_input_function. + + ! Note: This is similar to the SMB calculation for glaciers, but that calculation is done in the glacier module. + ! TODO: Put the glacier values of snow_threshold_min and snow_threshold_max in the climate derived type. + + ! compute snow accumulation (mm/yr w.e.) + where (model%climate%artm > model%climate%snow_threshold_max) + model%climate%snow = 0.0d0 ! all precip falls as rain + elsewhere (model%climate%artm < model%climate%snow_threshold_min) + model%climate%snow = model%climate%precip ! all precip falls as snow + elsewhere (model%climate%artm > model%climate%snow_threshold_min) + model%climate%snow = model%climate%precip * (model%climate%snow_threshold_max - model%climate%artm) & + / (model%climate%snow_threshold_max - model%climate%snow_threshold_min) + endwhere + + ! compute ablation (mm/yr w.e.) + ! Note: degree_factor has units of mm/yr w.e./degC to be consistent with other mass-balance variables. + ! It is like mu_star for glaciers. + model%climate%ablation = model%climate%degree_factor * max(model%climate%artm - model%climate%tmlt, 0.0d0) + + ! compute smb (mm/yr w.e.) + model%climate%smb = model%climate%snow - model%climate%ablation + + ! set smb = 0 for open ocean + where (model%geometry%thck == 0.0d0 .and. (model%geometry%topg - model%climate%eus) < 0.0d0) + model%climate%smb = 0.0d0 + endwhere + + endif ! smb_input_function + + call parallel_halo(model%climate%smb, parallel) + + ! For the non-default smb_input_function options, make sure the SMB is nonzero somewhere. + ! For the default option, do not abort, since idealized tests often have a zero SMB. + + if (model%options%smb_input_function /= SMB_INPUT_FUNCTION_XY) then + if (parallel_is_zero(model%climate%smb)) then + write(message,*) 'Error: smb = 0 everywhere with smb_input_function =', & + model%options%smb_input_function + call write_log(trim(message), GM_FATAL) + endif + endif + + ! optional diagnostics + + if (verbose_smb) then + + if (this_rank == rtest) then + write(iulog,*) 'Computing runtime smb with smb_input_function =', model%options%smb_input_function + endif + call point_diag(model%geometry%usrf, 'usrf (m)', itest, jtest, rtest, 7, 7) + + if (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY) then + call point_diag(model%climate%smb, 'smb (mm/yr)', itest, jtest, rtest, 7, 7) + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XY_GRADZ) then + call point_diag(model%geometry%usrf - model%climate%usrf_ref, 'usrf - usrf_ref (m)', & + itest, jtest, rtest, 7, 7) + call point_diag(model%climate%smb_ref, 'reference smb (mm/yr)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%smb_gradz, 'smb_gradz (mm/yr per m)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7) + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_XYZ) then + if (this_rank == rtest) then + write(iulog,*) ' ' + write(iulog,*) 'smb_3d at each level:' + endif + do k = 1, nzatm + if (this_rank == rtest) then + write(iulog,*) ' ' + write(iulog,*) 'k =', k + endif + call point_diag(model%climate%smb_3d(k,:,:), 'smb_3d (mm/yr)', itest, jtest, rtest, 7, 7) + enddo + call point_diag(model%climate%smb, 'downscaled smb (mm/yr)', itest, jtest, rtest, 7, 7) + elseif (model%options%smb_input_function == SMB_INPUT_FUNCTION_PDD) then + call point_diag(model%climate%artm, 'artm (deg C)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%precip, 'precip (mm/yr)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%snow, 'snow (mm/yr)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%ablation, 'ablation (mm/yr)', itest, jtest, rtest, 7, 7) + call point_diag(model%climate%smb,'smb (mm/yr)', itest, jtest, rtest, 7, 7) + endif ! smb_input_function + + endif ! verbose_smb + + end subroutine downscale_smb + !======================================================================= subroutine glissade_apply_smb(model) ! Apply the SMB at the upper and lower surfaces, and recompute tracer values. - use glimmer_paramets, only: eps11 use glimmer_physcon, only: rhow, rhoi, scyr use glissade_masks, only: glissade_get_masks use glissade_calving, only: verbose_calving @@ -928,6 +1103,7 @@ subroutine mass_balance_driver(& call glissade_sum_mass_and_tracers(& nx, ny, & nlyr, ntracers, & + parallel, & thck_layer(:,:,:), msum_init, & tracers(:,:,:,:), mtsum_init(:)) @@ -1006,9 +1182,9 @@ subroutine mass_balance_driver(& enddo enddo - sum_acab = parallel_reduce_sum(sum_acab) - sum_bmlt = parallel_reduce_sum(sum_bmlt) - sum_melt_potential = parallel_reduce_sum(sum_melt_potential) + sum_acab = parallel_global_sum(acab*effective_areafrac, parallel) + sum_bmlt = parallel_global_sum(bmlt*effective_areafrac, parallel) + sum_melt_potential = parallel_global_sum(melt_potential, parallel) msum_init = msum_init + (sum_acab - sum_bmlt)*dt @@ -1017,6 +1193,7 @@ subroutine mass_balance_driver(& call glissade_sum_mass_and_tracers(& nx, ny, & nlyr, ntracers, & + parallel, & thck_layer(:,:,:), msum_final, & tracers(:,:,:,:), mtsum_final(:)) diff --git a/libglissade/glissade_transport.F90 b/libglissade/glissade_transport.F90 index b42d6866..9ba618f4 100644 --- a/libglissade/glissade_transport.F90 +++ b/libglissade/glissade_transport.F90 @@ -45,7 +45,7 @@ module glissade_transport use glimmer_log use glissade_remap, only: glissade_horizontal_remap, make_remap_mask, puny use cism_parallel, only: this_rank, main_task, nhalo, lhalo, uhalo, staggered_lhalo, staggered_uhalo, & - parallel_type, parallel_reduce_max, parallel_reduce_sum, parallel_reduce_minloc, & + parallel_type, parallel_global_sum, parallel_reduce_max, parallel_reduce_minloc, & parallel_globalindex, broadcast implicit none @@ -445,6 +445,7 @@ subroutine glissade_transport_driver(dt, & call glissade_sum_mass_and_tracers(& nx, ny, & nlyr, ntracers, & + parallel, & thck_layer(:,:,:), msum_init, & tracers(:,:,:,:), mtsum_init(:)) endif @@ -603,6 +604,7 @@ subroutine glissade_transport_driver(dt, & call glissade_sum_mass_and_tracers(& nx, ny, & nlyr, ntracers, & + parallel, & thck_layer(:,:,:), msum_final, & tracers(:,:,:,:), mtsum_final(:)) @@ -661,6 +663,7 @@ subroutine glissade_transport_driver(dt, & call glissade_sum_mass_and_tracers(& nx, ny, & nlyr, ntracers, & + parallel, & thck_layer(:,:,:), msum_final, & tracers(:,:,:,:), mtsum_final(:)) @@ -950,6 +953,7 @@ end subroutine glissade_check_cfl subroutine glissade_sum_mass_and_tracers(& nx, ny, & nlyr, ntracer, & + parallel, & thck_layer, msum, & tracer, mtsum) @@ -963,6 +967,9 @@ subroutine glissade_sum_mass_and_tracers(& nlyr, &! number of vertical layers ntracer ! number of tracers + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + real(dp), dimension (nx,ny,nlyr), intent(in) :: & thck_layer ! ice layer thickness @@ -977,30 +984,21 @@ subroutine glissade_sum_mass_and_tracers(& ! Local arguments - integer :: i, j, nt - - msum = 0.d0 - if (present(mtsum)) mtsum(:) = 0.d0 + integer :: nt, k - do j = 1+nhalo, ny-nhalo - do i = 1+nhalo, nx-nhalo - - ! accumulate ice mass and mass*tracers - ! (actually, accumulate thickness, assuming rhoi*dx*dy is the same for each cell) - - msum = msum + sum(thck_layer(i,j,:)) - - if (present(mtsum)) then - do nt = 1, ntracer - mtsum(nt) = mtsum(nt) + sum(tracer(i,j,nt,:)*thck_layer(i,j,:)) - enddo - endif - - enddo ! i - enddo ! j + msum = 0.0d0 + do k = 1, nlyr + msum = msum + parallel_global_sum(thck_layer(:,:,k), parallel) + enddo - msum = parallel_reduce_sum(msum) - if (present(mtsum)) mtsum = parallel_reduce_sum(mtsum) + if (present(mtsum)) then + mtsum(:) = 0.0d0 + do k = 1, nlyr + do nt = 1, ntracer + mtsum(nt) = mtsum(nt) + parallel_global_sum(tracer(:,:,nt,k)*thck_layer(:,:,k), parallel) + enddo + enddo + endif end subroutine glissade_sum_mass_and_tracers diff --git a/libglissade/glissade_utils.F90 b/libglissade/glissade_utils.F90 index e7e3431f..4f1de07f 100644 --- a/libglissade/glissade_utils.F90 +++ b/libglissade/glissade_utils.F90 @@ -44,7 +44,12 @@ module glissade_utils glissade_basin_sum, glissade_basin_average, & glissade_usrf_to_thck, glissade_thck_to_usrf, & glissade_edge_fluxes, glissade_input_fluxes, & - glissade_rms_error + glissade_rms_error, write_array_to_file + + interface write_array_to_file + module procedure write_array_to_file_real8_2d + module procedure write_array_to_file_real8_3d + end interface contains @@ -516,8 +521,10 @@ end subroutine glissade_adjust_topography !**************************************************** + !TODO - Calls to this subroutine could be replaced by inline calls to parallel_global_sum_patch subroutine glissade_basin_sum(& nx, ny, & + parallel, & nbasin, basin_number, & rmask, & field_2d, & @@ -527,11 +534,14 @@ subroutine glissade_basin_sum(& ! The sum is taken over grid cells with mask = 1. ! All cells are weighted equally. - use cism_parallel, only: parallel_reduce_sum, nhalo + use cism_parallel, only: parallel_global_sum_patch integer, intent(in) :: & nx, ny !> number of grid cells in each dimension + type(parallel_type), intent(in) :: & + parallel !> info for parallel communication + integer, intent(in) :: & nbasin !> number of basins @@ -546,29 +556,10 @@ subroutine glissade_basin_sum(& real(dp), dimension(nbasin), intent(out) :: & field_basin_sum !> basin-sum output field - ! local variables - - integer :: i, j, nb - !TODO - Replace sumcell with sumarea, and pass in cell area. ! Current algorithm assumes all cells with mask = 1 have equal weight. - real(dp), dimension(nbasin) :: & - sumfield_local ! sum of field on local task - - sumfield_local(:) = 0.0d0 - - ! loop over locally owned cells - do j = nhalo+1, ny-nhalo - do i = nhalo+1, nx-nhalo - nb = basin_number(i,j) - if (nb >= 1) then - sumfield_local(nb) = sumfield_local(nb) + rmask(i,j)*field_2d(i,j) - endif - enddo - enddo - - field_basin_sum(:) = parallel_reduce_sum(sumfield_local(:)) + field_basin_sum = parallel_global_sum_patch(rmask*field_2d, nbasin, basin_number, parallel) end subroutine glissade_basin_sum @@ -576,6 +567,7 @@ end subroutine glissade_basin_sum subroutine glissade_basin_average(& nx, ny, & + parallel, & nbasin, basin_number, & rmask, & field_2d, & @@ -586,11 +578,14 @@ subroutine glissade_basin_average(& ! All cells are weighted equally. ! Note: This subroutine assumes an input field located at cell centers - use cism_parallel, only: parallel_reduce_sum, nhalo + use cism_parallel, only: parallel_global_sum_patch integer, intent(in) :: & nx, ny !> number of grid cells in each dimension + type(parallel_type), intent(in) :: & + parallel !> info for parallel communication + integer, intent(in) :: & nbasin !> number of basins @@ -607,33 +602,17 @@ subroutine glissade_basin_average(& ! local variables - integer :: i, j, nb + integer :: nb !TODO - Replace sumcell with sumarea, and pass in cell area. ! Current algorithm assumes all cells with mask = 1 have equal weight. real(dp), dimension(nbasin) :: & - summask_local, & ! sum of mask in each basin on local task summask_global, & ! sum of mask in each basin on full domain - sumfield_local, & ! sum of field on local task sumfield_global ! sum of field over full domain - summask_local(:) = 0.0d0 - sumfield_local(:) = 0.0d0 - - ! loop over locally owned cells only - do j = nhalo+1, ny-nhalo - do i = nhalo+1, nx-nhalo - nb = basin_number(i,j) - if (nb >= 1) then - summask_local(nb) = summask_local(nb) + rmask(i,j) - sumfield_local(nb) = sumfield_local(nb) + rmask(i,j)*field_2d(i,j) - endif - enddo - enddo - - summask_global(:) = parallel_reduce_sum(summask_local(:)) - sumfield_global(:) = parallel_reduce_sum(sumfield_local(:)) + summask_global = parallel_global_sum_patch(rmask, nbasin, basin_number, parallel) + sumfield_global = parallel_global_sum_patch(rmask*field_2d, nbasin, basin_number, parallel) do nb = 1, nbasin if (summask_global(nb) > tiny(0.0d0)) then @@ -708,7 +687,7 @@ subroutine glissade_usrf_to_thck(usrf, topg, eus, thck) ! That is, if topg - eus < 0 (marine-based ice), and if the upper surface is too close ! to sea level to ground the ice, then the ice thickness is chosen to satisfy ! rhoi*H = -rhoo*(topg-eus). - ! Note: usrf, topg, eus and thck must all have the same units (often but not necessarily meters). + ! Note: usrf, topg, eus and thck must all have the same units (usually but not necessarily meters). use glimmer_physcon, only : rhoo, rhoi @@ -971,10 +950,154 @@ subroutine glissade_input_fluxes(& end subroutine glissade_input_fluxes + + ! subroutines belonging to the write_array_to_file interface + subroutine write_array_to_file_real8_2d(arr, fileunit, filename, parallel, write_binary) + + ! Copy the input array into a global array and write all values to an output file. + ! This can be useful for debugging, if we want to find differences between two fields + ! (e.g., in two different runs). + ! This version writes out 64-bit character strings corresponding to the binary representation + ! of each floating-point variable. This can be useful for BFB comparisons. + ! Sometimes, two floating-point variables appear to have the same values in base 10, + ! when the last few bits actually vary. + !TODO - Allow either float or binary output + + use glimmer_utils, only: double_to_binary + use cism_parallel, only: gather_var + + real(dp), dimension(:,:), intent(in) :: arr + integer, intent(in) :: fileunit + character(len=*), intent(in) :: filename + type(parallel_type), intent(in) :: parallel + logical, intent(in), optional :: write_binary + + integer :: i, j + character(len=64) :: binary_str + real(dp), dimension(:,:), allocatable :: arr_global + logical :: binary_output + + if (present(write_binary)) then + binary_output = write_binary + else + binary_output = .false. + endif + + call gather_var(arr, arr_global, parallel) + if (main_task) then + open(unit=fileunit, file=trim(filename), status='replace', position='append') + + if (binary_output) then + do j = 1, parallel%global_nsn + do i = 1, parallel%global_ewn + call double_to_binary(arr_global(i,j), binary_str) + write (fileunit, '(2i6,a4,a64)') i, j, ' ', binary_str + enddo + enddo + else + do j = 1, parallel%global_nsn + do i = 1, parallel%global_ewn + write (fileunit, '(2i6,a4,f24.16)') i, j, ' ', arr_global(i,j) + enddo + enddo + endif + + close(unit=fileunit) + deallocate(arr_global) + endif + + end subroutine write_array_to_file_real8_2d + + + subroutine write_array_to_file_real8_3d(arr, fileunit, filename, parallel, write_binary, cycle_indices) + + ! Copy the input array into a global array and write all values to an output file. + ! This can be useful for debugging, if we want to find differences between two fields + ! (e.g., in two different runs). + ! This version writes out 64-bit character strings corresponding to the binary representation + ! of each floating-point variable. This can be useful for BFB comparisons. + ! Sometimes, two floating-point variables appear to have the same values in base 10, + ! when the last few bits actually vary. + !TODO - Allow either float or binary output + + use glimmer_utils, only: double_to_binary + use cism_parallel, only: gather_var + + real(dp), dimension(:,:,:), intent(in) :: arr ! first two indices are i and j + integer, intent(in) :: fileunit + character(len=*), intent(in) :: filename + type(parallel_type), intent(in) :: parallel + logical, intent(in), optional :: write_binary + logical, intent(in), optional :: cycle_indices ! if true, then index 3->1, 1->2, 2->3 + + integer :: i, j, k, kmax + character(len=64) :: binary_str + real(dp), dimension(:,:,:), allocatable :: arr_global + real(dp), dimension(:,:,:), allocatable :: arr_cycle + logical :: binary_output + logical :: cycle_ind + + if (present(write_binary)) then + binary_output = write_binary + else + binary_output = .false. + endif + + if (present(cycle_indices)) then + cycle_ind = cycle_indices + else + cycle_ind = .false. + endif + + if (cycle_ind) then + allocate(arr_cycle(size(arr,3), size(arr,1), size(arr,2))) + kmax = size(arr,3) + do j = 1, size(arr,2) + do i = 1, size(arr,1) + do k = 1, kmax + arr_cycle(k,i,j) = arr(i,j,k) + enddo + enddo + enddo + call gather_var(arr_cycle, arr_global, parallel) + deallocate(arr_cycle) + else + kmax = size(arr,1) + call gather_var(arr, arr_global, parallel) + endif + + if (main_task) then + open(unit=fileunit, file=trim(filename), status='unknown') + + if (binary_output) then + do j = 1, parallel%global_nsn + do i = 1, parallel%global_ewn + do k = 1, kmax + call double_to_binary(arr_global(k,i,j), binary_str) + write (fileunit, '(3i6,a4,a64)') i, j, k, ' ', binary_str + enddo + enddo + enddo + else + do j = 1, parallel%global_nsn + do i = 1, parallel%global_ewn + do k = 1, kmax + write (fileunit, '(3i6,a4,f24.16)') i, j, k, ' ', arr_global(k,i,j) + enddo + enddo + enddo + endif + + close(unit=fileunit) + deallocate(arr_global) + endif + + end subroutine write_array_to_file_real8_3d + !**************************************************************************** !TODO - Other utility subroutines to add here? -! E.g., tridiag; calclsrf; subroutines to zero out tracers +! E.g., calclsrf; subroutines to zero out tracers !**************************************************************************** diff --git a/libglissade/glissade_velo_higher.F90 b/libglissade/glissade_velo_higher.F90 index 3b146e44..1ea5b2eb 100644 --- a/libglissade/glissade_velo_higher.F90 +++ b/libglissade/glissade_velo_higher.F90 @@ -56,11 +56,11 @@ module glissade_velo_higher - use glimmer_global, only: dp + use glimmer_global, only: dp, i8 use glimmer_physcon, only: n_glen, rhoi, rhoo, grav, scyr, pi - use glimmer_paramets, only: iulog, eps08, eps10, eps11 + use glimmer_paramets, only: iulog, eps11, eps10 use glimmer_paramets, only: velo_scale, len_scale ! used for whichefvs = HO_EFVS_FLOWFACT - use glimmer_utils, only: point_diag + use glimmer_utils, only: point_diag, double_to_binary use glimmer_log use glimmer_sparse_type use glimmer_sparse @@ -92,7 +92,9 @@ module glissade_velo_higher use cism_parallel, only: this_rank, main_task, nhalo, tasks, & parallel_type, parallel_halo, staggered_parallel_halo, parallel_globalindex, & parallel_reduce_max, parallel_reduce_sum, not_parallel - + !WHL - debug + use cism_parallel, only: parallel_global_sum_stagger + use cism_reprosum_mod, only: verbose_reprosum implicit none private @@ -225,8 +227,8 @@ module glissade_velo_higher ! logical :: verbose_bfric = .true. logical :: verbose_trilinos = .false. ! logical :: verbose_trilinos = .true. - logical :: verbose_beta = .false. -! logical :: verbose_beta = .true. +! logical :: verbose_beta = .false. + logical :: verbose_beta = .true. logical :: verbose_efvs = .false. ! logical :: verbose_efvs = .true. logical :: verbose_tau = .false. @@ -239,6 +241,8 @@ module glissade_velo_higher ! logical :: verbose_L1L2 = .true. logical :: verbose_diva = .false. ! logical :: verbose_diva = .true. + logical :: verbose_bp = .false. +! logical :: verbose_bp = .true. logical :: verbose_glp = .false. ! logical :: verbose_glp = .true. logical :: verbose_picard = .false. @@ -259,6 +263,10 @@ module glissade_velo_higher dphi_dyr_3d_vav, &! vertical avg of dphi_dyr_3d dphi_dzr_3d_vav ! vertical avg of dphi_dzr_3d + !WHL - debug for reprosum + character(len=64) :: binary_str ! string representation of binary number (chain of 0's and 1's) + character(len=64) :: binary_str1, binary_str2 + contains !**************************************************************************** @@ -669,6 +677,9 @@ subroutine glissade_velo_higher_solve(model, & use glide_thck, only: glide_calclsrf use profile, only: t_startf, t_stopf + !WHL - debug + use glissade_utils, only: write_array_to_file + !---------------------------------------------------------------- ! Input-output arguments !---------------------------------------------------------------- @@ -1047,11 +1058,13 @@ subroutine glissade_velo_higher_solve(model, & uvel_2d_old, vvel_2d_old, & ! velocity solution from previous nonlinear iteration duvel_2d, dvvel_2d, & ! difference between current and previous velocity solutions uvel_2d_sav, vvel_2d_sav, & ! current best value for velocity solution (smallest residual) - beta_internal_sav ! beta_internal associated with saved velocity + beta_internal_sav, & ! beta_internal associated with saved velocity + beta_eff_x_sav, beta_eff_y_sav real(dp), dimension(:,:,:), allocatable :: & Auu_2d_sav, Auv_2d_sav, & ! assembled matrices associated with (uvel_2d_sav, vvel_2d_sav) - Avu_2d_sav, Avv_2d_sav + Avu_2d_sav, Avv_2d_sav, & + omega_k_sav ! for an accelerated 3D solve: real(dp), dimension(:,:,:), allocatable :: & @@ -1066,6 +1079,15 @@ subroutine glissade_velo_higher_solve(model, & integer :: itest, jtest ! coordinates of diagnostic point integer :: rtest ! task number for processor containing diagnostic point + !WHL - debug + real(dp), dimension(nNodeNeighbors_2d) :: & + sum_Auu, sum_Auv, sum_Avu, sum_Avv + real(dp) :: sum_uvel, sum_vvel + real(dp) :: sum_bu, sum_bv + real(dp) :: sum_flwa, sum_flwafact, sum_btrx, sum_btry, sum_stagusrf, sum_stagthck + real(dp) :: sum_betax, sum_betay, sum_omega, sum_stag_omega + real(dp), dimension(:,:), allocatable :: arr_global ! temporary global array + call t_startf('glissade_vhs_init') rtest = -999 itest = 1 @@ -1268,7 +1290,15 @@ subroutine glissade_velo_higher_solve(model, & allocate(vsav_2d(nx-1,ny-1)) allocate(resid_u_2d(nx-1,ny-1)) allocate(resid_v_2d(nx-1,ny-1)) - if (accel_picard) then + else ! 3d + allocate(Auu(nNodeNeighbors_3d,nz,nx-1,ny-1)) + allocate(Auv(nNodeNeighbors_3d,nz,nx-1,ny-1)) + allocate(Avu(nNodeNeighbors_3d,nz,nx-1,ny-1)) + allocate(Avv(nNodeNeighbors_3d,nz,nx-1,ny-1)) + endif + + if (accel_picard) then + if (solve_2d) then allocate(uvel_2d_old(nx-1,ny-1)) allocate(vvel_2d_old(nx-1,ny-1)) allocate(duvel_2d(nx-1,ny-1)) @@ -1279,14 +1309,7 @@ subroutine glissade_velo_higher_solve(model, & allocate(Auv_2d_sav(nx-1,ny-1,nNodeNeighbors_2d)) allocate(Avu_2d_sav(nx-1,ny-1,nNodeNeighbors_2d)) allocate(Avv_2d_sav(nx-1,ny-1,nNodeNeighbors_2d)) - allocate(beta_internal_sav(nx-1,ny-1)) - endif - else - allocate(Auu(nNodeNeighbors_3d,nz,nx-1,ny-1)) - allocate(Auv(nNodeNeighbors_3d,nz,nx-1,ny-1)) - allocate(Avu(nNodeNeighbors_3d,nz,nx-1,ny-1)) - allocate(Avv(nNodeNeighbors_3d,nz,nx-1,ny-1)) - if (accel_picard) then + else ! 3d allocate(uvel_old(nz,nx-1,ny-1)) allocate(vvel_old(nz,nx-1,ny-1)) allocate(duvel(nz,nx-1,ny-1)) @@ -1297,9 +1320,13 @@ subroutine glissade_velo_higher_solve(model, & allocate(Auv_sav(nNodeNeighbors_3d,nz,nx-1,ny-1)) allocate(Avu_sav(nNodeNeighbors_3d,nz,nx-1,ny-1)) allocate(Avv_sav(nNodeNeighbors_3d,nz,nx-1,ny-1)) - allocate(beta_internal_sav(nx-1,ny-1)) endif - endif + allocate(beta_internal_sav(nx-1,ny-1)) + !Note: The next three are used only for DIVA, but it's simpler to allocate them regardless + allocate(beta_eff_x_sav(nx-1,ny-1)) + allocate(beta_eff_y_sav(nx-1,ny-1)) + allocate(omega_k_sav(nz,nx,ny)) + endif ! accel_picard if (whichapprox == HO_APPROX_DIVA) then !! call parallel_halo(efvs, parallel) ! efvs halo update is in glissade_diagnostic_variable_solve @@ -1585,19 +1612,21 @@ subroutine glissade_velo_higher_solve(model, & ! unique local ID to each such node. !------------------------------------------------------------------------------ -!pw call t_startf('glissade_get_vertex_geom') - call get_vertex_geometry(nx, ny, & - nz, nhalo, & - parallel, & - dx, dy, & - itest, jtest, rtest, & - ice_mask, & - xVertex, yVertex, & - active_cell, active_vertex, & - nNodesSolve, nVerticesSolve, & - nodeID, vertexID, & - iNodeIndex, jNodeIndex, kNodeIndex, & - iVertexIndex, jVertexIndex) + !pw call t_startf('glissade_get_vertex_geom') + call get_vertex_geometry(& + nx, ny, & + nz, nhalo, & + parallel, & + model%general%x0, model%general%y0, & + dx, dy, & + itest, jtest, rtest, & + ice_mask, & + xVertex, yVertex, & + active_cell, active_vertex, & + nNodesSolve, nVerticesSolve, & + nodeID, vertexID, & + iNodeIndex, jNodeIndex, kNodeIndex, & + iVertexIndex, jVertexIndex) !pw call t_stopf('glissade_get_vertex_geom') ! Zero out the velocity for inactive vertices @@ -2181,8 +2210,7 @@ subroutine glissade_velo_higher_solve(model, & itest, jtest, rtest, & active_vertex, diva_level_index, & ice_plus_land_mask, & - stag_omega, omega_k, & - beta_internal, & + omega_k, beta_internal, & beta_eff_x, beta_eff_y, & stag_theta_slope_x, stag_theta_slope_y, & stag_diva_slope_factor_x, & @@ -2263,7 +2291,6 @@ subroutine glissade_velo_higher_solve(model, & vbas(:,:) = vvel(nz,:,:) endif -!! if (verbose_beta) then if (verbose_beta .and. counter==1) then if (this_rank == rtest) write(iulog,*) 'Before calcbeta, counter =', counter call point_diag(usrf, 'usrf (m)', itest, jtest, rtest, 7, 7) @@ -2327,6 +2354,36 @@ subroutine glissade_velo_higher_solve(model, & usav_2d(:,:) = uvel_2d(:,:) vsav_2d(:,:) = vvel_2d(:,:) + !WHL - debug - BFB check + sum_uvel = parallel_global_sum_stagger(uvel_2d, parallel) + sum_vvel = parallel_global_sum_stagger(vvel_2d, parallel) + sum_flwa = parallel_global_sum_stagger(flwa, parallel) + sum_flwafact = parallel_global_sum_stagger(flwafact, parallel) + sum_btrx = parallel_global_sum_stagger(btractx, parallel) + sum_btry = parallel_global_sum_stagger(btracty, parallel) + sum_stagusrf = parallel_global_sum_stagger(stagusrf, parallel) + sum_stagthck = parallel_global_sum_stagger(stagthck, parallel) +!! if (this_rank == rtest) then + if (0 == 1) then + write(iulog,*) ' ' + call double_to_binary(sum_uvel, binary_str) + write(iulog,*) 'Before assembly: sum_uvel, binary_str:', sum_uvel, binary_str + call double_to_binary(sum_vvel, binary_str) + write(iulog,*) 'Before assembly: sum_vvel, binary_str:', sum_vvel, binary_str + call double_to_binary(sum_flwa, binary_str) + write(iulog,*) 'Before assembly: sum_flwa, binary_str:', sum_flwa, binary_str + call double_to_binary(sum_flwafact, binary_str) + write(iulog,*) 'Before assembly: sum_flwafact, binary_str:', sum_flwafact, binary_str + call double_to_binary(sum_btrx, binary_str) + write(iulog,*) 'Before assembly: sum_btrx, binary_str:', sum_btrx, binary_str + call double_to_binary(sum_btry, binary_str) + write(iulog,*) 'Before assembly: sum_btry, binary_str:', sum_btry, binary_str + call double_to_binary(sum_stagusrf, binary_str) + write(iulog,*) 'Before assembly: sum_stagusrf, binary_str:', sum_stagusrf, binary_str + call double_to_binary(sum_stagthck, binary_str) + write(iulog,*) 'Before assembly: sum_stagthck, binary_str:', sum_stagthck, binary_str + endif + ! Assemble the matrix call assemble_stiffness_matrix_2d(nx, ny, & @@ -2351,6 +2408,45 @@ subroutine glissade_velo_higher_solve(model, & omega_k, omega, & efvs_qp_3d) + !WHL - debug - BFB check + if (verbose_reprosum .and. counter == 1) then +!! if (main_task) write(iulog,*) 'Write out matrices after assemble_stiffness_matrix' +!! call write_array_to_file(Auu_2d, 21, 'global_Auu1', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(Auv_2d, 22, 'global_Auv1', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(Avu_2d, 23, 'global_Avu1', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(Avv_2d, 24, 'global_Avv1', parallel, write_binary = .true., cycle_indices = .true.) + sum_Auu(:) = parallel_global_sum_stagger(Auu_2d, nNodeNeighbors_2d, parallel) + sum_Auv(:) = parallel_global_sum_stagger(Auv_2d, nNodeNeighbors_2d, parallel) + sum_Avu(:) = parallel_global_sum_stagger(Avu_2d, nNodeNeighbors_2d, parallel) + sum_Avv(:) = parallel_global_sum_stagger(Avv_2d, nNodeNeighbors_2d, parallel) + if (main_task) then + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Auu(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Auu(n), binary_str) + write(iulog,*) n, sum_Auu(n), binary_str + enddo + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Auv(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Auv(n), binary_str) + write(iulog,*) n, sum_Auv(n), binary_str + enddo + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Avu(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Avu(n), binary_str) + write(iulog,*) n, sum_Avu(n), binary_str + enddo + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Avv(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Avv(n), binary_str) + write(iulog,*) n, sum_Avv(n), binary_str + enddo + endif + endif ! verbose_reprosum + if (whichapprox == HO_APPROX_DIVA) then ! Halo update for omega @@ -2409,8 +2505,8 @@ subroutine glissade_velo_higher_solve(model, & beta_eff_x(:,:) = 0.d0 beta_eff_y(:,:) = 0.d0 - !Note: The 'if' is not strictly needed, since the corrected beta_eff is equal - ! to the uncorrected beta_eff whe slope_factor = 1.0 and theta_slope = 0.0 + !Note: The 'if diva_slope_correction' is not strictly needed, since the more complicated + ! equations reduce to the simpler ones when slope_factor = 1.0 and theta_slope = 0.0. if (diva_slope_correction) then ! compute a larger beta_eff at each vertex based on the slope if (whichbabc == HO_BABC_NO_SLIP) then @@ -2433,8 +2529,8 @@ subroutine glissade_velo_higher_solve(model, & beta_eff_y = 1.d0 / stag_omega endwhere else ! slip allowed at bed - beta_eff_x = beta_internal(:,:) / (1.d0 + beta_internal*stag_omega) - beta_eff_y = beta_internal(:,:) / (1.d0 + beta_internal*stag_omega) + beta_eff_x = beta_internal / (1.d0 + beta_internal*stag_omega) + beta_eff_y = beta_internal / (1.d0 + beta_internal*stag_omega) endif endif @@ -2566,6 +2662,18 @@ subroutine glissade_velo_higher_solve(model, & call staggered_parallel_halo(bv_2d(:,:), parallel) call t_stopf('glissade_halo_bxxs') + !WHL - debug - Write all the matrix elements and rhs elements (in binary form) to files + if (verbose_reprosum .and. counter == 1) then +!! if (main_task) write(iulog,*) 'Write out matrices after adding BC' +!! call write_array_to_file(Auu_2d(:,:,5), 30, 'global_Auu2', parallel) ! diagonal terms only +!! call write_array_to_file(Auu_2d, 31, 'global_Auu2', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(Auv_2d, 32, 'global_Auv2', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(Avu_2d, 33, 'global_Avu2', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(Avv_2d, 34, 'global_Avv2', parallel, write_binary = .true., cycle_indices = .true.) +!! call write_array_to_file(bu_2d, 35, 'global_bu2', parallel, write_binary = .true.) +!! call write_array_to_file(bv_2d, 36, 'global_bv2', parallel, write_binary = .true.) + endif + !--------------------------------------------------------------------------- ! Check symmetry of assembled matrix ! @@ -2595,6 +2703,41 @@ subroutine glissade_velo_higher_solve(model, & active_vertex, & nNonzeros) + if (verbose_reprosum) then + sum_Auu(:) = parallel_global_sum_stagger(Auu_2d, nNodeNeighbors_2d, parallel) + sum_Auv(:) = parallel_global_sum_stagger(Auv_2d, nNodeNeighbors_2d, parallel) + sum_Avu(:) = parallel_global_sum_stagger(Avu_2d, nNodeNeighbors_2d, parallel) + sum_Avv(:) = parallel_global_sum_stagger(Avv_2d, nNodeNeighbors_2d, parallel) + sum_bu = parallel_global_sum_stagger(bu_2d, parallel) + sum_bv = parallel_global_sum_stagger(bv_2d, parallel) + if (main_task) then + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Auu(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Auu(n), binary_str) + write(iulog,*) n, sum_Auu(n), binary_str + enddo + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Auv(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Auv(n), binary_str) + write(iulog,*) n, sum_Auv(n), binary_str + enddo + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Avu(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Avu(n), binary_str) + write(iulog,*) n, sum_Avu(n), binary_str + enddo + write(iulog,*) ' ' + write(iulog,*) 'After assembly: n, sum_Avv(n), binary_str:' + do n = 1, nNodeNeighbors_2d + call double_to_binary(sum_Avv(n), binary_str) + write(iulog,*) n, sum_Avv(n), binary_str + enddo + endif + endif ! verbose_reprosum + if (write_matrix) then if (counter == 1) then ! first outer iteration only call t_startf('glissade_wrt_mat') @@ -2608,7 +2751,7 @@ subroutine glissade_velo_higher_solve(model, & endif endif ! write_matrix - if (verbose_matrix .and. this_rank==rtest) then + if (verbose_matrix .and. main_task) then i = itest j = jtest write(iulog,*) ' ' @@ -2879,6 +3022,7 @@ subroutine glissade_velo_higher_solve(model, & if (solve_2d) then + call t_startf('glissade_resid_vec') call compute_residual_vector_2d(nx, ny, & parallel, & @@ -2895,41 +3039,40 @@ subroutine glissade_velo_higher_solve(model, & call t_startf('glissade_accel_picard') if (accel_picard) then - if (verbose_picard) then - if (this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'Saved L2 norm, new L2 norm:', L2_norm_alpha_sav, L2_norm - endif - call point_diag(resid_u_2d, 'resid_u_2d', itest, jtest, rtest, 7, 7, '(e10.3)') - call point_diag(uvel_2d, 'uvel_2d', itest, jtest, rtest, 7, 7, '(f10.3)') + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) ' ' + write(iulog,*) 'Saved L2 norm, new L2 norm:', L2_norm_alpha_sav, L2_norm endif if (counter >= 2) then - call evaluate_accelerated_picard_2d(nx, ny, & - L2_norm, L2_norm_large, & - L2_norm_alpha_sav, & - alpha_accel, alpha_accel_max, & - gamma_accel, resid_reduction_threshold, & - uvel_2d, vvel_2d, & - Auu_2d, Auv_2d, & - Avu_2d, Avv_2d, & - uvel_2d_old, vvel_2d_old, & - duvel_2d, dvvel_2d, & - uvel_2d_sav, vvel_2d_sav, & - Auu_2d_sav, Auv_2d_sav, & - Avu_2d_sav, Avv_2d_sav, & - beta_internal, beta_internal_sav, & - assembly_is_done) + call evaluate_accelerated_picard_2d(& + whichapprox, rtest, & + L2_norm, L2_norm_large, & + L2_norm_alpha_sav, & + alpha_accel, alpha_accel_max, & + gamma_accel, resid_reduction_threshold, & + uvel_2d, vvel_2d, & + Auu_2d, Auv_2d, & + Avu_2d, Avv_2d, & + uvel_2d_old, vvel_2d_old, & + duvel_2d, dvvel_2d, & + uvel_2d_sav, vvel_2d_sav, & + Auu_2d_sav, Auv_2d_sav, & + Avu_2d_sav, Avv_2d_sav, & + beta_internal, beta_internal_sav, & + beta_eff_x, beta_eff_x_sav, & + beta_eff_y, beta_eff_y_sav, & + omega_k, omega_k_sav, & + assembly_is_done) else ! counter = 1 ! proceed to the matrix solution assembly_is_done = .true. - if (verbose_picard .and. main_task) then - write(iulog,*) 'nonlinear counter = 1; continue to matrix solver' - endif + if (verbose_picard .and. this_rank == rtest) & + write(iulog,*) 'nonlinear counter = 1; continue to matrix solver' endif ! counter >= 2 @@ -2970,20 +3113,22 @@ subroutine glissade_velo_higher_solve(model, & if (counter >= 2) then - call evaluate_accelerated_picard_3d(L2_norm, L2_norm_large, & - L2_norm_alpha_sav, & - alpha_accel, alpha_accel_max, & - gamma_accel, resid_reduction_threshold, & - uvel, vvel, & - Auu, Auv, & - Avu, Avv, & - uvel_old, vvel_old, & - duvel, dvvel, & - uvel_sav, vvel_sav, & - Auu_sav, Auv_sav, & - Avu_sav, Avv_sav, & - beta_internal, beta_internal_sav, & - assembly_is_done) + call evaluate_accelerated_picard_3d(& + rtest, & + L2_norm, L2_norm_large, & + L2_norm_alpha_sav, & + alpha_accel, alpha_accel_max, & + gamma_accel, resid_reduction_threshold, & + uvel, vvel, & + Auu, Auv, & + Avu, Avv, & + uvel_old, vvel_old, & + duvel, dvvel, & + uvel_sav, vvel_sav, & + Auu_sav, Auv_sav, & + Avu_sav, Avv_sav, & + beta_internal, beta_internal_sav, & + assembly_is_done) else ! counter = 1 @@ -3012,9 +3157,9 @@ subroutine glissade_velo_higher_solve(model, & ! Optional diagnostics if (verbose_beta .and. counter > 1 .and. mod(counter-1,12)==0) then -!! if (verbose_beta) then - call point_diag(log10(max(beta_internal,1.d-99)), 'log_beta', itest, jtest, rtest, 7, 7, '(f10.5)') + if (this_rank == rtest) write(iulog,*) 'Counter =', counter + call point_diag(log10(max(beta_internal,1.d-99)), 'log_beta', itest, jtest, rtest, 7, 7) if (solve_2d) then call point_diag(uvel_2d, 'Mean uvel (m/yr)', itest, jtest, rtest, 7, 7) call point_diag(vvel_2d, 'Mean vvel (m/yr)', itest, jtest, rtest, 7, 7) @@ -3137,7 +3282,8 @@ subroutine glissade_velo_higher_solve(model, & call pcg_solver_standard_3d(nx, ny, & nz, parallel, & - indxA_3d, active_vertex, & + indxA_2d, indxA_3d, & + active_vertex, & Auu, Auv, & Avu, Avv, & bu, bv, & @@ -3407,8 +3553,7 @@ subroutine glissade_velo_higher_solve(model, & itest, jtest, rtest, & active_vertex, diva_level_index, & ice_plus_land_mask, & - stag_omega, omega_k, & - beta_internal, & + omega_k, beta_internal, & beta_eff_x, beta_eff_y, & stag_theta_slope_x, stag_theta_slope_y, & stag_diva_slope_factor_x, & @@ -3591,7 +3736,7 @@ subroutine glissade_velo_higher_solve(model, & !WHL - debug !TODO - One diagnostic to write out column velocities for any approximation - if (whichapprox == HO_APPROX_BP .and. this_rank==rtest) then + if (verbose_bp .and. whichapprox == HO_APPROX_BP .and. this_rank==rtest) then write(iulog,*) ' ' i = itest j = jtest @@ -3702,6 +3847,8 @@ subroutine glissade_velo_higher_solve(model, & deallocate(uvel_2d_sav, vvel_2d_sav) deallocate(Auu_2d_sav, Auv_2d_sav, Avu_2d_sav, Avv_2d_sav) deallocate(beta_internal_sav) + deallocate(beta_eff_x_sav, beta_eff_y_sav) + deallocate(omega_k_sav) endif else deallocate(Auu, Auv, Avu, Avv) @@ -3884,6 +4031,7 @@ end subroutine glissade_velo_higher_scale_output subroutine get_vertex_geometry(nx, ny, & nz, nhalo, & parallel, & + x0, y0, & dx, dy, & itest, jtest, rtest, & ice_mask, & @@ -3914,11 +4062,17 @@ subroutine get_vertex_geometry(nx, ny, & nhalo ! number of halo layers type(parallel_type), intent(in) :: & - parallel ! info for parallel communication + parallel ! info for parallel communication + + real(dp), dimension(nx-1), intent(in) :: & + x0 ! x coordinates of vertices + + real(dp), dimension(ny-1), intent(in) :: & + y0 ! y coordinates of vertices real(dp), intent(in) :: & - dx, dy ! grid cell length and width (m) - ! assumed to have the same value for each grid cell + dx, dy ! grid cell length and width (m) + ! assumed to have the same value for each grid cell integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point @@ -3970,17 +4124,15 @@ subroutine get_vertex_geometry(nx, ny, & staggered_jhi = parallel%staggered_jhi !---------------------------------------------------------------- - ! Compute the x and y coordinates of each vertex. + ! Copy the x and y coordinates of each vertex from x0 and y0. ! By convention, vertex (i,j) lies at the NE corner of cell(i,j). !---------------------------------------------------------------- - xVertex(:,:) = 0.d0 - yVertex(:,:) = 0.d0 do j = 1, ny-1 - do i = 1, nx-1 - xVertex(i,j) = dx * i - yVertex(i,j) = dy * j - enddo + do i = 1, nx-1 + xVertex(i,j) = x0(i) + yVertex(i,j) = y0(j) + enddo enddo ! Identify the active cells. @@ -4252,7 +4404,8 @@ subroutine load_vector_gravity(nx, ny, & loadv(kNode,iNode,jNode) = loadv(kNode,iNode,jNode) - & rhoi*grav * wqp_3d(p) * detJ/vol0 * dsdy_qp * phi_3d(n,p) - if (verbose_load .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest .and. p==ptest) then + if (verbose_load .and. this_rank==rtest .and. & + i==itest .and. j==jtest .and. k==ktest .and. p==ptest) then write(iulog,*) ' ' write(iulog,*) 'n, phi_3d(n), delta(loadu), delta(loadv):', n, phi_3d(n,p), & rhoi*grav*wqp_3d(p)*detJ/vol0 * dsdx_qp * phi_3d(n,p), & @@ -4754,26 +4907,38 @@ end subroutine lateral_shelf_bc !**************************************************************************** - subroutine assemble_stiffness_matrix_3d(nx, ny, & - nz, sigma, & + subroutine assemble_stiffness_matrix_2d(nx, ny, & + nz, & + sigma, stagsigma, & nhalo, & itest, jtest, rtest, & active_cell, & xVertex, yVertex, & - uvel, vvel, & + uvel_2d, vvel_2d, & stagusrf, stagthck, & - flwafact, whichapprox, & + flwa, flwafact, & + whichapprox, & + diva_slope_factor_x, diva_slope_factor_y, & whichefvs, efvs, & efvs_constant, effstrain_min, & Auu, Auv, & - Avu, Avv) + Avu, Avv, & + dusrf_dx, dusrf_dy, & + thck, & + btractx, btracty, & + omega_k, omega, & + efvs_qp_3d) !---------------------------------------------------------------- ! Assemble the stiffness matrix A in the linear system Ax = b. ! This subroutine is called for each nonlinear iteration if ! we are iterating on the effective viscosity. + ! The matrix A can be based on the shallow-shelf approximation or + ! the depth-integrated L1L2 approximation (Schoof and Hindmarsh, 2010). !---------------------------------------------------------------- + use glissade_grid_operators, only: glissade_vertical_average + !---------------------------------------------------------------- ! Input-output arguments !---------------------------------------------------------------- @@ -4781,37 +4946,45 @@ subroutine assemble_stiffness_matrix_3d(nx, ny, & integer, intent(in) :: & nx, ny, & ! horizontal grid dimensions nz, & ! number of vertical levels at which velocity is computed - ! Note: the number of elements per column is nz-1 + ! (used for flwafact) nhalo ! number of halo layers real(dp), dimension(nz), intent(in) :: & - sigma ! sigma vertical coordinate + sigma ! sigma vertical coordinate + + real(dp), dimension(nz-1), intent(in) :: & + stagsigma ! staggered sigma vertical coordinate integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point logical, dimension(nx,ny), intent(in) :: & - active_cell ! true if cell contains ice and borders a locally owned vertex + active_cell ! true if cell contains ice and borders a locally owned vertex real(dp), dimension(nx-1,ny-1), intent(in) :: & - xVertex, yVertex ! x and y coordinates of vertices + xVertex, yVertex ! x and y coordinates of vertices - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & - uvel, vvel ! velocity components (m/yr) + real(dp), dimension(nx-1,ny-1), intent(in) :: & + uvel_2d, vvel_2d ! 2D velocity components (m/yr) real(dp), dimension(nx-1,ny-1), intent(in) :: & stagusrf, & ! upper surface elevation on staggered grid (m) stagthck ! ice thickness on staggered grid (m) + !TODO - Pass in flwa only, and compute flwafact here? real(dp), dimension(nz-1,nx,ny), intent(in) :: & - flwafact ! temperature-based flow factor, 0.5 * A^(-1/n), + flwa, &! temperature-based flow factor A, Pa^{-n} yr^{-1} + flwafact ! temperature-based flow factor, 0.5 * A^(-1/n), Pa yr^(1/n) ! used to compute the effective viscosity - ! units: Pa yr^(1/n) integer, intent(in) :: & - whichapprox, & ! option for Stokes approximation (BP, SSA, SIA) + whichapprox, & ! option for Stokes approximation (BP, L1L2, SSA, SIA) whichefvs ! option for effective viscosity calculation + real(dp), dimension(nx,ny), intent(in) :: & + diva_slope_factor_x, & ! correction factor for DIVA in x direction, based on theta_slope_x + diva_slope_factor_y ! correction factor for DIVA in y direction, based on theta_slope_y + real(dp), dimension(nz-1,nx,ny), intent(out) :: & efvs ! effective viscosity (Pa yr) @@ -4819,36 +4992,61 @@ subroutine assemble_stiffness_matrix_3d(nx, ny, & efvs_constant, & ! constant value of effective viscosity (Pa yr) effstrain_min ! minimum value of effective strain rate (yr^-1) for computing viscosity - real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(out) :: & + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(out) :: & Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts Avu, Avv + ! The following optional arguments are used for the L1L2 approximation only + + real(dp), dimension(nx-1,ny-1), intent(in), optional :: & + dusrf_dx, & ! upper surface elevation gradient on staggered grid (m/m) + dusrf_dy ! needed for L1L2 assembly only + + ! The following optional arguments are used for DIVA only + + real(dp), dimension(nx,ny), intent(in), optional :: & + thck ! ice thickness (m) + + real(dp), dimension(nx-1,ny-1), intent(in), optional :: & + btractx, btracty ! components of basal traction (Pa) + + real(dp), dimension(nz,nx,ny), intent(out), optional :: & + omega_k ! single integral, defined by Goldberg (2011) eq. 32 + + real(dp), dimension(nx,ny), intent(out), optional :: & + omega ! double integral, defined by Goldberg (2011) eq. 35 + ! Note: omega here = Goldberg's omega/H + + real(dp), dimension(nz-1,nQuadPoints_2d,nx,ny), intent(inout), optional :: & + efvs_qp_3d ! effective viscosity (Pa yr) + !--------------------------------------------------------- ! Local variables !--------------------------------------------------------- - real(dp), dimension(nQuadPoints_3d) :: & + real(dp), dimension(nQuadPoints_2d) :: & detJ ! determinant of J - real(dp), dimension(nNodesPerElement_3d) :: & - dphi_dx_3d, dphi_dy_3d, dphi_dz_3d ! derivatives of basis function, evaluated at quad pt + real(dp), dimension(nNodesPerElement_2d) :: & + dphi_dx_2d, dphi_dy_2d, dphi_dz_2d ! derivatives of basis function, evaluated at quad pts + ! set dphi_dz = 0 for 2D problem !---------------------------------------------------------------- - ! Note: Kuu, Kuv, Kvu, and Kvv are 8x8 components of the stiffness matrix - ! for the local element. (The combined stiffness matrix is 16x16.) + ! Note: Kuu, Kuv, Kvu, and Kvv are 4x4 components of the stiffness matrix + ! for the local element. (The combined stiffness matrix is 8x8.) ! - ! Once these matrices are formed, their coefficients are summed into the assembled - ! matrices Auu, Auv, Avu, Avv. The A matrices each have as many rows as there are - ! active nodes, but only 27 columns, corresponding to the 27 vertices that belong to - ! the 8 elements sharing a given node. + ! Once these matrices are formed, their coefficients are summed into the global + ! matrices Auu_2d, Auv_2d, Avu_2d, Avv_2d. The global matrices each have as + ! many rows as there are active vertices, but only 9 columns, corresponding to + ! the 9 vertices of the 4 elements sharing a given node. ! ! The native structured PCG solver works with the dense A matrices in the form ! computed here. For the SLAP solver, the terms of the A matrices are put - ! in a sparse matrix during preprocessing. For the Trilinos solver, the terms - ! of the A matrices are passed to Trilinos one row at a time. + ! in a sparse matrix format during preprocessing. For the Trilinos solver, + ! the terms of the A matrices are passed to Trilinos one row at a time. !---------------------------------------------------------------- - real(dp), dimension(nNodesPerElement_3d, nNodesPerElement_3d) :: & ! + real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) :: & ! Kuu, & ! element stiffness matrix, divided into 4 parts as shown below Kuv, & ! Kvu, & ! @@ -4859,25 +5057,36 @@ subroutine assemble_stiffness_matrix_3d(nx, ny, & ! ! Kvu may not be needed if matrix is symmetric, but is included for now - real(dp), dimension(nNodesPerElement_3d) :: & - x, y, z, & ! Cartesian coordinates of nodes - u, v, & ! u and v at nodes - s ! upper surface elevation at nodes + real(dp), dimension(nNodesPerElement_2d) :: & + x, y, & ! Cartesian coordinates of vertices + u, v, & ! depth-integrated mean velocity at vertices (m/yr) + h, & ! thickness at vertices (m) + s, & ! upper surface elevation at vertices (m) + bx, by, & ! basal traction at vertices (Pa) (DIVA only) + dsdx, dsdy ! upper surface elevation gradient at vertices (m/m) (L1L2 only) - real(dp), dimension(nQuadPoints_3d) :: & - efvs_qp ! effective viscosity at a quad pt + real(dp), dimension(nQuadPoints_2d) :: & + efvs_qp_vertavg ! vertically averaged effective viscosity at a quad pt + + real(dp) :: & + h_qp ! thickness at a quad pt + + real(dp), dimension(nz-1,nQuadPoints_2d) :: & + efvs_qp ! effective viscosity at each layer in a cell column + ! corresponding to a quad pt logical, parameter :: & check_symmetry_element = .true. ! if true, then check symmetry of element matrix - !Note: Can speed up assembly a bit by setting to false for production + + real(dp), dimension(nx,ny) :: & + flwafact_2d ! vertically averaged flow factor integer :: i, j, k, n, p - integer :: iNode, jNode, kNode + integer :: iVertex, jVertex if (verbose_matrix .and. main_task) then write(iulog,*) ' ' - write(iulog,*) 'In assemble_stiffness_matrix_3d' - write(iulog,*) 'itest, jtest, ktest, rtest =', itest, jtest, ktest, rtest + write(iulog,*) 'In assemble_stiffness_matrix_2d' endif ! Initialize effective viscosity @@ -4885,10 +5094,17 @@ subroutine assemble_stiffness_matrix_3d(nx, ny, & ! Initialize global stiffness matrix - Auu(:,:,:,:) = 0.d0 - Auv(:,:,:,:) = 0.d0 - Avu(:,:,:,:) = 0.d0 - Avv(:,:,:,:) = 0.d0 + Auu(:,:,:) = 0.d0 + Auv(:,:,:) = 0.d0 + Avu(:,:,:) = 0.d0 + Avv(:,:,:) = 0.d0 + + ! Compute vertical average of flow factor (SSA only) + if (whichapprox == HO_APPROX_SSA) then + call glissade_vertical_average(nx, ny, & + nz, sigma, & + flwafact, flwafact_2d) + endif ! Sum over elements in active cells ! Loop over all cells that border locally owned vertices. @@ -4898,112 +5114,200 @@ subroutine assemble_stiffness_matrix_3d(nx, ny, & if (active_cell(i,j)) then - do k = 1, nz-1 ! loop over elements in this column - ! assume k increases from upper surface to bed - - ! Initialize element stiffness matrix - Kuu(:,:) = 0.d0 - Kuv(:,:) = 0.d0 - Kvu(:,:) = 0.d0 - Kvv(:,:) = 0.d0 + ! Initialize element stiffness matrix + Kuu(:,:) = 0.d0 + Kuv(:,:) = 0.d0 + Kvu(:,:) = 0.d0 + Kvv(:,:) = 0.d0 - ! compute spatial coordinates, velocity, and upper surface elevation for each node - - do n = 1, nNodesPerElement_3d + ! Compute spatial coordinates, velocity, thickness and surface elevation for each vertex + ! Also compute surface elevation gradient (for L1L2) and basal traction (for DIVA) + do n = 1, nNodesPerElement_2d - ! Determine (k,i,j) for this node - ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j). - ! Indices for other nodes are computed relative to this node. - iNode = i + ishift(7,n) - jNode = j + jshift(7,n) - kNode = k + kshift(7,n) + ! Determine (i,j) for this vertex + ! The reason for the '3' is that node 3, in the NE corner of the grid cell, has index (i,j). + ! Indices for other nodes are computed relative to this vertex. + iVertex = i + ishift(3,n) + jVertex = j + jshift(3,n) - x(n) = xVertex(iNode,jNode) - y(n) = yVertex(iNode,jNode) - z(n) = stagusrf(iNode,jNode) - sigma(kNode)*stagthck(iNode,jNode) - u(n) = uvel(kNode,iNode,jNode) - v(n) = vvel(kNode,iNode,jNode) - s(n) = stagusrf(iNode,jNode) + x(n) = xVertex(iVertex,jVertex) + y(n) = yVertex(iVertex,jVertex) + u(n) = uvel_2d(iVertex,jVertex) + v(n) = vvel_2d(iVertex,jVertex) + s(n) = stagusrf(iVertex,jVertex) + h(n) = stagthck(iVertex,jVertex) + if (present(dusrf_dx) .and. present(dusrf_dy)) then ! L1L2 + dsdx(n) = dusrf_dx(iVertex,jVertex) + dsdy(n) = dusrf_dy(iVertex,jVertex) + endif + if (present(btractx) .and. present(btracty)) then ! DIVA + bx(n) = btractx(iVertex,jVertex) + by(n) = btracty(iVertex,jVertex) + endif - if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then - write(iulog,*) ' ' - write(iulog,*) 'i, j, k, n, x, y, z:', i, j, k, n, x(n), y(n), z(n) - write(iulog,*) 's, u, v:', s(n), u(n), v(n) - endif + if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest) then + write(iulog,*) ' ' + write(iulog,*) 'i, j, n, x, y:', i, j, n, x(n), y(n) + write(iulog,*) 's, h, u, v:', s(n), h(n), u(n), v(n) + if (present(btractx) .and. present(btracty)) write(iulog,*) 'bx, by:', bx(n), by(n) + endif - enddo ! nodes per element + enddo ! vertices per element - ! Loop over quadrature points for this element + ! Loop over quadrature points for this element - do p = 1, nQuadPoints_3d + do p = 1, nQuadPoints_2d - ! Evaluate the derivatives of the element basis functions at this quadrature point. - !WHL - Pass in i, j, k, and p to the following subroutines for debugging. + ! Evaluate the derivatives of the element basis functions at this quadrature point. - call get_basis_function_derivatives_3d(x(:), y(:), z(:), & - dphi_dxr_3d(:,p), dphi_dyr_3d(:,p), dphi_dzr_3d(:,p), & - dphi_dx_3d(:), dphi_dy_3d(:), dphi_dz_3d(:), & - detJ(p), & - itest, jtest, rtest, & - i, j, k, p) + call get_basis_function_derivatives_2d(x(:), y(:), & + dphi_dxr_2d(:,p), dphi_dyr_2d(:,p), & + dphi_dx_2d(:), dphi_dy_2d(:), & + detJ(p), & + itest, jtest, rtest, & + i, j, p) -! call t_startf('glissade_effective_viscosity') + dphi_dz_2d(:) = 0.d0 + + if (whichapprox == HO_APPROX_L1L2) then + + ! Compute effective viscosity for each layer at this quadrature point + !TODO - sigma -> stagsigma for L1L2 viscosity? + call compute_effective_viscosity_L1L2(whichefvs, & + efvs_constant, effstrain_min, & + nz, sigma, & + nNodesPerElement_2d, phi_2d(:,p), & + dphi_dx_2d(:), dphi_dy_2d(:), & + u(:), v(:), & + h(:), & + dsdx(:), dsdy(:), & + flwa(:,i,j), flwafact(:,i,j), & + efvs_qp(:,p), & + itest, jtest, rtest, & + i, j, p) + + ! Compute vertical average of effective viscosity + efvs_qp_vertavg(p) = 0.d0 + do k = 1, nz-1 + efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p) * (sigma(k+1) - sigma(k)) + enddo + + elseif (whichapprox == HO_APPROX_DIVA) then + + ! Copy efvs_qp from global array to local column array + efvs_qp(:,:) = efvs_qp_3d(:,:,i,j) + + ! Compute effective viscosity for each layer at this quadrature point + ! Note: efvs_qp_3d is intent(inout); old value is used to compute new value + call compute_effective_viscosity_diva(whichefvs, & + efvs_constant, effstrain_min, & + nz, stagsigma, & + nNodesPerElement_2d, phi_2d(:,p), & + dphi_dx_2d(:), dphi_dy_2d(:), & + u(:), v(:), & + bx(:), by(:), & + diva_slope_factor_x(i,j), & + diva_slope_factor_y(i,j), & + h(:), & + flwa(:,i,j), flwafact(:,i,j), & + efvs_qp(:,p), & + itest, jtest, rtest, & + i, j, p) + + if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then + write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(:,p) + endif + + !WHL - Copy local efvs_qp to the global array + efvs_qp_3d(:,:,i,j) = efvs_qp(:,:) + + ! Compute vertical average of effective viscosity + efvs_qp_vertavg(p) = 0.d0 + do k = 1, nz-1 + efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p)*(sigma(k+1) - sigma(k)) + enddo + + else ! SSA + + ! Compute vertically averaged effective viscosity at this quadrature point + !TODO - Why do we pass in dphi_dz_2d here and not elsewhere? call compute_effective_viscosity(whichefvs, whichapprox, & efvs_constant, effstrain_min, & - nNodesPerElement_3d, & - dphi_dx_3d(:), dphi_dy_3d(:), dphi_dz_3d(:), & - u(:), v(:), & - flwafact(k,i,j), efvs_qp(p), & + nNodesPerElement_2d, & + dphi_dx_2d(:), dphi_dy_2d(:), dphi_dz_2d(:), & + u(:), v(:), & + flwafact_2d(i,j), efvs_qp_vertavg(p), & itest, jtest, rtest, & - i, j, k, p ) -! call t_stopf('glissade_effective_viscosity') + i, j, 1, p) - if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then - write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(p) - endif + ! Copy vertically averaged value to all levels + efvs_qp(:,p) = efvs_qp_vertavg(p) - ! Increment the element stiffness matrix with the contribution from each quadrature point. + endif ! whichapprox -! call t_startf('glissade_compute_element_matrix') - call compute_element_matrix(whichapprox, nNodesPerElement_3d, & - wqp_3d(p), detJ(p), efvs_qp(p), & - dphi_dx_3d(:), dphi_dy_3d(:), dphi_dz_3d(:), & - Kuu(:,:), Kuv(:,:), & - Kvu(:,:), Kvv(:,:), & - itest, jtest, rtest, & - i, j, k, p ) -! call t_stopf('glissade_compute_element_matrix') + ! Compute ice thickness at this quadrature point - enddo ! nQuadPoints_3d + h_qp = 0.d0 + do n = 1, nNodesPerElement_2d + h_qp = h_qp + phi_2d(n,p) * h(n) + enddo - ! Compute average of effective viscosity over quad pts - efvs(k,i,j) = 0.d0 + ! Increment the element stiffness matrix with the contribution from each quadrature point. + ! Note: The effective viscosity is multiplied by thickness since the equation to be solved + ! is vertically integrated. - do p = 1, nQuadPoints_3d - efvs(k,i,j) = efvs(k,i,j) + efvs_qp(p) + call compute_element_matrix(whichapprox, nNodesPerElement_2d, & + wqp_2d(p), detJ(p), & + h_qp*efvs_qp_vertavg(p), & + dphi_dx_2d(:), dphi_dy_2d(:), dphi_dz_2d(:), & + Kuu(:,:), Kuv(:,:), & + Kvu(:,:), Kvv(:,:), & + itest, jtest, rtest, & + i, j, 1, p ) + + enddo ! nQuadPoints_2d + + if (whichapprox == HO_APPROX_DIVA) then + + ! Compute vertical integrals needed for the 2D solve and 3D velocity reconstruction + call compute_integrals_diva(nz, sigma, & + itest, jtest, rtest, & + thck(i,j), efvs_qp(:,:), & + omega_k(:,i,j), omega(i,j), & + i, j) + + endif + + ! Compute average of effective viscosity over quad points. + ! For L1L2 and DIVA there is a different efvs in each layer. + ! For SSA, simply write the vertical average value to each layer. + + efvs(:,i,j) = 0.d0 + do p = 1, nQuadPoints_2d + do k = 1, nz-1 + efvs(k,i,j) = efvs(k,i,j) + efvs_qp(k,p) enddo - efvs(k,i,j) = efvs(k,i,j) / nQuadPoints_3d - - if (check_symmetry_element) then - call check_symmetry_element_matrix(nNodesPerElement_3d, & - Kuu, Kuv, Kvu, Kvv) - endif + enddo + efvs(:,i,j) = efvs(:,i,j) / nQuadPoints_2d - ! Sum terms of element matrix K into dense assembled matrix A + if (check_symmetry_element) then + call check_symmetry_element_matrix(nNodesPerElement_2d, & + Kuu, Kuv, Kvu, Kvv) + endif - call element_to_global_matrix_3d(nx, ny, nz, & - i, j, k, & - itest, jtest, rtest, & - Kuu, Kuv, & - Kvu, Kvv, & - Auu, Auv, & - Avu, Avv) + ! Sum the terms of element matrix K into the dense assembled matrix A - enddo ! nz (loop over elements in this column) + call element_to_global_matrix_2d(nx, ny, & + i, j, & + itest, jtest, rtest, & + Kuu, Kuv, & + Kvu, Kvv, & + Auu, Auv, & + Avu, Avv) if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then write(iulog,*) ' ' - write(iulog,*) 'Assembled 3D matrix, i, j =', i, j + write(iulog,*) 'Assembled 2D matrix, i, j =', i, j write(iulog,*) 'k, flwafact, efvs:' do k = 1, nz-1 write(iulog,*) k, flwafact(k,i,j), efvs(k,i,j) @@ -5015,42 +5319,30 @@ subroutine assemble_stiffness_matrix_3d(nx, ny, & enddo ! i enddo ! j - end subroutine assemble_stiffness_matrix_3d + end subroutine assemble_stiffness_matrix_2d !**************************************************************************** - subroutine assemble_stiffness_matrix_2d(nx, ny, & - nz, & - sigma, stagsigma, & + subroutine assemble_stiffness_matrix_3d(nx, ny, & + nz, sigma, & nhalo, & itest, jtest, rtest, & active_cell, & xVertex, yVertex, & - uvel_2d, vvel_2d, & + uvel, vvel, & stagusrf, stagthck, & - flwa, flwafact, & - whichapprox, & - diva_slope_factor_x, diva_slope_factor_y, & + flwafact, whichapprox, & whichefvs, efvs, & efvs_constant, effstrain_min, & Auu, Auv, & - Avu, Avv, & - dusrf_dx, dusrf_dy, & - thck, & - btractx, btracty, & - omega_k, omega, & - efvs_qp_3d) - + Avu, Avv) + !---------------------------------------------------------------- ! Assemble the stiffness matrix A in the linear system Ax = b. ! This subroutine is called for each nonlinear iteration if ! we are iterating on the effective viscosity. - ! The matrix A can be based on the shallow-shelf approximation or - ! the depth-integrated L1L2 approximation (Schoof and Hindmarsh, 2010). !---------------------------------------------------------------- - use glissade_grid_operators, only: glissade_vertical_average - !---------------------------------------------------------------- ! Input-output arguments !---------------------------------------------------------------- @@ -5058,45 +5350,37 @@ subroutine assemble_stiffness_matrix_2d(nx, ny, & integer, intent(in) :: & nx, ny, & ! horizontal grid dimensions nz, & ! number of vertical levels at which velocity is computed - ! (used for flwafact) + ! Note: the number of elements per column is nz-1 nhalo ! number of halo layers real(dp), dimension(nz), intent(in) :: & - sigma ! sigma vertical coordinate - - real(dp), dimension(nz-1), intent(in) :: & - stagsigma ! staggered sigma vertical coordinate + sigma ! sigma vertical coordinate integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point logical, dimension(nx,ny), intent(in) :: & - active_cell ! true if cell contains ice and borders a locally owned vertex + active_cell ! true if cell contains ice and borders a locally owned vertex real(dp), dimension(nx-1,ny-1), intent(in) :: & - xVertex, yVertex ! x and y coordinates of vertices + xVertex, yVertex ! x and y coordinates of vertices - real(dp), dimension(nx-1,ny-1), intent(in) :: & - uvel_2d, vvel_2d ! 2D velocity components (m/yr) + real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & + uvel, vvel ! velocity components (m/yr) real(dp), dimension(nx-1,ny-1), intent(in) :: & stagusrf, & ! upper surface elevation on staggered grid (m) stagthck ! ice thickness on staggered grid (m) - !TODO - Pass in flwa only, and compute flwafact here? real(dp), dimension(nz-1,nx,ny), intent(in) :: & - flwa, &! temperature-based flow factor A, Pa^{-n} yr^{-1} - flwafact ! temperature-based flow factor, 0.5 * A^(-1/n), Pa yr^(1/n) + flwafact ! temperature-based flow factor, 0.5 * A^(-1/n), ! used to compute the effective viscosity + ! units: Pa yr^(1/n) integer, intent(in) :: & - whichapprox, & ! option for Stokes approximation (BP, L1L2, SSA, SIA) + whichapprox, & ! option for Stokes approximation (BP, SSA, SIA) whichefvs ! option for effective viscosity calculation - real(dp), dimension(nx,ny), intent(in) :: & - diva_slope_factor_x, & ! correction factor for DIVA in x direction, based on theta_slope_x - diva_slope_factor_y ! correction factor for DIVA in y direction, based on theta_slope_y - real(dp), dimension(nz-1,nx,ny), intent(out) :: & efvs ! effective viscosity (Pa yr) @@ -5104,61 +5388,36 @@ subroutine assemble_stiffness_matrix_2d(nx, ny, & efvs_constant, & ! constant value of effective viscosity (Pa yr) effstrain_min ! minimum value of effective strain rate (yr^-1) for computing viscosity - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(out) :: & + real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(out) :: & Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts - Avu, Avv - - ! The following optional arguments are used for the L1L2 approximation only - - real(dp), dimension(nx-1,ny-1), intent(in), optional :: & - dusrf_dx, & ! upper surface elevation gradient on staggered grid (m/m) - dusrf_dy ! needed for L1L2 assembly only - - ! The following optional arguments are used for DIVA only - - real(dp), dimension(nx,ny), intent(in), optional :: & - thck ! ice thickness (m) - - real(dp), dimension(nx-1,ny-1), intent(in), optional :: & - btractx, btracty ! components of basal traction (Pa) - - real(dp), dimension(nz,nx,ny), intent(out), optional :: & - omega_k ! single integral, defined by Goldberg (2011) eq. 32 - - real(dp), dimension(nx,ny), intent(out), optional :: & - omega ! double integral, defined by Goldberg (2011) eq. 35 - ! Note: omega here = Goldberg's omega/H - - real(dp), dimension(nz-1,nQuadPoints_2d,nx,ny), intent(inout), optional :: & - efvs_qp_3d ! effective viscosity (Pa yr) + Avu, Avv !--------------------------------------------------------- ! Local variables !--------------------------------------------------------- - real(dp), dimension(nQuadPoints_2d) :: & + real(dp), dimension(nQuadPoints_3d) :: & detJ ! determinant of J - real(dp), dimension(nNodesPerElement_2d) :: & - dphi_dx_2d, dphi_dy_2d, dphi_dz_2d ! derivatives of basis function, evaluated at quad pts - ! set dphi_dz = 0 for 2D problem + real(dp), dimension(nNodesPerElement_3d) :: & + dphi_dx_3d, dphi_dy_3d, dphi_dz_3d ! derivatives of basis function, evaluated at quad pt !---------------------------------------------------------------- - ! Note: Kuu, Kuv, Kvu, and Kvv are 4x4 components of the stiffness matrix - ! for the local element. (The combined stiffness matrix is 8x8.) + ! Note: Kuu, Kuv, Kvu, and Kvv are 8x8 components of the stiffness matrix + ! for the local element. (The combined stiffness matrix is 16x16.) ! - ! Once these matrices are formed, their coefficients are summed into the global - ! matrices Auu_2d, Auv_2d, Avu_2d, Avv_2d. The global matrices each have as - ! many rows as there are active vertices, but only 9 columns, corresponding to - ! the 9 vertices of the 4 elements sharing a given node. + ! Once these matrices are formed, their coefficients are summed into the assembled + ! matrices Auu, Auv, Avu, Avv. The A matrices each have as many rows as there are + ! active nodes, but only 27 columns, corresponding to the 27 vertices that belong to + ! the 8 elements sharing a given node. ! ! The native structured PCG solver works with the dense A matrices in the form ! computed here. For the SLAP solver, the terms of the A matrices are put - ! in a sparse matrix format during preprocessing. For the Trilinos solver, - ! the terms of the A matrices are passed to Trilinos one row at a time. + ! in a sparse matrix during preprocessing. For the Trilinos solver, the terms + ! of the A matrices are passed to Trilinos one row at a time. !---------------------------------------------------------------- - real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) :: & ! + real(dp), dimension(nNodesPerElement_3d, nNodesPerElement_3d) :: & ! Kuu, & ! element stiffness matrix, divided into 4 parts as shown below Kuv, & ! Kvu, & ! @@ -5169,36 +5428,25 @@ subroutine assemble_stiffness_matrix_2d(nx, ny, & ! ! Kvu may not be needed if matrix is symmetric, but is included for now - real(dp), dimension(nNodesPerElement_2d) :: & - x, y, & ! Cartesian coordinates of vertices - u, v, & ! depth-integrated mean velocity at vertices (m/yr) - h, & ! thickness at vertices (m) - s, & ! upper surface elevation at vertices (m) - bx, by, & ! basal traction at vertices (Pa) (DIVA only) - dsdx, dsdy ! upper surface elevation gradient at vertices (m/m) (L1L2 only) - - real(dp), dimension(nQuadPoints_2d) :: & - efvs_qp_vertavg ! vertically averaged effective viscosity at a quad pt - - real(dp) :: & - h_qp ! thickness at a quad pt + real(dp), dimension(nNodesPerElement_3d) :: & + x, y, z, & ! Cartesian coordinates of nodes + u, v, & ! u and v at nodes + s ! upper surface elevation at nodes - real(dp), dimension(nz-1,nQuadPoints_2d) :: & - efvs_qp ! effective viscosity at each layer in a cell column - ! corresponding to a quad pt + real(dp), dimension(nQuadPoints_3d) :: & + efvs_qp ! effective viscosity at a quad pt logical, parameter :: & check_symmetry_element = .true. ! if true, then check symmetry of element matrix - - real(dp), dimension(nx,ny) :: & - flwafact_2d ! vertically averaged flow factor + !Note: Can speed up assembly a bit by setting to false for production integer :: i, j, k, n, p - integer :: iVertex, jVertex + integer :: iNode, jNode, kNode if (verbose_matrix .and. main_task) then write(iulog,*) ' ' - write(iulog,*) 'In assemble_stiffness_matrix_2d' + write(iulog,*) 'In assemble_stiffness_matrix_3d' + write(iulog,*) 'itest, jtest, ktest, rtest =', itest, jtest, ktest, rtest endif ! Initialize effective viscosity @@ -5206,220 +5454,125 @@ subroutine assemble_stiffness_matrix_2d(nx, ny, & ! Initialize global stiffness matrix - Auu(:,:,:) = 0.d0 - Auv(:,:,:) = 0.d0 - Avu(:,:,:) = 0.d0 - Avv(:,:,:) = 0.d0 - - ! Compute vertical average of flow factor (SSA only) - if (whichapprox == HO_APPROX_SSA) then - call glissade_vertical_average(nx, ny, & - nz, sigma, & - flwafact, flwafact_2d) - endif + Auu(:,:,:,:) = 0.d0 + Auv(:,:,:,:) = 0.d0 + Avu(:,:,:,:) = 0.d0 + Avv(:,:,:,:) = 0.d0 ! Sum over elements in active cells ! Loop over all cells that border locally owned vertices. do j = nhalo+1, ny-nhalo+1 do i = nhalo+1, nx-nhalo+1 - - if (active_cell(i,j)) then - - ! Initialize element stiffness matrix - Kuu(:,:) = 0.d0 - Kuv(:,:) = 0.d0 - Kvu(:,:) = 0.d0 - Kvv(:,:) = 0.d0 - - ! Compute spatial coordinates, velocity, thickness and surface elevation for each vertex - ! Also compute surface elevation gradient (for L1L2) and basal traction (for DIVA) - do n = 1, nNodesPerElement_2d - ! Determine (i,j) for this vertex - ! The reason for the '3' is that node 3, in the NE corner of the grid cell, has index (i,j). - ! Indices for other nodes are computed relative to this vertex. - iVertex = i + ishift(3,n) - jVertex = j + jshift(3,n) - - x(n) = xVertex(iVertex,jVertex) - y(n) = yVertex(iVertex,jVertex) - u(n) = uvel_2d(iVertex,jVertex) - v(n) = vvel_2d(iVertex,jVertex) - s(n) = stagusrf(iVertex,jVertex) - h(n) = stagthck(iVertex,jVertex) - if (present(dusrf_dx) .and. present(dusrf_dy)) then ! L1L2 - dsdx(n) = dusrf_dx(iVertex,jVertex) - dsdy(n) = dusrf_dy(iVertex,jVertex) - endif - if (present(btractx) .and. present(btracty)) then ! DIVA - bx(n) = btractx(iVertex,jVertex) - by(n) = btracty(iVertex,jVertex) - endif - - if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest) then - write(iulog,*) ' ' - write(iulog,*) 'i, j, n, x, y:', i, j, n, x(n), y(n) - write(iulog,*) 's, h, u, v:', s(n), h(n), u(n), v(n) - if (present(btractx) .and. present(btracty)) write(iulog,*) 'bx, by:', bx(n), by(n) - endif - - enddo ! vertices per element - - ! Loop over quadrature points for this element - - do p = 1, nQuadPoints_2d - - ! Evaluate the derivatives of the element basis functions at this quadrature point. - - call get_basis_function_derivatives_2d(x(:), y(:), & - dphi_dxr_2d(:,p), dphi_dyr_2d(:,p), & - dphi_dx_2d(:), dphi_dy_2d(:), & - detJ(p), & - itest, jtest, rtest, & - i, j, p) - - dphi_dz_2d(:) = 0.d0 + if (active_cell(i,j)) then - if (whichapprox == HO_APPROX_L1L2) then + do k = 1, nz-1 ! loop over elements in this column + ! assume k increases from upper surface to bed - ! Compute effective viscosity for each layer at this quadrature point - !TODO - sigma -> stagsigma for L1L2 viscosity? - call compute_effective_viscosity_L1L2(whichefvs, & - efvs_constant, effstrain_min, & - nz, sigma, & - nNodesPerElement_2d, phi_2d(:,p), & - dphi_dx_2d(:), dphi_dy_2d(:), & - u(:), v(:), & - h(:), & - dsdx(:), dsdy(:), & - flwa(:,i,j), flwafact(:,i,j), & - efvs_qp(:,p), & - itest, jtest, rtest, & - i, j, p) + ! Initialize element stiffness matrix + Kuu(:,:) = 0.d0 + Kuv(:,:) = 0.d0 + Kvu(:,:) = 0.d0 + Kvv(:,:) = 0.d0 - ! Compute vertical average of effective viscosity - efvs_qp_vertavg(p) = 0.d0 - do k = 1, nz-1 - efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p) * (sigma(k+1) - sigma(k)) - enddo + ! compute spatial coordinates, velocity, and upper surface elevation for each node - elseif (whichapprox == HO_APPROX_DIVA) then + do n = 1, nNodesPerElement_3d - ! Copy efvs_qp from global array to local column array - efvs_qp(:,:) = efvs_qp_3d(:,:,i,j) + ! Determine (k,i,j) for this node + ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j). + ! Indices for other nodes are computed relative to this node. + iNode = i + ishift(7,n) + jNode = j + jshift(7,n) + kNode = k + kshift(7,n) - ! Compute effective viscosity for each layer at this quadrature point - ! Note: efvs_qp_3d is intent(inout); old value is used to compute new value - call compute_effective_viscosity_diva(whichefvs, & - efvs_constant, effstrain_min, & - nz, stagsigma, & - nNodesPerElement_2d, phi_2d(:,p), & - dphi_dx_2d(:), dphi_dy_2d(:), & - u(:), v(:), & - bx(:), by(:), & - diva_slope_factor_x(i,j), & - diva_slope_factor_y(i,j), & - h(:), & - flwa(:,i,j), flwafact(:,i,j), & - efvs_qp(:,p), & - itest, jtest, rtest, & - i, j, p) + x(n) = xVertex(iNode,jNode) + y(n) = yVertex(iNode,jNode) + z(n) = stagusrf(iNode,jNode) - sigma(kNode)*stagthck(iNode,jNode) + u(n) = uvel(kNode,iNode,jNode) + v(n) = vvel(kNode,iNode,jNode) + s(n) = stagusrf(iNode,jNode) - if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then - write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(:,p) + if (verbose_matrix .and. this_rank==rtest .and. i==itest .and. j==jtest .and. k==ktest) then + write(iulog,*) ' ' + write(iulog,*) 'i, j, k, n, x, y, z:', i, j, k, n, x(n), y(n), z(n) + write(iulog,*) 's, u, v:', s(n), u(n), v(n) endif - !WHL - Copy local efvs_qp to the global array - efvs_qp_3d(:,:,i,j) = efvs_qp(:,:) - - ! Compute vertical average of effective viscosity - efvs_qp_vertavg(p) = 0.d0 - do k = 1, nz-1 - efvs_qp_vertavg(p) = efvs_qp_vertavg(p) + efvs_qp(k,p)*(sigma(k+1) - sigma(k)) - enddo - - else ! SSA - - ! Compute vertically averaged effective viscosity at this quadrature point - !TODO - Why do we pass in dphi_dz_2d here and not elsewhere? - call compute_effective_viscosity(whichefvs, whichapprox, & - efvs_constant, effstrain_min, & - nNodesPerElement_2d, & - dphi_dx_2d(:), dphi_dy_2d(:), dphi_dz_2d(:), & - u(:), v(:), & - flwafact_2d(i,j), efvs_qp_vertavg(p), & - itest, jtest, rtest, & - i, j, 1, p) - - ! Copy vertically averaged value to all levels - efvs_qp(:,p) = efvs_qp_vertavg(p) - - endif ! whichapprox + enddo ! nodes per element - ! Compute ice thickness at this quadrature point + ! Loop over quadrature points for this element - h_qp = 0.d0 - do n = 1, nNodesPerElement_2d - h_qp = h_qp + phi_2d(n,p) * h(n) - enddo + do p = 1, nQuadPoints_3d - ! Increment the element stiffness matrix with the contribution from each quadrature point. - ! Note: The effective viscosity is multiplied by thickness since the equation to be solved - ! is vertically integrated. + ! Evaluate the derivatives of the element basis functions at this quadrature point. + !WHL - Pass in i, j, k, and p to the following subroutines for debugging. - call compute_element_matrix(whichapprox, nNodesPerElement_2d, & - wqp_2d(p), detJ(p), & - h_qp*efvs_qp_vertavg(p), & - dphi_dx_2d(:), dphi_dy_2d(:), dphi_dz_2d(:), & - Kuu(:,:), Kuv(:,:), & - Kvu(:,:), Kvv(:,:), & - itest, jtest, rtest, & - i, j, 1, p ) + call get_basis_function_derivatives_3d(x(:), y(:), z(:), & + dphi_dxr_3d(:,p), dphi_dyr_3d(:,p), dphi_dzr_3d(:,p), & + dphi_dx_3d(:), dphi_dy_3d(:), dphi_dz_3d(:), & + detJ(p), & + itest, jtest, rtest, & + i, j, k, p) - enddo ! nQuadPoints_2d +! call t_startf('glissade_effective_viscosity') + call compute_effective_viscosity(whichefvs, whichapprox, & + efvs_constant, effstrain_min, & + nNodesPerElement_3d, & + dphi_dx_3d(:), dphi_dy_3d(:), dphi_dz_3d(:), & + u(:), v(:), & + flwafact(k,i,j), efvs_qp(p), & + itest, jtest, rtest, & + i, j, k, p ) +! call t_stopf('glissade_effective_viscosity') - if (whichapprox == HO_APPROX_DIVA) then + if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest .and. p==ptest) then + write(iulog,*) 'i, j, k, p, efvs (Pa yr):', i, j, k, p, efvs_qp(p) + endif - ! Compute vertical integrals needed for the 2D solve and 3D velocity reconstruction - call compute_integrals_diva(nz, sigma, & - itest, jtest, rtest, & - thck(i,j), efvs_qp(:,:), & - omega_k(:,i,j), omega(i,j), & - i, j) + ! Increment the element stiffness matrix with the contribution from each quadrature point. - endif +! call t_startf('glissade_compute_element_matrix') + call compute_element_matrix(whichapprox, nNodesPerElement_3d, & + wqp_3d(p), detJ(p), efvs_qp(p), & + dphi_dx_3d(:), dphi_dy_3d(:), dphi_dz_3d(:), & + Kuu(:,:), Kuv(:,:), & + Kvu(:,:), Kvv(:,:), & + itest, jtest, rtest, & + i, j, k, p ) +! call t_stopf('glissade_compute_element_matrix') - ! Compute average of effective viscosity over quad points. - ! For L1L2 and DIVA there is a different efvs in each layer. - ! For SSA, simply write the vertical average value to each layer. + enddo ! nQuadPoints_3d - efvs(:,i,j) = 0.d0 - do p = 1, nQuadPoints_2d - do k = 1, nz-1 - efvs(k,i,j) = efvs(k,i,j) + efvs_qp(k,p) + ! Compute average of effective viscosity over quad pts + efvs(k,i,j) = 0.d0 + + do p = 1, nQuadPoints_3d + efvs(k,i,j) = efvs(k,i,j) + efvs_qp(p) enddo - enddo - efvs(:,i,j) = efvs(:,i,j) / nQuadPoints_2d + efvs(k,i,j) = efvs(k,i,j) / nQuadPoints_3d - if (check_symmetry_element) then - call check_symmetry_element_matrix(nNodesPerElement_2d, & - Kuu, Kuv, Kvu, Kvv) - endif + if (check_symmetry_element) then + call check_symmetry_element_matrix(nNodesPerElement_3d, & + Kuu, Kuv, Kvu, Kvv) + endif - ! Sum the terms of element matrix K into the dense assembled matrix A + ! Sum terms of element matrix K into dense assembled matrix A - call element_to_global_matrix_2d(nx, ny, & - i, j, & - itest, jtest, rtest, & - Kuu, Kuv, & - Kvu, Kvv, & - Auu, Auv, & - Avu, Avv) + call element_to_global_matrix_3d(nx, ny, nz, & + i, j, k, & + itest, jtest, rtest, & + Kuu, Kuv, & + Kvu, Kvv, & + Auu, Auv, & + Avu, Avv) + + enddo ! nz (loop over elements in this column) if (verbose_efvs .and. this_rank==rtest .and. i==itest .and. j==jtest) then write(iulog,*) ' ' - write(iulog,*) 'Assembled 2D matrix, i, j =', i, j + write(iulog,*) 'Assembled 3D matrix, i, j =', i, j write(iulog,*) 'k, flwafact, efvs:' do k = 1, nz-1 write(iulog,*) k, flwafact(k,i,j), efvs(k,i,j) @@ -5431,7 +5584,7 @@ subroutine assemble_stiffness_matrix_2d(nx, ny, & enddo ! i enddo ! j - end subroutine assemble_stiffness_matrix_2d + end subroutine assemble_stiffness_matrix_3d !**************************************************************************** @@ -5573,8 +5726,7 @@ subroutine compute_3d_velocity_diva(& itest, jtest, rtest, & active_vertex, diva_level_index, & ice_plus_land_mask, & - stag_omega, omega_k, & - beta, & + omega_k, beta, & beta_eff_x, beta_eff_y, & stag_theta_slope_x, stag_theta_slope_y, & stag_diva_slope_factor_x, & @@ -5624,9 +5776,6 @@ subroutine compute_3d_velocity_diva(& stag_theta_slope_y, & ! slope angle (radians) in y direction at vertices stag_diva_slope_factor_x, & ! slope correction factor in x direction stag_diva_slope_factor_y, & ! slope correction factor in y direction - stag_omega, & ! double integral, defined by Goldberg eq. 35 (m^2/(Pa yr)) - ! already interpolated to staggered grid - ! Note: omega here = Goldberg's omega/H uvel_2d, vvel_2d ! depth-integrated mean velocity; solution of 2D velocity solve (m/yr) real(dp), dimension(nx-1,ny-1), intent(out) :: & @@ -5643,12 +5792,6 @@ subroutine compute_3d_velocity_diva(& stag_omega_k ! single integral, defined by Goldberg eq. 32 (m^2/(Pa yr)) ! interpolated to staggered grid - real(dp), dimension(nx-1,ny-1) :: & - stag_integral ! integral that relates bed velocity to uvel_2d and vvel_2d - ! = stag_omega for diva_level_index = 0 - ! = stag_omega_k(k,:,:) for other values of diva_level_index - - real(dp) :: & slope_correction_x, & ! slope-based correction for vertical shear in x direction slope_correction_y ! slope-based correction for vertical shear in y direction @@ -5668,15 +5811,6 @@ subroutine compute_3d_velocity_diva(& stagger_margin_in = 1) enddo - ! Identify the appropriate integral for relating uvel_2d/vvel_2d to the bed velocity - - if (diva_level_index == 0) then ! solved for mean velocity - stag_integral(:,:) = stag_omega(:,:) - else - k = diva_level_index - stag_integral(:,:) = stag_omega_k(k,:,:) - endif - !---------------------------------------------------------------- ! Compute the 3D velocity field !---------------------------------------------------------------- @@ -6092,37 +6226,220 @@ subroutine compute_3d_velocity_L1L2(nx, ny, & endif endif - if (vmask_dirichlet(i,j) == 1) then - vvel(k,i,j) = vvel(nz,i,j) - else - if (include_membrane_stress_in_tau) then - vvel(k,i,j) = vvel(k+1,i,j) + fact * tau_yz(k,i,j) - else ! SIA stress term only - vvel(k,i,j) = vvel(k+1,i,j) + fact * tau_yz_sia(k,i,j) - endif - endif + if (vmask_dirichlet(i,j) == 1) then + vvel(k,i,j) = vvel(nz,i,j) + else + if (include_membrane_stress_in_tau) then + vvel(k,i,j) = vvel(k+1,i,j) + fact * tau_yz(k,i,j) + else ! SIA stress term only + vvel(k,i,j) = vvel(k+1,i,j) + fact * tau_yz_sia(k,i,j) + endif + endif + + if (verbose_L1L2 .and. this_rank==rtest .and. i==itest .and. j==jtest) then + depth = stagsigma(k) * stagthck(i,j) + write(iulog,*) ' ' + write(iulog,*) 'k, depth, fact:', & + k, depth, fact + write(iulog,*) 'tau_xz(i,j): SIA term, membrane term, total:', & + tau_xz_sia(k,i,j), tau_xz(k,i,j) - tau_xz_sia(k,i,j), tau_xz(k,i,j) + write(iulog,*) 'tau_yz(i,j): SIA term, membrane term, total:', & + tau_yz_sia(k,i,j), tau_yz(k,i,j) - tau_yz_sia(k,i,j), tau_yz(k,i,j) + write(iulog,*) 'uvel(k), vvel(k):', uvel(k,i,j), vvel(k,i,j) + endif + + endif + + enddo ! i + enddo ! j + + enddo ! k + + end subroutine compute_3d_velocity_L1L2 + +!**************************************************************************** + + subroutine get_basis_function_derivatives_2d(xNode, yNode, & + dphi_dxr_2d, dphi_dyr_2d, & + dphi_dx_2d, dphi_dy_2d, & + detJ, & + itest, jtest, rtest, & + i, j, p) + + !------------------------------------------------------------------ + ! Evaluate the x and y derivatives of 2D element basis functions + ! at a particular quadrature point. + ! + ! Also determine the Jacobian of the transformation between the + ! reference element and the true element. + ! + ! This subroutine should work for any 2D element with any number of nodes. + !------------------------------------------------------------------ + + real(dp), dimension(nNodesPerElement_2d), intent(in) :: & + xNode, yNode, &! nodal coordinates + dphi_dxr_2d, dphi_dyr_2d ! derivatives of basis functions at quad pt + ! wrt x and y in reference element + + real(dp), dimension(nNodesPerElement_2d), intent(out) :: & + dphi_dx_2d, dphi_dy_2d ! derivatives of basis functions at quad pt + ! wrt x and y in true Cartesian coordinates + + real(dp), intent(out) :: & + detJ ! determinant of Jacobian matrix + + real(dp), dimension(2,2) :: & + Jac, &! Jacobian matrix + Jinv ! inverse Jacobian matrix + + integer, intent(in) :: & + itest, jtest, rtest ! coordinates of diagnostic point + + integer, intent(in) :: i, j, p + + integer :: n, row, col + + logical, parameter :: Jac_bug_check = .false. ! set to true for debugging + real(dp), dimension(2,2) :: prod ! Jac * Jinv (should be identity matrix) + + !------------------------------------------------------------------ + ! Compute the Jacobian for the transformation from the reference + ! coordinates to the true coordinates: + ! + ! | | + ! | sum_n{dphi_n/dxr * xn} sum_n{dphi_n/dxr * yn} | + ! J(xr,yr) = | | + ! | sum_n{dphi_n/dyr * xn} sum_n{dphi_n/dyr * yn} | + ! | | + ! + ! where (xn,yn) are the true Cartesian nodal coordinates, + ! (xr,yr) are the coordinates of the quad point in the reference element, + ! and sum_n denotes a sum over nodes. + !------------------------------------------------------------------ + + Jac(:,:) = 0.d0 + + if ((verbose_Jac .or. verbose_diva) .and. this_rank==rtest .and. i==itest .and. j==jtest) then + write(iulog,*) ' ' + write(iulog,*) 'In get_basis_function_derivatives_2d: i, j, p =', i, j, p + endif + + do n = 1, nNodesPerElement_2d + if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then + write(iulog,*) ' ' + write(iulog,*) 'n, x, y:', n, xNode(n), yNode(n) + write(iulog,*) 'dphi_dxr_2d, dphi_dyr_2d:', dphi_dxr_2d(n), dphi_dyr_2d(n) + endif + Jac(1,1) = Jac(1,1) + dphi_dxr_2d(n) * xNode(n) + Jac(1,2) = Jac(1,2) + dphi_dxr_2d(n) * yNode(n) + Jac(2,1) = Jac(2,1) + dphi_dyr_2d(n) * xNode(n) + Jac(2,2) = Jac(2,2) + dphi_dyr_2d(n) * yNode(n) + enddo + + !------------------------------------------------------------------ + ! Compute the determinant and inverse of J + !------------------------------------------------------------------ + + detJ = Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1) + + if (abs(detJ) > 0.d0) then + Jinv(1,1) = Jac(2,2)/detJ + Jinv(1,2) = -Jac(1,2)/detJ + Jinv(2,1) = -Jac(2,1)/detJ + Jinv(2,2) = Jac(1,1)/detJ + else + write(iulog,*) 'stopping, det J = 0' + write(iulog,*) 'i, j, p:', i, j, p + write(iulog,*) 'Jacobian matrix:' + write(iulog,*) Jac(1,:) + write(iulog,*) Jac(2,:) + call write_log('Jacobian matrix is singular', GM_FATAL) + endif + + if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then + write(iulog,*) ' ' + write(iulog,*) 'Jacobian calc, p =', p + write(iulog,*) 'det J =', detJ + write(iulog,*) ' ' + write(iulog,*) 'Jacobian matrix:' + write(iulog,*) Jac(1,:) + write(iulog,*) Jac(2,:) + write(iulog,*) ' ' + write(iulog,*) 'Inverse matrix:' + write(iulog,*) Jinv(1,:) + write(iulog,*) Jinv(2,:) + write(iulog,*) ' ' + prod = matmul(Jac, Jinv) + write(iulog,*) 'Jac*Jinv:' + write(iulog,*) prod(1,:) + write(iulog,*) prod(2,:) + endif + + ! Optional bug check - Verify that J * Jinv = I + + if (Jac_bug_check) then + prod = matmul(Jac,Jinv) + do col = 1, 2 + do row = 1, 2 + if (abs(prod(row,col) - identity3(row,col)) > 1.d-12) then + write(iulog,*) '2d Jacobian, stopping, Jac * Jinv /= identity' + write(iulog,*) 'rank, i, j, p:', this_rank, i, j, p + write(iulog,*) 'Jac*Jinv:' + write(iulog,*) prod(1,:) + write(iulog,*) prod(2,:) + call write_log('Jacobian matrix was not correctly inverted', GM_FATAL) + endif + enddo + enddo + endif + + !------------------------------------------------------------------ + ! Compute the contribution of this quadrature point to dphi/dx and dphi/dy + ! for each basis function. + ! + ! | dphi_n/dx | | dphi_n/dxr | + ! | | = Jinv * | | + ! | dphi_n/dy | | dphi_n/dyr | + ! + !------------------------------------------------------------------ + + dphi_dx_2d(:) = 0.d0 + dphi_dy_2d(:) = 0.d0 - if (verbose_L1L2 .and. this_rank==rtest .and. i==itest .and. j==jtest) then - depth = stagsigma(k) * stagthck(i,j) - write(iulog,*) ' ' - write(iulog,*) 'k, depth, fact:', & - k, depth, fact - write(iulog,*) 'tau_xz(i,j): SIA term, membrane term, total:', & - tau_xz_sia(k,i,j), tau_xz(k,i,j) - tau_xz_sia(k,i,j), tau_xz(k,i,j) - write(iulog,*) 'tau_yz(i,j): SIA term, membrane term, total:', & - tau_yz_sia(k,i,j), tau_yz(k,i,j) - tau_yz_sia(k,i,j), tau_yz(k,i,j) - write(iulog,*) 'uvel(k), vvel(k):', uvel(k,i,j), vvel(k,i,j) - endif + do n = 1, nNodesPerElement_2d + dphi_dx_2d(n) = dphi_dx_2d(n) + Jinv(1,1)*dphi_dxr_2d(n) & + + Jinv(1,2)*dphi_dyr_2d(n) + dphi_dy_2d(n) = dphi_dy_2d(n) + Jinv(2,1)*dphi_dxr_2d(n) & + + Jinv(2,2)*dphi_dyr_2d(n) + enddo - endif + if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then + write(iulog,*) ' ' + write(iulog,*) 'dphi_dx_2d:', dphi_dx_2d(:) + write(iulog,*) 'dphi_dy_2d:', dphi_dy_2d(:) + endif - enddo ! i - enddo ! j + if (Jac_bug_check) then - enddo ! k + ! Check that the sum of dphi_dx, etc. is close to zero + if (abs( sum(dphi_dx_2d)/maxval(dphi_dx_2d) ) > 1.d-11) then + write(iulog,*) 'stopping, sum over basis functions of dphi_dx > 0' + write(iulog,*) 'dphi_dx_2d =', dphi_dx_2d(:) + write(iulog,*) 'i, j, p =', i, j, p + call write_log('Sum over basis functions of dphi_dx /= 0', GM_FATAL) + endif - end subroutine compute_3d_velocity_L1L2 + if (abs( sum(dphi_dy_2d)/maxval(dphi_dy_2d) ) > 1.d-11) then + write(iulog,*) 'stopping, sum over basis functions of dphi_dy > 0' + write(iulog,*) 'dphi_dy =', dphi_dy_2d(:) + write(iulog,*) 'i, j, p =', i, j, p + call write_log('Sum over basis functions of dphi_dy /= 0', GM_FATAL) + endif + + endif + end subroutine get_basis_function_derivatives_2d + !**************************************************************************** subroutine get_basis_function_derivatives_3d(xNode, yNode, zNode, & @@ -6149,7 +6466,7 @@ subroutine get_basis_function_derivatives_3d(xNode, yNode, zNode, real(dp), dimension(nNodesPerElement_3d), intent(out) :: & dphi_dx_3d, dphi_dy_3d, dphi_dz_3d ! derivatives of basis functions at quad pt - ! wrt x, y and z in true Cartesian coordinates + ! wrt x, y and z in true Cartesian coordinates real(dp), intent(out) :: & detJ ! determinant of Jacobian matrix @@ -6276,263 +6593,80 @@ subroutine get_basis_function_derivatives_3d(xNode, yNode, zNode, if (Jac_bug_check) then prod = matmul(Jac,Jinv) - do col = 1, 3 - do row = 1, 3 - if (abs(prod(row,col) - identity3(row,col)) > 1.d-11) then - write(iulog,*) 'stopping, Jac * Jinv /= identity' - write(iulog,*) 'i, j, k, p:', i, j, k, p - write(iulog,*) 'Jac*Jinv:' - write(iulog,*) prod(1,:) - write(iulog,*) prod(2,:) - write(iulog,*) prod(3,:) - call write_log('Jacobian matrix was not correctly inverted', GM_FATAL) - endif - enddo - enddo - endif ! Jac_bug_check - - !------------------------------------------------------------------ - ! Compute the contribution of this quadrature point to dphi/dx and dphi/dy - ! for each basis function. - ! - ! | dphi_n/dx | | dphi_n/dxr | - ! | | | | - ! | dphi_n/dy | = Jinv * | dphi_n/dyr | - ! | | | | - ! | dphi_n/dz | | dphi_n/dzr | - ! - !------------------------------------------------------------------ - - dphi_dx_3d(:) = 0.d0 - dphi_dy_3d(:) = 0.d0 - dphi_dz_3d(:) = 0.d0 - - do n = 1, nNodesPerElement_3d - dphi_dx_3d(n) = Jinv(1,1)*dphi_dxr_3d(n) & - + Jinv(1,2)*dphi_dyr_3d(n) & - + Jinv(1,3)*dphi_dzr_3d(n) - dphi_dy_3d(n) = Jinv(2,1)*dphi_dxr_3d(n) & - + Jinv(2,2)*dphi_dyr_3d(n) & - + Jinv(2,3)*dphi_dzr_3d(n) - dphi_dz_3d(n) = Jinv(3,1)*dphi_dxr_3d(n) & - + Jinv(3,2)*dphi_dyr_3d(n) & - + Jinv(3,3)*dphi_dzr_3d(n) - enddo - - if (Jac_bug_check) then - - ! Check that the sum of dphi_dx, etc. is close to zero - - if (abs( sum(dphi_dx_3d)/maxval(dphi_dx_3d) ) > 1.d-11) then - write(iulog,*) 'stopping, sum over basis functions of dphi_dx > 0' - write(iulog,*) 'dphi_dx_3d =', dphi_dx_3d(:) - write(iulog,*) 'sum =', sum(dphi_dx_3d) - write(iulog,*) 'i, j, k, p =', i, j, k, p - call write_log('Sum over basis functions of dphi_dx /= 0', GM_FATAL) - endif - - if (abs( sum(dphi_dy_3d)/maxval(dphi_dy_3d) ) > 1.d-11) then - write(iulog,*) 'stopping, sum over basis functions of dphi_dy > 0' - write(iulog,*) 'dphi_dy_3d =', dphi_dy_3d(:) - write(iulog,*) 'sum =', sum(dphi_dy_3d) - write(iulog,*) 'i, j, k, p =', i, j, k, p - call write_log('Sum over basis functions of dphi_dy /= 0', GM_FATAL) - endif - - if (abs( sum(dphi_dz_3d)/maxval(dphi_dz_3d) ) > 1.d-11) then - write(iulog,*) 'stopping, sum over basis functions of dphi_dz > 0' - write(iulog,*) 'dphi_dz_3d =', dphi_dz_3d(:) - write(iulog,*) 'sum =', sum(dphi_dz_3d) - write(iulog,*) 'i, j, k, p =', i, j, k, p - call write_log('Sum over basis functions of dphi_dz /= 0', GM_FATAL) - endif - - endif ! Jac_bug_check - - end subroutine get_basis_function_derivatives_3d - -!**************************************************************************** - - subroutine get_basis_function_derivatives_2d(xNode, yNode, & - dphi_dxr_2d, dphi_dyr_2d, & - dphi_dx_2d, dphi_dy_2d, & - detJ, & - itest, jtest, rtest, & - i, j, p) - - !------------------------------------------------------------------ - ! Evaluate the x and y derivatives of 2D element basis functions - ! at a particular quadrature point. - ! - ! Also determine the Jacobian of the transformation between the - ! reference element and the true element. - ! - ! This subroutine should work for any 2D element with any number of nodes. - !------------------------------------------------------------------ - - real(dp), dimension(nNodesPerElement_2d), intent(in) :: & - xNode, yNode, &! nodal coordinates - dphi_dxr_2d, dphi_dyr_2d ! derivatives of basis functions at quad pt - ! wrt x and y in reference element - - real(dp), dimension(nNodesPerElement_2d), intent(out) :: & - dphi_dx_2d, dphi_dy_2d ! derivatives of basis functions at quad pt - ! wrt x and y in true Cartesian coordinates - - real(dp), intent(out) :: & - detJ ! determinant of Jacobian matrix - - real(dp), dimension(2,2) :: & - Jac, &! Jacobian matrix - Jinv ! inverse Jacobian matrix - - integer, intent(in) :: & - itest, jtest, rtest ! coordinates of diagnostic point - - integer, intent(in) :: i, j, p - - integer :: n, row, col - - logical, parameter :: Jac_bug_check = .false. ! set to true for debugging - real(dp), dimension(2,2) :: prod ! Jac * Jinv (should be identity matrix) - - !------------------------------------------------------------------ - ! Compute the Jacobian for the transformation from the reference - ! coordinates to the true coordinates: - ! - ! | | - ! | sum_n{dphi_n/dxr * xn} sum_n{dphi_n/dxr * yn} | - ! J(xr,yr) = | | - ! | sum_n{dphi_n/dyr * xn} sum_n{dphi_n/dyr * yn} | - ! | | - ! - ! where (xn,yn) are the true Cartesian nodal coordinates, - ! (xr,yr) are the coordinates of the quad point in the reference element, - ! and sum_n denotes a sum over nodes. - !------------------------------------------------------------------ - - Jac(:,:) = 0.d0 - - if ((verbose_Jac .or. verbose_diva) .and. this_rank==rtest .and. i==itest .and. j==jtest) then - write(iulog,*) ' ' - write(iulog,*) 'In get_basis_function_derivatives_2d: i, j, p =', i, j, p - endif - - do n = 1, nNodesPerElement_2d - if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then - write(iulog,*) ' ' - write(iulog,*) 'n, x, y:', n, xNode(n), yNode(n) - write(iulog,*) 'dphi_dxr_2d, dphi_dyr_2d:', dphi_dxr_2d(n), dphi_dyr_2d(n) - endif - Jac(1,1) = Jac(1,1) + dphi_dxr_2d(n) * xNode(n) - Jac(1,2) = Jac(1,2) + dphi_dxr_2d(n) * yNode(n) - Jac(2,1) = Jac(2,1) + dphi_dyr_2d(n) * xNode(n) - Jac(2,2) = Jac(2,2) + dphi_dyr_2d(n) * yNode(n) - enddo - - !------------------------------------------------------------------ - ! Compute the determinant and inverse of J - !------------------------------------------------------------------ - - detJ = Jac(1,1)*Jac(2,2) - Jac(1,2)*Jac(2,1) - - if (abs(detJ) > 0.d0) then - Jinv(1,1) = Jac(2,2)/detJ - Jinv(1,2) = -Jac(1,2)/detJ - Jinv(2,1) = -Jac(2,1)/detJ - Jinv(2,2) = Jac(1,1)/detJ - else - write(iulog,*) 'stopping, det J = 0' - write(iulog,*) 'i, j, p:', i, j, p - write(iulog,*) 'Jacobian matrix:' - write(iulog,*) Jac(1,:) - write(iulog,*) Jac(2,:) - call write_log('Jacobian matrix is singular', GM_FATAL) - endif - - if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then - write(iulog,*) ' ' - write(iulog,*) 'Jacobian calc, p =', p - write(iulog,*) 'det J =', detJ - write(iulog,*) ' ' - write(iulog,*) 'Jacobian matrix:' - write(iulog,*) Jac(1,:) - write(iulog,*) Jac(2,:) - write(iulog,*) ' ' - write(iulog,*) 'Inverse matrix:' - write(iulog,*) Jinv(1,:) - write(iulog,*) Jinv(2,:) - write(iulog,*) ' ' - prod = matmul(Jac, Jinv) - write(iulog,*) 'Jac*Jinv:' - write(iulog,*) prod(1,:) - write(iulog,*) prod(2,:) - endif - - ! Optional bug check - Verify that J * Jinv = I - - if (Jac_bug_check) then - prod = matmul(Jac,Jinv) - do col = 1, 2 - do row = 1, 2 - if (abs(prod(row,col) - identity3(row,col)) > 1.d-12) then - write(iulog,*) 'stopping, Jac * Jinv /= identity' - write(iulog,*) 'i, j, p:', i, j, p + do col = 1, 3 + do row = 1, 3 + if (abs(prod(row,col) - identity3(row,col)) > eps10) then + write(iulog,*) '3d Jacobian, stopping, Jac * Jinv /= identity' + write(iulog,*) 'rank, i, j, k, p:', this_rank, i, j, k, p write(iulog,*) 'Jac*Jinv:' write(iulog,*) prod(1,:) write(iulog,*) prod(2,:) + write(iulog,*) prod(3,:) call write_log('Jacobian matrix was not correctly inverted', GM_FATAL) endif enddo enddo - endif + endif ! Jac_bug_check !------------------------------------------------------------------ ! Compute the contribution of this quadrature point to dphi/dx and dphi/dy ! for each basis function. ! ! | dphi_n/dx | | dphi_n/dxr | - ! | | = Jinv * | | - ! | dphi_n/dy | | dphi_n/dyr | + ! | | | | + ! | dphi_n/dy | = Jinv * | dphi_n/dyr | + ! | | | | + ! | dphi_n/dz | | dphi_n/dzr | ! !------------------------------------------------------------------ - dphi_dx_2d(:) = 0.d0 - dphi_dy_2d(:) = 0.d0 + dphi_dx_3d(:) = 0.d0 + dphi_dy_3d(:) = 0.d0 + dphi_dz_3d(:) = 0.d0 - do n = 1, nNodesPerElement_2d - dphi_dx_2d(n) = dphi_dx_2d(n) + Jinv(1,1)*dphi_dxr_2d(n) & - + Jinv(1,2)*dphi_dyr_2d(n) - dphi_dy_2d(n) = dphi_dy_2d(n) + Jinv(2,1)*dphi_dxr_2d(n) & - + Jinv(2,2)*dphi_dyr_2d(n) + do n = 1, nNodesPerElement_3d + dphi_dx_3d(n) = Jinv(1,1)*dphi_dxr_3d(n) & + + Jinv(1,2)*dphi_dyr_3d(n) & + + Jinv(1,3)*dphi_dzr_3d(n) + dphi_dy_3d(n) = Jinv(2,1)*dphi_dxr_3d(n) & + + Jinv(2,2)*dphi_dyr_3d(n) & + + Jinv(2,3)*dphi_dzr_3d(n) + dphi_dz_3d(n) = Jinv(3,1)*dphi_dxr_3d(n) & + + Jinv(3,2)*dphi_dyr_3d(n) & + + Jinv(3,3)*dphi_dzr_3d(n) enddo - if (verbose_Jac .and. this_rank==rtest .and. i==itest .and. j==jtest) then - write(iulog,*) ' ' - write(iulog,*) 'dphi_dx_2d:', dphi_dx_2d(:) - write(iulog,*) 'dphi_dy_2d:', dphi_dy_2d(:) - endif - if (Jac_bug_check) then ! Check that the sum of dphi_dx, etc. is close to zero - if (abs( sum(dphi_dx_2d)/maxval(dphi_dx_2d) ) > 1.d-11) then + + if (abs( sum(dphi_dx_3d)/maxval(dphi_dx_3d) ) > 1.d-11) then write(iulog,*) 'stopping, sum over basis functions of dphi_dx > 0' - write(iulog,*) 'dphi_dx_2d =', dphi_dx_2d(:) - write(iulog,*) 'i, j, p =', i, j, p + write(iulog,*) 'dphi_dx_3d =', dphi_dx_3d(:) + write(iulog,*) 'sum =', sum(dphi_dx_3d) + write(iulog,*) 'i, j, k, p =', i, j, k, p call write_log('Sum over basis functions of dphi_dx /= 0', GM_FATAL) endif - if (abs( sum(dphi_dy_2d)/maxval(dphi_dy_2d) ) > 1.d-11) then + if (abs( sum(dphi_dy_3d)/maxval(dphi_dy_3d) ) > 1.d-11) then write(iulog,*) 'stopping, sum over basis functions of dphi_dy > 0' - write(iulog,*) 'dphi_dy =', dphi_dy_2d(:) - write(iulog,*) 'i, j, p =', i, j, p + write(iulog,*) 'dphi_dy_3d =', dphi_dy_3d(:) + write(iulog,*) 'sum =', sum(dphi_dy_3d) + write(iulog,*) 'i, j, k, p =', i, j, k, p call write_log('Sum over basis functions of dphi_dy /= 0', GM_FATAL) endif - endif + if (abs( sum(dphi_dz_3d)/maxval(dphi_dz_3d) ) > 1.d-11) then + write(iulog,*) 'stopping, sum over basis functions of dphi_dz > 0' + write(iulog,*) 'dphi_dz_3d =', dphi_dz_3d(:) + write(iulog,*) 'sum =', sum(dphi_dz_3d) + write(iulog,*) 'i, j, k, p =', i, j, k, p + call write_log('Sum over basis functions of dphi_dz /= 0', GM_FATAL) + endif - end subroutine get_basis_function_derivatives_2d + endif ! Jac_bug_check + + end subroutine get_basis_function_derivatives_3d !**************************************************************************** @@ -6934,7 +7068,7 @@ subroutine compute_internal_stress (nx, ny, & integer :: i, j, k, n, p integer :: iNode, jNode, kNode - + ! initialize stresses tau_xz (:,:,:) = 0.d0 tau_yz (:,:,:) = 0.d0 @@ -7543,7 +7677,7 @@ subroutine compute_effective_viscosity_diva(whichefvs, efvs, & itest, jtest, rtest, & i, j, p ) - + ! Compute the effective viscosity at each layer of an ice column corresponding ! to a particular quadrature point, based on the depth-integrated formulation. ! See Goldberg(2011) for details. @@ -7823,7 +7957,7 @@ subroutine compute_element_matrix(whichapprox, nNodesPerElement, & !---------------------------------------------------------------- integer, intent(in) :: & - whichapprox ! which Stokes approximation to use (BP, SIA, SSA) + whichapprox ! which Stokes approximation to use (SIA, SSA, DIVA, L1L2, BP) integer, intent(in) :: nNodesPerElement ! number of nodes per element @@ -7915,7 +8049,7 @@ subroutine compute_element_matrix(whichapprox, nNodesPerElement, & (2.d0*dphi_dy(nr)*dphi_dx(nc) + dphi_dx(nr)*dphi_dy(nc)) Kvv(nr,nc) = Kvv(nr,nc) + efvs_factor * & - ( 4.d0*dphi_dy(nr)*dphi_dy(nc) + dphi_dx(nr)*dphi_dx(nc) & + ( 4.d0*dphi_dy(nr)*dphi_dy(nc) + dphi_dx(nr)*dphi_dx(nc) & + dphi_dz(nr)*dphi_dz(nc) ) enddo ! nr (rows) @@ -7927,146 +8061,148 @@ end subroutine compute_element_matrix !**************************************************************************** - subroutine element_to_global_matrix_3d(nx, ny, nz, & - iElement, jElement, kElement, & - itest, jtest, rtest, & - Kuu, Kuv, & - Kvu, Kvv, & - Auu, Auv, & + subroutine element_to_global_matrix_2d(nx, ny, & + iElement, jElement, & + itest, jtest, rtest, & + Kuu, Kuv, & + Kvu, Kvv, & + Auu, Auv, & Avu, Avv) - + ! Sum terms of element matrix K into dense assembled matrix A ! K is partitioned into Kuu, Kuv, Kvu, and Kvv, and similarly for A. integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions - nz ! number of vertical levels where velocity is computed + nx, ny ! horizontal grid dimensions integer, intent(in) :: & - iElement, jElement, kElement ! i, j and k indices for this element + iElement, jElement ! i and j indices for this element integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point - real(dp), dimension(nNodesPerElement_3d,nNodesPerElement_3d), intent(in) :: & + real(dp), dimension(nNodesPerElement_2d,nNodesPerElement_2d), intent(in) :: & Kuu, Kuv, Kvu, Kvv ! element matrix - real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) :: & + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & Auu, Auv, Avu, Avv ! assembled matrix - integer :: i, j, k, m - integer :: iA, jA, kA + integer :: i, j, m + integer :: iA, jA integer :: n, nr, nc - if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest .and. kElement==ktest) then - write(iulog,*) 'Element i, j, k:', iElement, jElement, kElement + if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then + write(iulog,*) 'Element i, j:', iElement, jElement write(iulog,*) 'Rows of Kuu:' - do n = 1, nNodesPerElement_3d + do n = 1, nNodesPerElement_2d write(iulog, '(8e12.4)') Kuu(n,:) enddo endif - !WHL - On a Mac I tried switching the loops to put nc on the outside, but - ! the one with nr on the outside is faster. - do nr = 1, nNodesPerElement_3d ! rows of K + do nr = 1, nNodesPerElement_2d ! rows of K - ! Determine row of A to be incremented by finding (k,i,j) for this node - ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j). + ! Determine row of A to be incremented by finding (i,j) for this node + ! The reason for the '3' is that node 3, in the NE corner of this gridcell, has index (i,j). ! Indices for other nodes are computed relative to this node. - i = iElement + ishift(7,nr) - j = jElement + jshift(7,nr) - k = kElement + kshift(7,nr) + i = iElement + ishift(3,nr) + j = jElement + jshift(3,nr) - do nc = 1, nNodesPerElement_3d ! columns of K + do nc = 1, nNodesPerElement_2d ! columns of K ! Determine column of A to be incremented - kA = kshift(nr,nc) ! k index of A into which K(m,n) is summed - iA = ishift(nr,nc) ! similarly for i and j indices + iA = ishift(nr,nc) ! similarly for i and j indices jA = jshift(nr,nc) ! these indices can take values -1, 0 and 1 - m = indxA_3d(iA,jA,kA) + m = indxA_2d(iA,jA) ! Increment A - Auu(m,k,i,j) = Auu(m,k,i,j) + Kuu(nr,nc) - Auv(m,k,i,j) = Auv(m,k,i,j) + Kuv(nr,nc) - Avu(m,k,i,j) = Avu(m,k,i,j) + Kvu(nr,nc) - Avv(m,k,i,j) = Avv(m,k,i,j) + Kvv(nr,nc) + Auu(i,j,m) = Auu(i,j,m) + Kuu(nr,nc) + Auv(i,j,m) = Auv(i,j,m) + Kuv(nr,nc) + Avu(i,j,m) = Avu(i,j,m) + Kvu(nr,nc) + Avv(i,j,m) = Avv(i,j,m) + Kvv(nr,nc) - enddo ! nc + if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then + write(iulog,*) 'Increment Auu, element i, j, nr, nc =', iElement, jElement, nr, nc + write(iulog,*) ' i, j, m, Kuu, new Auu:', i, j, m, Kuu(nr,nc), Auu(i,j,m) +!! write(iulog,*) 'Increment Avv, element i, j, nr, nc =', iElement, jElement, nr, nc +!! write(iulog,*) ' i, j, m, Kvv, new Avv:', i, j, m, Kvv(nr,nc), Avv(i,j,m) + endif + enddo ! nc enddo ! nr - end subroutine element_to_global_matrix_3d - + end subroutine element_to_global_matrix_2d + !**************************************************************************** - subroutine element_to_global_matrix_2d(nx, ny, & - iElement, jElement, & - itest, jtest, rtest, & - Kuu, Kuv, & - Kvu, Kvv, & - Auu, Auv, & + subroutine element_to_global_matrix_3d(nx, ny, nz, & + iElement, jElement, kElement, & + itest, jtest, rtest, & + Kuu, Kuv, & + Kvu, Kvv, & + Auu, Auv, & Avu, Avv) - + ! Sum terms of element matrix K into dense assembled matrix A ! K is partitioned into Kuu, Kuv, Kvu, and Kvv, and similarly for A. integer, intent(in) :: & - nx, ny ! horizontal grid dimensions + nx, ny, & ! horizontal grid dimensions + nz ! number of vertical levels where velocity is computed integer, intent(in) :: & - iElement, jElement ! i and j indices for this element + iElement, jElement, kElement ! i, j and k indices for this element integer, intent(in) :: & itest, jtest, rtest ! coordinates of diagnostic point - real(dp), dimension(nNodesPerElement_2d,nNodesPerElement_2d), intent(in) :: & + real(dp), dimension(nNodesPerElement_3d,nNodesPerElement_3d), intent(in) :: & Kuu, Kuv, Kvu, Kvv ! element matrix - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & + real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) :: & Auu, Auv, Avu, Avv ! assembled matrix - integer :: i, j, m - integer :: iA, jA + integer :: i, j, k, m + integer :: iA, jA, kA integer :: n, nr, nc - if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then - write(iulog,*) 'Element i, j:', iElement, jElement + if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest .and. kElement==ktest) then + write(iulog,*) 'Element i, j, k:', iElement, jElement, kElement write(iulog,*) 'Rows of Kuu:' - do n = 1, nNodesPerElement_2d + do n = 1, nNodesPerElement_3d write(iulog, '(8e12.4)') Kuu(n,:) enddo endif - do nr = 1, nNodesPerElement_2d ! rows of K + !WHL - On a Mac I tried switching the loops to put nc on the outside, but + ! the one with nr on the outside is faster. + do nr = 1, nNodesPerElement_3d ! rows of K - ! Determine row of A to be incremented by finding (i,j) for this node - ! The reason for the '3' is that node 3, in the NE corner of this gridcell, has index (i,j). + ! Determine row of A to be incremented by finding (k,i,j) for this node + ! The reason for the '7' is that node 7, in the NE corner of the upper layer, has index (k,i,j). ! Indices for other nodes are computed relative to this node. - i = iElement + ishift(3,nr) - j = jElement + jshift(3,nr) + i = iElement + ishift(7,nr) + j = jElement + jshift(7,nr) + k = kElement + kshift(7,nr) - do nc = 1, nNodesPerElement_2d ! columns of K + do nc = 1, nNodesPerElement_3d ! columns of K ! Determine column of A to be incremented + kA = kshift(nr,nc) ! k index of A into which K(m,n) is summed iA = ishift(nr,nc) ! similarly for i and j indices jA = jshift(nr,nc) ! these indices can take values -1, 0 and 1 - m = indxA_2d(iA,jA) + m = indxA_3d(iA,jA,kA) ! Increment A - Auu(i,j,m) = Auu(i,j,m) + Kuu(nr,nc) - Auv(i,j,m) = Auv(i,j,m) + Kuv(nr,nc) - Avu(i,j,m) = Avu(i,j,m) + Kvu(nr,nc) - Avv(i,j,m) = Avv(i,j,m) + Kvv(nr,nc) - - if (verbose_matrix .and. this_rank==rtest .and. iElement==itest .and. jElement==jtest) then - write(iulog,*) 'Increment Auu, element i, j, nr, nc =', iElement, jElement, nr, nc - write(iulog,*) ' i, j, m, Kuu, new Auu:', i, j, m, Kuu(nr,nc), Auu(i,j,m) - endif + Auu(m,k,i,j) = Auu(m,k,i,j) + Kuu(nr,nc) + Auv(m,k,i,j) = Auv(m,k,i,j) + Kuv(nr,nc) + Avu(m,k,i,j) = Avu(m,k,i,j) + Kvu(nr,nc) + Avv(m,k,i,j) = Avv(m,k,i,j) + Kvv(nr,nc) enddo ! nc + enddo ! nr - end subroutine element_to_global_matrix_2d + end subroutine element_to_global_matrix_3d !**************************************************************************** !WHL, May 2025: @@ -8078,7 +8214,7 @@ end subroutine element_to_global_matrix_2d ! Set diva_slope_correction = F to reproduce older results. !TODO - Call this subroutine for both 2D and 3D solvers. - ! First need to switch the index order for 3D matrices. + ! First would need to switch the index order for 3D matrices. subroutine basal_sliding_bc_2d(nx, ny, & nNeighbors, nhalo, & parallel, & @@ -8091,7 +8227,6 @@ subroutine basal_sliding_bc_2d(nx, ny, & whichassemble_beta, & Auu, Avv) - !------------------------------------------------------------------------ ! Increment the Auu and Avv matrices with basal traction terms. ! Do a surface integral over all basal faces that contain at least one node with a stress BC. @@ -8164,6 +8299,7 @@ subroutine basal_sliding_bc_2d(nx, ny, & dphi_dx_2d, dphi_dy_2d, dphi_dz_2d ! derivatives of basis functions, evaluated at quad pts real(dp) :: & + increment, & ! incremental change in matrix element beta_qp, & ! beta evaluated at quadrature point detJ ! determinant of Jacobian for the transformation ! between the reference element and true element @@ -8171,24 +8307,23 @@ subroutine basal_sliding_bc_2d(nx, ny, & real(dp), dimension(nNodesPerElement_2d, nNodesPerElement_2d) :: & Kuu, Kvv ! components of element matrix associated with basal sliding - if (verbose_basal) then - call point_diag(beta, 'beta', itest, jtest, rtest, 7, 7, '(f10.0)') - endif - if (whichassemble_beta == HO_ASSEMBLE_BETA_LOCAL) then - if (nNeighbors == nNodeNeighbors_3d) then ! 3D problem - m = indxA_3d(0,0,0) - else ! 2D problem - m = indxA_2d(0,0) - endif - + m = indxA_2d(0,0) ! Sum over active vertices + !WHL, 12/13/25: The following minor change makes the results independent of processor count. + ! Without the commented code, the 1-core result differs from the 4-core result + ! for some cells near the 4-core processor boundary (i = 211 in a GrIS run). + ! The difference is in the last of 64 bits. + ! I don't know why the original results differ or why the change fixes it. do j = 1, ny-1 do i = 1, nx-1 if (active_vertex(i,j)) then - Auu(i,j,m) = Auu(i,j,m) + dx*dy/vol0 * beta(i,j) - Avv(i,j,m) = Avv(i,j,m) + dx*dy/vol0 * beta(i,j) +! Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta(i,j) +! Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta(i,j) + increment = (dx*dy/vol0) * beta(i,j) + Auu(i,j,m) = Auu(i,j,m) + increment + Avv(i,j,m) = Avv(i,j,m) + increment endif ! active_vertex enddo ! i enddo ! j @@ -8329,20 +8464,6 @@ subroutine basal_sliding_bc_2d(nx, ny, & endif ! whichassemble_beta - if (verbose_basal .and. this_rank==rtest) then - i = itest - j = jtest - if (nNeighbors == nNodeNeighbors_3d) then ! 3D problem - m = indxA_3d(0,0,0) - else - m = indxA_2d(0,0) - endif - write(iulog,*) ' ' - write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m - write(iulog,*) 'New Auu diagonal:', Auu(i,j,m) - write(iulog,*) 'New Avv diagonal:', Avv(i,j,m) - endif - end subroutine basal_sliding_bc_2d !**************************************************************************** @@ -8455,11 +8576,7 @@ subroutine basal_sliding_bc_2d_diva(& if (whichassemble_beta == HO_ASSEMBLE_BETA_LOCAL) then - if (nNeighbors == nNodeNeighbors_3d) then ! 3D problem - m = indxA_3d(0,0,0) - else ! 2D problem - m = indxA_2d(0,0) - endif + m = indxA_2d(0,0) ! Average the lower ice surface elevation to vertices call glissade_stagger(& @@ -8481,16 +8598,16 @@ subroutine basal_sliding_bc_2d_diva(& call parallel_halo(theta_basal_slope_y, parallel) if (verbose_basal) then - call point_diag(theta_basal_slope_x*180.d0/pi, 'theta_basal_slope_x (deg)', itest, jtest, rtest, 7, 7, '(f10.0)') - call point_diag(theta_basal_slope_y*180.d0/pi, 'theta_basal_slope_y (deg)', itest, jtest, rtest, 7, 7, '(f10.0)') + call point_diag(theta_basal_slope_x*180.d0/pi, 'theta_basal_slope_x (deg)', itest, jtest, rtest, 7, 7) + call point_diag(theta_basal_slope_y*180.d0/pi, 'theta_basal_slope_y (deg)', itest, jtest, rtest, 7, 7) endif ! Sum over active vertices do j = 1, ny-1 do i = 1, nx-1 if (active_vertex(i,j)) then - Auu(i,j,m) = Auu(i,j,m) + dx*dy/vol0 * beta_eff_x(i,j) / cos(theta_basal_slope_x(i,j)) - Avv(i,j,m) = Avv(i,j,m) + dx*dy/vol0 * beta_eff_y(i,j) / cos(theta_basal_slope_y(i,j)) + Auu(i,j,m) = Auu(i,j,m) + (dx*dy/vol0) * beta_eff_x(i,j) / cos(theta_basal_slope_x(i,j)) + Avv(i,j,m) = Avv(i,j,m) + (dx*dy/vol0) * beta_eff_y(i,j) / cos(theta_basal_slope_y(i,j)) endif ! active_vertex enddo ! i enddo ! j @@ -8627,20 +8744,6 @@ subroutine basal_sliding_bc_2d_diva(& endif ! whichassemble_beta - if (verbose_basal .and. this_rank==rtest) then - i = itest - j = jtest - if (nNeighbors == nNodeNeighbors_3d) then ! 3D problem - m = indxA_3d(0,0,0) - else - m = indxA_2d(0,0) - endif - write(iulog,*) ' ' - write(iulog,*) 'Basal BC: i, j, diagonal index =', i, j, m - write(iulog,*) 'New Auu diagonal:', Auu(i,j,m) - write(iulog,*) 'New Avv diagonal:', Avv(i,j,m) - endif - end subroutine basal_sliding_bc_2d_diva !**************************************************************************** @@ -8962,8 +9065,8 @@ end subroutine basal_sliding_bc_3d !**************************************************************************** - subroutine dirichlet_boundary_conditions_3d(nx, ny, & - nz, nhalo, & + subroutine dirichlet_boundary_conditions_2d(nx, ny, & + nhalo, & active_vertex, & umask_dirichlet, vmask_dirichlet, & uvel, vvel, & @@ -8984,183 +9087,158 @@ subroutine dirichlet_boundary_conditions_3d(nx, ny, & integer, intent(in) :: & nx, ny, & ! horizontal grid dimensions - nz, & ! number of vertical levels where velocity is computed nhalo ! number of halo layers logical, dimension(nx-1,ny-1), intent(in) :: & active_vertex ! true for active vertices (vertices of active cells) - integer, dimension(nz,nx-1,ny-1), intent(in) :: & - umask_dirichlet, &! Dirichlet mask for u velocity (if true, u is prescribed) - vmask_dirichlet ! Dirichlet mask for v velocity (if true, v is prescribed) + integer, dimension(nx-1,ny-1), intent(in) :: & + umask_dirichlet, &! Dirichlet mask for velocity (if true, u is prescribed) + vmask_dirichlet ! Dirichlet mask for velocity (if true, v is prescribed) - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1), intent(in) :: & uvel, vvel ! velocity components - real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) :: & + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts Avu, Avv - real(dp), dimension(nz,nx-1,ny-1), intent(inout) :: & + real(dp), dimension(nx-1,ny-1), intent(inout) :: & bu, bv ! assembled load vector, divided into 2 parts !---------------------------------------------------------------- ! Local variables !---------------------------------------------------------------- - integer :: i, j, k ! Cartesian indices of nodes - integer :: iA, jA, kA ! i, j, and k offsets of neighboring nodes - integer :: m + integer :: i, j ! Cartesian indices of nodes + integer :: iA, jA ! i and j offsets of neighboring nodes + integer :: m, mm ! Loop over all vertices that border locally owned vertices. - ! For outflow BC, OK to skip vertices outside the global domain (i < nhalo or j < nhalo). + ! Locally owned vertices are (staggered_ilo:staggered_ihi, staggered_jlo_staggered_jhi). + ! OK to skip vertices outside the global domain (i < nhalo or j < nhalo). ! Note: Need nhalo >= 2 so as not to step out of bounds. - do j = nhalo, ny-nhalo+1 - do i = nhalo, nx-nhalo+1 - if (active_vertex(i,j)) then - do k = 1, nz + do jA = -1,1 + do iA = -1,1 + m = indxA_2d(iA,jA) + mm = indxA_2d(-iA,-jA) - if (umask_dirichlet(k,i,j) == 1) then + do j = nhalo, ny-nhalo+1 + do i = nhalo, nx-nhalo+1 + if (active_vertex(i,j)) then + + if (umask_dirichlet(i,j) == 1) then ! set the rhs to the prescribed velocity - bu(k,i,j) = uvel(k,i,j) + bu(i,j) = uvel(i,j) - ! loop through matrix values in the rows associated with this node + ! loop through matrix values in the rows associated with this vertex ! (Auu contains one row, Avu contains a second row) - do kA = -1,1 - do jA = -1,1 - do iA = -1,1 - - if ( (k+kA >= 1 .and. k+kA <= nz) & - .and. & - (i+iA >= 1 .and. i+iA <= nx-1) & - .and. & - (j+jA >= 1 .and. j+jA <= ny-1) ) then - if (iA==0 .and. jA==0 .and. kA==0) then ! main diagonal + if ( (i+iA >= 1 .and. i+iA <= nx-1) & + .and. & + (j+jA >= 1 .and. j+jA <= ny-1) ) then - ! Set Auu = 1 on the main diagonal - ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix - ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0) - m = indxA_3d(0,0,0) - Auu(m,k,i,j) = 1.d0 - Auv(m,k,i,j) = 0.d0 - Avu(m,k,i,j) = 0.d0 + if (iA==0 .and. jA==0) then ! main diagonal - !TODO - Set bu above, outside iA/jA loop - ! Set the rhs to the prescribed velocity, forcing u = prescribed uvel for this vertex -!! bu(k,i,j) = uvel(k,i,j) - - else ! not on the diagonal + ! Set Auu = 1 on the main diagonal + ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix + ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0) + Auu(i,j,indxA_2d(0,0)) = 1.d0 + Auv(i,j,indxA_2d(0,0)) = 1.d0 + Avu(i,j,indxA_2d(0,0)) = 1.d0 - ! Zero out non-diagonal matrix terms in the rows associated with this node - m = indxA_3d(iA,jA,kA) - Auu(m, k, i, j) = 0.d0 - Auv(m, k, i, j) = 0.d0 + else ! not on the diagonal - ! Shift terms associated with this velocity to the rhs. - ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. - m = indxA_3d(-iA,-jA,-kA) + ! Zero out non-diagonal matrix terms in the row associated with this vertex + Auu(i,j,m) = 0.d0 + Auv(i,j,m) = 0.d0 - if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then - ! Move (Auu term) * uvel to rhs - bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auu(m, k+kA, i+iA, j+jA) * uvel(k,i,j) - Auu(m, k+kA, i+iA, j+jA) = 0.d0 - endif + ! Shift terms associated with this velocity to the rhs. + ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. + ! Recall mm = indxA_2d(-iA,-jA) - if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then - ! Move (Avu term) * uvel to rhs - bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avu(m, k+kA, i+iA, j+jA) * uvel(k,i,j) - Avu(m, k+kA, i+iA, j+jA) = 0.d0 - endif + if (umask_dirichlet(i+iA, j+jA) /= 1) then + ! Move (Auu term) * uvel to rhs + bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auu(i+iA, j+jA, mm) * uvel(i,j) + Auu(i+iA, j+jA, mm) = 0.d0 + endif - endif ! on the diagonal + if (vmask_dirichlet(i+iA, j+jA) /= 1) then + ! Move (Avu term) * uvel to rhs + bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avu(i+iA, j+jA, mm) * uvel(i,j) + Avu(i+iA, j+jA, mm) = 0.d0 + endif - endif ! i+iA, j+jA, and k+kA in bounds + endif ! on the diagonal - enddo ! kA - enddo ! iA - enddo ! jA + endif ! i+iA and j+jA in bounds - endif ! umask_dirichlet + endif ! umask_dirichlet - if (vmask_dirichlet(k,i,j) == 1) then + if (vmask_dirichlet(i,j) == 1) then ! set the rhs to the prescribed velocity - bv(k,i,j) = vvel(k,i,j) - - ! loop through matrix values in the rows associated with this node - ! (Auu contains one row, Avu contains a second row) - do kA = -1,1 - do jA = -1,1 - do iA = -1,1 + bv(i,j) = vvel(i,j) - if ( (k+kA >= 1 .and. k+kA <= nz) & - .and. & - (i+iA >= 1 .and. i+iA <= nx-1) & - .and. & - (j+jA >= 1 .and. j+jA <= ny-1) ) then + ! loop through matrix values in the rows associated with this vertex + ! (Auv contains one row, Avv contains a second row) - if (iA==0 .and. jA==0 .and. kA==0) then ! main diagonal + if ( (i+iA >= 1 .and. i+iA <= nx-1) & + .and. & + (j+jA >= 1 .and. j+jA <= ny-1) ) then - ! Set Avv = 1 on the main diagonal - ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix - ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0) - m = indxA_3d(0,0,0) + if (iA==0 .and. jA==0) then ! main diagonal - Auv(m,k,i,j) = 0.d0 - Avu(m,k,i,j) = 0.d0 - Avv(m,k,i,j) = 1.d0 + ! Set Avv = 1 on the main diagonal + ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix + ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0) + Auv(i,j,indxA_2d(0,0)) = 0.d0 + Avu(i,j,indxA_2d(0,0)) = 0.d0 + Avv(i,j,indxA_2d(0,0)) = 1.d0 - !TODO - Set bv above, outside iA/jA loop - ! Set the rhs to the prescribed velocity, forcing v = prescribed vvel for this node -!! bv(k,i,j) = vvel(k,i,j) - - else ! not on the diagonal + else ! not on the diagonal - ! Zero out non-diagonal matrix terms in the rows associated with this node - m = indxA_3d(iA,jA,kA) - Avu(m, k, i, j) = 0.d0 - Avv(m, k, i, j) = 0.d0 + ! Zero out non-diagonal matrix terms in the rows associated with this vertex + Avu(i,j,m) = 0.d0 + Avv(i,j,m) = 0.d0 - ! Shift terms associated with this velocity to the rhs. - ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. - m = indxA_3d(-iA,-jA,-kA) + ! Shift terms associated with this velocity to the rhs. + ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. + ! Recall mm = indxA_2d(-iA,-jA) - if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then - ! Move (Auv term) * vvel to rhs - bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auv(m, k+kA, i+iA, j+jA) * vvel(k,i,j) - Auv(m, k+kA, i+iA, j+jA) = 0.d0 - endif + if (umask_dirichlet(i+iA, j+jA) /= 1) then + ! Move (Auv term) * vvel to rhs + bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auv(i+iA, j+jA, mm) * vvel(i,j) + Auv(i+iA, j+jA, mm) = 0.d0 + endif - if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then - ! Move (Avv term) * vvel to rhs - bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avv(m, k+kA, i+iA, j+jA) * vvel(k,i,j) - Avv(m, k+kA, i+iA, j+jA) = 0.d0 - endif + if (vmask_dirichlet(i+iA, j+jA) /= 1) then + ! Move (Avv term) * vvel to rhs + bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avv(i+iA, j+jA, mm) * vvel(i,j) + Avv(i+iA, j+jA, mm) = 0.d0 + endif - endif ! on the diagonal + endif ! on the diagonal - endif ! i+iA, j+jA, and k+kA in bounds + endif ! i+iA and j+jA in bounds - enddo ! kA - enddo ! iA - enddo ! jA + endif ! vmask_dirichlet - endif ! vmask_dirichlet + endif ! active_vertex + enddo ! i + enddo ! j - enddo ! k - endif ! active_vertex - enddo ! i - enddo ! j + enddo ! iA + enddo ! jA - end subroutine dirichlet_boundary_conditions_3d + end subroutine dirichlet_boundary_conditions_2d !**************************************************************************** - subroutine dirichlet_boundary_conditions_2d(nx, ny, & - nhalo, & + subroutine dirichlet_boundary_conditions_3d(nx, ny, & + nz, nhalo, & active_vertex, & umask_dirichlet, vmask_dirichlet, & uvel, vvel, & @@ -9181,157 +9259,182 @@ subroutine dirichlet_boundary_conditions_2d(nx, ny, & integer, intent(in) :: & nx, ny, & ! horizontal grid dimensions + nz, & ! number of vertical levels where velocity is computed nhalo ! number of halo layers logical, dimension(nx-1,ny-1), intent(in) :: & active_vertex ! true for active vertices (vertices of active cells) - integer, dimension(nx-1,ny-1), intent(in) :: & - umask_dirichlet, &! Dirichlet mask for velocity (if true, u is prescribed) - vmask_dirichlet ! Dirichlet mask for velocity (if true, v is prescribed) + integer, dimension(nz,nx-1,ny-1), intent(in) :: & + umask_dirichlet, &! Dirichlet mask for u velocity (if true, u is prescribed) + vmask_dirichlet ! Dirichlet mask for v velocity (if true, v is prescribed) - real(dp), dimension(nx-1,ny-1), intent(in) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & uvel, vvel ! velocity components - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & + real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(inout) :: & Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts Avu, Avv - real(dp), dimension(nx-1,ny-1), intent(inout) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(inout) :: & bu, bv ! assembled load vector, divided into 2 parts !---------------------------------------------------------------- ! Local variables !---------------------------------------------------------------- - integer :: i, j ! Cartesian indices of nodes - integer :: iA, jA ! i and j offsets of neighboring nodes - integer :: m, mm + integer :: i, j, k ! Cartesian indices of nodes + integer :: iA, jA, kA ! i, j, and k offsets of neighboring nodes + integer :: m ! Loop over all vertices that border locally owned vertices. - ! Locally owned vertices are (staggered_ilo:staggered_ihi, staggered_jlo_staggered_jhi). - ! OK to skip vertices outside the global domain (i < nhalo or j < nhalo). + ! For outflow BC, OK to skip vertices outside the global domain (i < nhalo or j < nhalo). ! Note: Need nhalo >= 2 so as not to step out of bounds. - do jA = -1,1 - do iA = -1,1 - m = indxA_2d(iA,jA) - mm = indxA_2d(-iA,-jA) - - do j = nhalo, ny-nhalo+1 - do i = nhalo, nx-nhalo+1 - if (active_vertex(i,j)) then + do j = nhalo, ny-nhalo+1 + do i = nhalo, nx-nhalo+1 + if (active_vertex(i,j)) then + do k = 1, nz - if (umask_dirichlet(i,j) == 1) then + if (umask_dirichlet(k,i,j) == 1) then ! set the rhs to the prescribed velocity - bu(i,j) = uvel(i,j) + bu(k,i,j) = uvel(k,i,j) - ! loop through matrix values in the rows associated with this vertex + ! loop through matrix values in the rows associated with this node ! (Auu contains one row, Avu contains a second row) + do kA = -1,1 + do jA = -1,1 + do iA = -1,1 + + if ( (k+kA >= 1 .and. k+kA <= nz) & + .and. & + (i+iA >= 1 .and. i+iA <= nx-1) & + .and. & + (j+jA >= 1 .and. j+jA <= ny-1) ) then - if ( (i+iA >= 1 .and. i+iA <= nx-1) & - .and. & - (j+jA >= 1 .and. j+jA <= ny-1) ) then + if (iA==0 .and. jA==0 .and. kA==0) then ! main diagonal - if (iA==0 .and. jA==0) then ! main diagonal + ! Set Auu = 1 on the main diagonal + ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix + ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0) + m = indxA_3d(0,0,0) + Auu(m,k,i,j) = 1.d0 + Auv(m,k,i,j) = 0.d0 + Avu(m,k,i,j) = 0.d0 - ! Set Auu = 1 on the main diagonal - ! Set Auv term = 0; this term is off-diagonal for the fully assembled matrix - ! Set Avu term = 0 to preserve matrix symmetry (given that Auv term = 0) - Auu(i,j,indxA_2d(0,0)) = 1.d0 - Auv(i,j,indxA_2d(0,0)) = 1.d0 - Avu(i,j,indxA_2d(0,0)) = 1.d0 + !TODO - Set bu above, outside iA/jA loop + ! Set the rhs to the prescribed velocity, forcing u = prescribed uvel for this vertex +!! bu(k,i,j) = uvel(k,i,j) + + else ! not on the diagonal - else ! not on the diagonal + ! Zero out non-diagonal matrix terms in the rows associated with this node + m = indxA_3d(iA,jA,kA) + Auu(m, k, i, j) = 0.d0 + Auv(m, k, i, j) = 0.d0 - ! Zero out non-diagonal matrix terms in the row associated with this vertex - Auu(i,j,m) = 0.d0 - Auv(i,j,m) = 0.d0 + ! Shift terms associated with this velocity to the rhs. + ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. + m = indxA_3d(-iA,-jA,-kA) - ! Shift terms associated with this velocity to the rhs. - ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. - ! Recall mm = indxA_2d(-iA,-jA) + if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then + ! Move (Auu term) * uvel to rhs + bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auu(m, k+kA, i+iA, j+jA) * uvel(k,i,j) + Auu(m, k+kA, i+iA, j+jA) = 0.d0 + endif - if (umask_dirichlet(i+iA, j+jA) /= 1) then - ! Move (Auu term) * uvel to rhs - bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auu(i+iA, j+jA, mm) * uvel(i,j) - Auu(i+iA, j+jA, mm) = 0.d0 - endif + if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then + ! Move (Avu term) * uvel to rhs + bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avu(m, k+kA, i+iA, j+jA) * uvel(k,i,j) + Avu(m, k+kA, i+iA, j+jA) = 0.d0 + endif - if (vmask_dirichlet(i+iA, j+jA) /= 1) then - ! Move (Avu term) * uvel to rhs - bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avu(i+iA, j+jA, mm) * uvel(i,j) - Avu(i+iA, j+jA, mm) = 0.d0 - endif + endif ! on the diagonal - endif ! on the diagonal + endif ! i+iA, j+jA, and k+kA in bounds - endif ! i+iA and j+jA in bounds + enddo ! kA + enddo ! iA + enddo ! jA - endif ! umask_dirichlet + endif ! umask_dirichlet - if (vmask_dirichlet(i,j) == 1) then + if (vmask_dirichlet(k,i,j) == 1) then ! set the rhs to the prescribed velocity - bv(i,j) = vvel(i,j) + bv(k,i,j) = vvel(k,i,j) - ! loop through matrix values in the rows associated with this vertex - ! (Auv contains one row, Avv contains a second row) + ! loop through matrix values in the rows associated with this node + ! (Auu contains one row, Avu contains a second row) + do kA = -1,1 + do jA = -1,1 + do iA = -1,1 - if ( (i+iA >= 1 .and. i+iA <= nx-1) & - .and. & - (j+jA >= 1 .and. j+jA <= ny-1) ) then + if ( (k+kA >= 1 .and. k+kA <= nz) & + .and. & + (i+iA >= 1 .and. i+iA <= nx-1) & + .and. & + (j+jA >= 1 .and. j+jA <= ny-1) ) then - if (iA==0 .and. jA==0) then ! main diagonal + if (iA==0 .and. jA==0 .and. kA==0) then ! main diagonal - ! Set Avv = 1 on the main diagonal - ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix - ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0) - Auv(i,j,indxA_2d(0,0)) = 0.d0 - Avu(i,j,indxA_2d(0,0)) = 0.d0 - Avv(i,j,indxA_2d(0,0)) = 1.d0 + ! Set Avv = 1 on the main diagonal + ! Set Avu term = 0; this term is off-diagonal for the fully assembled matrix + ! Set Auv term = 0 to preserve matrix symmetry (given that Avu term = 0) + m = indxA_3d(0,0,0) - else ! not on the diagonal + Auv(m,k,i,j) = 0.d0 + Avu(m,k,i,j) = 0.d0 + Avv(m,k,i,j) = 1.d0 - ! Zero out non-diagonal matrix terms in the rows associated with this vertex - Avu(i,j,m) = 0.d0 - Avv(i,j,m) = 0.d0 + !TODO - Set bv above, outside iA/jA loop + ! Set the rhs to the prescribed velocity, forcing v = prescribed vvel for this node +!! bv(k,i,j) = vvel(k,i,j) + + else ! not on the diagonal - ! Shift terms associated with this velocity to the rhs. - ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. - ! Recall mm = indxA_2d(-iA,-jA) + ! Zero out non-diagonal matrix terms in the rows associated with this node + m = indxA_3d(iA,jA,kA) + Avu(m, k, i, j) = 0.d0 + Avv(m, k, i, j) = 0.d0 - if (umask_dirichlet(i+iA, j+jA) /= 1) then - ! Move (Auv term) * vvel to rhs - bu(i+iA, j+jA) = bu(i+iA, j+jA) - Auv(i+iA, j+jA, mm) * vvel(i,j) - Auv(i+iA, j+jA, mm) = 0.d0 - endif + ! Shift terms associated with this velocity to the rhs. + ! Note: The remaining operations do not change the answer, but do restore symmetry to the matrix. + m = indxA_3d(-iA,-jA,-kA) - if (vmask_dirichlet(i+iA, j+jA) /= 1) then - ! Move (Avv term) * vvel to rhs - bv(i+iA, j+jA) = bv(i+iA, j+jA) - Avv(i+iA, j+jA, mm) * vvel(i,j) - Avv(i+iA, j+jA, mm) = 0.d0 - endif + if (umask_dirichlet(k+kA, i+iA, j+jA) /= 1) then + ! Move (Auv term) * vvel to rhs + bu(k+kA, i+iA, j+jA) = bu(k+kA, i+iA, j+jA) - Auv(m, k+kA, i+iA, j+jA) * vvel(k,i,j) + Auv(m, k+kA, i+iA, j+jA) = 0.d0 + endif - endif ! on the diagonal + if (vmask_dirichlet(k+kA, i+iA, j+jA) /= 1) then + ! Move (Avv term) * vvel to rhs + bv(k+kA, i+iA, j+jA) = bv(k+kA, i+iA, j+jA) - Avv(m, k+kA, i+iA, j+jA) * vvel(k,i,j) + Avv(m, k+kA, i+iA, j+jA) = 0.d0 + endif - endif ! i+iA and j+jA in bounds + endif ! on the diagonal - endif ! vmask_dirichlet + endif ! i+iA, j+jA, and k+kA in bounds - endif ! active_vertex - enddo ! i - enddo ! j + enddo ! kA + enddo ! iA + enddo ! jA - enddo ! iA - enddo ! jA + endif ! vmask_dirichlet - end subroutine dirichlet_boundary_conditions_2d + enddo ! k + endif ! active_vertex + enddo ! i + enddo ! j + + end subroutine dirichlet_boundary_conditions_3d !**************************************************************************** - subroutine compute_residual_vector_3d(nx, ny, nz, & + subroutine compute_residual_vector_2d(nx, ny, & parallel, & itest, jtest, rtest, & active_vertex, & @@ -9345,9 +9448,12 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & ! Compute the residual vector Ax - b and its L2 norm. ! This subroutine assumes that the matrix is stored in structured (x/y/z) format. + !---------------------------------------------------------------- + ! Input/output arguments + !---------------------------------------------------------------- + integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions (for scalars) - nz ! number of vertical levels where velocity is computed + nx, ny ! horizontal grid dimensions (for scalars) type(parallel_type), intent(in) :: & parallel ! info for parallel communication @@ -9358,25 +9464,25 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & logical, dimension(nx-1,ny-1), intent(in) :: & active_vertex ! T for columns (i,j) where velocity is computed, else F - real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) :: & Auu, Auv, Avu, Avv ! four components of assembled matrix - ! 1st dimension = 3 (node and its nearest neighbors in x, y and z direction) - ! other dimensions = (z,x,y) indices + ! 3rd dimension = 9 (node and its nearest neighbors in x and y directions) + ! 1st and 2nd dimensions = (x,y) indices ! ! Auu | Auv ! _____|____ ! Avu | Avv ! | - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1), intent(in) :: & uvel, vvel ! u and v components of velocity (m/yr) - real(dp), dimension(nz,nx-1,ny-1), intent(out) :: & + real(dp), dimension(nx-1,ny-1), intent(out) :: & resid_u, & ! residual vector, divided into 2 parts - resid_v ! + resid_v real(dp), intent(out) :: & L2_norm ! L2 norm of residual vector, |Ax - b| @@ -9384,12 +9490,17 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & real(dp), intent(out), optional :: & L2_norm_relative ! L2 norm of residual vector relative to rhs, |Ax - b| / |b| - real(dp), dimension(nz,nx-1,ny-1) :: & - resid_sq ! resid_u^2 + resid_v^2 + !---------------------------------------------------------------- + ! Local variables + !---------------------------------------------------------------- + + real(dp), dimension(nx-1,ny-1) :: & + worku, workv, & ! work arrays for global sums + resid_sq ! resid_u^2 + resid_v^2 real(dp) :: my_max_resid, global_max_resid - integer :: i, j, k, iA, jA, kA, m, iglobal, jglobal + integer :: i, j, iA, jA, m, iglobal, jglobal real(dp) :: L2_norm_rhs ! L2 norm of rhs vector, |b| @@ -9404,75 +9515,50 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & ! Compute u and v components of A*x - resid_u(:,:,:) = 0.d0 - resid_v(:,:,:) = 0.d0 + resid_u(:,:) = 0.d0 + resid_v(:,:) = 0.d0 - !TODO - Replace the following by a call to matvec_multiply_structured_3d ! Loop over locally owned vertices - - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - - if (active_vertex(i,j)) then - - do k = 1, nz - - do kA = -1,1 - do jA = -1,1 - do iA = -1,1 - - if ( (k+kA >= 1 .and. k+kA <= nz) & - .and. & - (i+iA >= 1 .and. i+iA <= nx-1) & - .and. & - (j+jA >= 1 .and. j+jA <= ny-1) ) then - - m = indxA_3d(iA,jA,kA) - - resid_u(k,i,j) = resid_u(k,i,j) & - + Auu(m,k,i,j)*uvel(k+kA,i+iA,j+jA) & - + Auv(m,k,i,j)*vvel(k+kA,i+iA,j+jA) - - resid_v(k,i,j) = resid_v(k,i,j) & - + Avu(m,k,i,j)*uvel(k+kA,i+iA,j+jA) & - + Avv(m,k,i,j)*vvel(k+kA,i+iA,j+jA) - - endif ! in bounds - - enddo ! kA - enddo ! iA - enddo ! jA - - enddo ! k - - endif ! active_vertex - - enddo ! i - enddo ! j + do jA = -1,1 + do iA = -1,1 + m = indxA_2d(iA,jA) + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + if ( (i+iA >= 1 .and. i+iA <= nx-1) & + .and. & + (j+jA >= 1 .and. j+jA <= ny-1) ) then + resid_u(i,j) = resid_u(i,j) & + + Auu(i,j,m)*uvel(i+iA,j+jA) & + + Auv(i,j,m)*vvel(i+iA,j+jA) + resid_v(i,j) = resid_v(i,j) & + + Avu(i,j,m)*uvel(i+iA,j+jA) & + + Avv(i,j,m)*vvel(i+iA,j+jA) + endif ! in bounds + endif ! active_vertex + enddo ! i + enddo ! j + enddo ! iA + enddo ! jA ! Subtract b to get A*x - b - ! Sum up squared L2 norm as we go - - L2_norm = 0.d0 - resid_sq(:,:,:) = 0.0d0 + worku(:,:) = 0.0d0 + workv(:,:) = 0.0d0 ! Loop over locally owned vertices - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - do k = 1, nz - resid_u(k,i,j) = resid_u(k,i,j) - bu(k,i,j) - resid_v(k,i,j) = resid_v(k,i,j) - bv(k,i,j) - resid_sq(k,i,j) = resid_u(k,i,j)*resid_u(k,i,j) + resid_v(k,i,j)*resid_v(k,i,j) - L2_norm = L2_norm + resid_sq(k,i,j) - enddo ! k - endif ! active vertex - enddo ! i - enddo ! j + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + resid_u(i,j) = resid_u(i,j) - bu(i,j) + resid_v(i,j) = resid_v(i,j) - bv(i,j) + worku(i,j) = resid_u(i,j)*resid_u(i,j) + workv(i,j) = resid_v(i,j)*resid_v(i,j) + endif ! active vertex + enddo ! i + enddo ! j ! Take global sum, then take square root - L2_norm = parallel_reduce_sum(L2_norm) + L2_norm = parallel_global_sum_stagger(worku, parallel, workv) L2_norm = sqrt(L2_norm) if (verbose_residual) then @@ -9480,28 +9566,28 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & if (this_rank==rtest) then i = itest j = jtest - k = ktest - write(iulog,*) 'In compute_residual_vector_3d: task, i, j, k =', this_rank, i, j, k - write(iulog, '(a16, 2f13.7, 2e13.5)') & - ' u, v, ru, rv: ', uvel(k,i,j), vvel(k,i,j), resid_u(k,i,j), resid_v(k,i,j) + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog,*) ' ' + write(iulog,*) 'In compute_residual_vector_2d: test ig, jg =', i, j + write(iulog, '(a15, 2f12.5, 2e13.5)') & + ' u, v, ru, rv:', uvel(i,j), vvel(i,j), resid_u(i,j), resid_v(i,j) endif ! Compute max value of (squared) residual on this task. ! If this task owns the vertex with the global max residual, then print a diagnostic message. + resid_sq(:,:) = worku(:,:) + workv(:,:) my_max_resid = maxval(resid_sq) global_max_resid = parallel_reduce_max(my_max_resid) if (abs((my_max_resid - global_max_resid)/global_max_resid) < 1.0d-6) then do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - do k = 1, nz - if (abs((resid_sq(k,i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog, '(a24, 2i6, i4, 2e13.5, e16.8)') 'ig, jg, k, ru, rv, rmax:', & - iglobal, jglobal, k, resid_u(k,i,j), resid_v(k,i,j), sqrt(global_max_resid) - write(iulog,*) ' ' - endif - enddo + if (abs((resid_sq(i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog, '(a24, 2i6, 2e13.5, e16.8)') 'ig, jg, ru, rv, global rmax:', & + iglobal, jglobal, resid_u(i,j), resid_v(i,j), sqrt(global_max_resid) + write(iulog,*) ' ' + endif enddo enddo endif @@ -9510,35 +9596,36 @@ subroutine compute_residual_vector_3d(nx, ny, nz, & if (present(L2_norm_relative)) then ! compute L2_norm relative to rhs - L2_norm_rhs = 0.d0 + worku(:,:) = 0.0d0 + workv(:,:) = 0.0d0 + ! Loop over locally owned vertices do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - do k = 1, nz - L2_norm_rhs = L2_norm_rhs + bu(k,i,j)*bu(k,i,j) + bv(k,i,j)*bv(k,i,j) - enddo ! k - endif ! active vertex - enddo ! i - enddo ! j + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + worku(i,j) = bu(i,j)*bu(i,j) + workv(i,j) = bv(i,j)*bv(i,j) + endif ! active vertex + enddo ! i + enddo ! j ! Take global sum, then take square root - L2_norm_rhs = parallel_reduce_sum(L2_norm_rhs) + L2_norm_rhs = parallel_global_sum_stagger(worku, parallel, workv) L2_norm_rhs = sqrt(L2_norm_rhs) - if (L2_norm_rhs > 0.d0) then + if (L2_norm_rhs > 0.0d0) then L2_norm_relative = L2_norm / L2_norm_rhs else - L2_norm_relative = 0.d0 + L2_norm_relative = 0.0d0 endif endif - end subroutine compute_residual_vector_3d + end subroutine compute_residual_vector_2d !**************************************************************************** - subroutine compute_residual_vector_2d(nx, ny, & + subroutine compute_residual_vector_3d(nx, ny, nz, & parallel, & itest, jtest, rtest, & active_vertex, & @@ -9552,8 +9639,13 @@ subroutine compute_residual_vector_2d(nx, ny, & ! Compute the residual vector Ax - b and its L2 norm. ! This subroutine assumes that the matrix is stored in structured (x/y/z) format. + !---------------------------------------------------------------- + ! Input/output variables + !---------------------------------------------------------------- + integer, intent(in) :: & - nx, ny ! horizontal grid dimensions (for scalars) + nx, ny, & ! horizontal grid dimensions (for scalars) + nz ! number of vertical levels where velocity is computed type(parallel_type), intent(in) :: & parallel ! info for parallel communication @@ -9564,25 +9656,25 @@ subroutine compute_residual_vector_2d(nx, ny, & logical, dimension(nx-1,ny-1), intent(in) :: & active_vertex ! T for columns (i,j) where velocity is computed, else F - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) :: & + real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) :: & Auu, Auv, Avu, Avv ! four components of assembled matrix - ! 3rd dimension = 9 (node and its nearest neighbors in x and y directions) - ! 1st and 2nd dimensions = (x,y) indices + ! 1st dimension = 3 (node and its nearest neighbors in x, y and z direction) + ! other dimensions = (z,x,y) indices ! ! Auu | Auv ! _____|____ ! Avu | Avv ! | - real(dp), dimension(nx-1,ny-1), intent(in) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts - real(dp), dimension(nx-1,ny-1), intent(in) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & uvel, vvel ! u and v components of velocity (m/yr) - real(dp), dimension(nx-1,ny-1), intent(out) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(out) :: & resid_u, & ! residual vector, divided into 2 parts - resid_v + resid_v ! real(dp), intent(out) :: & L2_norm ! L2 norm of residual vector, |Ax - b| @@ -9590,12 +9682,17 @@ subroutine compute_residual_vector_2d(nx, ny, & real(dp), intent(out), optional :: & L2_norm_relative ! L2 norm of residual vector relative to rhs, |Ax - b| / |b| - real(dp), dimension(nx-1,ny-1) :: & + !---------------------------------------------------------------- + ! Local variables + !---------------------------------------------------------------- + + real(dp), dimension(nz,nx-1,ny-1) :: & + worku, workv, & ! work arrays for global sums resid_sq ! resid_u^2 + resid_v^2 real(dp) :: my_max_resid, global_max_resid - integer :: i, j, iA, jA, m, iglobal, jglobal + integer :: i, j, k, iA, jA, kA, m, iglobal, jglobal real(dp) :: L2_norm_rhs ! L2 norm of rhs vector, |b| @@ -9610,54 +9707,62 @@ subroutine compute_residual_vector_2d(nx, ny, & ! Compute u and v components of A*x - resid_u(:,:) = 0.d0 - resid_v(:,:) = 0.d0 + resid_u(:,:,:) = 0.d0 + resid_v(:,:,:) = 0.d0 ! Loop over locally owned vertices - do jA = -1,1 - do iA = -1,1 - m = indxA_2d(iA,jA) - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - if ( (i+iA >= 1 .and. i+iA <= nx-1) & - .and. & - (j+jA >= 1 .and. j+jA <= ny-1) ) then - resid_u(i,j) = resid_u(i,j) & - + Auu(i,j,m)*uvel(i+iA,j+jA) & - + Auv(i,j,m)*vvel(i+iA,j+jA) - resid_v(i,j) = resid_v(i,j) & - + Avu(i,j,m)*uvel(i+iA,j+jA) & - + Avv(i,j,m)*vvel(i+iA,j+jA) - endif ! in bounds - endif ! active_vertex - enddo ! i - enddo ! j - enddo ! iA - enddo ! jA + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + do k = 1, nz + do kA = -1,1 + do jA = -1,1 + do iA = -1,1 + if ( (k+kA >= 1 .and. k+kA <= nz) & + .and. & + (i+iA >= 1 .and. i+iA <= nx-1) & + .and. & + (j+jA >= 1 .and. j+jA <= ny-1) ) then + + m = indxA_3d(iA,jA,kA) + + resid_u(k,i,j) = resid_u(k,i,j) & + + Auu(m,k,i,j)*uvel(k+kA,i+iA,j+jA) & + + Auv(m,k,i,j)*vvel(k+kA,i+iA,j+jA) + + resid_v(k,i,j) = resid_v(k,i,j) & + + Avu(m,k,i,j)*uvel(k+kA,i+iA,j+jA) & + + Avv(m,k,i,j)*vvel(k+kA,i+iA,j+jA) + endif ! in bounds + enddo ! kA + enddo ! iA + enddo ! jA + enddo ! k + endif ! active_vertex + enddo ! i + enddo ! j ! Subtract b to get A*x - b - ! Sum up squared L2 norm as we go - L2_norm = 0.d0 - resid_sq(:,:) = 0.0d0 + worku(:,:,:) = 0.0d0 + workv(:,:,:) = 0.0d0 ! Loop over locally owned vertices - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - resid_u(i,j) = resid_u(i,j) - bu(i,j) - resid_v(i,j) = resid_v(i,j) - bv(i,j) - resid_sq(i,j) = resid_u(i,j)*resid_u(i,j) + resid_v(i,j)*resid_v(i,j) - L2_norm = L2_norm + resid_sq(i,j) - endif ! active vertex - enddo ! i + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + do k = 1, nz + resid_u(k,i,j) = resid_u(k,i,j) - bu(k,i,j) + resid_v(k,i,j) = resid_v(k,i,j) - bv(k,i,j) + worku(k,i,j) = resid_u(k,i,j)*resid_u(k,i,j) + workv(k,i,j) = resid_v(k,i,j)*resid_v(k,i,j) + enddo ! k + endif ! active vertex + enddo ! i enddo ! j ! Take global sum, then take square root - - L2_norm = parallel_reduce_sum(L2_norm) + L2_norm = parallel_global_sum_stagger(worku, parallel, workv) L2_norm = sqrt(L2_norm) if (verbose_residual) then @@ -9665,9 +9770,10 @@ subroutine compute_residual_vector_2d(nx, ny, & if (this_rank==rtest) then i = itest j = jtest - write(iulog,*) 'In compute_residual_vector_2d: task, i, j =', this_rank, i, j + k = ktest + write(iulog,*) 'In compute_residual_vector_3d: task, i, j, k =', this_rank, i, j, k write(iulog, '(a16, 2f13.7, 2e13.5)') & - ' u, v, ru, rv: ', uvel(i,j), vvel(i,j), resid_u(i,j), resid_v(i,j) + ' u, v, ru, rv: ', uvel(k,i,j), vvel(k,i,j), resid_u(k,i,j), resid_v(k,i,j) endif ! Compute max value of (squared) residual on this task. @@ -9678,12 +9784,14 @@ subroutine compute_residual_vector_2d(nx, ny, & if (abs((my_max_resid - global_max_resid)/global_max_resid) < 1.0d-6) then do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - if (abs((resid_sq(i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog, '(a24, 2i6, 2e13.5, e16.8)') 'ig, jg, ru, rv, rmax:', & - iglobal, jglobal, resid_u(i,j), resid_v(i,j), sqrt(global_max_resid) - write(iulog,*) ' ' - endif + do k = 1, nz + if (abs((resid_sq(k,i,j) - global_max_resid)/global_max_resid) < 1.0d-6) then + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog, '(a24, 2i6, i4, 2e13.5, e16.8)') 'ig, jg, k, ru, rv, rmax:', & + iglobal, jglobal, k, resid_u(k,i,j), resid_v(k,i,j), sqrt(global_max_resid) + write(iulog,*) ' ' + endif + enddo enddo enddo endif @@ -9692,48 +9800,60 @@ subroutine compute_residual_vector_2d(nx, ny, & if (present(L2_norm_relative)) then ! compute L2_norm relative to rhs - L2_norm_rhs = 0.d0 + worku(:,:,:) = 0.0d0 + workv(:,:,:) = 0.0d0 do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - L2_norm_rhs = L2_norm_rhs + bu(i,j)*bu(i,j) + bv(i,j)*bv(i,j) - endif ! active vertex - enddo ! i + do i = staggered_ilo, staggered_ihi + if (active_vertex(i,j)) then + do k = 1, nz + worku(k,i,j) = bu(k,i,j)*bu(k,i,j) + workv(k,i,j) = bv(k,i,j)*bv(k,i,j) + enddo ! k + endif ! active vertex + enddo ! i enddo ! j ! Take global sum, then take square root - L2_norm_rhs = parallel_reduce_sum(L2_norm_rhs) + L2_norm_rhs = parallel_global_sum_stagger(worku, parallel, workv) L2_norm_rhs = sqrt(L2_norm_rhs) - if (L2_norm_rhs > 0.d0) then + if (L2_norm_rhs > 0.0d0) then L2_norm_relative = L2_norm / L2_norm_rhs else - L2_norm_relative = 0.d0 + L2_norm_relative = 0.0d0 endif endif - end subroutine compute_residual_vector_2d + end subroutine compute_residual_vector_3d !**************************************************************************** - subroutine evaluate_accelerated_picard_3d(& + subroutine evaluate_accelerated_picard_2d(& + whichapprox, rtest, & L2_norm, L2_norm_large, & L2_norm_alpha_sav, & alpha_accel, alpha_accel_max, & gamma_accel, resid_reduction_threshold, & - uvel, vvel, & - Auu, Auv, & - Avu, Avv, & - uvel_old, vvel_old, & - duvel, dvvel, & - uvel_sav, vvel_sav, & - Auu_sav, Auv_sav, & - Avu_sav, Avv_sav, & + uvel_2d, vvel_2d, & + Auu_2d, Auv_2d, & + Avu_2d, Avv_2d, & + uvel_2d_old, vvel_2d_old, & + duvel_2d, dvvel_2d, & + uvel_2d_sav, vvel_2d_sav, & + Auu_2d_sav, Auv_2d_sav, & + Avu_2d_sav, Avv_2d_sav, & beta_internal, beta_internal_sav, & + beta_eff_x, beta_eff_x_sav, & + beta_eff_y, beta_eff_y_sav, & + omega_k, omega_k_sav, & assembly_is_done) + integer, intent(in) :: & + rtest, & ! rank for diagnostic point + whichapprox ! which Stokes approximation to use (SIA, SSA, DIVA, L1L2, BP) + real(dp), intent(in) :: & L2_norm, & ! latest value of L2 norm of residual L2_norm_large, & ! large value for re-initializing the L2 norm @@ -9746,24 +9866,33 @@ subroutine evaluate_accelerated_picard_3d(& L2_norm_alpha_sav ! value of L2 norm of residual, given the previous alpha_accel real(dp), dimension(:,:), intent(inout) :: & + uvel_2d, vvel_2d, & ! latest guess for the velocity solution + uvel_2d_old, vvel_2d_old, & ! velocity solution from previous nonlinear iteration + duvel_2d, dvvel_2d, & ! difference between old velocity solution and latest solution + uvel_2d_sav, vvel_2d_sav, & ! best velocity solution so far, based on the residual norm beta_internal, & ! beta_internal as a function of uvel_2d and vvel_2d beta_internal_sav ! beta_internal as a function of uvel_2d_sav and vvel_2d_sav + real(dp), dimension(:,:), intent(inout) :: & + beta_eff_x, beta_eff_x_sav, & + beta_eff_y, beta_eff_y_sav + real(dp), dimension(:,:,:), intent(inout) :: & - uvel, vvel, & ! latest guess for the velocity solution - uvel_old, vvel_old, & ! velocity solution from previous nonlinear iteration - duvel, dvvel, & ! difference between old velocity solution and latest solution - uvel_sav, vvel_sav ! best velocity solution so far, based on the residual norm + omega_k, omega_k_sav - real(dp), dimension(:,:,:,:), intent(inout) :: & - Auu, Auv, & ! latest assembled matrices as a function of uvel_2d and vvel_2d - Avu, Avv, & - Auu_sav, Auv_sav, & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav - Avu_sav, Avv_sav + real(dp), dimension(:,:,:), intent(inout) :: & + Auu_2d, Auv_2d, & ! latest assembled matrices as a function of uvel_2d and vvel_2d + Avu_2d, Avv_2d, & + Auu_2d_sav, Auv_2d_sav, & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav + Avu_2d_sav, Avv_2d_sav logical, intent(inout) :: & assembly_is_done ! if true, then accept the current assembled matrices and proceed to solution + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) 'Evaluate Picard acceleration: L2_norm_sav, L2_norm:', L2_norm_alpha_sav, L2_norm + endif + if (L2_norm < resid_reduction_threshold*L2_norm_alpha_sav .and. & alpha_accel + gamma_accel <= alpha_accel_max) then @@ -9771,39 +9900,39 @@ subroutine evaluate_accelerated_picard_3d(& ! ("Substantially" is defined by the factor resid_reduction_threshold < 1.) ! Save the latest values of the solver inputs - - uvel_sav = uvel - vvel_sav = vvel - Auu_sav = Auu - Auv_sav = Auv - Avu_sav = Avu - Avv_sav = Avv + uvel_2d_sav = uvel_2d + vvel_2d_sav = vvel_2d + Auu_2d_sav = Auu_2d + Auv_2d_sav = Auv_2d + Avu_2d_sav = Avu_2d + Avv_2d_sav = Avv_2d beta_internal_sav = beta_internal + if (whichapprox == HO_APPROX_DIVA) then + beta_eff_x_sav = beta_eff_x + beta_eff_y_sav = beta_eff_y + omega_k_sav = omega_k + endif ! Increase alpha_accel and see if the residual keeps getting smaller. ! If not, we will back off to the saved values above. alpha_accel = alpha_accel + gamma_accel L2_norm_alpha_sav = L2_norm - if (verbose_picard .and. main_task) then - write(iulog,*) 'Keep going, alpha =', alpha_accel + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) 'Increase alpha to', alpha_accel endif ! Since assembly_is_done = F, we now return to the start of the loop: - ! do while (.not.assembly_is_done) + ! do while (.not.assembly_is_done) elseif (L2_norm < L2_norm_alpha_sav) then ! The residual norm decreased only a little (or we have reached alpha_accel_max). ! Call it good and move on to the solver. - if (verbose_picard .and. main_task) then - write(iulog,*) 'Hold, alpha =', alpha_accel - endif - ! Save this velocity as the starting point for the next nonlinear iteration - uvel_old = uvel - vvel_old = vvel + uvel_2d_old = uvel_2d + vvel_2d_old = vvel_2d ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration alpha_accel = 1.0d0 @@ -9812,26 +9941,30 @@ subroutine evaluate_accelerated_picard_3d(& ! proceed to the matrix solution assembly_is_done = .true. - else + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) 'Hold alpha at', alpha_accel + write(iulog,*) ' Continue to matrix solver' + endif + else ! The residual is larger than the previous value. ! Switch back to the previously saved velocity and matrix with the lower residual. - uvel = uvel_sav - vvel = vvel_sav - Auu = Auu_sav - Auv = Auv_sav - Avu = Avu_sav - Avv = Avv_sav + uvel_2d = uvel_2d_sav + vvel_2d = vvel_2d_sav + Auu_2d = Auu_2d_sav + Auv_2d = Auv_2d_sav + Avu_2d = Avu_2d_sav + Avv_2d = Avv_2d_sav beta_internal = beta_internal_sav + if (whichapprox == HO_APPROX_DIVA) then + beta_eff_x = beta_eff_x_sav + beta_eff_y = beta_eff_y_sav + omega_k = omega_k_sav + endif ! Save this velocity as the starting point for the next nonlinear iteration - uvel_old = uvel - vvel_old = vvel - - if (verbose_picard .and. main_task) then - write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel - write(iulog,*) 'Continue to matrix solver' - endif + uvel_2d_old = uvel_2d + vvel_2d_old = vvel_2d ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration alpha_accel = 1.0d0 @@ -9840,31 +9973,35 @@ subroutine evaluate_accelerated_picard_3d(& ! proceed to the matrix solution assembly_is_done = .true. - endif ! L2_norm of residual has reduced + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) 'Reduce alpha to', alpha_accel - gamma_accel + write(iulog,*) 'Continue to matrix solver' + endif - end subroutine evaluate_accelerated_picard_3d + endif ! L2_norm of residual has reduced + end subroutine evaluate_accelerated_picard_2d + !**************************************************************************** - subroutine evaluate_accelerated_picard_2d(& - nx, ny, & + subroutine evaluate_accelerated_picard_3d(& + rtest, & L2_norm, L2_norm_large, & L2_norm_alpha_sav, & alpha_accel, alpha_accel_max, & gamma_accel, resid_reduction_threshold, & - uvel_2d, vvel_2d, & - Auu_2d, Auv_2d, & - Avu_2d, Avv_2d, & - uvel_2d_old, vvel_2d_old, & - duvel_2d, dvvel_2d, & - uvel_2d_sav, vvel_2d_sav, & - Auu_2d_sav, Auv_2d_sav, & - Avu_2d_sav, Avv_2d_sav, & + uvel, vvel, & + Auu, Auv, & + Avu, Avv, & + uvel_old, vvel_old, & + duvel, dvvel, & + uvel_sav, vvel_sav, & + Auu_sav, Auv_sav, & + Avu_sav, Avv_sav, & beta_internal, beta_internal_sav, & assembly_is_done) - integer, intent(in) :: & - nx, ny ! number of grid cells in each direction + integer, intent(in) :: rtest ! rank for diagnostic point real(dp), intent(in) :: & L2_norm, & ! latest value of L2 norm of residual @@ -9877,19 +10014,21 @@ subroutine evaluate_accelerated_picard_2d(& alpha_accel, & ! factor for extending the vector (duvel, dvvel) to reduce the residual L2_norm_alpha_sav ! value of L2 norm of residual, given the previous alpha_accel - real(dp), dimension(nx-1,ny-1), intent(inout) :: & - uvel_2d, vvel_2d, & ! latest guess for the velocity solution - uvel_2d_old, vvel_2d_old, & ! velocity solution from previous nonlinear iteration - duvel_2d, dvvel_2d, & ! difference between old velocity solution and latest solution - uvel_2d_sav, vvel_2d_sav, & ! best velocity solution so far, based on the residual norm + real(dp), dimension(:,:), intent(inout) :: & beta_internal, & ! beta_internal as a function of uvel_2d and vvel_2d beta_internal_sav ! beta_internal as a function of uvel_2d_sav and vvel_2d_sav - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & - Auu_2d, Auv_2d, & ! latest assembled matrices as a function of uvel_2d and vvel_2d - Avu_2d, Avv_2d, & - Auu_2d_sav, Auv_2d_sav, & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav - Avu_2d_sav, Avv_2d_sav + real(dp), dimension(:,:,:), intent(inout) :: & + uvel, vvel, & ! latest guess for the velocity solution + uvel_old, vvel_old, & ! velocity solution from previous nonlinear iteration + duvel, dvvel, & ! difference between old velocity solution and latest solution + uvel_sav, vvel_sav ! best velocity solution so far, based on the residual norm + + real(dp), dimension(:,:,:,:), intent(inout) :: & + Auu, Auv, & ! latest assembled matrices as a function of uvel_2d and vvel_2d + Avu, Avv, & + Auu_sav, Auv_sav, & ! assembled matrices as a function of uvel_2d_sav and vvel_2d_sav + Avu_sav, Avv_sav logical, intent(inout) :: & assembly_is_done ! if true, then accept the current assembled matrices and proceed to solution @@ -9902,12 +10041,12 @@ subroutine evaluate_accelerated_picard_2d(& ! Save the latest values of the solver inputs - uvel_2d_sav = uvel_2d - vvel_2d_sav = vvel_2d - Auu_2d_sav = Auu_2d - Auv_2d_sav = Auv_2d - Avu_2d_sav = Avu_2d - Avv_2d_sav = Avv_2d + uvel_sav = uvel + vvel_sav = vvel + Auu_sav = Auu + Auv_sav = Auv + Avu_sav = Avu + Avv_sav = Avv beta_internal_sav = beta_internal ! Increase alpha_accel and see if the residual keeps getting smaller. @@ -9915,8 +10054,8 @@ subroutine evaluate_accelerated_picard_2d(& alpha_accel = alpha_accel + gamma_accel L2_norm_alpha_sav = L2_norm - if (verbose_picard .and. main_task) then - write(iulog,*) 'Keep going, alpha =', alpha_accel + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) 'Increase alpha to', alpha_accel endif ! Since assembly_is_done = F, we now return to the start of the loop: @@ -9932,8 +10071,8 @@ subroutine evaluate_accelerated_picard_2d(& endif ! Save this velocity as the starting point for the next nonlinear iteration - uvel_2d_old = uvel_2d - vvel_2d_old = vvel_2d + uvel_old = uvel + vvel_old = vvel ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration alpha_accel = 1.0d0 @@ -9942,26 +10081,26 @@ subroutine evaluate_accelerated_picard_2d(& ! proceed to the matrix solution assembly_is_done = .true. + if (verbose_picard .and. this_rank == rtest) then + write(iulog,*) 'Hold alpha at', alpha_accel + write(iulog,*) ' Continue to matrix solver' + endif + else ! The residual is larger than the previous value. ! Switch back to the previously saved velocity and matrix with the lower residual. - uvel_2d = uvel_2d_sav - vvel_2d = vvel_2d_sav - Auu_2d = Auu_2d_sav - Auv_2d = Auv_2d_sav - Avu_2d = Avu_2d_sav - Avv_2d = Avv_2d_sav + uvel = uvel_sav + vvel = vvel_sav + Auu = Auu_sav + Auv = Auv_sav + Avu = Avu_sav + Avv = Avv_sav beta_internal = beta_internal_sav ! Save this velocity as the starting point for the next nonlinear iteration - uvel_2d_old = uvel_2d - vvel_2d_old = vvel_2d - - if (verbose_picard .and. main_task) then - write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel - write(iulog,*) 'Continue to matrix solver' - endif + uvel_old = uvel + vvel_old = vvel ! Reset alpha_accel and L2_norm_alpha_sav for the next nonlinear iteration alpha_accel = 1.0d0 @@ -9970,9 +10109,140 @@ subroutine evaluate_accelerated_picard_2d(& ! proceed to the matrix solution assembly_is_done = .true. + if (verbose_picard .and. main_task) then + write(iulog,*) 'Back up to alpha =', alpha_accel - gamma_accel + write(iulog,*) 'Continue to matrix solver' + endif + endif ! L2_norm of residual has reduced - end subroutine evaluate_accelerated_picard_2d + end subroutine evaluate_accelerated_picard_3d + +!**************************************************************************** + + subroutine compute_residual_velocity_2d(whichresid, parallel, & + uvel, vvel, & + usav, vsav, & + resid_velo) + + integer, intent(in) :: & + whichresid ! option for method to use when calculating residual + + type(parallel_type), intent(in) :: & + parallel ! info for parallel communication + + real(dp), dimension(:,:), intent(in) :: & + uvel, vvel, & ! current guess for velocity + usav, vsav ! previous guess for velocity + + real(dp), intent(out) :: & + resid_velo ! quantity related to velocity convergence + + integer :: & + imaxdiff, jmaxdiff ! location of maximum speed difference + ! currently computed but not used + + integer :: i, j, count + + real(dp) :: & + speed, & ! current guess for ice speed + oldspeed, & ! previous guess for ice speed + diffspeed ! abs(speed-oldspeed) + + integer :: & + staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid + staggered_jlo, staggered_jhi + + staggered_ilo = parallel%staggered_ilo + staggered_ihi = parallel%staggered_ihi + staggered_jlo = parallel%staggered_jlo + staggered_jhi = parallel%staggered_jhi + + ! Compute a residual quantity based on convergence of the velocity field. + + ! options for residual calculation method, as specified in configuration file + ! case(0): use max of abs( vel_old - vel ) / vel ) + ! case(1): use max of abs( vel_old - vel ) / vel ) but ignore basal vels + ! case(2): use mean of abs( vel_old - vel ) / vel ) + ! case(3): use max of abs( vel_old - vel ) / vel ) (in addition to L2 norm) + + resid_velo = 0.d0 + imaxdiff = 0 + jmaxdiff = 0 + + select case (whichresid) + + case(HO_RESID_MAXU_NO_UBAS) ! max speed difference, excluding the bed + + ! Loop over locally owned vertices + + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2) + if (speed /= 0.d0) then + oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2) + diffspeed = abs((oldspeed - speed)/speed) + if (diffspeed > resid_velo) then + resid_velo = diffspeed + imaxdiff = i + jmaxdiff = j + endif + endif + enddo + enddo + + ! take global max + resid_velo = parallel_reduce_max(resid_velo) + + case(HO_RESID_MEANU) ! mean relative speed difference + + count = 0 + + ! Loop over locally owned vertices + + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2) + if (speed /= 0.d0) then + count = count+1 + oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2) + diffspeed = abs((oldspeed - speed)/speed) + resid_velo = resid_velo + diffspeed + endif + enddo + enddo + + if (count > 0) resid_velo = resid_velo / count + + !TODO - Need to convert the mean residual to a global value. + ! (Or simply remove this case, which is rarely if ever used) + call not_parallel(__FILE__, __LINE__) + + case default ! max speed difference, including basal speeds + ! (case HO_RESID_MAXU or HO_RESID_L2NORM) + + ! Loop over locally owned vertices + + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2) + if (speed /= 0.d0) then + oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2) + diffspeed = abs((oldspeed - speed)/speed) + if (diffspeed > resid_velo) then + resid_velo = diffspeed + imaxdiff = i + jmaxdiff = j + endif + endif + enddo + enddo + + resid_velo = parallel_reduce_max(resid_velo) + + end select + + end subroutine compute_residual_velocity_2d !**************************************************************************** @@ -10068,7 +10338,7 @@ subroutine compute_residual_velocity_3d(whichresid, parallel, & if (speed /= 0.d0) then count = count+1 oldspeed = sqrt(usav(k,i,j)**2 + vsav(k,i,j)**2) - diffspeed = abs((oldspeed - speed)/speed) + diffspeed = abs((oldspeed - speed)/speed) resid_velo = resid_velo + diffspeed endif enddo @@ -10112,129 +10382,66 @@ end subroutine compute_residual_velocity_3d !**************************************************************************** - subroutine compute_residual_velocity_2d(whichresid, parallel, & - uvel, vvel, & - usav, vsav, & - resid_velo) + subroutine count_nonzeros_2d(nx, ny, & + parallel, & + Auu, Auv, & + Avu, Avv, & + active_vertex, & + nNonzeros) + + !---------------------------------------------------------------- + ! Input-output arguments + !---------------------------------------------------------------- integer, intent(in) :: & - whichresid ! option for method to use when calculating residual + nx, ny ! number of grid cells in each direction type(parallel_type), intent(in) :: & parallel ! info for parallel communication - real(dp), dimension(:,:), intent(in) :: & - uvel, vvel, & ! current guess for velocity - usav, vsav ! previous guess for velocity - - real(dp), intent(out) :: & - resid_velo ! quantity related to velocity convergence - - integer :: & - imaxdiff, jmaxdiff ! location of maximum speed difference - ! currently computed but not used - - integer :: i, j, count - - real(dp) :: & - speed, & ! current guess for ice speed - oldspeed, & ! previous guess for ice speed - diffspeed ! abs(speed-oldspeed) - - integer :: & - staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid - staggered_jlo, staggered_jhi - - staggered_ilo = parallel%staggered_ilo - staggered_ihi = parallel%staggered_ihi - staggered_jlo = parallel%staggered_jlo - staggered_jhi = parallel%staggered_jhi - - ! Compute a residual quantity based on convergence of the velocity field. - - ! options for residual calculation method, as specified in configuration file - ! case(0): use max of abs( vel_old - vel ) / vel ) - ! case(1): use max of abs( vel_old - vel ) / vel ) but ignore basal vels - ! case(2): use mean of abs( vel_old - vel ) / vel ) - ! case(3): use max of abs( vel_old - vel ) / vel ) (in addition to L2 norm) - - resid_velo = 0.d0 - imaxdiff = 0 - jmaxdiff = 0 - - select case (whichresid) - - case(HO_RESID_MAXU_NO_UBAS) ! max speed difference, excluding the bed - - ! Loop over locally owned vertices - - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2) - if (speed /= 0.d0) then - oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2) - diffspeed = abs((oldspeed - speed)/speed) - if (diffspeed > resid_velo) then - resid_velo = diffspeed - imaxdiff = i - jmaxdiff = j - endif - endif - enddo - enddo - - ! take global max - resid_velo = parallel_reduce_max(resid_velo) - - case(HO_RESID_MEANU) ! mean relative speed difference + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) :: & + Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts + Avu, Avv - count = 0 + logical, dimension(nx-1,ny-1), intent(in) :: & + active_vertex ! true for vertices of active cells - ! Loop over locally owned vertices + integer, intent(out) :: & + nNonzeros ! number of nonzero matrix elements - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2) - if (speed /= 0.d0) then - count = count+1 - oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2) - diffspeed = abs((oldspeed - speed)/speed) - resid_velo = resid_velo + diffspeed - endif - enddo - enddo + !---------------------------------------------------------------- + ! Local variables + !---------------------------------------------------------------- - if (count > 0) resid_velo = resid_velo / count + integer :: i, j, m - !TODO - Need to convert the mean residual to a global value. - ! (Or simply remove this case, which is rarely if ever used) - call not_parallel(__FILE__, __LINE__) + integer :: & + staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid + staggered_jlo, staggered_jhi - case default ! max speed difference, including basal speeds - ! (case HO_RESID_MAXU or HO_RESID_L2NORM) + staggered_ilo = parallel%staggered_ilo + staggered_ihi = parallel%staggered_ihi + staggered_jlo = parallel%staggered_jlo + staggered_jhi = parallel%staggered_jhi - ! Loop over locally owned vertices + nNonzeros = 0 + do m = 1, nNodeNeighbors_2d do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - speed = sqrt(uvel(i,j)**2 + vvel(i,j)**2) - if (speed /= 0.d0) then - oldspeed = sqrt(usav(i,j)**2 + vsav(i,j)**2) - diffspeed = abs((oldspeed - speed)/speed) - if (diffspeed > resid_velo) then - resid_velo = diffspeed - imaxdiff = i - jmaxdiff = j - endif - endif - enddo - enddo + if (active_vertex(i,j)) then + if (Auu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 + if (Auv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 + if (Avu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 + if (Avv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 + endif ! active_vertex + enddo ! i + enddo ! j + enddo ! m - resid_velo = parallel_reduce_max(resid_velo) - - end select + nNonzeros = parallel_reduce_sum(nNonzeros) - end subroutine compute_residual_velocity_2d + end subroutine count_nonzeros_2d !**************************************************************************** @@ -10308,38 +10515,112 @@ end subroutine count_nonzeros_3d !**************************************************************************** - subroutine count_nonzeros_2d(nx, ny, & - parallel, & - Auu, Auv, & - Avu, Avv, & - active_vertex, & - nNonzeros) + subroutine check_symmetry_element_matrix(nNodesPerElement, & + Kuu, Kuv, Kvu, Kvv) - !---------------------------------------------------------------- - ! Input-output arguments - !---------------------------------------------------------------- + !------------------------------------------------------------------ + ! Check that the element stiffness matrix is symmetric. + ! This is true provided that (1) Kuu = (Kuu)^T + ! (2) Kvv = (Kvv)^T + ! (3) Kuv = (Kvu)^T + ! This subroutine works for either 2D or 3D elements. + ! A symmetry check should not be needed for production runs with a well-tested code, + ! but is included for now to help with debugging. + !------------------------------------------------------------------ + + integer, intent(in) :: nNodesPerElement ! number of nodes per element + + real(dp), dimension(nNodesPerElement, nNodesPerElement), intent(in) :: & + Kuu, Kuv, Kvu, Kvv ! component of element stiffness matrix + ! + ! Kuu | Kuv + ! _____|____ + ! Kvu | Kvv + ! | + + integer :: i, j + + ! make sure Kuu = (Kuu)^T + + do j = 1, nNodesPerElement + do i = j, nNodesPerElement + if (abs(Kuu(i,j) - Kuu(j,i)) > eps11) then + write(iulog,*) 'Kuu is not symmetric' + write(iulog,*) 'i, j, Kuu(i,j), Kuu(j,i):', i, j, Kuu(i,j), Kuu(j,i) + stop + endif + enddo + enddo + + ! check that Kvv = (Kvv)^T + + do j = 1, nNodesPerElement + do i = j, nNodesPerElement + if (abs(Kvv(i,j) - Kvv(j,i)) > eps11) then + write(iulog,*) 'Kvv is not symmetric' + write(iulog,*) 'i, j, Kvv(i,j), Kvv(j,i):', i, j, Kvv(i,j), Kvv(j,i) + stop + endif + enddo + enddo + + ! Check that Kuv = (Kvu)^T + + do j = 1, nNodesPerElement + do i = 1, nNodesPerElement + if (abs(Kuv(i,j) - Kvu(j,i)) > eps11) then + write(iulog,*) 'Kuv /= (Kvu)^T' + write(iulog,*) 'i, j, Kuv(i,j), Kvu(j,i):', i, j, Kuv(i,j), Kvu(j,i) + stop + endif + enddo + enddo + + end subroutine check_symmetry_element_matrix + +!**************************************************************************** + + subroutine check_symmetry_assembled_matrix_2d(nx, ny, & + parallel, & + active_vertex, & + Auu, Auv, Avu, Avv) + + !------------------------------------------------------------------ + ! Check that the assembled stiffness matrix is symmetric. + ! This is true provided that (1) Auu = (Auu)^T + ! (2) Avv = (Avv)^T + ! (3) Auv = (Avu)^T + ! The A matrices are assembled in a dense fashion to save storage + ! and preserve the i/j/k structure of the grid. + ! + ! There can be small differences from perfect symmetry due to roundoff error. + ! These differences are fixed provided they are small enough. + !------------------------------------------------------------------ integer, intent(in) :: & - nx, ny ! number of grid cells in each direction + nx, ny ! horizontal grid dimensions type(parallel_type), intent(in) :: & parallel ! info for parallel communication - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) :: & - Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts - Avu, Avv + logical, dimension(nx-1,ny-1), intent(in) :: & + active_vertex ! T for columns (i,j) where velocity is computed, else F - logical, dimension(nx-1,ny-1), intent(in) :: & - active_vertex ! true for vertices of active cells + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & + Auu, Auv, Avu, Avv ! components of assembled stiffness matrix + ! + ! Auu | Auv + ! _____|____ + ! | + ! Avu | Avv - integer, intent(out) :: & - nNonzeros ! number of nonzero matrix elements + integer :: i, j, iA, jA, m, mm, iglobal, jglobal - !---------------------------------------------------------------- - ! Local variables - !---------------------------------------------------------------- + real(dp) :: val1, val2 ! values of matrix coefficients - integer :: i, j, m + real(dp) :: maxdiff, global_maxdiff, diag_entry, avg_val + + integer :: rmax, imax, jmax, mmax integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid @@ -10350,89 +10631,158 @@ subroutine count_nonzeros_2d(nx, ny, & staggered_jlo = parallel%staggered_jlo staggered_jhi = parallel%staggered_jhi - nNonzeros = 0 + ! Check matrix for symmetry + + ! Here we correct for small differences from symmetry due to roundoff error. + ! The maximum departure from symmetry is set to be a small fraction + ! of the diagonal entry for the row. + ! If the departure from symmetry is larger than this, then the model prints a warning + ! and/or aborts. + + maxdiff = 0.d0 + rmax = 0; imax = 0; jmax = 0; mmax = 0 + + ! Loop over locally owned vertices. + ! Each active vertex is associate with 2*nz matrix rows belonging to this processor. + + do jA = -1, 1 + do iA = -1, 1 + m = indxA_2d( iA, jA) + mm = indxA_2d(-iA,-jA) - do m = 1, nNodeNeighbors_2d do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi if (active_vertex(i,j)) then - if (Auu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 - if (Auv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 - if (Avu(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 - if (Avv(i,j,m) /= 0.d0) nNonzeros = nNonzeros + 1 - endif ! active_vertex - enddo ! i - enddo ! j - enddo ! m - - nNonzeros = parallel_reduce_sum(nNonzeros) - end subroutine count_nonzeros_2d + ! Check Auu and Auv for symmetry + diag_entry = Auu(i,j,indxA_2d(0,0)) -!**************************************************************************** + !WHL - debug + if (diag_entry /= diag_entry) then + write(iulog,*) 'WARNING: Diagonal NaN: i, j =', i, j + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal +!! stop + endif - subroutine check_symmetry_element_matrix(nNodesPerElement, & - Kuu, Kuv, Kvu, Kvv) + ! Check that Auu = Auu^T + val1 = Auu(i, j, m ) ! value of Auu(row,col) + val2 = Auu(i+iA, j+jA, mm) ! value of Auu(col,row) + if (val2 /= val1) then + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; mmax = m + endif + ! if difference is small, then fix the asymmetry by averaging values + !WHL - Here and below, I commented out the code to average asymmetric values. + ! The hope is that the asymmetries are too small to matter. + ! else print a warning and abort + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then +! avg_val = 0.5d0 * (val1 + val2) +! Auu(i, j, m ) = avg_val +! Auu(i+iA, j+jA, mm) = avg_val + else + write(iulog,*) 'WARNING: Auu is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA + write(iulog,*) 'Auu(row,col), Auu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal +!! stop + endif + endif ! val2 /= val1 - !------------------------------------------------------------------ - ! Check that the element stiffness matrix is symmetric. - ! This is true provided that (1) Kuu = (Kuu)^T - ! (2) Kvv = (Kvv)^T - ! (3) Kuv = (Kvu)^T - ! This subroutine works for either 2D or 3D elements. - ! A symmetry check should not be needed for production runs with a well-tested code, - ! but is included for now to help with debugging. - !------------------------------------------------------------------ + ! Check that Auv = (Avu)^T + val1 = Auv(i, j, m ) ! value of Auv(row,col) + val2 = Avu(i+iA, j+jA, mm) ! value of Avu(col,row) + if (val2 /= val1) then + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; mmax = m + endif + ! if difference is small, then fix the asymmetry by averaging values + ! else print a warning and abort + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then +! avg_val = 0.5d0 * (val1 + val2) +! Auv(i, j, m ) = avg_val +! Avu(i+iA, j+jA, mm) = avg_val + else + write(iulog,*) 'WARNING: Auv is not equal to (Avu)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA + write(iulog,*) 'Auv(row,col), Avu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal +!! stop + endif + endif ! val2 /= val1 - integer, intent(in) :: nNodesPerElement ! number of nodes per element + ! Now check Avu and Avv + diag_entry = Avv(i,j,indxA_2d(0,0)) - real(dp), dimension(nNodesPerElement, nNodesPerElement), intent(in) :: & - Kuu, Kuv, Kvu, Kvv ! component of element stiffness matrix - ! - ! Kuu | Kuv - ! _____|____ - ! Kvu | Kvv - ! | + ! check that Avv = (Avv)^T + val1 = Avv(i, j, m ) ! value of Avv(row,col) + val2 = Avv(i+iA, j+jA, mm) ! value of Avv(col,row) - integer :: i, j + if (val2 /= val1) then + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; mmax = m + endif + ! if difference is small, then fix the asymmetry by averaging values + ! else print a warning and abort + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then +! avg_val = 0.5d0 * (val1 + val2) +! Avv(i, j, m ) = avg_val +! Avv(i+iA, j+jA, mm) = avg_val + else + write(iulog,*) 'WARNING: Avv is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA + write(iulog,*) 'Avv(row,col), Avv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal +!! stop + endif - ! make sure Kuu = (Kuu)^T + endif ! val2 /= val1 - do j = 1, nNodesPerElement - do i = j, nNodesPerElement - if (abs(Kuu(i,j) - Kuu(j,i)) > eps10) then - write(iulog,*) 'Kuu is not symmetric' - write(iulog,*) 'i, j, Kuu(i,j), Kuu(j,i):', i, j, Kuu(i,j), Kuu(j,i) - stop - endif - enddo - enddo + ! Check that Avu = (Auv)^T + val1 = Avu(i, j, m ) ! value of Avu(row,col) + val2 = Auv(i+iA, j+jA, mm) ! value of Auv(col,row) - ! check that Kvv = (Kvv)^T + if (val2 /= val1) then + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; mmax = m + endif + ! if difference is small, then fix the asymmetry by averaging values + ! else print a warning and abort + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then +! avg_val = 0.5d0 * (val1 + val2) +! Avu(i, j, m ) = avg_val +! Auv(i+iA, j+jA, mm) = avg_val + else + write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA + write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal +!! stop + endif - do j = 1, nNodesPerElement - do i = j, nNodesPerElement - if (abs(Kvv(i,j) - Kvv(j,i)) > eps10) then - write(iulog,*) 'Kvv is not symmetric' - write(iulog,*) 'i, j, Kvv(i,j), Kvv(j,i):', i, j, Kvv(i,j), Kvv(j,i) - stop - endif - enddo - enddo + endif ! val2 /= val1 - ! Check that Kuv = (Kvu)^T + endif ! active_vertex + enddo ! i + enddo ! j + enddo ! iA + enddo ! jA - do j = 1, nNodesPerElement - do i = 1, nNodesPerElement - if (abs(Kuv(i,j) - Kvu(j,i)) > eps10) then - write(iulog,*) 'Kuv /= (Kvu)^T' - write(iulog,*) 'i, j, Kuv(i,j), Kvu(j,i):', i, j, Kuv(i,j), Kvu(j,i) - stop - endif - enddo - enddo + if (verbose_matrix) then + global_maxdiff = parallel_reduce_max(maxdiff) + if (global_maxdiff > 0.0d0 .and. maxdiff == global_maxdiff) then + ! maxdiff is on this processor; compute and broadcast the global index + call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel) + write(iulog,*) 'Max asymmetry =', global_maxdiff + write(iulog,*) ' ig, jg, m =', iglobal, jglobal, mmax + endif + endif - end subroutine check_symmetry_element_matrix + end subroutine check_symmetry_assembled_matrix_2d !**************************************************************************** @@ -10476,12 +10826,14 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & real(dp) :: val1, val2 ! values of matrix coefficients - real(dp) :: maxdiff, diag_entry, avg_val + real(dp) :: maxdiff, global_maxdiff, diag_entry, avg_val integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi + integer :: rmax, imax, jmax, kmax, mmax + staggered_ilo = parallel%staggered_ilo staggered_ihi = parallel%staggered_ihi staggered_jlo = parallel%staggered_jlo @@ -10496,6 +10848,7 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & ! and/or aborts. maxdiff = 0.d0 + rmax = 0; imax = 0; jmax = 0; kmax = 0; mmax = 0 ! Loop over locally owned vertices. ! Each active vertex is associate with 2*nz matrix rows belonging to this processor. @@ -10534,12 +10887,15 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m + endif ! if difference is small, then fix the asymmetry by averaging values ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then avg_val = 0.5d0 * (val1 + val2) Auu( m, k, i, j ) = avg_val Auu(mm, k+kA,i+iA,j+jA) = avg_val @@ -10561,12 +10917,15 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m + endif ! if difference is small, then fix the asymmetry by averaging values ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then avg_val = 0.5d0 * (val1 + val2) Auv( m, k, i, j ) = avg_val Avu(mm, k+kA,i+iA,j+jA) = avg_val @@ -10616,12 +10975,15 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m + endif ! if difference is small, then fix the asymmetry by averaging values ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then avg_val = 0.5d0 * (val1 + val2) Avv( m, k, i, j ) = avg_val Avv(mm, k+kA,i+iA,j+jA) = avg_val @@ -10641,278 +11003,93 @@ subroutine check_symmetry_assembled_matrix_3d(nx, ny, nz, & val1 = Avu( m, k, i, j) ! value of Avu(row,col) val2 = Auv(mm, k+kA, i+iA, j+jA) ! value of Auv(col,row) - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) + if (abs(val2 - val1) > maxdiff) then + maxdiff = abs(val2 - val1) + rmax = this_rank; imax = i; jmax = j; kmax = k; mmax = m + endif if (val2 /= val1) then - ! if difference is small, then fix the asymmetry by averaging values - ! else print a warning and abort - - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then - avg_val = 0.5d0 * (val1 + val2) - Avu( m, k, i, j ) = avg_val - Auv(mm, k+kA,i+iA,j+jA) = avg_val - else - write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, k, iA, jA, kA =', & - this_rank, i, j, k, iA, jA, kA - write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -!! stop - endif - - endif ! val2 /= val1 - - endif ! k+kA in bounds - - enddo ! kA - enddo ! iA - enddo ! jA - - enddo ! k - endif ! active_vertex - enddo ! i - enddo ! j - - if (verbose_matrix) maxdiff = parallel_reduce_max(maxdiff) - - if (verbose_matrix .and. main_task) then - write(iulog,*) ' ' - write(iulog,*) 'Max difference from symmetry =', maxdiff - endif - - end subroutine check_symmetry_assembled_matrix_3d - -!**************************************************************************** - - subroutine check_symmetry_assembled_matrix_2d(nx, ny, & - parallel, & - active_vertex, & - Auu, Auv, Avu, Avv) - - !------------------------------------------------------------------ - ! Check that the assembled stiffness matrix is symmetric. - ! This is true provided that (1) Auu = (Auu)^T - ! (2) Avv = (Avv)^T - ! (3) Auv = (Avu)^T - ! The A matrices are assembled in a dense fashion to save storage - ! and preserve the i/j/k structure of the grid. - ! - ! There can be small differences from perfect symmetry due to roundoff error. - ! These differences are fixed provided they are small enough. - !------------------------------------------------------------------ - - integer, intent(in) :: & - nx, ny ! horizontal grid dimensions - - type(parallel_type), intent(in) :: & - parallel ! info for parallel communication - - logical, dimension(nx-1,ny-1), intent(in) :: & - active_vertex ! T for columns (i,j) where velocity is computed, else F - - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(inout) :: & - Auu, Auv, Avu, Avv ! components of assembled stiffness matrix - ! - ! Auu | Auv - ! _____|____ - ! | - ! Avu | Avv - - integer :: i, j, iA, jA, m, mm, iglobal, jglobal - - real(dp) :: val1, val2 ! values of matrix coefficients - - real(dp) :: maxdiff, diag_entry, avg_val - - integer :: & - staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid - staggered_jlo, staggered_jhi - - staggered_ilo = parallel%staggered_ilo - staggered_ihi = parallel%staggered_ihi - staggered_jlo = parallel%staggered_jlo - staggered_jhi = parallel%staggered_jhi - - ! Check matrix for symmetry - - ! Here we correct for small differences from symmetry due to roundoff error. - ! The maximum departure from symmetry is set to be a small fraction - ! of the diagonal entry for the row. - ! If the departure from symmetry is larger than this, then the model prints a warning - ! and/or aborts. - - maxdiff = 0.d0 - - ! Loop over locally owned vertices. - ! Each active vertex is associate with 2*nz matrix rows belonging to this processor. - - do jA = -1, 1 - do iA = -1, 1 - m = indxA_2d( iA, jA) - mm = indxA_2d(-iA,-jA) - - do j = staggered_jlo, staggered_jhi - do i = staggered_ilo, staggered_ihi - if (active_vertex(i,j)) then - - ! Check Auu and Auv for symmetry - diag_entry = Auu(i,j,indxA_2d(0,0)) - - !WHL - debug - if (diag_entry /= diag_entry) then - write(iulog,*) 'WARNING: Diagonal NaN: i, j =', i, j - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -!! stop - endif - - ! Check that Auu = Auu^T - val1 = Auu(i, j, m ) ! value of Auu(row,col) - val2 = Auu(i+iA, j+jA, mm) ! value of Auu(col,row) - if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) - ! if difference is small, then fix the asymmetry by averaging values - ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then - avg_val = 0.5d0 * (val1 + val2) - Auu(i, j, m ) = avg_val - Auu(i+iA, j+jA, mm) = avg_val - else - write(iulog,*) 'WARNING: Auu is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA - write(iulog,*) 'Auu(row,col), Auu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -!! stop - endif - endif ! val2 /= val1 - - ! Check that Auv = (Avu)^T - val1 = Auv(i, j, m ) ! value of Auv(row,col) - val2 = Avu(i+iA, j+jA, mm) ! value of Avu(col,row) - if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) - ! if difference is small, then fix the asymmetry by averaging values - ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then - avg_val = 0.5d0 * (val1 + val2) - Auv(i, j, m ) = avg_val - Avu(i+iA, j+jA, mm) = avg_val - else - write(iulog,*) 'WARNING: Auv is not equal to (Avu)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA - write(iulog,*) 'Auv(row,col), Avu(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -!! stop - endif - endif ! val2 /= val1 - - ! Now check Avu and Avv - diag_entry = Avv(i,j,indxA_2d(0,0)) - - ! check that Avv = (Avv)^T - val1 = Avv(i, j, m ) ! value of Avv(row,col) - val2 = Avv(i+iA, j+jA, mm) ! value of Avv(col,row) - - if (val2 /= val1) then - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) - - ! if difference is small, then fix the asymmetry by averaging values - ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then - avg_val = 0.5d0 * (val1 + val2) - Avv(i, j, m ) = avg_val - Avv(i+iA, j+jA, mm) = avg_val - else - write(iulog,*) 'WARNING: Avv is not symmetric: this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA - write(iulog,*) 'Avv(row,col), Avv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -!! stop - endif - - endif ! val2 /= val1 - - ! Check that Avu = (Auv)^T - val1 = Avu(i, j, m ) ! value of Avu(row,col) - val2 = Auv(i+iA, j+jA, mm) ! value of Auv(col,row) - - if (abs(val2 - val1) > maxdiff) maxdiff = abs(val2 - val1) + ! if difference is small, then fix the asymmetry by averaging values + ! else print a warning and abort - if (val2 /= val1) then + if ( abs(val2-val1) < eps11*abs(diag_entry) ) then + avg_val = 0.5d0 * (val1 + val2) + Avu( m, k, i, j ) = avg_val + Auv(mm, k+kA,i+iA,j+jA) = avg_val + else + write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, k, iA, jA, kA =', & + this_rank, i, j, k, iA, jA, kA + write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry + call parallel_globalindex(i, j, iglobal, jglobal, parallel) + write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal +!! stop + endif - ! if difference is small, then fix the asymmetry by averaging values - ! else print a warning and abort - if ( abs(val2-val1) < eps08*abs(diag_entry) ) then - avg_val = 0.5d0 * (val1 + val2) - Avu(i, j, m ) = avg_val - Auv(i+iA, j+jA, mm) = avg_val - else - write(iulog,*) 'WARNING: Avu is not equal to (Auv)^T, this_rank, i, j, iA, jA =', this_rank, i, j, iA, jA - write(iulog,*) 'Avu(row,col), Auv(col,row), diff/diag:', val1, val2, (val2 - val1)/diag_entry - call parallel_globalindex(i, j, iglobal, jglobal, parallel) - write(iulog,*) ' iglobal, jglobal:', iglobal, jglobal -!! stop - endif + endif ! val2 /= val1 - endif ! val2 /= val1 + endif ! k+kA in bounds - endif ! active_vertex - enddo ! i - enddo ! j - enddo ! iA - enddo ! jA + enddo ! kA + enddo ! iA + enddo ! jA - if (verbose_matrix) maxdiff = parallel_reduce_max(maxdiff) + enddo ! k + endif ! active_vertex + enddo ! i + enddo ! j - if (verbose_matrix .and. main_task) then - write(iulog,*) ' ' - write(iulog,*) 'Max difference from symmetry =', maxdiff + if (verbose_matrix) then + global_maxdiff = parallel_reduce_max(maxdiff) + if (global_maxdiff > 0.0d0 .and. maxdiff == global_maxdiff) then + ! maxdiff is on this processor; compute and broadcast the global index + call parallel_globalindex(imax, jmax, iglobal, jglobal, parallel) + write(iulog,*) 'Max asymmetry =', global_maxdiff + write(iulog,*) ' i, j, ig, jg, k, m =', imax, jmax, iglobal, jglobal, kmax, mmax + endif endif - end subroutine check_symmetry_assembled_matrix_2d + end subroutine check_symmetry_assembled_matrix_3d !**************************************************************************** - subroutine write_matrix_elements_3d(nx, ny, nz, & - nNodesSolve, nodeID, & - iNodeIndex, jNodeIndex, & - kNodeIndex, & - Auu, Auv, & - Avu, Avv, & - bu, bv) + subroutine write_matrix_elements_2d(nx, ny, & + nVerticesSolve, vertexID, & + iVertexIndex, jVertexIndex, & + Auu, Auv, & + Avu, Avv, & + bu, bv) ! Write matrix elements to text files. ! Note: Does not work when running on more than one task. integer, intent(in) :: & nx, ny, & ! horizontal grid dimensions - nz, & ! number of vertical levels at which velocity is computed - nNodesSolve ! number of nodes where we solve for velocity + nVerticesSolve ! number of vertices where we solve for velocity - integer, dimension(nz,nx-1,ny-1), intent(in) :: & - nodeID ! ID for each node + integer, dimension(nx-1,ny-1), intent(in) :: & + vertexID ! ID for each vertex integer, dimension(:), intent(in) :: & - iNodeIndex, jNodeIndex, kNodeIndex ! i, j and k indices of active nodes + iVertexIndex, jVertexIndex ! i and j indices of active vertices - real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) :: & Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts - Avu, Avv ! 1st dimension = node and its nearest neighbors in x, y and z direction - ! other dimensions = (k,i,j) indices + Avu, Avv ! 1st dimension = vertex and its nearest neighbors in x and y direction + ! other dimensions = (i,j) indices - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts ! Local variables integer :: rowA, colA - integer :: i, j, k, m, iA, jA, kA + integer :: i, j, m, iA, jA - real(dp), dimension(nNodesSolve, nNodesSolve) :: & + real(dp), dimension(nVerticesSolve, nVerticesSolve) :: & Auu_val, Auv_val, Avu_val, Avv_val ! dense matrices - real(dp), dimension(nNodesSolve) :: nonzeros + real(dp), dimension(nVerticesSolve) :: nonzeros if (tasks > 1) then call write_log('Error: Cannot write matrix elements to files when tasks > 1', GM_FATAL) @@ -10923,35 +11100,29 @@ subroutine write_matrix_elements_3d(nx, ny, nz, & Avu_val(:,:) = 0.d0 Avv_val(:,:) = 0.d0 - do rowA = 1, nNodesSolve - - i = iNodeIndex(rowA) - j = jNodeIndex(rowA) - k = kNodeIndex(rowA) + do rowA = 1, nVerticesSolve - do kA = -1, 1 + i = iVertexIndex(rowA) + j = jVertexIndex(rowA) do jA = -1, 1 do iA = -1, 1 - if ( (k+kA >= 1 .and. k+kA <= nz) & - .and. & - (i+iA >= 1 .and. i+iA <= nx-1) & + if ( (i+iA >= 1 .and. i+iA <= nx-1) & .and. & (j+jA >= 1 .and. j+jA <= ny-1) ) then - colA = nodeID(k+kA, i+iA, j+jA) ! ID for neighboring node - m = indxA_3d(iA,jA,kA) + colA = vertexID(i+iA, j+jA) ! ID for neighboring vertex + m = indxA_2d(iA,jA) if (colA > 0) then - Auu_val(rowA, colA) = Auu(m,k,i,j) - Auv_val(rowA, colA) = Auv(m,k,i,j) - Avu_val(rowA, colA) = Avu(m,k,i,j) - Avv_val(rowA, colA) = Avv(m,k,i,j) + Auu_val(rowA, colA) = Auu(i,j,m) + Auv_val(rowA, colA) = Auv(i,j,m) + Avu_val(rowA, colA) = Avu(i,j,m) + Avv_val(rowA, colA) = Avv(i,j,m) endif - endif ! i+iA, j+jA, and k+kA in bounds + endif ! i+iA and j+jA in bounds - enddo ! kA enddo ! iA enddo ! jA @@ -10960,9 +11131,9 @@ subroutine write_matrix_elements_3d(nx, ny, nz, & !WHL - bug check write(iulog,*) ' ' write(iulog,*) 'nonzeros per row:' - do rowA = 1, nNodesSolve + do rowA = 1, nVerticesSolve nonzeros(rowA) = 0 - do colA = 1, nNodesSolve + do colA = 1, nVerticesSolve if (abs(Auu_val(rowA,colA)) > 1.d-11) then nonzeros(rowA) = nonzeros(rowA) + 1 endif @@ -10979,12 +11150,12 @@ subroutine write_matrix_elements_3d(nx, ny, nz, & open(unit=12, file='Avu.'//matrix_label, status='unknown') open(unit=13, file='Avv.'//matrix_label, status='unknown') - do rowA = 1, nNodesSolve + do rowA = 1, nVerticesSolve write(10,'(i6)',advance='no') rowA write(11,'(i6)',advance='no') rowA write(12,'(i6)',advance='no') rowA write(13,'(i6)',advance='no') rowA - do colA = 1, nNodesSolve + do colA = 1, nVerticesSolve write(10,'(e16.8)',advance='no') Auu_val(rowA,colA) write(11,'(e16.8)',advance='no') Auv_val(rowA,colA) write(12,'(e16.8)',advance='no') Avu_val(rowA,colA) @@ -11006,57 +11177,58 @@ subroutine write_matrix_elements_3d(nx, ny, nz, & ! write load vectors to file open(unit=14, file='bu.'//matrix_label, status='unknown') open(unit=15, file='bv.'//matrix_label, status='unknown') - do rowA = 1, nNodesSolve - i = iNodeIndex(rowA) - j = jNodeIndex(rowA) - k = kNodeIndex(rowA) - write(14,'(i6, e16.8)') rowA, bu(k,i,j) - write(15,'(i6, e16.8)') rowA, bv(k,i,j) + do rowA = 1, nVerticesSolve + i = iVertexIndex(rowA) + j = jVertexIndex(rowA) + write(14,'(i6, e16.8)') rowA, bu(i,j) + write(15,'(i6, e16.8)') rowA, bv(i,j) enddo close(14) close(15) - end subroutine write_matrix_elements_3d - + end subroutine write_matrix_elements_2d + !**************************************************************************** - subroutine write_matrix_elements_2d(nx, ny, & - nVerticesSolve, vertexID, & - iVertexIndex, jVertexIndex, & - Auu, Auv, & - Avu, Avv, & - bu, bv) + subroutine write_matrix_elements_3d(nx, ny, nz, & + nNodesSolve, nodeID, & + iNodeIndex, jNodeIndex, & + kNodeIndex, & + Auu, Auv, & + Avu, Avv, & + bu, bv) ! Write matrix elements to text files. ! Note: Does not work when running on more than one task. integer, intent(in) :: & nx, ny, & ! horizontal grid dimensions - nVerticesSolve ! number of vertices where we solve for velocity + nz, & ! number of vertical levels at which velocity is computed + nNodesSolve ! number of nodes where we solve for velocity - integer, dimension(nx-1,ny-1), intent(in) :: & - vertexID ! ID for each vertex + integer, dimension(nz,nx-1,ny-1), intent(in) :: & + nodeID ! ID for each node integer, dimension(:), intent(in) :: & - iVertexIndex, jVertexIndex ! i and j indices of active vertices + iNodeIndex, jNodeIndex, kNodeIndex ! i, j and k indices of active nodes - real(dp), dimension(nx-1,ny-1,nNodeNeighbors_2d), intent(in) :: & + real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) :: & Auu, Auv, & ! assembled stiffness matrix, divided into 4 parts - Avu, Avv ! 1st dimension = vertex and its nearest neighbors in x and y direction - ! other dimensions = (i,j) indices + Avu, Avv ! 1st dimension = node and its nearest neighbors in x, y and z direction + ! other dimensions = (k,i,j) indices - real(dp), dimension(nx-1,ny-1), intent(in) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts ! Local variables integer :: rowA, colA - integer :: i, j, m, iA, jA + integer :: i, j, k, m, iA, jA, kA - real(dp), dimension(nVerticesSolve, nVerticesSolve) :: & + real(dp), dimension(nNodesSolve, nNodesSolve) :: & Auu_val, Auv_val, Avu_val, Avv_val ! dense matrices - real(dp), dimension(nVerticesSolve) :: nonzeros + real(dp), dimension(nNodesSolve) :: nonzeros if (tasks > 1) then call write_log('Error: Cannot write matrix elements to files when tasks > 1', GM_FATAL) @@ -11067,29 +11239,35 @@ subroutine write_matrix_elements_2d(nx, ny, & Avu_val(:,:) = 0.d0 Avv_val(:,:) = 0.d0 - do rowA = 1, nVerticesSolve + do rowA = 1, nNodesSolve - i = iVertexIndex(rowA) - j = jVertexIndex(rowA) + i = iNodeIndex(rowA) + j = jNodeIndex(rowA) + k = kNodeIndex(rowA) + + do kA = -1, 1 do jA = -1, 1 do iA = -1, 1 - if ( (i+iA >= 1 .and. i+iA <= nx-1) & + if ( (k+kA >= 1 .and. k+kA <= nz) & + .and. & + (i+iA >= 1 .and. i+iA <= nx-1) & .and. & (j+jA >= 1 .and. j+jA <= ny-1) ) then - colA = vertexID(i+iA, j+jA) ! ID for neighboring vertex - m = indxA_2d(iA,jA) + colA = nodeID(k+kA, i+iA, j+jA) ! ID for neighboring node + m = indxA_3d(iA,jA,kA) if (colA > 0) then - Auu_val(rowA, colA) = Auu(i,j,m) - Auv_val(rowA, colA) = Auv(i,j,m) - Avu_val(rowA, colA) = Avu(i,j,m) - Avv_val(rowA, colA) = Avv(i,j,m) + Auu_val(rowA, colA) = Auu(m,k,i,j) + Auv_val(rowA, colA) = Auv(m,k,i,j) + Avu_val(rowA, colA) = Avu(m,k,i,j) + Avv_val(rowA, colA) = Avv(m,k,i,j) endif - endif ! i+iA and j+jA in bounds + endif ! i+iA, j+jA, and k+kA in bounds + enddo ! kA enddo ! iA enddo ! jA @@ -11098,9 +11276,9 @@ subroutine write_matrix_elements_2d(nx, ny, & !WHL - bug check write(iulog,*) ' ' write(iulog,*) 'nonzeros per row:' - do rowA = 1, nVerticesSolve + do rowA = 1, nNodesSolve nonzeros(rowA) = 0 - do colA = 1, nVerticesSolve + do colA = 1, nNodesSolve if (abs(Auu_val(rowA,colA)) > 1.d-11) then nonzeros(rowA) = nonzeros(rowA) + 1 endif @@ -11117,12 +11295,12 @@ subroutine write_matrix_elements_2d(nx, ny, & open(unit=12, file='Avu.'//matrix_label, status='unknown') open(unit=13, file='Avv.'//matrix_label, status='unknown') - do rowA = 1, nVerticesSolve + do rowA = 1, nNodesSolve write(10,'(i6)',advance='no') rowA write(11,'(i6)',advance='no') rowA write(12,'(i6)',advance='no') rowA write(13,'(i6)',advance='no') rowA - do colA = 1, nVerticesSolve + do colA = 1, nNodesSolve write(10,'(e16.8)',advance='no') Auu_val(rowA,colA) write(11,'(e16.8)',advance='no') Auv_val(rowA,colA) write(12,'(e16.8)',advance='no') Avu_val(rowA,colA) @@ -11144,100 +11322,18 @@ subroutine write_matrix_elements_2d(nx, ny, & ! write load vectors to file open(unit=14, file='bu.'//matrix_label, status='unknown') open(unit=15, file='bv.'//matrix_label, status='unknown') - do rowA = 1, nVerticesSolve - i = iVertexIndex(rowA) - j = jVertexIndex(rowA) - write(14,'(i6, e16.8)') rowA, bu(i,j) - write(15,'(i6, e16.8)') rowA, bv(i,j) + do rowA = 1, nNodesSolve + i = iNodeIndex(rowA) + j = jNodeIndex(rowA) + k = kNodeIndex(rowA) + write(14,'(i6, e16.8)') rowA, bu(k,i,j) + write(15,'(i6, e16.8)') rowA, bv(k,i,j) enddo close(14) close(15) - end subroutine write_matrix_elements_2d - -!**************************************************************************** - !TODO - Either delete this subroutine, or switch the indices. Not currently used. - subroutine compress_3d_to_2d(nx, ny, nz, & - Auu, Auv, & - Avu, Avv, & - bu, bv, & - Auu_2d, Auv_2d, & - Avu_2d, Avv_2d, & - bu_2d, bv_2d) - - !---------------------------------------------------------------- - ! Form the 2D matrix and rhs by combining terms from the 3D matrix and rhs. - ! This combination is based on the assumption of no vertical shear; - ! i.e., uvel and vvel have the same value at each level in a given column. - !---------------------------------------------------------------- - - !---------------------------------------------------------------- - ! Input-output arguments - !---------------------------------------------------------------- - - integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions - nz ! number of vertical levels where velocity is computed - - real(dp), dimension(nNodeNeighbors_3d,nz,nx-1,ny-1), intent(in) :: & - Auu, Auv, & ! assembled 3D stiffness matrix, divided into 4 parts - Avu, Avv - - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & - bu, bv ! assembled 3D rhs vector, divided into 2 parts - - real(dp), dimension(nNodeNeighbors_2d,nx-1,ny-1), intent(out) :: & - Auu_2d, Auv_2d, &! assembled 2D (SSA) stiffness matrix, divided into 4 parts - Avu_2d, Avv_2d - - real(dp), dimension(nx-1,ny-1), intent(out) :: & - bu_2d, bv_2d ! assembled 2D (SSA) rhs vector, divided into 2 parts - - !---------------------------------------------------------------- - ! Local variables - !---------------------------------------------------------------- - - integer :: i, j, k, iA, jA, kA, m, m2 - - ! Initialize 2D matrix and rhs - - Auu_2d(:,:,:) = 0.d0 - Auv_2d(:,:,:) = 0.d0 - Avu_2d(:,:,:) = 0.d0 - Avv_2d(:,:,:) = 0.d0 - bu_2d(:,:) = 0.d0 - bv_2d(:,:) = 0.d0 - - ! Form 2D matrix and rhs - - do j = 1, ny-1 - do i = 1, nx-1 - do k = 1, nz - - ! matrix - do kA = -1,1 - do jA = -1,1 - do iA = -1,1 - m = indxA_3d(iA,jA,kA) - m2 = indxA_2d(iA,jA) - Auu_2d(m2,i,j) = Auu_2d(m2,i,j) + Auu(m,k,i,j) - Auv_2d(m2,i,j) = Auv_2d(m2,i,j) + Auv(m,k,i,j) - Avu_2d(m2,i,j) = Avu_2d(m2,i,j) + Avu(m,k,i,j) - Avv_2d(m2,i,j) = Avv_2d(m2,i,j) + Avv(m,k,i,j) - enddo ! iA - enddo ! jA - enddo ! kA - - ! rhs - bu_2d(i,j) = bu_2d(i,j) + bu(k,i,j) - bv_2d(i,j) = bv_2d(i,j) + bv(k,i,j) - - enddo ! k - enddo ! i - enddo ! j - - end subroutine compress_3d_to_2d - + end subroutine write_matrix_elements_3d + !**************************************************************************** end module glissade_velo_higher diff --git a/libglissade/glissade_velo_higher_pcg.F90 b/libglissade/glissade_velo_higher_pcg.F90 index 23c9ceaf..57d1cebe 100644 --- a/libglissade/glissade_velo_higher_pcg.F90 +++ b/libglissade/glissade_velo_higher_pcg.F90 @@ -48,28 +48,25 @@ module glissade_velo_higher_pcg use glimmer_log use profile, only: t_startf, t_stopf use cism_parallel, only: this_rank, main_task, & - parallel_type, staggered_parallel_halo, parallel_reduce_sum, & - parallel_global_sum_staggered + parallel_type, staggered_parallel_halo, parallel_global_sum_stagger implicit none private - public :: pcg_solver_standard_3d, pcg_solver_standard_2d, & - pcg_solver_chrongear_3d, pcg_solver_chrongear_2d, & + public :: pcg_solver_standard_2d, pcg_solver_standard_3d, & + pcg_solver_chrongear_2d, pcg_solver_chrongear_3d, & matvec_multiply_structured_3d logical, parameter :: verbose_pcg = .false. logical, parameter :: verbose_tridiag = .false. -!! logical, parameter :: verbose_pcg = .true. -!! logical, parameter :: verbose_tridiag = .true. contains !**************************************************************************** - subroutine pcg_solver_standard_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + subroutine pcg_solver_standard_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & bu, bv, & @@ -84,50 +81,25 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! to solve the equation $Ax=b$. ! Convergence is checked every {\em linear_solve_ncheck} steps. ! - ! It is based on the barotropic solver in the POP ocean model - ! (author Phil Jones, LANL). Input and output arrays are located - ! on a structured (i,j,k) grid as defined in the glissade_velo_higher - ! module. The global matrix is sparse, but its nonzero elements - ! are stored in four dense matrices called Auu, Avv, Auv, and Avu. - ! Each matrix has 3x3x3 = 27 potential nonzero elements per - ! node (i,j,k). + ! It is similar to subroutine pcg_solver_standard_3d, but modified + ! to solve for x and y at a single horizontal level, as in the + ! shallow-shelf approximation. See the comments in that subroutine + ! (above) for more details on data structure and solver methods. + ! + ! Input and output arrays are located on a structured (i,j) grid + ! as defined in the glissade_velo_higher module. The global matrix + ! is sparse, but its nonzero element are stored in four dense matrices + ! called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential + ! nonzero elements per node (i,j). ! ! The current preconditioning options are ! (0) no preconditioning ! (1) diagonal preconditioning - ! (2) preconditioning using a physics-based SIA solver - ! - ! For the dome test case with higher-order dynamics, option (2) is best. - ! - ! Here is a schematic of the method implemented below for solving Ax = b: - ! - ! halo_update(x0) - ! r0 = b - A*x0 - ! d0 = 0 - ! eta0 = 1 ! - ! while (not converged) - ! solve Mz = r for z - ! eta1 = (r,z) - ! beta = eta1/eta0 - ! d = z + beta*d - ! halo_update(d) - ! eta0 = eta1 - ! q = Ad - ! eta2 = (d,q) - ! alpha = eta1/eta2 - ! x = x + alpha*d - ! r = r - alpha*q (or occasionally, r = b - Ax) - ! Check for convergence: err = sqrt(r,r)/sqrt(b,b) < tolerance - ! end while - ! - ! where x = solution (initial value = x0) - ! d = conjugate direction vector (initial value = d0) - ! r = residual vector (initial value = r0) - ! M = preconditioning matrix - ! (r,z) = dot product of vectors r and z - ! and similarly for (d,q) - ! + ! The SIA-based preconditioning optional is not available for a 2D solve. + ! + ! TODO: Add a tridiagonal preconditioning option to this subroutine, + ! as for subroutine pcg_solver_chrongear_2d. !--------------------------------------------------------------- !--------------------------------------------------------------- @@ -135,65 +107,61 @@ subroutine pcg_solver_standard_3d(nx, ny, & !--------------------------------------------------------------- integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions (for scalars) - ! velocity grid has dimensions (nx-1,ny-1) - nz ! number of vertical levels where velocity is computed + nx, ny ! horizontal grid dimensions (for scalars) + ! velocity grid has dimensions (nx-1,ny-1) type(parallel_type), intent(in) :: & parallel ! info for parallel communication - integer, dimension(-1:1,-1:1,-1:1), intent(in) :: & - indxA_3d ! maps relative (x,y,z) coordinates to an index between 1 and 27 + integer, dimension(-1:1,-1:1), intent(in) :: & + indxA_2d ! maps relative (x,y) coordinates to an index between 1 and 9 logical, dimension(nx-1,ny-1), intent(in) :: & - active_vertex ! T for columns (i,j) where velocity is computed, else F + active_vertex ! T for vertices (i,j) where velocity is computed, else F - real(dp), dimension(27,nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1,9), intent(in) :: & Auu, Auv, Avu, Avv ! four components of assembled matrix - ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction) - ! other dimensions = (z,x,y) indices + ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction) + ! 1st and 2nd dimensions = (x,y) indices ! ! Auu | Auv ! _____|____ ! Avu | Avv ! | - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts - real(dp), dimension(nz,nx-1,ny-1), intent(inout) :: & + real(dp), dimension(nx-1,ny-1), intent(inout) :: & xu, xv ! u and v components of solution (i.e., uvel and vvel) integer, intent(in) :: & precond ! = 0 for no preconditioning ! = 1 for diagonal preconditioning (best option for SSA-dominated flow) - ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow) integer, intent(in) :: & - linear_solve_ncheck ! number of iterations between convergence checks in the linear solver + linear_solve_ncheck ! number of iterations between convergence checks in the linear solver integer, intent(in) :: & - maxiters ! max number of linear iterations before quitting + maxiters ! max number of linear iterations before quitting real(dp), intent(in) :: & - tolerance ! tolerance for linear solver + tolerance ! tolerance for linear solver real(dp), intent(out) :: & - err ! error (L2 norm of residual) in final solution + err ! error (L2 norm of residual) in final solution integer, intent(out) :: & - niters ! iterations needed to solution + niters ! iterations needed to solution integer, intent(in) :: & - itest, jtest, rtest ! point for debugging diagnostics + itest, jtest, rtest ! point for debugging diagnostics !--------------------------------------------------------------- ! Local variables and parameters !--------------------------------------------------------------- - integer :: i, j, k ! grid indices - integer :: iA, jA, kA ! grid offsets ranging from -1 to 1 - integer :: m ! matrix element index + integer :: i, j, ii, jj ! grid indices integer :: iter ! iteration counter real(dp) :: & @@ -202,7 +170,7 @@ subroutine pcg_solver_standard_3d(nx, ny, & beta ! eta1/eta0 = term in expression for new direction vector ! vectors (each of these is split into u and v components) - real(dp), dimension(nz,nx-1,ny-1) :: & + real(dp), dimension(nx-1,ny-1) :: & Adiagu, Adiagv, &! diagonal terms of matrices Auu and Avv ru, rv, &! residual vector (b-Ax) du, dv, &! conjugate direction vector @@ -215,118 +183,262 @@ subroutine pcg_solver_standard_3d(nx, ny, & L2_rhs ! L2 norm of rhs vector b ! solver converges when L2_resid/L2_rhs < tolerance - real(dp), dimension(-1:1,nz,nx-1,ny-1) :: & - Muu, Mvv ! simplified SIA matrices for preconditioning + ! tridiagonal matrix elements + real(dp), dimension(:,:), allocatable :: & + Asubdiag_u, Adiag_u, Asupdiag_u, & ! matrix entries from Auu for tridiagonal preconditioning + Asubdiag_v, Adiag_v, Asupdiag_v ! matrix entries from Avv for tridiagonal preconditioning + + real(dp), dimension(:,:), allocatable :: & + omega_u, omega_v, & ! work arrays for tridiagonal solve + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v + + real(dp), dimension(:,:), allocatable :: & + b_u, b_v, x_u, x_v + + real(dp), dimension(:,:), allocatable :: & + gather_data_row, & ! arrays for gathering data from every task on a row or column + gather_data_col + + integer :: ilocal, jlocal ! number of locally owned vertices in each direction + + integer :: & + tasks_row, & ! number of tasks per row and column for tridiagonal solve + tasks_col + + logical :: first_time ! true on the first subroutine call (iter = 1), false thereafter + + integer :: & + staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid + staggered_jlo, staggered_jhi + + !TODO - Apply the following for tridiag PCs? +! integer, parameter :: & +! maxiters_tridiag = 100 ! max number of linear iterations for tridiagonal preconditioning, +! ! which generally leads to faster convergence than diagonal preconditioning + + !WHL - debug + integer :: iu_max, ju_max, iv_max, jv_max + real(dp) :: ru_max, rv_max + + staggered_ilo = parallel%staggered_ilo + staggered_ihi = parallel%staggered_ihi + staggered_jlo = parallel%staggered_jlo + staggered_jhi = parallel%staggered_jhi + + tasks_row = parallel%tasks_row + tasks_col = parallel%tasks_col if (verbose_pcg .and. main_task) then write(iulog,*) 'Using native PCG solver (standard)' write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond endif + ! Compute array sizes for locally owned vertices + ilocal = staggered_ihi - staggered_ilo + 1 + jlocal = staggered_jhi - staggered_jlo + 1 + ! Set up matrices for preconditioning - !TODO - Add tridiagonal options call t_startf("pcg_precond_init") - if (precond == HO_PRECOND_DIAG) then - - call setup_preconditioner_diag_3d(nx, ny, & - nz, indxA_3d, & - Auu, Avv, & - Adiagu, Adiagv) + if (precond == HO_PRECOND_NONE) then ! no preconditioner - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - i = itest - j = jtest - write(iulog,*) 'i, j, r =', i, j, this_rank - write(iulog,*) 'Auu diag =', Adiagu(:,i,j) - write(iulog,*) 'Avu diag =', Adiagv(:,i,j) + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Using no preconditioner' endif - !TODO - Create a separate setup for tridiag_local - ! For this setup: Pass in Auu and Avv - ! Return Adiag/subdiag/supdiag for u and v in halo - ! Return omega and denom in halo - ! Then M*z = r can compute z in halo + elseif (precond == HO_PRECOND_DIAG) then - elseif (precond == HO_PRECOND_SIA) then + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Using diagonal matrix for preconditioning' + endif ! verbose_pcg - call setup_preconditioner_sia_3d(nx, ny, & - nz, indxA_3d, & - Auu, Avv, & - Muu, Mvv) + call setup_preconditioner_diag_2d(nx, ny, & + indxA_2d, & + Auu, Avv, & + Adiagu, Adiagv) - else ! no preconditioner + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then - if (verbose_pcg .and. main_task) then - write(iulog,*) 'Using no preconditioner' + !WHL - debug + if (verbose_tridiag .and. this_rank==rtest) then + i = itest + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'r, i, j =', this_rank, i, j + write(iulog,*) 'Auu =', Auu(i,j,:) + write(iulog,*) 'Avv =', Avv(i,j,:) endif - endif ! precond + allocate(Adiag_u (nx-1,ny-1)) + allocate(Asubdiag_u(nx-1,ny-1)) + allocate(Asupdiag_u(nx-1,ny-1)) + allocate(omega_u (nx-1,ny-1)) + allocate(denom_u (nx-1,ny-1)) - call t_stopf("pcg_precond_init") + allocate(Adiag_v (nx-1,ny-1)) + allocate(Asubdiag_v(nx-1,ny-1)) + allocate(Asupdiag_v(nx-1,ny-1)) + allocate(omega_v (nx-1,ny-1)) + allocate(denom_v (nx-1,ny-1)) - ! Compute initial residual and initialize the direction vector d - ! Note: The matrix A must be complete for all rows corresponding to locally - ! owned vertices, and x must have the correct values in - ! halo vertices bordering the locally owned vertices. - ! Then y = Ax will be correct for locally owned vertices. + call setup_preconditioner_tridiag_local_2d(& + nx, ny, & + parallel, indxA_2d, & + itest, jtest, rtest, & + Auu, Avv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v) - ! Halo update for x (initial guess for velocity solution) + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then - call t_startf("pcg_halo_init") - call staggered_parallel_halo(xu, parallel) - call staggered_parallel_halo(xv, parallel) - call t_stopf("pcg_halo_init") + ! Allocate tridiagonal matrices + ! Note: (i,j) indices are switced for the A_v matrices to reduce striding. - ! Compute A*x (use z as a temp vector for A*x) + allocate(Adiag_u (ilocal,jlocal)) + allocate(Asubdiag_u(ilocal,jlocal)) + allocate(Asupdiag_u(ilocal,jlocal)) + allocate(omega_u(ilocal,jlocal)) + allocate(denom_u(ilocal,jlocal)) + allocate(xuh_u(ilocal,jlocal)) + allocate(xlh_u(ilocal,jlocal)) + allocate(b_u(ilocal,jlocal)) + allocate(x_u(ilocal,jlocal)) - call t_startf("pcg_matmult_init") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & - Auu, Auv, & - Avu, Avv, & - xu, xv, & - zu, zv) - call t_stopf("pcg_matmult_init") + allocate(Adiag_v (jlocal,ilocal)) + allocate(Asubdiag_v(jlocal,ilocal)) + allocate(Asupdiag_v(jlocal,ilocal)) + allocate(omega_v(jlocal,ilocal)) + allocate(denom_v(jlocal,ilocal)) + allocate(xuh_v(jlocal,ilocal)) + allocate(xlh_v(jlocal,ilocal)) + allocate(b_v(jlocal,ilocal)) + allocate(x_v(jlocal,ilocal)) - ! Compute the initial residual r(0) = b - Ax(0) - ! This will be correct for locally owned vertices. + ! These two matrices are for gathering data from all tasks on a given row or column. + allocate(gather_data_row(8*tasks_row,jlocal)) + allocate(gather_data_col(8*tasks_col,ilocal)) + gather_data_row = 0.0d0 + gather_data_col = 0.0d0 - call t_startf("pcg_vecupdate_init") - ru(:,:,:) = bu(:,:,:) - zu(:,:,:) - rv(:,:,:) = bv(:,:,:) - zv(:,:,:) - call t_stopf("pcg_vecupdate_init") + ! Compute the entries of the tridiagonal matrices - ! Initialize scalars and vectors + ! Extract tridiagonal matrix entries from Auu + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0)) ! subdiagonal elements + Adiag_u (i,j) = Auu(ii,jj,indxA_2d( 0,0)) ! diagonal elements + Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0)) ! superdiagonal elements + enddo + enddo + + ! compute work arrays for the u solve in each matrix row + call setup_preconditioner_tridiag_global_2d(& + ilocal, jlocal, & +!! itest, jtest, rtest, & + itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates + jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates + rtest, & + Adiag_u, & + Asubdiag_u, Asupdiag_u, & + omega_u, denom_u, & + xuh_u, xlh_u) + + ! Extract tridiagonal matrix entries from Avv + + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1)) ! subdiagonal elements + Adiag_v (j,i) = Avv(ii,jj,indxA_2d(0, 0)) ! diagonal elements + Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1)) ! superdiagonal elements + enddo + enddo + + ! compute work arrays for the v solve in each matrix column + ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides + + call setup_preconditioner_tridiag_global_2d(& + jlocal, ilocal, & +!! itest, jtest, rtest, & + jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates + itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates + rtest, & + Adiag_v, & + Asubdiag_v, Asupdiag_v, & + omega_v, denom_v, & + xuh_v, xlh_v) + + endif ! precond + + if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup' + + call t_stopf("pcg_precond_init") + + ! Compute initial residual and initialize the direction vector d + ! Note: The matrix A must be complete for all rows corresponding to locally + ! owned vertices, and x must have the correct values in + ! halo vertices bordering the locally owned vertices. + ! Then y = Ax will be correct for locally owned vertices. + + ! Halo update for x (initial guess for velocity solution) + + call t_startf("pcg_halo_init") + call staggered_parallel_halo(xu, parallel) + call staggered_parallel_halo(xv, parallel) + call t_stopf("pcg_halo_init") + + ! Compute A*x (use z as a temp vector for A*x) + + call t_startf("pcg_matmult_init") + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & + Auu, Auv, & + Avu, Avv, & + xu, xv, & + zu, zv) + call t_stopf("pcg_matmult_init") + + ! Compute the initial residual r(0) = b - Ax(0) + ! This will be correct for locally owned vertices. + + call t_startf("pcg_vecupdate_init") + ru(:,:) = bu(:,:) - zu(:,:) + rv(:,:) = bv(:,:) - zv(:,:) + call t_stopf("pcg_vecupdate_init") + + ! Initialize scalars and vectors niters = maxiters eta0 = 1.d0 - du(:,:,:) = 0.d0 - dv(:,:,:) = 0.d0 + du(:,:) = 0.d0 + dv(:,:) = 0.d0 - zu(:,:,:) = 0.d0 - zv(:,:,:) = 0.d0 + zu(:,:) = 0.d0 + zv(:,:) = 0.d0 ! Compute the L2 norm of the RHS vectors ! (Goal is to obtain L2_resid/L2_rhs < tolerance) call t_startf("pcg_dotprod") - work0u(:,:,:) = bu(:,:,:)*bu(:,:,:) ! terms of dot product (b, b) - work0v(:,:,:) = bv(:,:,:)*bv(:,:,:) + work0u(:,:) = bu(:,:)*bu(:,:) ! terms of dot product (b, b) + work0v(:,:) = bv(:,:)*bv(:,:) call t_stopf("pcg_dotprod") ! find global sum of the squared L2 norm call t_startf("pcg_glbsum_init") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - L2_rhs, & - work0u, work0v) + L2_rhs = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_init") ! take square root @@ -337,44 +449,173 @@ subroutine pcg_solver_standard_3d(nx, ny, & iter_loop: do iter = 1, maxiters + if (verbose_pcg .and. main_task) then +! write(iulog,*) 'iter =', iter + endif + call t_startf("pcg_precond") ! Compute PC(r) = solution z of Mz = r if (precond == HO_PRECOND_NONE) then ! no preconditioning - zu(:,:,:) = ru(:,:,:) ! PC(r) = r - zv(:,:,:) = rv(:,:,:) ! PC(r) = r + zu(:,:) = ru(:,:) ! PC(r) = r + zv(:,:) = rv(:,:) ! PC(r) = r - elseif (precond == HO_PRECOND_DIAG ) then ! diagonal preconditioning + elseif (precond == HO_PRECOND_DIAG) then ! diagonal preconditioning do j = 1, ny-1 do i = 1, nx-1 - do k = 1, nz - if (Adiagu(k,i,j) /= 0.d0) then - zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j) ! PC(r), where PC is formed from diagonal elements of A + if (Adiagu(i,j) /= 0.d0) then + zu(i,j) = ru(i,j) / Adiagu(i,j) ! PC(r), where PC is formed from diagonal elements of A else - zu(k,i,j) = 0.d0 + zu(i,j) = 0.d0 endif - if (Adiagv(k,i,j) /= 0.d0) then - zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j) + if (Adiagv(i,j) /= 0.d0) then + zv(i,j) = rv(i,j) / Adiagv(i,j) else - zv(k,i,j) = 0.d0 + zv(i,j) = 0.d0 endif - enddo ! k enddo ! i enddo ! j - elseif (precond == HO_PRECOND_SIA) then ! local vertical shallow-ice solver for preconditioning + elseif(precond == HO_PRECOND_TRIDIAG_LOCAL) then ! local - call easy_sia_solver(nx, ny, nz, & - active_vertex, & - Muu, ru, zu) ! solve Muu*zu = ru for zu + if (verbose_tridiag .and. this_rank == rtest) then + i = itest + j = jtest + write(iulog,*) 'Residual:' + write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j) + write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j) + write(iulog,*) ' ' + write(iulog,*) 'jtest =', jtest + write(iulog,*) 'i, ru, rv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) + enddo + endif - call easy_sia_solver(nx, ny, nz, & - active_vertex, & - Mvv, rv, zv) ! solve Mvv*zv = rv for zv + if (verbose_pcg .and. main_task) then + write(iulog,*) 'call tridiag_solver_local_2d' + endif + + ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task) + !TODO - Test a local solver that can compute zu and zv in the halo + ! (to avoid the halo update below) + + call tridiag_solver_local_2d(& + nx, ny, & + parallel, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + ru, rv, & ! right hand side + zu, zv) ! solution + + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) + enddo + endif + + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) + + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve + + ! Use a global tridiagonal solver to find an approximate solution of A*z = r + if (iter == 1) then + first_time = .true. + else + first_time = .false. + endif + + ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal) + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + b_u(i,j) = ru(ii,jj) + enddo + enddo + + ! Solve M*z = r, where M is a global tridiagonal matrix + + call tridiag_solver_global_2d(& + ilocal, jlocal, & + parallel, tasks_row, & + 'row', & ! tridiagonal solve for each row +!! itest, jtest, rtest, & + itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates + jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates + rtest, & + Adiag_u, & + Asubdiag_u, Asupdiag_u, & + omega_u, denom_u, & + xuh_u, xlh_u, & + b_u, x_u, & + first_time, & + gather_data_row) + + ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1) + zu(:,:) = 0.0d0 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + zu(ii,jj) = x_u(i,j) + enddo + enddo + + ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal) + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + b_v(j,i) = rv(ii,jj) + enddo + enddo + + call tridiag_solver_global_2d(& + jlocal, ilocal, & + parallel, tasks_col, & + 'col', & ! tridiagonal solve for each column + !! itest, jtest, rtest, & + jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates + itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates + rtest, & + Adiag_v, & + Asubdiag_v, Asupdiag_v, & + omega_v, denom_v, & + xuh_v, xlh_v, & + b_v, x_v, & + first_time, & + gather_data_col) + + ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1) + zv(:,:) = 0.0d0 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + zv(ii,jj) = x_v(j,i) + enddo + enddo + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) + endif ! precond call t_stopf("pcg_precond") @@ -382,23 +623,19 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! Compute the dot product eta1 = (r, PC(r)) call t_startf("pcg_dotprod") - work0u(:,:,:) = ru(:,:,:)*zu(:,:,:) ! terms of dot product (r, PC(r)) - work0v(:,:,:) = rv(:,:,:)*zv(:,:,:) + work0u(:,:) = ru(:,:)*zu(:,:) ! terms of dot product (r, PC(r)) + work0v(:,:) = rv(:,:)*zv(:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - eta1, & - work0u, work0v) + eta1 = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_iter") !WHL - If the SIA solver has failed due to singular matrices, ! then eta1 will be NaN. if (eta1 /= eta1) then ! eta1 is NaN - call write_log('PCG solver has failed, alpha = NaN', GM_FATAL) + call write_log('PCG solver has failed, eta1 = NaN', GM_FATAL) endif ! Update the conjugate direction vector d @@ -406,13 +643,13 @@ subroutine pcg_solver_standard_3d(nx, ny, & beta = eta1/eta0 call t_startf("pcg_vecupdate") - du(:,:,:) = zu(:,:,:) + beta*du(:,:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i - dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:) ! - ! (r_(i+1), PC(r_(i+1))) - ! where beta_(i+1) = -------------------- - ! (r_i, PC(r_i)) - ! Initially eta0 = 1 - ! For n >=2, eta0 = old eta1 + du(:,:) = zu(:,:) + beta*du(:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i + dv(:,:) = zv(:,:) + beta*dv(:,:) ! + ! (r_(i+1), PC(r_(i+1))) + ! where beta_(i+1) = -------------------- + ! (r_i, PC(r_i)) + ! Initially eta0 = 1 + ! For n >=2, eta0 = old eta1 call t_stopf("pcg_vecupdate") ! Halo update for d @@ -426,9 +663,9 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! This is the one matvec multiply required for each iteration call t_startf("pcg_matmult_iter") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & du, dv, & @@ -442,16 +679,12 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! Compute the dot product eta2 = (d, A*d) call t_startf("pcg_dotprod") - work0u(:,:,:) = du(:,:,:) * qu(:,:,:) ! terms of dot product (d, Ad) - work0v(:,:,:) = dv(:,:,:) * qv(:,:,:) + work0u(:,:) = du(:,:) * qu(:,:) ! terms of dot product (d, Ad) + work0v(:,:) = dv(:,:) * qv(:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - eta2, & - work0u, work0v) + eta2 = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_iter") ! Compute alpha @@ -469,11 +702,11 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! Compute the new solution and residual call t_startf("pcg_vecupdate") - xu(:,:,:) = xu(:,:,:) + alpha * du(:,:,:) ! new solution, x_(i+1) = x_i + alpha*d - xv(:,:,:) = xv(:,:,:) + alpha * dv(:,:,:) + xu(:,:) = xu(:,:) + alpha * du(:,:) ! new solution, x_(i+1) = x_i + alpha*d + xv(:,:) = xv(:,:) + alpha * dv(:,:) - ru(:,:,:) = ru(:,:,:) - alpha * qu(:,:,:) ! new residual, r_(i+1) = r_i - alpha*(Ad) - rv(:,:,:) = rv(:,:,:) - alpha * qv(:,:,:) + ru(:,:) = ru(:,:) - alpha * qu(:,:) ! new residual, r_(i+1) = r_i - alpha*(Ad) + rv(:,:) = rv(:,:) - alpha * qv(:,:) call t_stopf("pcg_vecupdate") ! Check for convergence every linear_solve_ncheck iterations. @@ -483,7 +716,6 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! For convergence check, use r = b - Ax if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then -!! if (mod(iter, linear_solve_ncheck) == 0 .or. iter == linear_solve_ncheck/2) then ! Halo update for x @@ -495,9 +727,9 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! Compute A*x (use z as a temp vector for A*x) call t_startf("pcg_matmult_resid") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -507,23 +739,19 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! Compute residual r = b - Ax call t_startf("pcg_vecupdate") - ru(:,:,:) = bu(:,:,:) - zu(:,:,:) - rv(:,:,:) = bv(:,:,:) - zv(:,:,:) + ru(:,:) = bu(:,:) - zu(:,:) + rv(:,:) = bv(:,:) - zv(:,:) call t_stopf("pcg_vecupdate") ! Compute squared L2 norm of (r, r) call t_startf("pcg_dotprod") - work0u(:,:,:) = ru(:,:,:)*ru(:,:,:) ! terms of dot product (r, r) - work0v(:,:,:) = rv(:,:,:)*rv(:,:,:) + work0u(:,:) = ru(:,:)*ru(:,:) ! terms of dot product (r, r) + work0v(:,:) = rv(:,:)*rv(:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_resid") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - L2_resid, & - work0u, work0v) + L2_resid = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_resid") ! take square root @@ -532,36 +760,74 @@ subroutine pcg_solver_standard_3d(nx, ny, & ! compute normalized error err = L2_resid/L2_rhs - if (verbose_pcg .and. main_task) then -! write(iulog,*) ' ' -! write(iulog,*) 'iter, L2_resid, error =', iter, L2_resid, err + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + ru_max = 0.d0 + rv_max = 0.d0 + iu_max = 0 + ju_max = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + if (abs(ru(i,j)) > ru_max) then + ru_max = ru(i,j) + iu_max = i + ju_max = j + endif + if (abs(rv(i,j)) > rv_max) then + rv_max = rv(i,j) + iv_max = i + jv_max = j + endif + enddo + enddo + write(iulog,*) 'r, i, j, ru_max:', this_rank, iu_max, ju_max, ru_max + write(iulog,*) 'r, i, j, rv_max:', this_rank, iv_max, jv_max, rv_max endif + ! If converged, then exit the loop. + ! Note: Without good preconditioning, convergence can be slow, + ! but the solution after maxiters_chrongear might be good enough. + if (err < tolerance) then niters = iter + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Glissade PCG solver has converged, iter =', niters + write(iulog,*) ' ' + endif exit iter_loop - endif + elseif (iter == maxiters) then + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Glissade PCG solver did not converge' + write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance + write(iulog,*) ' ' + endif + endif endif ! linear_solve_ncheck enddo iter_loop -!WHL - Without good preconditioning, convergence can be slow, but the solution after maxiters might be good enough. - - if (niters == maxiters) then - if (verbose_pcg .and. main_task) then - write(iulog,*) 'Glissade PCG solver not converged' - write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance - endif - endif + ! Clean up + if (allocated(Adiag_u)) deallocate(Adiag_u, Adiag_v) + if (allocated(Asubdiag_u)) deallocate(Asubdiag_u, Asubdiag_v) + if (allocated(Asupdiag_u)) deallocate(Asupdiag_u, Asupdiag_v) + if (allocated(omega_u)) deallocate(omega_u, omega_v) + if (allocated(denom_u)) deallocate(denom_u, denom_v) + if (allocated(xuh_u)) deallocate(xuh_u, xuh_v) + if (allocated(xlh_u)) deallocate(xlh_u, xlh_v) + if (allocated(b_u)) deallocate(b_u, b_v) + if (allocated(x_u)) deallocate(x_u, x_v) + if (allocated(gather_data_row)) deallocate(gather_data_row) + if (allocated(gather_data_col)) deallocate(gather_data_col) - end subroutine pcg_solver_standard_3d + end subroutine pcg_solver_standard_2d !**************************************************************************** - subroutine pcg_solver_standard_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + subroutine pcg_solver_standard_3d(nx, ny, & + nz, parallel, & + indxA_2d, indxA_3d, & + active_vertex, & Auu, Auv, & Avu, Avv, & bu, bv, & @@ -576,25 +842,50 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! to solve the equation $Ax=b$. ! Convergence is checked every {\em linear_solve_ncheck} steps. ! - ! It is similar to subroutine pcg_solver_standard_3d, but modified - ! to solve for x and y at a single horizontal level, as in the - ! shallow-shelf approximation. See the comments in that subroutine - ! (above) for more details on data structure and solver methods. - ! - ! Input and output arrays are located on a structured (i,j) grid - ! as defined in the glissade_velo_higher module. The global matrix - ! is sparse, but its nonzero element are stored in four dense matrices - ! called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential - ! nonzero elements per node (i,j). + ! It is based on the barotropic solver in the POP ocean model + ! (author Phil Jones, LANL). Input and output arrays are located + ! on a structured (i,j,k) grid as defined in the glissade_velo_higher + ! module. The global matrix is sparse, but its nonzero elements + ! are stored in four dense matrices called Auu, Avv, Auv, and Avu. + ! Each matrix has 3x3x3 = 27 potential nonzero elements per + ! node (i,j,k). ! ! The current preconditioning options are ! (0) no preconditioning ! (1) diagonal preconditioning - ! - ! The SIA-based preconditioning optional is not available for a 2D solve. + ! (2) preconditioning using a physics-based SIA solver ! - ! TODO: Add a tridiagonal preconditioning option to this subroutine, - ! as for subroutine pcg_solver_chrongear_2d. + ! For the dome test case with higher-order dynamics, option (2) is best. + ! + ! Here is a schematic of the method implemented below for solving Ax = b: + ! + ! halo_update(x0) + ! r0 = b - A*x0 + ! d0 = 0 + ! eta0 = 1 + ! + ! while (not converged) + ! solve Mz = r for z + ! eta1 = (r,z) + ! beta = eta1/eta0 + ! d = z + beta*d + ! halo_update(d) + ! eta0 = eta1 + ! q = Ad + ! eta2 = (d,q) + ! alpha = eta1/eta2 + ! x = x + alpha*d + ! r = r - alpha*q (or occasionally, r = b - Ax) + ! Check for convergence: err = sqrt(r,r)/sqrt(b,b) < tolerance + ! end while + ! + ! where x = solution (initial value = x0) + ! d = conjugate direction vector (initial value = d0) + ! r = residual vector (initial value = r0) + ! M = preconditioning matrix + ! (r,z) = dot product of vectors r and z + ! and similarly for (d,q) + ! !--------------------------------------------------------------- !--------------------------------------------------------------- @@ -602,62 +893,70 @@ subroutine pcg_solver_standard_2d(nx, ny, & !--------------------------------------------------------------- integer, intent(in) :: & - nx, ny ! horizontal grid dimensions (for scalars) - ! velocity grid has dimensions (nx-1,ny-1) + nx, ny, & ! horizontal grid dimensions (for scalars) + ! velocity grid has dimensions (nx-1,ny-1) + nz ! number of vertical levels where velocity is computed type(parallel_type), intent(in) :: & - parallel ! info for parallel communication + parallel ! info for parallel communication integer, dimension(-1:1,-1:1), intent(in) :: & indxA_2d ! maps relative (x,y) coordinates to an index between 1 and 9 + integer, dimension(-1:1,-1:1,-1:1), intent(in) :: & + indxA_3d ! maps relative (x,y,z) coordinates to an index between 1 and 27 + logical, dimension(nx-1,ny-1), intent(in) :: & - active_vertex ! T for vertices (i,j) where velocity is computed, else F + active_vertex ! T for columns (i,j) where velocity is computed, else F - real(dp), dimension(nx-1,ny-1,9), intent(in) :: & + real(dp), dimension(27,nz,nx-1,ny-1), intent(in) :: & Auu, Auv, Avu, Avv ! four components of assembled matrix - ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction) - ! 1st and 2nd dimensions = (x,y) indices + ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction) + ! other dimensions = (z,x,y) indices ! ! Auu | Auv ! _____|____ ! Avu | Avv ! | - real(dp), dimension(nx-1,ny-1), intent(in) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts - real(dp), dimension(nx-1,ny-1), intent(inout) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(inout) :: & xu, xv ! u and v components of solution (i.e., uvel and vvel) integer, intent(in) :: & precond ! = 0 for no preconditioning ! = 1 for diagonal preconditioning (best option for SSA-dominated flow) + ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow) integer, intent(in) :: & - linear_solve_ncheck ! number of iterations between convergence checks in the linear solver + linear_solve_ncheck ! number of iterations between convergence checks in the linear solver integer, intent(in) :: & - maxiters ! max number of linear iterations before quitting + maxiters ! max number of linear iterations before quitting real(dp), intent(in) :: & - tolerance ! tolerance for linear solver + tolerance ! tolerance for linear solver real(dp), intent(out) :: & - err ! error (L2 norm of residual) in final solution + err ! error (L2 norm of residual) in final solution integer, intent(out) :: & - niters ! iterations needed to solution + niters ! iterations needed to solution integer, intent(in) :: & - itest, jtest, rtest ! point for debugging diagnostics + itest, jtest, rtest ! point for debugging diagnostics !--------------------------------------------------------------- ! Local variables and parameters !--------------------------------------------------------------- - integer :: i, j ! grid indices - integer :: iter ! iteration counter + integer :: i, j, k ! grid indices + integer :: iA, jA, kA ! grid offsets ranging from -1 to 1 + integer :: m ! matrix element index + integer :: ilocal, jlocal ! number of locally owned vertices in each direction + integer :: iter ! iteration counter real(dp) :: & eta0, eta1, eta2, &! scalar inner product results @@ -665,7 +964,7 @@ subroutine pcg_solver_standard_2d(nx, ny, & beta ! eta1/eta0 = term in expression for new direction vector ! vectors (each of these is split into u and v components) - real(dp), dimension(nx-1,ny-1) :: & + real(dp), dimension(nz,nx-1,ny-1) :: & Adiagu, Adiagv, &! diagonal terms of matrices Auu and Avv ru, rv, &! residual vector (b-Ax) du, dv, &! conjugate direction vector @@ -678,86 +977,243 @@ subroutine pcg_solver_standard_2d(nx, ny, & L2_rhs ! L2 norm of rhs vector b ! solver converges when L2_resid/L2_rhs < tolerance - if (verbose_pcg .and. main_task) then - write(iulog,*) 'Using native PCG solver (standard)' - write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond - endif + real(dp), dimension(-1:1,nz,nx-1,ny-1) :: & + Muu, Mvv ! simplified SIA matrices for preconditioning - ! Set up matrices for preconditioning + ! arrays for tridiagonal preconditioning + ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above + real(dp), dimension(:,:), allocatable :: & + Asubdiag_u, Adiag_u, Asupdiag_u, & ! matrix entries from Auu for tridiagonal preconditioning + Asubdiag_v, Adiag_v, Asupdiag_v ! matrix entries from Avv for tridiagonal preconditioning - !TODO - Add tridiagonal option + real(dp), dimension(:,:), allocatable :: & + omega_u, omega_v, & ! work arrays for tridiagonal solve + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v - if (verbose_pcg .and. main_task) then - write(iulog,*) 'Using diagonal matrix for preconditioning' - endif ! verbose_pcg + ! Note: These two matrices are global in the EW and NS dimensions, respectively. + ! Each holds 8 pieces of information for each task on each row or column. + ! Since only 2 of these 8 pieces of information change from one iteration to the next, + ! it is more efficient to gather the remaining information once and pass the arrays + ! with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d + ! and gather all the information every time the subroutine is called. + ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity? - call t_startf("pcg_precond_init") - call setup_preconditioner_diag_2d(nx, ny, & - indxA_2d, & - Auu, Avv, & - Adiagu, Adiagv) - call t_stopf("pcg_precond_init") + real(dp), dimension(:,:), allocatable :: & + gather_data_row, & ! arrays for gathering data from every task on a row or column + gather_data_col - ! Compute initial residual and initialize the direction vector d - ! Note: The matrix A must be complete for all rows corresponding to locally - ! owned vertices, and x must have the correct values in - ! halo vertices bordering the locally owned vertices. - ! Then y = Ax will be correct for locally owned vertices. + integer :: & + tasks_row, & ! number of tasks per row and column for tridiagonal solve + tasks_col - ! Halo update for x (initial guess for velocity solution) + logical :: first_time ! true on the first subroutine call (iter = 1), false thereafter - call t_startf("pcg_halo_init") - call staggered_parallel_halo(xu, parallel) - call staggered_parallel_halo(xv, parallel) - call t_stopf("pcg_halo_init") + integer :: & + staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid + staggered_jlo, staggered_jhi - ! Compute A*x (use z as a temp vector for A*x) + !WHL - debug + integer :: iu_max, ju_max, iv_max, jv_max + real(dp) :: ru_max, rv_max - call t_startf("pcg_matmult_init") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & - Auu, Auv, & - Avu, Avv, & - xu, xv, & - zu, zv) - call t_stopf("pcg_matmult_init") + staggered_ilo = parallel%staggered_ilo + staggered_ihi = parallel%staggered_ihi + staggered_jlo = parallel%staggered_jlo + staggered_jhi = parallel%staggered_jhi - ! Compute the initial residual r(0) = b - Ax(0) - ! This will be correct for locally owned vertices. + tasks_row = parallel%tasks_row + tasks_col = parallel%tasks_col - call t_startf("pcg_vecupdate_init") - ru(:,:) = bu(:,:) - zu(:,:) - rv(:,:) = bv(:,:) - zv(:,:) - call t_stopf("pcg_vecupdate_init") + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Using native PCG solver (standard)' + write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters, precond + endif - ! Initialize scalars and vectors + ! Compute array sizes for locally owned vertices + ilocal = staggered_ihi - staggered_ilo + 1 + jlocal = staggered_jhi - staggered_jlo + 1 - niters = maxiters - eta0 = 1.d0 + ! Set up matrices for preconditioning - du(:,:) = 0.d0 - dv(:,:) = 0.d0 + call t_startf("pcg_precond_init") - zu(:,:) = 0.d0 - zv(:,:) = 0.d0 + if (precond == HO_PRECOND_NONE) then ! no preconditioner - ! Compute the L2 norm of the RHS vectors - ! (Goal is to obtain L2_resid/L2_rhs < tolerance) + if (verbose_pcg .and. this_rank == rtest) then + write(iulog,*) 'Using no preconditioner' + endif - call t_startf("pcg_dotprod") - work0u(:,:) = bu(:,:)*bu(:,:) ! terms of dot product (b, b) - work0v(:,:) = bv(:,:)*bv(:,:) - call t_stopf("pcg_dotprod") + elseif (precond == HO_PRECOND_DIAG) then - ! find global sum of the squared L2 norm + call setup_preconditioner_diag_3d(nx, ny, & + nz, indxA_3d, & + Auu, Avv, & + Adiagu, Adiagv) + + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + i = itest + j = jtest + write(iulog,*) 'i, j, r =', i, j, this_rank + write(iulog,*) 'Auu diag =', Adiagu(:,i,j) + write(iulog,*) 'Avv diag =', Adiagv(:,i,j) + endif + + elseif (precond == HO_PRECOND_SIA) then + + call setup_preconditioner_sia_3d(nx, ny, & + nz, indxA_3d, & + Auu, Avv, & + Muu, Mvv) + + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'i, k, Muu_sia, Mvv_sia:' + do i = staggered_ihi, staggered_ilo, -1 + write(iulog,*) ' ' + do k = 1, nz + write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j) + enddo + enddo ! i + endif + + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + + ! Allocate tridiagonal preconditioning matrices + allocate(Adiag_u (nx-1,ny-1)) + allocate(Asubdiag_u(nx-1,ny-1)) + allocate(Asupdiag_u(nx-1,ny-1)) + allocate(omega_u (nx-1,ny-1)) + allocate(denom_u (nx-1,ny-1)) + + allocate(Adiag_v (nx-1,ny-1)) + allocate(Asubdiag_v(nx-1,ny-1)) + allocate(Asupdiag_v(nx-1,ny-1)) + allocate(omega_v (nx-1,ny-1)) + allocate(denom_v (nx-1,ny-1)) + + ! Compute arrays for tridiagonal preconditioning + + call setup_preconditioner_tridiag_local_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + indxA_2d, indxA_3d, & + itest, jtest, rtest, & + Auu, Avv, & + Muu, Mvv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v) + + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then + + ! Allocate tridiagonal preconditioning matrices + ! Note: (i,j) indices are switched for the A_v matrices to reduce striding. + allocate(Adiag_u (ilocal,jlocal)) + allocate(Asubdiag_u(ilocal,jlocal)) + allocate(Asupdiag_u(ilocal,jlocal)) + allocate(omega_u(ilocal,jlocal)) + allocate(denom_u(ilocal,jlocal)) + allocate(xuh_u(ilocal,jlocal)) + allocate(xlh_u(ilocal,jlocal)) + + allocate(Adiag_v (jlocal,ilocal)) + allocate(Asubdiag_v(jlocal,ilocal)) + allocate(Asupdiag_v(jlocal,ilocal)) + allocate(omega_v(jlocal,ilocal)) + allocate(denom_v(jlocal,ilocal)) + allocate(xuh_v(jlocal,ilocal)) + allocate(xlh_v(jlocal,ilocal)) + + ! These two matrices are for gathering data from all tasks on a given row or column. + allocate(gather_data_row(8*tasks_row,jlocal)) + allocate(gather_data_col(8*tasks_col,ilocal)) + gather_data_row = 0.0d0 + gather_data_col = 0.0d0 + + call setup_preconditioner_tridiag_global_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + indxA_2d, indxA_3d, & + ilocal, jlocal, & + itest, jtest, rtest, & + Auu, Avv, & + Muu, Mvv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v) + + endif ! precond + + call t_stopf("pcg_precond_init") + + ! Compute initial residual and initialize the direction vector d + ! Note: The matrix A must be complete for all rows corresponding to locally + ! owned vertices, and x must have the correct values in + ! halo vertices bordering the locally owned vertices. + ! Then y = Ax will be correct for locally owned vertices. + + ! Halo update for x (initial guess for velocity solution) + + call t_startf("pcg_halo_init") + call staggered_parallel_halo(xu, parallel) + call staggered_parallel_halo(xv, parallel) + call t_stopf("pcg_halo_init") + + ! Compute A*x (use z as a temp vector for A*x) + + call t_startf("pcg_matmult_init") + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & + Auu, Auv, & + Avu, Avv, & + xu, xv, & + zu, zv) + call t_stopf("pcg_matmult_init") + + ! Compute the initial residual r(0) = b - Ax(0) + ! This will be correct for locally owned vertices. + + call t_startf("pcg_vecupdate_init") + ru(:,:,:) = bu(:,:,:) - zu(:,:,:) + rv(:,:,:) = bv(:,:,:) - zv(:,:,:) + call t_stopf("pcg_vecupdate_init") + + ! Initialize scalars and vectors + + niters = maxiters + eta0 = 1.d0 + + du(:,:,:) = 0.d0 + dv(:,:,:) = 0.d0 + + zu(:,:,:) = 0.d0 + zv(:,:,:) = 0.d0 + + ! Compute the L2 norm of the RHS vectors + ! (Goal is to obtain L2_resid/L2_rhs < tolerance) + + call t_startf("pcg_dotprod") + work0u(:,:,:) = bu(:,:,:)*bu(:,:,:) ! terms of dot product (b, b) + work0v(:,:,:) = bv(:,:,:)*bv(:,:,:) + call t_stopf("pcg_dotprod") + + ! find global sum of the squared L2 norm call t_startf("pcg_glbsum_init") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - L2_rhs, & - work0u, work0v) + L2_rhs = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_init") ! take square root @@ -768,36 +1224,92 @@ subroutine pcg_solver_standard_2d(nx, ny, & iter_loop: do iter = 1, maxiters - if (verbose_pcg .and. main_task) then - write(iulog,*) 'iter =', iter - endif - call t_startf("pcg_precond") ! Compute PC(r) = solution z of Mz = r if (precond == HO_PRECOND_NONE) then ! no preconditioning - zu(:,:) = ru(:,:) ! PC(r) = r - zv(:,:) = rv(:,:) ! PC(r) = r + zu(:,:,:) = ru(:,:,:) ! PC(r) = r + zv(:,:,:) = rv(:,:,:) ! PC(r) = r - elseif (precond == HO_PRECOND_DIAG) then ! diagonal preconditioning + elseif (precond == HO_PRECOND_DIAG ) then ! diagonal preconditioning do j = 1, ny-1 do i = 1, nx-1 - if (Adiagu(i,j) /= 0.d0) then - zu(i,j) = ru(i,j) / Adiagu(i,j) ! PC(r), where PC is formed from diagonal elements of A + do k = 1, nz + if (Adiagu(k,i,j) /= 0.d0) then + zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j) ! PC(r), where PC is formed from diagonal elements of A else - zu(i,j) = 0.d0 + zu(k,i,j) = 0.d0 endif - if (Adiagv(i,j) /= 0.d0) then - zv(i,j) = rv(i,j) / Adiagv(i,j) + if (Adiagv(k,i,j) /= 0.d0) then + zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j) else - zv(i,j) = 0.d0 + zv(k,i,j) = 0.d0 endif + enddo ! k enddo ! i enddo ! j + elseif (precond == HO_PRECOND_SIA) then ! local vertical shallow-ice solver for preconditioning + + call easy_sia_solver(nx, ny, nz, & + active_vertex, & + Muu, ru, zu) ! solve Muu*zu = ru for zu + + call easy_sia_solver(nx, ny, nz, & + active_vertex, & + Mvv, rv, zv) ! solve Mvv*zv = rv for zv + + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + + ! Use a local tridiagonal solver to find an approximate solution of A*z = r + + call tridiag_solver_local_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + Muu, Mvv, & ! entries of SIA matrix + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r + + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then + + ! Use a global tridiagonal solver to find an approximate solution of A*z = r + + if (iter == 1) then + first_time = .true. + else + first_time = .false. + endif + + call tridiag_solver_global_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + ilocal, jlocal, & + tasks_row, tasks_col, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v, & + Muu, Mvv, & ! entries of SIA matrix + gather_data_row, gather_data_col, & + first_time, & + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r + endif ! precond call t_stopf("pcg_precond") @@ -805,23 +1317,19 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! Compute the dot product eta1 = (r, PC(r)) call t_startf("pcg_dotprod") - work0u(:,:) = ru(:,:)*zu(:,:) ! terms of dot product (r, PC(r)) - work0v(:,:) = rv(:,:)*zv(:,:) + work0u(:,:,:) = ru(:,:,:)*zu(:,:,:) ! terms of dot product (r, PC(r)) + work0v(:,:,:) = rv(:,:,:)*zv(:,:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - eta1, & - work0u, work0v) + eta1 = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_iter") !WHL - If the SIA solver has failed due to singular matrices, ! then eta1 will be NaN. if (eta1 /= eta1) then ! eta1 is NaN - call write_log('PCG solver has failed, eta1 = NaN', GM_FATAL) + call write_log('PCG solver has failed, alpha = NaN', GM_FATAL) endif ! Update the conjugate direction vector d @@ -829,13 +1337,13 @@ subroutine pcg_solver_standard_2d(nx, ny, & beta = eta1/eta0 call t_startf("pcg_vecupdate") - du(:,:) = zu(:,:) + beta*du(:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i - dv(:,:) = zv(:,:) + beta*dv(:,:) ! - ! (r_(i+1), PC(r_(i+1))) - ! where beta_(i+1) = -------------------- - ! (r_i, PC(r_i)) - ! Initially eta0 = 1 - ! For n >=2, eta0 = old eta1 + du(:,:,:) = zu(:,:,:) + beta*du(:,:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i + dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:) ! + ! (r_(i+1), PC(r_(i+1))) + ! where beta_(i+1) = -------------------- + ! (r_i, PC(r_i)) + ! Initially eta0 = 1 + ! For n >=2, eta0 = old eta1 call t_stopf("pcg_vecupdate") ! Halo update for d @@ -849,9 +1357,9 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! This is the one matvec multiply required for each iteration call t_startf("pcg_matmult_iter") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & Auu, Auv, & Avu, Avv, & du, dv, & @@ -865,16 +1373,12 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! Compute the dot product eta2 = (d, A*d) call t_startf("pcg_dotprod") - work0u(:,:) = du(:,:) * qu(:,:) ! terms of dot product (d, Ad) - work0v(:,:) = dv(:,:) * qv(:,:) + work0u(:,:,:) = du(:,:,:) * qu(:,:,:) ! terms of dot product (d, Ad) + work0v(:,:,:) = dv(:,:,:) * qv(:,:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - eta2, & - work0u, work0v) + eta2 = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_iter") ! Compute alpha @@ -892,13 +1396,26 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! Compute the new solution and residual call t_startf("pcg_vecupdate") - xu(:,:) = xu(:,:) + alpha * du(:,:) ! new solution, x_(i+1) = x_i + alpha*d - xv(:,:) = xv(:,:) + alpha * dv(:,:) + xu(:,:,:) = xu(:,:,:) + alpha * du(:,:,:) ! new solution, x_(i+1) = x_i + alpha*d + xv(:,:,:) = xv(:,:,:) + alpha * dv(:,:,:) - ru(:,:) = ru(:,:) - alpha * qu(:,:) ! new residual, r_(i+1) = r_i - alpha*(Ad) - rv(:,:) = rv(:,:) - alpha * qv(:,:) + ru(:,:,:) = ru(:,:,:) - alpha * qu(:,:,:) ! new residual, r_(i+1) = r_i - alpha*(Ad) + rv(:,:,:) = rv(:,:,:) - alpha * qv(:,:,:) call t_stopf("pcg_vecupdate") + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'iter =', iter + write(iulog,*) 'i, k, xu, xv, ru, rv:' + do i = itest-3, itest+3 + write(iulog,*) ' ' + do k = 1, nz + write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) + enddo + enddo ! i + endif + ! Check for convergence every linear_solve_ncheck iterations. ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence. ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5? This would be answer-changing. @@ -906,7 +1423,11 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! For convergence check, use r = b - Ax if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then -!! if (mod(iter, linear_solve_ncheck) == 0 .or. iter == linear_solve_ncheck/2) then + + if (verbose_pcg .and. main_task) then + write(iulog,*) ' ' + write(iulog,*) 'Check convergence, iter =', iter + endif ! Halo update for x @@ -918,9 +1439,9 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! Compute A*x (use z as a temp vector for A*x) call t_startf("pcg_matmult_resid") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -930,23 +1451,19 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! Compute residual r = b - Ax call t_startf("pcg_vecupdate") - ru(:,:) = bu(:,:) - zu(:,:) - rv(:,:) = bv(:,:) - zv(:,:) + ru(:,:,:) = bu(:,:,:) - zu(:,:,:) + rv(:,:,:) = bv(:,:,:) - zv(:,:,:) call t_stopf("pcg_vecupdate") ! Compute squared L2 norm of (r, r) call t_startf("pcg_dotprod") - work0u(:,:) = ru(:,:)*ru(:,:) ! terms of dot product (r, r) - work0v(:,:) = rv(:,:)*rv(:,:) + work0u(:,:,:) = ru(:,:,:)*ru(:,:,:) ! terms of dot product (r, r) + work0v(:,:,:) = rv(:,:,:)*rv(:,:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_resid") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - L2_resid, & - work0u, work0v) + L2_resid = parallel_global_sum_stagger(work0u, parallel, work0v) call t_stopf("pcg_glbsum_resid") ! take square root @@ -955,32 +1472,75 @@ subroutine pcg_solver_standard_2d(nx, ny, & ! compute normalized error err = L2_resid/L2_rhs + if (verbose_pcg .and. main_task) then + write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err + endif + + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + ru_max = 0.d0 + rv_max = 0.d0 + iu_max = 0 + ju_max = 0 + do j = staggered_jlo, staggered_jhi + do i = staggered_ilo, staggered_ihi + if (sum(abs(ru(:,i,j))) > ru_max) then + ru_max = sum(abs(ru(:,i,j))) + iu_max = i + ju_max = j + endif + if (sum(abs(rv(:,i,j))) > rv_max) then + rv_max = sum(abs(rv(:,i,j))) + iv_max = i + jv_max = j + endif + enddo + enddo + write(iulog,*) 'r, i, j, ru_max:', this_rank, iu_max, ju_max, ru_max + write(iulog,*) 'r, i, j, rv_max:', this_rank, iv_max, jv_max, rv_max + endif + + ! If converged, then exit the loop. + ! Note: Without good preconditioning, convergence can be slow, + ! but the solution after maxiters_chrongear might be good enough. + if (err < tolerance) then niters = iter + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Glissade PCG solver has converged, iter =', niters + write(iulog,*) ' ' + endif exit iter_loop - endif + elseif (iter == maxiters) then + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Glissade PCG solver did not converge' + write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance + write(iulog,*) ' ' + endif + endif endif ! linear_solve_ncheck enddo iter_loop -!WHL - Without good preconditioning, convergence can be slow, but the solution after maxiters might be good enough. - - if (niters == maxiters) then - if (verbose_pcg .and. main_task) then - write(iulog,*) 'Glissade PCG solver not converged' - write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance - endif - endif + ! Clean up + if (allocated(Adiag_u)) deallocate(Adiag_u, Adiag_v) + if (allocated(Asubdiag_u)) deallocate(Asubdiag_u, Asubdiag_v) + if (allocated(Asupdiag_u)) deallocate(Asupdiag_u, Asupdiag_v) + if (allocated(omega_u)) deallocate(omega_u, omega_v) + if (allocated(denom_u)) deallocate(denom_u, denom_v) + if (allocated(xuh_u)) deallocate(xuh_u, xuh_v) + if (allocated(xlh_u)) deallocate(xlh_u, xlh_v) + if (allocated(gather_data_row)) deallocate(gather_data_row) + if (allocated(gather_data_col)) deallocate(gather_data_col) - end subroutine pcg_solver_standard_2d + end subroutine pcg_solver_standard_3d !**************************************************************************** - subroutine pcg_solver_chrongear_3d(nx, ny, & - nz, parallel, & - indxA_2d, indxA_3d, & - active_vertex, & + subroutine pcg_solver_chrongear_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & bu, bv, & @@ -992,104 +1552,26 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !--------------------------------------------------------------- ! This subroutine uses a Chronopoulos-Gear preconditioned conjugate-gradient - ! algorithm to solve the equation $Ax=b$. - ! - ! It is based on the Chronopoulos-Gear PCG solver in the POP ocean model - ! (author Frank Bryan, NCAR). It is a rearranged conjugate gradient solver - ! that reduces the number of global reductions per iteration from two to one - ! (not counting the convergence check). Convergence is checked every - ! {\em linear_solve_ncheck} steps. - ! - ! References are: - ! - ! Chronopoulos, A.T., A Class of Parallel Iterative Methods Implemented on Multiprocessors, - ! Ph.D. thesis, Technical Report UIUCDCS-R-86-1267, Department of Computer Science, - ! University of Illinois, Urbana, Illinois, pp. 1-116, 1986. - ! - ! Chronopoulos, A.T., and C.W. Gear. s-step iterative methods - ! for symmetric linear systems. J. Comput. Appl. Math., 25(2), - ! 153-168, 1989. - ! - ! Dongarra, J. and V. Eijkhout. LAPACK Working Note 159. - ! Finite-choice algorithm optimization in conjugate gradients. - ! Tech. Rep. ut-cs-03-502. Computer Science Department. - ! University of Tennessee, Knoxville. 2003. + ! algorithm to solve the equation $Ax=b$. (See references in subroutine above.) ! - ! D Azevedo, E.F., V.L. Eijkhout, and C.H. Romine. LAPACK Working - ! Note 56. Conjugate gradient algorithms with reduced - ! synchronization overhead on distributed memory multiprocessors. - ! Tech. Rep. CS-93-185. Computer Science Department. - ! University of Tennessee, Knoxville. 1993. - !--------------------------------------------------------------- + ! It is similar to subroutine pcg_solver_chrongear_3d, but modified + ! to solve for x and y at a single horizontal level, as in the + ! shallow-shelf approximation. See the comments in that subroutine + ! (above) for more details on data structure and solver methods. ! - ! The input and output arrays are located on a structured (i,j,k) grid - ! as defined in the glissade_velo_higher module. - ! The global matrix is sparse, but its nonzero elements are stored in - ! four dense matrices called Auu, Avv, Auv, and Avu. - ! Each matrix has 3x3x3 = 27 potential nonzero elements per node (i,j,k). + ! Input and output arrays are located on a structured (i,j) grid + ! as defined in the glissade_velo_higher module. The global matrix + ! is sparse, but its nonzero element are stored in four dense matrices + ! called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential + ! nonzero elements per node (i,j). ! - ! The current preconditioning options are + ! The current preconditioning options for the solver are ! (0) no preconditioning ! (1) diagonal preconditioning - ! (2) preconditioning using a physics-based SIA solver - ! - ! For the dome test case with higher-order dynamics, option (2) is best. - ! - ! Here is a schematic of the method implemented below for solving Ax = b: - ! - ! Set up preconditioner M - ! work0 = (b,b) - ! bb = global_sum(work0) - ! - ! First pass of algorithm: - ! halo_update(x) - ! r = b - A*x - ! halo_update(r) - ! solve Mz = r for z - ! work(1) = (r,z) - ! d = z - ! q = A*d - ! work(2) = (d,q) - ! halo_update(q) - ! rho_old = global_sum(work(1)) - ! sigma = global_sum(work(2)) - ! alpha = rho_old/sigma - ! x = x + alpha*d - ! r = r - alpha*q - ! - ! Iterative loop: - ! while (not converged) - ! solve Mz = r for z - ! Az = A*z - ! work(1) = (r,z) - ! work(2) = (Az,z) - ! halo_update(Az) - ! rho = global_sum(work(1)) - ! delta = global_sum(work(2)) - ! beta = rho/rho_old - ! sigma = delta - beta^2 * sigma - ! alpha = rho/sigma - ! rho_old = rho - ! d = z + beta*d - ! q = Az + beta*q - ! x = x + alpha*d - ! r = r - alpha*q - ! if (time to check convergence) then - ! r = b - A*x - ! work0 = (r,r) - ! halo_update(r) - ! rr = global_sum(work0) - ! if (sqrt(r,r)/sqrt(b,b) < tolerance) exit - ! endif - ! end while + ! (3) local tridiagonal preconditioning + ! (4) global tridiagonal preconditioning + ! The SIA-based preconditioning option is not available for a 2D solve. ! - ! where x = solution vector - ! d = conjugate direction vector - ! r = residual vector - ! M = preconditioning matrix - ! (r,z) = dot product of vectors r and z - ! and similarly for (Az,z), etc. - ! !--------------------------------------------------------------- !--------------------------------------------------------------- @@ -1097,70 +1579,63 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !--------------------------------------------------------------- integer, intent(in) :: & - nx, ny, & ! horizontal grid dimensions (for scalars) - ! velocity grid has dimensions (nx-1,ny-1) - nz ! number of vertical levels where velocity is computed + nx, ny ! horizontal grid dimensions (for scalars) + ! velocity grid has dimensions (nx-1,ny-1) type(parallel_type), intent(in) :: & - parallel ! info for parallel communication + parallel ! info for parallel communication integer, dimension(-1:1,-1:1), intent(in) :: & indxA_2d ! maps relative (x,y) coordinates to an index between 1 and 9 - integer, dimension(-1:1,-1:1,-1:1), intent(in) :: & - indxA_3d ! maps relative (x,y,z) coordinates to an index between 1 and 27 - logical, dimension(nx-1,ny-1), intent(in) :: & active_vertex ! T for columns (i,j) where velocity is computed, else F - real(dp), dimension(27,nz,nx-1,ny-1), intent(in) :: & - Auu, Auv, Avu, Avv ! four components of assembled matrix - ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction) - ! other dimensions = (z,x,y) indices + real(dp), dimension(nx-1,ny-1,9), intent(in) :: & + Auu, Auv, & ! four components of assembled matrix + Avu, Avv ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction) + ! 1st and 2nd dimensions = (x,y) indices ! ! Auu | Auv ! _____|____ ! Avu | Avv ! | - real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & + real(dp), dimension(nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts - real(dp), dimension(nz,nx-1,ny-1), intent(inout) :: & + real(dp), dimension(nx-1,ny-1), intent(inout) :: & xu, xv ! u and v components of solution (i.e., uvel and vvel) integer, intent(in) :: & precond ! = 0 for no preconditioning ! = 1 for diagonal preconditioning (best option for SSA-dominated flow) - ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow) integer, intent(in) :: & - linear_solve_ncheck ! number of iterations between convergence checks in the linear solver + linear_solve_ncheck ! number of iterations between convergence checks in the linear solver integer, intent(in) :: & - maxiters ! max number of linear iterations before quitting + maxiters ! max number of linear iterations before quitting real(dp), intent(in) :: & - tolerance ! tolerance for linear solver + tolerance ! tolerance for linear solver real(dp), intent(out) :: & - err ! error (L2 norm of residual) in final solution + err ! error (L2 norm of residual) in final solution integer, intent(out) :: & - niters ! iterations needed to solution + niters ! iterations needed to solution integer, intent(in) :: & - itest, jtest, rtest ! point for debugging diagnostics + itest, jtest, rtest ! point for debugging diagnostics !--------------------------------------------------------------- ! Local variables and parameters !--------------------------------------------------------------- - integer :: i, j, k, m ! grid indices - integer :: ii, jj - integer :: ilocal, jlocal ! number of locally owned vertices in each direction - integer :: iter ! iteration counter - integer :: maxiters_chrongear ! max number of linear iterations before quitting + integer :: i, j, ii, jj ! grid indices + integer :: m ! matrix element index + integer :: iter ! iteration counter real(dp) :: & alpha, &! rho/sigma = term in expression for new residual and solution @@ -1174,8 +1649,7 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & gsum ! result of global sum for dot products ! vectors (each of these is split into u and v components) - real(dp), dimension(nz,nx-1,ny-1) :: & - Adiagu, Adiagv, &! diagonal terms of matrices Auu and Avv + real(dp), dimension(nx-1,ny-1) :: & ru, rv, &! residual vector (b-Ax) du, dv, &! conjugate direction vector zu, zv, &! solution of Mz = r @@ -1183,22 +1657,21 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & Azu, Azv, &! result of matvec multiply A*z worku, workv ! intermediate results - real(dp), dimension(nz,nx-1,ny-1,2) :: & + real(dp), dimension(nx-1,ny-1,2) :: & work2u, work2v ! intermediate results real(dp) :: & rr, &! dot product (r,r) bb, &! dot product (b,b) L2_resid, &! L2 norm of residual = sqrt(r,r) - L2_rhs ! L2 norm of rhs vector = sqrt(b,b) - ! solver is converged when L2_resid/L2_rhs < tolerance - - real(dp), dimension(-1:1,nz,nx-1,ny-1) :: & - Muu, Mvv ! simplified SIA matrices for preconditioning + L2_rhs ! L2 norm of rhs vector = sqrt(b,b) + ! solver is converged when L2_resid/L2_rhs < tolerance - ! arrays for tridiagonal preconditioning - ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above + ! diagonal matrix elements + real(dp), dimension(nx-1,ny-1) :: & + Adiagu, Adiagv ! diagonal terms of matrices Auu and Avv + ! tridiagonal matrix elements real(dp), dimension(:,:), allocatable :: & Asubdiag_u, Adiag_u, Asupdiag_u, & ! matrix entries from Auu for tridiagonal preconditioning Asubdiag_v, Adiag_v, Asupdiag_v ! matrix entries from Avv for tridiagonal preconditioning @@ -1212,18 +1685,14 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & real(dp), dimension(:,:), allocatable :: & b_u, b_v, x_u, x_v - ! Note: These two matrices are global in the EW and NS dimensions, respectively. - ! Each holds 8 pieces of information for each task on each row or column. - ! Since only 2 of these 8 pieces of information change from one iteration to the next, - ! it is more efficient to gather the remaining information once and pass the arrays - ! with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d - ! and gather all the information every time the subroutine is called. - ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity? - real(dp), dimension(:,:), allocatable :: & gather_data_row, & ! arrays for gathering data from every task on a row or column gather_data_col + integer :: ilocal, jlocal ! number of locally owned vertices in each direction + + integer :: maxiters_chrongear ! max number of linear iterations before quitting + integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid staggered_jlo, staggered_jhi @@ -1232,9 +1701,17 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & tasks_row, & ! number of tasks per row and column for tridiagonal solve tasks_col + integer, parameter :: & + maxiters_tridiag = 100 ! max number of linear iterations for tridiagonal preconditioning, + ! which generally leads to faster convergence than diagonal preconditioning + + !WHL - debug + real(dp) :: usum, usum_global, vsum, vsum_global + !WHL - debug integer :: iu_max, ju_max, iv_max, jv_max real(dp) :: ru_max, rv_max + real(dp) :: sum_temp staggered_ilo = parallel%staggered_ilo staggered_ihi = parallel%staggered_ihi @@ -1244,22 +1721,17 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & tasks_row = parallel%tasks_row tasks_col = parallel%tasks_col - ! Note: maxiters_tridiag commented out here, because the BP tridiagonal solver - ! tends not to converge as well as the 2D version. - ! TODO: Make maxiters a config option. - ! Set the maximum number of linear iterations. ! Typically allow up to 200 iterations with diagonal preconditioning, but only 100 ! with tridiagonal, which usually converges faster. - !TODO - Test whether maxiters_tridiag (currently = 100) is sufficient for convergence with 3D solver -!! if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then -!! maxiters_chrongear = maxiters_tridiag -!! else + if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then + maxiters_chrongear = maxiters_tridiag + else maxiters_chrongear = maxiters -!! endif + endif - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'Using native PCG solver (Chronopoulos-Gear)' write(iulog,*) 'tolerance, maxiters, precond =', tolerance, maxiters_chrongear, precond endif @@ -1268,26 +1740,20 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & ilocal = staggered_ihi - staggered_ilo + 1 jlocal = staggered_jhi - staggered_jlo + 1 - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - write(iulog,*) 'stag_ihi, stag_ilo, ilocal:', staggered_ihi, staggered_ilo, ilocal - write(iulog,*) 'stag_jhi, stag_jlo, jlocal:', staggered_jhi, staggered_jlo, jlocal - endif - !---- Set up matrices for preconditioning call t_startf("pcg_precond_init") if (precond == HO_PRECOND_NONE) then ! no preconditioner - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'Using no preconditioner' endif elseif (precond == HO_PRECOND_DIAG) then - call setup_preconditioner_diag_3d(nx, ny, & - nz, indxA_3d, & + call setup_preconditioner_diag_2d(nx, ny, & + indxA_2d, & Auu, Avv, & Adiagu, Adiagv) @@ -1296,32 +1762,22 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & i = itest j = jtest write(iulog,*) 'i, j, r =', i, j, this_rank - write(iulog,*) 'Auu diag =', Adiagu(:,i,j) - write(iulog,*) 'Avu diag =', Adiagv(:,i,j) + write(iulog,*) 'Au diag =', Adiagu(i,j) + write(iulog,*) 'Av diag =', Adiagv(i,j) endif - elseif (precond == HO_PRECOND_SIA) then - - call setup_preconditioner_sia_3d(nx, ny, & - nz, indxA_3d, & - Auu, Avv, & - Muu, Mvv) + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then - if (verbose_pcg .and. this_rank == rtest) then + !WHL - debug + if (verbose_tridiag .and. this_rank==rtest) then + i = itest j = jtest write(iulog,*) ' ' - write(iulog,*) 'i, k, Muu_sia, Mvv_sia:' - do i = staggered_ihi, staggered_ilo, -1 - write(iulog,*) ' ' - do k = 1, nz - write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j) - enddo - enddo ! i + write(iulog,*) 'r, i, j =', this_rank, i, j + write(iulog,*) 'Auu =', Auu(i,j,:) + write(iulog,*) 'Avv =', Avv(i,j,:) endif - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then - - ! Allocate tridiagonal preconditioning matrices allocate(Adiag_u (nx-1,ny-1)) allocate(Asubdiag_u(nx-1,ny-1)) allocate(Asupdiag_u(nx-1,ny-1)) @@ -1334,26 +1790,22 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & allocate(omega_v (nx-1,ny-1)) allocate(denom_v (nx-1,ny-1)) - ! Compute arrays for tridiagonal preconditioning - - call setup_preconditioner_tridiag_local_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - indxA_2d, indxA_3d, & - itest, jtest, rtest, & - Auu, Avv, & - Muu, Mvv, & - Adiag_u, Adiag_v, & - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & + call setup_preconditioner_tridiag_local_2d(& + nx, ny, & + parallel, indxA_2d, & + itest, jtest, rtest, & + Auu, Avv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & denom_u, denom_v) - + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! Allocate tridiagonal preconditioning matrices - ! Note: (i,j) indices are switched for the A_v matrices to reduce striding. + ! Note: (i,j) indices are switced for the A_v matrices to reduce striding. + allocate(Adiag_u (ilocal,jlocal)) allocate(Asubdiag_u(ilocal,jlocal)) allocate(Asupdiag_u(ilocal,jlocal)) @@ -1361,6 +1813,8 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & allocate(denom_u(ilocal,jlocal)) allocate(xuh_u(ilocal,jlocal)) allocate(xlh_u(ilocal,jlocal)) + allocate(b_u(ilocal,jlocal)) + allocate(x_u(ilocal,jlocal)) allocate(Adiag_v (jlocal,ilocal)) allocate(Asubdiag_v(jlocal,ilocal)) @@ -1369,6 +1823,8 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & allocate(denom_v(jlocal,ilocal)) allocate(xuh_v(jlocal,ilocal)) allocate(xlh_v(jlocal,ilocal)) + allocate(b_v(jlocal,ilocal)) + allocate(x_v(jlocal,ilocal)) ! These two matrices are for gathering data from all tasks on a given row or column. allocate(gather_data_row(8*tasks_row,jlocal)) @@ -1376,63 +1832,93 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & gather_data_row = 0.0d0 gather_data_col = 0.0d0 - ! Compute arrays for tridiagonal preconditioning + ! Compute the entries of the tridiagonal matrices - call setup_preconditioner_tridiag_global_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - indxA_2d, indxA_3d, & - ilocal, jlocal, & - itest, jtest, rtest, & - Auu, Avv, & - Muu, Mvv, & - Adiag_u, Adiag_v, & - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - xuh_u, xuh_v, & - xlh_u, xlh_v) + ! Extract tridiagonal matrix entries from Auu + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0)) ! subdiagonal elements + Adiag_u (i,j) = Auu(ii,jj,indxA_2d( 0,0)) ! diagonal elements + Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0)) ! superdiagonal elements + enddo + enddo + + ! compute work arrays for the u solve in each matrix row + call setup_preconditioner_tridiag_global_2d(& + ilocal, jlocal, & +!! itest, jtest, rtest, & + itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates + jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates + rtest, & + Adiag_u, & + Asubdiag_u, Asupdiag_u, & + omega_u, denom_u, & + xuh_u, xlh_u) + + ! Extract tridiagonal matrix entries from Avv + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1)) ! subdiagonal elements + Adiag_v (j,i) = Avv(ii,jj,indxA_2d(0, 0)) ! diagonal elements + Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1)) ! superdiagonal elements + enddo + enddo + + ! compute work arrays for the v solve in each matrix column + ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides + + call setup_preconditioner_tridiag_global_2d(& + jlocal, ilocal, & +!! itest, jtest, rtest, & + jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates + itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates + rtest, & + Adiag_v, & + Asubdiag_v, Asupdiag_v, & + omega_v, denom_v, & + xuh_v, xlh_v) endif ! precond + !WHL - debug + if (verbose_pcg .and. main_task) write(iulog,*) 'Done in PC setup' + call t_stopf("pcg_precond_init") !---- Initialize scalars and vectors niters = maxiters_chrongear - ru(:,:,:) = 0.d0 - rv(:,:,:) = 0.d0 - du(:,:,:) = 0.d0 - dv(:,:,:) = 0.d0 - zu(:,:,:) = 0.d0 - zv(:,:,:) = 0.d0 - qu(:,:,:) = 0.d0 - qv(:,:,:) = 0.d0 - Azu(:,:,:) = 0.d0 - Azv(:,:,:) = 0.d0 - worku(:,:,:) = 0.d0 - workv(:,:,:) = 0.d0 - work2u(:,:,:,:) = 0.d0 - work2v(:,:,:,:) = 0.d0 + ru(:,:) = 0.d0 + rv(:,:) = 0.d0 + du(:,:) = 0.d0 + dv(:,:) = 0.d0 + zu(:,:) = 0.d0 + zv(:,:) = 0.d0 + qu(:,:) = 0.d0 + qv(:,:) = 0.d0 + Azu(:,:) = 0.d0 + Azv(:,:) = 0.d0 + worku(:,:) = 0.d0 + workv(:,:) = 0.d0 + work2u(:,:,:) = 0.d0 + work2v(:,:,:) = 0.d0 !---- Compute the L2 norm of the RHS vectors !---- (Goal is to obtain L2_resid/L2_rhs < tolerance) call t_startf("pcg_dotprod") - worku(:,:,:) = bu(:,:,:)*bu(:,:,:) ! terms of dot product (b, b) - workv(:,:,:) = bv(:,:,:)*bv(:,:,:) + worku(:,:) = bu(:,:)*bu(:,:) ! terms of dot product (b, b) + workv(:,:) = bv(:,:)*bv(:,:) call t_stopf("pcg_dotprod") ! find global sum of the squared L2 norm call t_startf("pcg_glbsum_init") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - bb, & - worku, workv) + bb = parallel_global_sum_stagger(worku, parallel, workv) call t_stopf("pcg_glbsum_init") ! take square root @@ -1458,9 +1944,9 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- Compute A*x (use z as a temp vector for A*x) call t_startf("pcg_matmult_init") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -1471,8 +1957,8 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- This is correct for locally owned nodes. call t_startf("pcg_vecupdate") - ru(:,:,:) = bu(:,:,:) - zu(:,:,:) - rv(:,:,:) = bv(:,:,:) - zv(:,:,:) + ru(:,:) = bu(:,:) - zu(:,:) + rv(:,:) = bv(:,:) - zv(:,:) call t_stopf("pcg_vecupdate") !---- Halo update for residual @@ -1488,103 +1974,169 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & ! From here on, call timers with 'iter' suffix because this can be considered the first iteration call t_startf("pcg_precond_iter") - if (precond == HO_PRECOND_NONE) then ! no preconditioning + if (precond == HO_PRECOND_NONE) then ! no preconditioning - zu(:,:,:) = ru(:,:,:) ! PC(r) = r - zv(:,:,:) = rv(:,:,:) ! PC(r) = r + zu(:,:) = ru(:,:) ! PC(r) = r + zv(:,:) = rv(:,:) ! PC(r) = r - elseif (precond == HO_PRECOND_DIAG ) then ! diagonal preconditioning + elseif (precond == HO_PRECOND_DIAG) then ! diagonal preconditioning + ! Solve Mz = r, where M is a diagonal matrix do j = 1, ny-1 do i = 1, nx-1 - do k = 1, nz - if (Adiagu(k,i,j) /= 0.d0) then - zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j) ! PC(r), where PC is formed from diagonal elements of A + if (Adiagu(i,j) /= 0.d0) then + zu(i,j) = ru(i,j) / Adiagu(i,j) ! PC(r), where PC is formed from diagonal elements of A else - zu(k,i,j) = 0.d0 + zu(i,j) = 0.d0 endif - if (Adiagv(k,i,j) /= 0.d0) then - zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j) + if (Adiagv(i,j) /= 0.d0) then + zv(i,j) = rv(i,j) / Adiagv(i,j) else - zv(k,i,j) = 0.d0 + zv(i,j) = 0.d0 endif - enddo ! k enddo ! i enddo ! j - elseif (precond == HO_PRECOND_SIA) then ! local vertical shallow-ice solver for preconditioning + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then +! i = itest +! write(iulog,*) ' ' +! write(iulog,*) 'zv solve with diagonal precond, this_rank, i =', this_rank, i +! write(iulog,*) 'j, active, Adiagv, rv, zv, xv:' +! do j = staggered_jhi, staggered_jlo, -1 +! write(iulog,'(i4, l4, 2f12.3, e12.3, f12.3)') j, active_vertex(i,j), Adiagv(i,j), rv(i,j), zv(i,j), xv(i,j) +! enddo + endif + + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then ! local + + if (verbose_tridiag .and. this_rank == rtest) then + i = itest + j = jtest + write(iulog,*) 'Residual:' + write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j) + write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j) + write(iulog,*) ' ' + write(iulog,*) 'jtest =', jtest + write(iulog,*) 'i, ru, rv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) + enddo + endif + + if (verbose_pcg .and. main_task) then + write(iulog,*) 'call tridiag_solver_local_2d' + endif - call easy_sia_solver(nx, ny, nz, & - active_vertex, & - Muu, ru, zu) ! solve Muu*zu = ru for zu + ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task) - call easy_sia_solver(nx, ny, nz, & - active_vertex, & - Mvv, rv, zv) ! solve Mvv*zv = rv for zv + !TODO - Test a local solver that can compute zu and zv in the halo + ! (to avoid the halo update below) + + call tridiag_solver_local_2d(nx, ny, & + parallel, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + ru, rv, & ! right hand side + zu, zv) ! solution !WHL - debug if (verbose_pcg .and. this_rank == rtest) then j = jtest - write(iulog,*) 'Standard SIA PC:' write(iulog,*) ' ' - write(iulog,*) 'i, zu_sia(1), zu_sia(nz):' + write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' do i = itest-3, itest+3 - write(iulog,*) ' ' - do k = 1, nz - write(iulog,'(i4, 2f16.10)') i, zu(1,i,j), zu(nz,i,j) - enddo - enddo ! i - write(iulog,*) ' ' - write(iulog,*) 'i, zv_sia(1), zv_sia(nz):' - do i = itest-3, itest+3 - write(iulog,*) ' ' - do k = 1, nz - write(iulog,'(i4, 2f16.10)') i, zv(1,i,j), zv(nz,i,j) - enddo - enddo ! i + write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) + enddo endif - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) - ! Use a local tridiagonal solver to find an approximate solution of A*z = r + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve - call tridiag_solver_local_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - Muu, Mvv, & ! entries of SIA matrix - ru, rv, & ! 3D residual - zu, zv) ! approximate solution of Az = r + ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal) + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + b_u(i,j) = ru(ii,jj) + enddo + enddo - elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then + ! Solve M*z = r, where M is a global tridiagonal matrix - ! Use a global tridiagonal solver to find an approximate solution of A*z = r + call tridiag_solver_global_2d(ilocal, jlocal, & + parallel, tasks_row, & + 'row', & ! tridiagonal solve for each row +!! itest, jtest, rtest, & + itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates + jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates + rtest, & + Adiag_u, & + Asubdiag_u, Asupdiag_u, & + omega_u, denom_u, & + xuh_u, xlh_u, & + b_u, x_u, & + .true., & ! first_time + gather_data_row) - call tridiag_solver_global_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - ilocal, jlocal, & - tasks_row, tasks_col, & - itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - xuh_u, xuh_v, & - xlh_u, xlh_v, & - Muu, Mvv, & ! entries of SIA matrix - gather_data_row, gather_data_col, & - .true., & ! first_time = T (first iteration) - ru, rv, & ! 3D residual - zu, zv) ! approximate solution of Az = r + ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1) + zu(:,:) = 0.0d0 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + zu(ii,jj) = x_u(i,j) + enddo + enddo + + ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal) + + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + b_v(j,i) = rv(ii,jj) + enddo + enddo + + call tridiag_solver_global_2d(jlocal, ilocal, & + parallel, tasks_col, & + 'col', & ! tridiagonal solve for each column +!! itest, jtest, rtest, & + jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates + itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates + rtest, & + Adiag_v, & + Asubdiag_v, Asupdiag_v, & + omega_v, denom_v, & + xuh_v, xlh_v, & + b_v, x_v, & + .true., & ! first_time + gather_data_col) + + ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1) + + zv(:,:) = 0.0d0 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + zv(ii,jj) = x_v(j,i) + enddo + enddo + + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) endif ! precond @@ -1593,23 +2145,23 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- Compute intermediate result for dot product (r,z) call t_startf("pcg_dotprod") - work2u(:,:,:,1) = ru(:,:,:) * zu(:,:,:) - work2v(:,:,:,1) = rv(:,:,:) * zv(:,:,:) + work2u(:,:,1) = ru(:,:) * zu(:,:) + work2v(:,:,1) = rv(:,:) * zv(:,:) call t_stopf("pcg_dotprod") !---- Compute the conjugate direction vector d !---- Since z is correct in halo, so is d - du(:,:,:) = zu(:,:,:) - dv(:,:,:) = zv(:,:,:) + du(:,:) = zu(:,:) + dv(:,:) = zv(:,:) !---- Compute q = A*d - !---- q is correct for locally owned nodes + !---- q is correct for locally owned nodes, provided d extends one layer into the halo call t_startf("pcg_matmult_iter") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & du, dv, & @@ -1619,18 +2171,14 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- Compute intermediate result for dot product (d,q) = (d,Ad) call t_startf("pcg_dotprod") - work2u(:,:,:,2) = du(:,:,:) * qu(:,:,:) - work2v(:,:,:,2) = dv(:,:,:) * qv(:,:,:) + work2u(:,:,2) = du(:,:) * qu(:,:) + work2v(:,:,2) = dv(:,:) * qv(:,:) call t_stopf("pcg_dotprod") !---- Find global sums of (r,z) and (d,q) call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - gsum, & - work2u, work2v) + gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) ! nflds = 2 call t_stopf("pcg_glbsum_iter") !---- Halo update for q @@ -1654,150 +2202,241 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- These are correct in halo call t_startf("pcg_vecupdate") - xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:) - xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:) + xu(:,:) = xu(:,:) + alpha*du(:,:) + xv(:,:) = xv(:,:) + alpha*dv(:,:) - ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:) ! q = A*d - rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:) + ru(:,:) = ru(:,:) - alpha*qu(:,:) ! q = A*d + rv(:,:) = rv(:,:) - alpha*qv(:,:) call t_stopf("pcg_vecupdate") !WHL - debug if (verbose_pcg .and. this_rank == rtest) then j = jtest - write(iulog,*) ' ' - write(iulog,*) 'alpha =', alpha - write(iulog,*) 'iter = 1: i, k, xu, xv, ru, rv:' - do i = staggered_ilo, staggered_ihi -!! do i = itest-3, itest+3 - write(iulog,*) ' ' - do k = 1, nz - write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) - enddo - enddo +!! write(iulog,*) ' ' +!! write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:' + do i = itest-3, itest+3 +!! write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j) + enddo ! i endif + !--------------------------------------------------------------- ! Iterate to solution !--------------------------------------------------------------- iter_loop: do iter = 2, maxiters_chrongear ! first iteration done above - if (verbose_pcg .and. this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'iter =', iter + if (verbose_pcg .and. main_task) then +! write(iulog,*) 'iter =', iter endif - !---- Compute PC(r) = solution z of Mz = r - !---- z is correct in halo + !---- Compute PC(r) = solution z of Mz = r + !---- z is correct in halo + + call t_startf("pcg_precond_iter") + + if (precond == HO_PRECOND_NONE) then ! no preconditioning + + zu(:,:) = ru(:,:) ! PC(r) = r + zv(:,:) = rv(:,:) ! PC(r) = r + + elseif (precond == HO_PRECOND_DIAG) then ! diagonal preconditioning + + do j = 1, ny-1 + do i = 1, nx-1 + if (Adiagu(i,j) /= 0.d0) then + zu(i,j) = ru(i,j) / Adiagu(i,j) ! PC(r), where PC is formed from diagonal elements of A + else + zu(i,j) = 0.d0 + endif + if (Adiagv(i,j) /= 0.d0) then + zv(i,j) = rv(i,j) / Adiagv(i,j) + else + zv(i,j) = 0.d0 + endif + enddo ! i + enddo ! j + + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then ! tridiagonal preconditioning with local solve + + ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task) + + !TODO - Test a local solver that can compute zu and zv in the halo + ! (to avoid the halo update below) + + !WHL - debug + if (verbose_tridiag .and. this_rank == rtest) then + i = itest + j = jtest +! write(iulog,*) 'Residual:' +! write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j) +! write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j) +! write(iulog,*) ' ' +! write(iulog,*) 'jtest =', jtest +! write(iulog,*) 'i, ru, rv:' +! do i = staggered_ihi, staggered_ilo, -1 +! write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) +! enddo + endif + + + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + write(iulog,*) ' call tridiag_solver_local_2d' + endif + + call tridiag_solver_local_2d(nx, ny, & + parallel, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + ru, rv, & ! right hand side + zu, zv) ! solution + + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) - call t_startf("pcg_precond_iter") + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) + enddo + endif - if (precond == HO_PRECOND_NONE) then ! no preconditioning + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve - zu(:,:,:) = ru(:,:,:) ! PC(r) = r - zv(:,:,:) = rv(:,:,:) ! PC(r) = r + !WHL - debug + if (verbose_tridiag .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'jtest =', jtest + write(iulog,*) 'i, ru, rv:' + do i = itest-3, itest+3 + write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) + enddo + endif - elseif (precond == HO_PRECOND_DIAG ) then ! diagonal preconditioning + ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal) - do j = 1, ny-1 - do i = 1, nx-1 - do k = 1, nz - if (Adiagu(k,i,j) /= 0.d0) then - zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j) ! PC(r), where PC is formed from diagonal elements of A - else - zu(k,i,j) = 0.d0 - endif - if (Adiagv(k,i,j) /= 0.d0) then - zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j) - else - zv(k,i,j) = 0.d0 - endif - enddo ! k - enddo ! i - enddo ! j + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + b_u(i,j) = ru(ii,jj) + enddo + enddo - elseif (precond == HO_PRECOND_SIA) then ! local vertical shallow-ice solver for preconditioning + !WHL - debug + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'Before global tridiag PC u solve, r, j =', rtest, jtest + write(iulog,*) ' ' + write(iulog,*) 'i, Adiag_u, Asubdiag_u, Asupdiag_u, b_u:' + do i = itest-3, itest+3 + write(iulog,'(i4, 4e16.8)') i, Adiag_u(i,j), Asubdiag_u(i,j), Asupdiag_u(i,j), b_u(i,j) + enddo + endif - call easy_sia_solver(nx, ny, nz, & - active_vertex, & - Muu, ru, zu) ! solve Muu*zu = ru for zu + call tridiag_solver_global_2d(ilocal, jlocal, & + parallel, tasks_row, & + 'row', & ! tridiagonal solve for each row +!! itest, jtest, rtest, & + itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates + jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates + rtest, & + Adiag_u, & + Asubdiag_u, Asupdiag_u, & + omega_u, denom_u, & + xuh_u, xlh_u, & + b_u, x_u, & + .false., & ! first_time + gather_data_row) - call easy_sia_solver(nx, ny, nz, & - active_vertex, & - Mvv, rv, zv) ! solve Mvv*zv = rv for zv + ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1) + zu(:,:) = 0.0d0 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + zu(ii,jj) = x_u(i,j) + enddo + enddo - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal) - ! Use a local tridiagonal solver to find an approximate solution of A*z = r + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + b_v(j,i) = rv(ii,jj) + enddo + enddo - call tridiag_solver_local_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - Muu, Mvv, & ! entries of SIA matrix - ru, rv, & ! 3D residual - zu, zv) ! approximate solution of Az = r + call tridiag_solver_global_2d(jlocal, ilocal, & + parallel, tasks_col, & + 'col', & ! tridiagonal solve for each column +!! itest, jtest, rtest, & + jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates + itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates + rtest, & + Adiag_v, & + Asubdiag_v, Asupdiag_v, & + omega_v, denom_v, & + xuh_v, xlh_v, & + b_v, x_v, & + .false., & ! first_time + gather_data_col) - elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve + ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1) - ! Use a global tridiagonal solver to find an approximate solution of A*z = r + zv(:,:) = 0.0d0 + do i = 1, ilocal + ii = i + staggered_ilo - 1 + do j = 1, jlocal + jj = j + staggered_jlo - 1 + zv(ii,jj) = x_v(j,i) + enddo + enddo - call tridiag_solver_global_3d(& - nx, ny, & - nz, parallel, & - active_vertex, & - ilocal, jlocal, & - tasks_row, tasks_col, & - itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - xuh_u, xuh_v, & - xlh_u, xlh_v, & - Muu, Mvv, & ! entries of SIA matrix - gather_data_row, gather_data_col, & - .false., & ! first_time = F (iteration 2+) - ru, rv, & ! 3D residual - zu, zv) ! approximate solution of Az = r + !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells + !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? + call staggered_parallel_halo(zu, parallel) + call staggered_parallel_halo(zv, parallel) endif ! precond - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) write(iulog,*) 'L1' - call t_stopf("pcg_precond_iter") !---- Compute Az = A*z !---- This is the one matvec multiply required per iteration - !---- Az is correct for local owned nodes and needs a halo update (below) + !---- Az is correct for locally owned nodes and needs a halo update (below) call t_startf("pcg_matmult_iter") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & zu, zv, & Azu, Azv) call t_stopf("pcg_matmult_iter") - !---- Compute intermediate results for the dot products (r,z) and (Az,z) call t_startf("pcg_dotprod") - work2u(:,:,:,1) = ru(:,:,:)*zu(:,:,:) ! terms of dot product (r,z) - work2v(:,:,:,1) = rv(:,:,:)*zv(:,:,:) + work2u(:,:,1) = ru(:,:)*zu(:,:) ! terms of dot product (r,z) + work2v(:,:,1) = rv(:,:)*zv(:,:) - work2u(:,:,:,2) = Azu(:,:,:)*zu(:,:,:) ! terms of dot product (A*z,z) - work2v(:,:,:,2) = Azv(:,:,:)*zv(:,:,:) + work2u(:,:,2) = Azu(:,:)*zu(:,:) ! terms of dot product (A*z,z) + work2v(:,:,2) = Azv(:,:)*zv(:,:) call t_stopf("pcg_dotprod") ! Take the global sums of (r,z) and (Az,z) @@ -1805,15 +2444,10 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & ! this is the one MPI global reduction per iteration. call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - gsum, & - work2u, work2v) + gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) ! nflds = 2 call t_stopf("pcg_glbsum_iter") !---- Halo update for Az - !---- This is the one halo update required per iteration call t_startf("pcg_halo_iter") call staggered_parallel_halo(Azu, parallel) @@ -1832,7 +2466,7 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & if (alpha /= alpha) then ! alpha is NaN !! write(iulog,*) 'rho, sigma, alpha:', rho, sigma, alpha - call write_log('Chron-Gear PCG solver has failed, alpha = NaN', GM_FATAL) + call write_log('Chron_Gear PCG solver has failed, alpha = NaN', GM_FATAL) endif !---- Update d and q @@ -1840,37 +2474,25 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & call t_startf("pcg_vecupdate") - du(:,:,:) = zu(:,:,:) + beta*du(:,:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i - dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:) ! - ! (r_(i+1), PC(r_(i+1))) - ! where beta_(i+1) = -------------------- - ! (r_i, PC(r_i)) - qu(:,:,:) = Azu(:,:,:) + beta*qu(:,:,:) - qv(:,:,:) = Azv(:,:,:) + beta*qv(:,:,:) + du(:,:) = zu(:,:) + beta*du(:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i + dv(:,:) = zv(:,:) + beta*dv(:,:) ! + ! (r_(i+1), PC(r_(i+1))) + ! where beta_(i+1) = -------------------- + ! (r_i, PC(r_i)) + qu(:,:) = Azu(:,:) + beta*qu(:,:) + qv(:,:) = Azv(:,:) + beta*qv(:,:) !---- Update solution and residual !---- These are correct in halo - xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:) - xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:) + xu(:,:) = xu(:,:) + alpha*du(:,:) + xv(:,:) = xv(:,:) + alpha*dv(:,:) - ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:) - rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:) + ru(:,:) = ru(:,:) - alpha*qu(:,:) + rv(:,:) = rv(:,:) - alpha*qv(:,:) call t_stopf("pcg_vecupdate") - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'i, k, xu, xv, ru, rv:' - do i = itest-3, itest+3 - write(iulog,*) ' ' - do k = 1, nz - write(iulog,'(i4, 4f16.10)') i, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) - enddo - enddo ! i - endif - ! Check for convergence every linear_solve_ncheck iterations. ! Also check at iter = 5, to reduce iterations when the nonlinear solver is close to convergence. ! TODO: Check at iter = linear_solve_ncheck/2 instead of 5? This would be answer-changing. @@ -1879,17 +2501,21 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) ' ' - write(iulog,*) 'Check convergence, iter =', iter + write(iulog,*) ' check convergence, iter =', iter endif !---- Compute z = A*x (use z as a temp vector for A*x) + !WHL - debug - don't think this is needed, but try just in case +!! call staggered_parallel_halo(xu, parallel) +!! call staggered_parallel_halo(xv, parallel) + call t_startf("pcg_matmult_resid") - call matvec_multiply_structured_3d(nx, ny, & - nz, parallel, & - indxA_3d, active_vertex, & + call matvec_multiply_structured_2d(nx, ny, & + parallel, & + indxA_2d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -1898,33 +2524,41 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & !---- Compute residual r = b - A*x + !WHL - debug - don't think this is needed, but try just in case +!! call staggered_parallel_halo(bu, parallel) +!! call staggered_parallel_halo(bv, parallel) + !WHL - debug - don't think this is needed, but try just in case +!! call staggered_parallel_halo(zu, parallel) +!! call staggered_parallel_halo(zv, parallel) + call t_startf("pcg_vecupdate") - ru(:,:,:) = bu(:,:,:) - zu(:,:,:) - rv(:,:,:) = bv(:,:,:) - zv(:,:,:) + ru(:,:) = bu(:,:) - zu(:,:) + rv(:,:) = bv(:,:) - zv(:,:) call t_stopf("pcg_vecupdate") + !WHL - debug - don't think this is needed, but try just in case +!! call staggered_parallel_halo(ru, parallel) +!! call staggered_parallel_halo(rv, parallel) + !---- Compute dot product (r, r) call t_startf("pcg_dotprod") - worku(:,:,:) = ru(:,:,:)*ru(:,:,:) - workv(:,:,:) = rv(:,:,:)*rv(:,:,:) + worku(:,:) = ru(:,:)*ru(:,:) + workv(:,:) = rv(:,:)*rv(:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_resid") - call parallel_global_sum_staggered(& - nx, ny, & - nz, parallel, & - rr, & - worku, workv) + rr = parallel_global_sum_stagger(worku, parallel, workv) call t_stopf("pcg_glbsum_resid") L2_resid = sqrt(rr) ! L2 norm of residual err = L2_resid/L2_rhs ! normalized error - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err endif + !WHL - debug if (verbose_pcg .and. this_rank == rtest) then ru_max = 0.d0 rv_max = 0.d0 @@ -1932,13 +2566,13 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & ju_max = 0 do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - if (abs(sum(ru(:,i,j))) > ru_max) then - ru_max = sum(ru(:,i,j)) + if (abs(ru(i,j)) > ru_max) then + ru_max = ru(i,j) iu_max = i ju_max = j endif - if (abs(sum(rv(:,i,j))) > rv_max) then - rv_max = sum(rv(:,i,j)) + if (abs(rv(i,j)) > rv_max) then + rv_max = rv(i,j) iv_max = i jv_max = j endif @@ -1954,15 +2588,15 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & if (err < tolerance) then niters = iter - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'Glissade PCG solver has converged, iter =', niters write(iulog,*) ' ' endif exit iter_loop - elseif (niters == maxiters_chrongear) then - if (verbose_pcg .and. this_rank == rtest) then - write(iulog,*) 'Glissade PCG solver not converged' - write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance + elseif (iter == maxiters_chrongear) then + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Glissade PCG solver did not converge' + write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance write(iulog,*) ' ' endif endif @@ -1986,14 +2620,19 @@ subroutine pcg_solver_chrongear_3d(nx, ny, & if (allocated(denom_u)) deallocate(denom_u, denom_v) if (allocated(xuh_u)) deallocate(xuh_u, xuh_v) if (allocated(xlh_u)) deallocate(xlh_u, xlh_v) + if (allocated(b_u)) deallocate(b_u, b_v) + if (allocated(x_u)) deallocate(x_u, x_v) + if (allocated(gather_data_row)) deallocate(gather_data_row) + if (allocated(gather_data_col)) deallocate(gather_data_col) - end subroutine pcg_solver_chrongear_3d + end subroutine pcg_solver_chrongear_2d !**************************************************************************** - - subroutine pcg_solver_chrongear_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + + subroutine pcg_solver_chrongear_3d(nx, ny, & + nz, parallel, & + indxA_2d, indxA_3d, & + active_vertex, & Auu, Auv, & Avu, Avv, & bu, bv, & @@ -2005,26 +2644,104 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !--------------------------------------------------------------- ! This subroutine uses a Chronopoulos-Gear preconditioned conjugate-gradient - ! algorithm to solve the equation $Ax=b$. (See references in subroutine above.) + ! algorithm to solve the equation $Ax=b$. ! - ! It is similar to subroutine pcg_solver_chrongear_3d, but modified - ! to solve for x and y at a single horizontal level, as in the - ! shallow-shelf approximation. See the comments in that subroutine - ! (above) for more details on data structure and solver methods. + ! It is based on the Chronopoulos-Gear PCG solver in the POP ocean model + ! (author Frank Bryan, NCAR). It is a rearranged conjugate gradient solver + ! that reduces the number of global reductions per iteration from two to one + ! (not counting the convergence check). Convergence is checked every + ! {\em linear_solve_ncheck} steps. ! - ! Input and output arrays are located on a structured (i,j) grid - ! as defined in the glissade_velo_higher module. The global matrix - ! is sparse, but its nonzero element are stored in four dense matrices - ! called Auu, Avv, Auv, and Avu. Each matrix has 3x3 = 9 potential - ! nonzero elements per node (i,j). + ! References are: ! - ! The current preconditioning options for the solver are + ! Chronopoulos, A.T., A Class of Parallel Iterative Methods Implemented on Multiprocessors, + ! Ph.D. thesis, Technical Report UIUCDCS-R-86-1267, Department of Computer Science, + ! University of Illinois, Urbana, Illinois, pp. 1-116, 1986. + ! + ! Chronopoulos, A.T., and C.W. Gear. s-step iterative methods + ! for symmetric linear systems. J. Comput. Appl. Math., 25(2), + ! 153-168, 1989. + ! + ! Dongarra, J. and V. Eijkhout. LAPACK Working Note 159. + ! Finite-choice algorithm optimization in conjugate gradients. + ! Tech. Rep. ut-cs-03-502. Computer Science Department. + ! University of Tennessee, Knoxville. 2003. + ! + ! D Azevedo, E.F., V.L. Eijkhout, and C.H. Romine. LAPACK Working + ! Note 56. Conjugate gradient algorithms with reduced + ! synchronization overhead on distributed memory multiprocessors. + ! Tech. Rep. CS-93-185. Computer Science Department. + ! University of Tennessee, Knoxville. 1993. + !--------------------------------------------------------------- + ! + ! The input and output arrays are located on a structured (i,j,k) grid + ! as defined in the glissade_velo_higher module. + ! The global matrix is sparse, but its nonzero elements are stored in + ! four dense matrices called Auu, Avv, Auv, and Avu. + ! Each matrix has 3x3x3 = 27 potential nonzero elements per node (i,j,k). + ! + ! The current preconditioning options are ! (0) no preconditioning ! (1) diagonal preconditioning - ! (3) local tridiagonal preconditioning - ! (4) global tridiagonal preconditioning - ! The SIA-based preconditioning option is not available for a 2D solve. + ! (2) preconditioning using a physics-based SIA solver + ! + ! For the dome test case with higher-order dynamics, option (2) is best. ! + ! Here is a schematic of the method implemented below for solving Ax = b: + ! + ! Set up preconditioner M + ! work0 = (b,b) + ! bb = global_sum(work0) + ! + ! First pass of algorithm: + ! halo_update(x) + ! r = b - A*x + ! halo_update(r) + ! solve Mz = r for z + ! work(1) = (r,z) + ! d = z + ! q = A*d + ! work(2) = (d,q) + ! halo_update(q) + ! rho_old = global_sum(work(1)) + ! sigma = global_sum(work(2)) + ! alpha = rho_old/sigma + ! x = x + alpha*d + ! r = r - alpha*q + ! + ! Iterative loop: + ! while (not converged) + ! solve Mz = r for z + ! Az = A*z + ! work(1) = (r,z) + ! work(2) = (Az,z) + ! halo_update(Az) + ! rho = global_sum(work(1)) + ! delta = global_sum(work(2)) + ! beta = rho/rho_old + ! sigma = delta - beta^2 * sigma + ! alpha = rho/sigma + ! rho_old = rho + ! d = z + beta*d + ! q = Az + beta*q + ! x = x + alpha*d + ! r = r - alpha*q + ! if (time to check convergence) then + ! r = b - A*x + ! work0 = (r,r) + ! halo_update(r) + ! rr = global_sum(work0) + ! if (sqrt(r,r)/sqrt(b,b) < tolerance) exit + ! endif + ! end while + ! + ! where x = solution vector + ! d = conjugate direction vector + ! r = residual vector + ! M = preconditioning matrix + ! (r,z) = dot product of vectors r and z + ! and similarly for (Az,z), etc. + ! !--------------------------------------------------------------- !--------------------------------------------------------------- @@ -2032,8 +2749,9 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !--------------------------------------------------------------- integer, intent(in) :: & - nx, ny ! horizontal grid dimensions (for scalars) - ! velocity grid has dimensions (nx-1,ny-1) + nx, ny, & ! horizontal grid dimensions (for scalars) + ! velocity grid has dimensions (nx-1,ny-1) + nz ! number of vertical levels where velocity is computed type(parallel_type), intent(in) :: & parallel ! info for parallel communication @@ -2041,54 +2759,60 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & integer, dimension(-1:1,-1:1), intent(in) :: & indxA_2d ! maps relative (x,y) coordinates to an index between 1 and 9 + integer, dimension(-1:1,-1:1,-1:1), intent(in) :: & + indxA_3d ! maps relative (x,y,z) coordinates to an index between 1 and 27 + logical, dimension(nx-1,ny-1), intent(in) :: & active_vertex ! T for columns (i,j) where velocity is computed, else F - real(dp), dimension(nx-1,ny-1,9), intent(in) :: & - Auu, Auv, & ! four components of assembled matrix - Avu, Avv ! 3rd dimension = 9 (node and its nearest neighbors in x and y direction) - ! 1st and 2nd dimensions = (x,y) indices + real(dp), dimension(27,nz,nx-1,ny-1), intent(in) :: & + Auu, Auv, Avu, Avv ! four components of assembled matrix + ! 1st dimension = 27 (node and its nearest neighbors in x, y and z direction) + ! other dimensions = (z,x,y) indices ! ! Auu | Auv ! _____|____ ! Avu | Avv ! | - real(dp), dimension(nx-1,ny-1), intent(in) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(in) :: & bu, bv ! assembled load (rhs) vector, divided into 2 parts - real(dp), dimension(nx-1,ny-1), intent(inout) :: & + real(dp), dimension(nz,nx-1,ny-1), intent(inout) :: & xu, xv ! u and v components of solution (i.e., uvel and vvel) integer, intent(in) :: & precond ! = 0 for no preconditioning ! = 1 for diagonal preconditioning (best option for SSA-dominated flow) + ! = 2 for preconditioning with SIA solver (works well for SIA-dominated flow) integer, intent(in) :: & - linear_solve_ncheck ! number of iterations between convergence checks in the linear solver + linear_solve_ncheck ! number of iterations between convergence checks in the linear solver integer, intent(in) :: & - maxiters ! max number of linear iterations before quitting + maxiters ! max number of linear iterations before quitting real(dp), intent(in) :: & - tolerance ! tolerance for linear solver + tolerance ! tolerance for linear solver real(dp), intent(out) :: & - err ! error (L2 norm of residual) in final solution + err ! error (L2 norm of residual) in final solution integer, intent(out) :: & - niters ! iterations needed to solution + niters ! iterations needed to solution integer, intent(in) :: & - itest, jtest, rtest ! point for debugging diagnostics + itest, jtest, rtest ! point for debugging diagnostics !--------------------------------------------------------------- ! Local variables and parameters !--------------------------------------------------------------- - integer :: i, j ! grid indices - integer :: m ! matrix element index - integer :: iter ! iteration counter + integer :: i, j, k, m ! grid indices + integer :: ii, jj + integer :: ilocal, jlocal ! number of locally owned vertices in each direction + integer :: iter ! iteration counter + integer :: maxiters_chrongear ! max number of linear iterations before quitting real(dp) :: & alpha, &! rho/sigma = term in expression for new residual and solution @@ -2101,26 +2825,9 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & real(dp), dimension(2) :: & gsum ! result of global sum for dot products - ! diagonal matrix elements - real(dp), dimension(nx-1,ny-1) :: & - Adiagu, Adiagv ! diagonal terms of matrices Auu and Avv - - ! tridiagonal matrix elements - real(dp), dimension(:,:), allocatable :: & - Asubdiag_u, Adiag_u, Asupdiag_u, & ! matrix entries from Auu for tridiagonal preconditioning - Asubdiag_v, Adiag_v, Asupdiag_v ! matrix entries from Avv for tridiagonal preconditioning - - real(dp), dimension(:,:), allocatable :: & - omega_u, omega_v, & ! work arrays for tridiagonal solve - denom_u, denom_v, & - xuh_u, xuh_v, & - xlh_u, xlh_v - - real(dp), dimension(:,:), allocatable :: & - b_u, b_v, x_u, x_v - ! vectors (each of these is split into u and v components) - real(dp), dimension(nx-1,ny-1) :: & + real(dp), dimension(nz,nx-1,ny-1) :: & + Adiagu, Adiagv, &! diagonal terms of matrices Auu and Avv ru, rv, &! residual vector (b-Ax) du, dv, &! conjugate direction vector zu, zv, &! solution of Mz = r @@ -2128,7 +2835,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & Azu, Azv, &! result of matvec multiply A*z worku, workv ! intermediate results - real(dp), dimension(nx-1,ny-1,2) :: & + real(dp), dimension(nz,nx-1,ny-1,2) :: & work2u, work2v ! intermediate results real(dp) :: & @@ -2138,14 +2845,33 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & L2_rhs ! L2 norm of rhs vector = sqrt(b,b) ! solver is converged when L2_resid/L2_rhs < tolerance + real(dp), dimension(-1:1,nz,nx-1,ny-1) :: & + Muu, Mvv ! simplified SIA matrices for preconditioning + + ! arrays for tridiagonal preconditioning + ! Note: 2D diagonal entries are Adiag_u and Adiag_v; distinct from 3D Adiagu and Adiagv above + real(dp), dimension(:,:), allocatable :: & - gather_data_row, & ! arrays for gathering data from every task on a row or column - gather_data_col + Asubdiag_u, Adiag_u, Asupdiag_u, & ! matrix entries from Auu for tridiagonal preconditioning + Asubdiag_v, Adiag_v, Asupdiag_v ! matrix entries from Avv for tridiagonal preconditioning - integer :: ilocal, jlocal ! number of locally owned vertices in each direction + real(dp), dimension(:,:), allocatable :: & + omega_u, omega_v, & ! work arrays for tridiagonal solve + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v - integer :: ii, jj - integer :: maxiters_chrongear ! max number of linear iterations before quitting + ! Note: These two matrices are global in the EW and NS dimensions, respectively. + ! Each holds 8 pieces of information for each task on each row or column. + ! Since only 2 of these 8 pieces of information change from one iteration to the next, + ! it is more efficient to gather the remaining information once and pass the arrays + ! with intent(inout), than to declare the arrays in subroutine tridiag_solver_global_2d + ! and gather all the information every time the subroutine is called. + ! TODO: Revisit this. Is the efficiency gain large enough to justify the extra complexity? + + real(dp), dimension(:,:), allocatable :: & + gather_data_row, & ! arrays for gathering data from every task on a row or column + gather_data_col integer :: & staggered_ilo, staggered_ihi, & ! bounds of locally owned vertices on staggered grid @@ -2155,13 +2881,6 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & tasks_row, & ! number of tasks per row and column for tridiagonal solve tasks_col - integer, parameter :: & - maxiters_tridiag = 100 ! max number of linear iterations for tridiagonal preconditioning, - ! which generally leads to faster convergence than diagonal preconditioning - - !WHL - debug - real(dp) :: usum, usum_global, vsum, vsum_global - !WHL - debug integer :: iu_max, ju_max, iv_max, jv_max real(dp) :: ru_max, rv_max @@ -2174,15 +2893,20 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & tasks_row = parallel%tasks_row tasks_col = parallel%tasks_col + ! Note: maxiters_tridiag commented out here, because the BP tridiagonal solver + ! tends not to converge as well as the 2D version. + ! TODO: Make maxiters a config option. + ! Set the maximum number of linear iterations. ! Typically allow up to 200 iterations with diagonal preconditioning, but only 100 ! with tridiagonal, which usually converges faster. - if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then - maxiters_chrongear = maxiters_tridiag - else + !TODO - Test whether maxiters_tridiag (currently = 100) is sufficient for convergence with 3D solver +!! if (precond == HO_PRECOND_TRIDIAG_LOCAL .or. precond == HO_PRECOND_TRIDIAG_GLOBAL) then +!! maxiters_chrongear = maxiters_tridiag +!! else maxiters_chrongear = maxiters - endif +!! endif if (verbose_pcg .and. this_rank == rtest) then write(iulog,*) 'Using native PCG solver (Chronopoulos-Gear)' @@ -2205,8 +2929,8 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & elseif (precond == HO_PRECOND_DIAG) then - call setup_preconditioner_diag_2d(nx, ny, & - indxA_2d, & + call setup_preconditioner_diag_3d(nx, ny, & + nz, indxA_3d, & Auu, Avv, & Adiagu, Adiagv) @@ -2215,22 +2939,32 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & i = itest j = jtest write(iulog,*) 'i, j, r =', i, j, this_rank - write(iulog,*) 'Au diag =', Adiagu(i,j) - write(iulog,*) 'Av diag =', Adiagv(i,j) + write(iulog,*) 'Auu diag =', Adiagu(:,i,j) + write(iulog,*) 'Avv diag =', Adiagv(:,i,j) endif - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + elseif (precond == HO_PRECOND_SIA) then - !WHL - debug - if (verbose_tridiag .and. this_rank==rtest) then - i = itest - j = jtest + call setup_preconditioner_sia_3d(nx, ny, & + nz, indxA_3d, & + Auu, Avv, & + Muu, Mvv) + + if (verbose_pcg .and. this_rank == rtest) then + j = jtest + write(iulog,*) ' ' + write(iulog,*) 'i, k, Muu_sia, Mvv_sia:' + do i = staggered_ihi, staggered_ilo, -1 write(iulog,*) ' ' - write(iulog,*) 'r, i, j =', this_rank, i, j - write(iulog,*) 'Auu =', Auu(i,j,:) - write(iulog,*) 'Avv =', Avv(i,j,:) - endif + do k = 1, nz + write(iulog,'(2i4, 6e13.5)') i, k, Muu(:,k,i,j), Mvv(:,k,i,j) + enddo + enddo ! i + endif + + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + ! Allocate tridiagonal preconditioning matrices allocate(Adiag_u (nx-1,ny-1)) allocate(Asubdiag_u(nx-1,ny-1)) allocate(Asupdiag_u(nx-1,ny-1)) @@ -2243,22 +2977,26 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & allocate(omega_v (nx-1,ny-1)) allocate(denom_v (nx-1,ny-1)) - call setup_preconditioner_tridiag_local_2d(& - nx, ny, & - parallel, indxA_2d, & - itest, jtest, rtest, & - Auu, Avv, & - Adiag_u, Adiag_v, & - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v) + ! Compute arrays for tridiagonal preconditioning + call setup_preconditioner_tridiag_local_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + indxA_2d, indxA_3d, & + itest, jtest, rtest, & + Auu, Avv, & + Muu, Mvv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v) + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then - ! Allocate tridiagonal matrices - ! Note: (i,j) indices are switced for the A_v matrices to reduce striding. - + ! Allocate tridiagonal preconditioning matrices + ! Note: (i,j) indices are switched for the A_v matrices to reduce striding. allocate(Adiag_u (ilocal,jlocal)) allocate(Asubdiag_u(ilocal,jlocal)) allocate(Asupdiag_u(ilocal,jlocal)) @@ -2266,8 +3004,6 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & allocate(denom_u(ilocal,jlocal)) allocate(xuh_u(ilocal,jlocal)) allocate(xlh_u(ilocal,jlocal)) - allocate(b_u(ilocal,jlocal)) - allocate(x_u(ilocal,jlocal)) allocate(Adiag_v (jlocal,ilocal)) allocate(Asubdiag_v(jlocal,ilocal)) @@ -2276,100 +3012,66 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & allocate(denom_v(jlocal,ilocal)) allocate(xuh_v(jlocal,ilocal)) allocate(xlh_v(jlocal,ilocal)) - allocate(b_v(jlocal,ilocal)) - allocate(x_v(jlocal,ilocal)) - - ! Compute the entries of the tridiagonal matrices - - ! Extract tridiagonal matrix entries from Auu - do j = 1, jlocal - jj = j + staggered_jlo - 1 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - Asubdiag_u(i,j) = Auu(ii,jj,indxA_2d(-1,0)) ! subdiagonal elements - Adiag_u (i,j) = Auu(ii,jj,indxA_2d( 0,0)) ! diagonal elements - Asupdiag_u(i,j) = Auu(ii,jj,indxA_2d( 1,0)) ! superdiagonal elements - enddo - enddo - - ! compute work arrays for the u solve in each matrix row - call setup_preconditioner_tridiag_global_2d(& - ilocal, jlocal, & -!! itest, jtest, rtest, & - itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates - jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates - rtest, & - Adiag_u, & - Asubdiag_u, Asupdiag_u, & - omega_u, denom_u, & - xuh_u, xlh_u) - ! Extract tridiagonal matrix entries from Avv - do i = 1, ilocal - ii = i + staggered_ilo - 1 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - Asubdiag_v(j,i) = Avv(ii,jj,indxA_2d(0,-1)) ! subdiagonal elements - Adiag_v (j,i) = Avv(ii,jj,indxA_2d(0, 0)) ! diagonal elements - Asupdiag_v(j,i) = Avv(ii,jj,indxA_2d(0, 1)) ! superdiagonal elements - enddo - enddo + ! These two matrices are for gathering data from all tasks on a given row or column. + allocate(gather_data_row(8*tasks_row,jlocal)) + allocate(gather_data_col(8*tasks_col,ilocal)) + gather_data_row = 0.0d0 + gather_data_col = 0.0d0 - ! compute work arrays for the v solve in each matrix column - ! Note: The *_v arrays have dimensions (jlocal,ilocal) to reduce strides + ! Compute arrays for tridiagonal preconditioning - call setup_preconditioner_tridiag_global_2d(& - jlocal, ilocal, & -!! itest, jtest, rtest, & - jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates - itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates - rtest, & - Adiag_v, & - Asubdiag_v, Asupdiag_v, & - omega_v, denom_v, & - xuh_v, xlh_v) + call setup_preconditioner_tridiag_global_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + indxA_2d, indxA_3d, & + ilocal, jlocal, & + itest, jtest, rtest, & + Auu, Avv, & + Muu, Mvv, & + Adiag_u, Adiag_v, & + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v) endif ! precond - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) write(iulog,*) 'Done in PC setup' - call t_stopf("pcg_precond_init") !---- Initialize scalars and vectors niters = maxiters_chrongear - ru(:,:) = 0.d0 - rv(:,:) = 0.d0 - du(:,:) = 0.d0 - dv(:,:) = 0.d0 - zu(:,:) = 0.d0 - zv(:,:) = 0.d0 - qu(:,:) = 0.d0 - qv(:,:) = 0.d0 - Azu(:,:) = 0.d0 - Azv(:,:) = 0.d0 - worku(:,:) = 0.d0 - workv(:,:) = 0.d0 - work2u(:,:,:) = 0.d0 - work2v(:,:,:) = 0.d0 + ru(:,:,:) = 0.d0 + rv(:,:,:) = 0.d0 + du(:,:,:) = 0.d0 + dv(:,:,:) = 0.d0 + zu(:,:,:) = 0.d0 + zv(:,:,:) = 0.d0 + qu(:,:,:) = 0.d0 + qv(:,:,:) = 0.d0 + Azu(:,:,:) = 0.d0 + Azv(:,:,:) = 0.d0 + worku(:,:,:) = 0.d0 + workv(:,:,:) = 0.d0 + work2u(:,:,:,:) = 0.d0 + work2v(:,:,:,:) = 0.d0 !---- Compute the L2 norm of the RHS vectors !---- (Goal is to obtain L2_resid/L2_rhs < tolerance) call t_startf("pcg_dotprod") - worku(:,:) = bu(:,:)*bu(:,:) ! terms of dot product (b, b) - workv(:,:) = bv(:,:)*bv(:,:) + worku(:,:,:) = bu(:,:,:)*bu(:,:,:) ! terms of dot product (b, b) + workv(:,:,:) = bv(:,:,:)*bv(:,:,:) call t_stopf("pcg_dotprod") ! find global sum of the squared L2 norm call t_startf("pcg_glbsum_init") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - bb, & - worku, workv) + bb = parallel_global_sum_stagger(worku, parallel, workv) call t_stopf("pcg_glbsum_init") ! take square root @@ -2380,6 +3082,8 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & ! First pass of algorithm !--------------------------------------------------------------- + iter = 1 + ! Note: The matrix A must be complete for all rows corresponding to locally ! owned nodes, and x must have the correct values in ! halo nodes bordering the locally owned nodes. @@ -2395,9 +3099,9 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- Compute A*x (use z as a temp vector for A*x) call t_startf("pcg_matmult_init") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -2408,8 +3112,8 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- This is correct for locally owned nodes. call t_startf("pcg_vecupdate") - ru(:,:) = bu(:,:) - zu(:,:) - rv(:,:) = bv(:,:) - zv(:,:) + ru(:,:,:) = bu(:,:,:) - zu(:,:,:) + rv(:,:,:) = bv(:,:,:) - zv(:,:,:) call t_stopf("pcg_vecupdate") !---- Halo update for residual @@ -2425,177 +3129,81 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & ! From here on, call timers with 'iter' suffix because this can be considered the first iteration call t_startf("pcg_precond_iter") - if (precond == HO_PRECOND_NONE) then ! no preconditioning + if (precond == HO_PRECOND_NONE) then ! no preconditioning - zu(:,:) = ru(:,:) ! PC(r) = r - zv(:,:) = rv(:,:) ! PC(r) = r + zu(:,:,:) = ru(:,:,:) ! PC(r) = r + zv(:,:,:) = rv(:,:,:) ! PC(r) = r - elseif (precond == HO_PRECOND_DIAG) then ! diagonal preconditioning + elseif (precond == HO_PRECOND_DIAG ) then ! diagonal preconditioning - ! Solve Mz = r, where M is a diagonal matrix do j = 1, ny-1 do i = 1, nx-1 - if (Adiagu(i,j) /= 0.d0) then - zu(i,j) = ru(i,j) / Adiagu(i,j) ! PC(r), where PC is formed from diagonal elements of A - else - zu(i,j) = 0.d0 - endif - if (Adiagv(i,j) /= 0.d0) then - zv(i,j) = rv(i,j) / Adiagv(i,j) + do k = 1, nz + if (Adiagu(k,i,j) /= 0.d0) then + zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j) ! PC(r), where PC is formed from diagonal elements of A else - zv(i,j) = 0.d0 - endif - enddo ! i - enddo ! j - - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - i = itest -! write(iulog,*) ' ' -! write(iulog,*) 'zv solve with diagonal precond, this_rank, i =', this_rank, i -! write(iulog,*) 'j, active, Adiagv, rv, zv, xv:' -! do j = staggered_jhi, staggered_jlo, -1 -! write(iulog,'(i4, l4, 2f12.3, e12.3, f12.3)') j, active_vertex(i,j), Adiagv(i,j), rv(i,j), zv(i,j), xv(i,j) -! enddo - endif - - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then ! local - - if (verbose_tridiag .and. this_rank == rtest) then - i = itest - j = jtest - write(iulog,*) 'Residual:' - write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j) - write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j) - write(iulog,*) ' ' - write(iulog,*) 'jtest =', jtest - write(iulog,*) 'i, ru, rv:' - do i = itest-3, itest+3 - write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) - enddo - endif - - if (verbose_pcg .and. this_rank == rtest) then - write(iulog,*) 'call tridiag_solver_local_2d' - endif - - ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task) - - !TODO - Test a local solver that can compute zu and zv in the halo - ! (to avoid the halo update below) - - call tridiag_solver_local_2d(nx, ny, & - parallel, & - itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of preconditioning matrix - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - ru, rv, & ! right hand side - zu, zv) ! solution - - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' - do i = itest-3, itest+3 - write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) - enddo + zu(k,i,j) = 0.d0 endif + if (Adiagv(k,i,j) /= 0.d0) then + zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j) + else + zv(k,i,j) = 0.d0 + endif + enddo ! k + enddo ! i + enddo ! j - !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells - !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? - call staggered_parallel_halo(zu, parallel) - call staggered_parallel_halo(zv, parallel) - - elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve - - ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal) - do j = 1, jlocal - jj = j + staggered_jlo - 1 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - b_u(i,j) = ru(ii,jj) - enddo - enddo - - ! Initialize the array for gathering information on each row of tasks - allocate(gather_data_row(8*tasks_row,jlocal)) - gather_data_row = 0.0d0 - - ! Solve M*z = r, where M is a global tridiagonal matrix - - call tridiag_solver_global_2d(ilocal, jlocal, & - parallel, tasks_row, & - 'row', & ! tridiagonal solve for each row -!! itest, jtest, rtest, & - itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates - jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates - rtest, & - Adiag_u, & - Asubdiag_u, Asupdiag_u, & - omega_u, denom_u, & - xuh_u, xlh_u, & - b_u, x_u, & - .true., & ! first_time - gather_data_row) + elseif (precond == HO_PRECOND_SIA) then ! local vertical shallow-ice solver for preconditioning - ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1) - zu(:,:) = 0.0d0 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - zu(ii,jj) = x_u(i,j) - enddo - enddo + call easy_sia_solver(nx, ny, nz, & + active_vertex, & + Muu, ru, zu) ! solve Muu*zu = ru for zu - ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal) + call easy_sia_solver(nx, ny, nz, & + active_vertex, & + Mvv, rv, zv) ! solve Mvv*zv = rv for zv - do i = 1, ilocal - ii = i + staggered_ilo - 1 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - b_v(j,i) = rv(ii,jj) - enddo - enddo + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then - ! Initialize the array for gathering information on each column of tasks - allocate(gather_data_col(8*tasks_col,ilocal)) - gather_data_col = 0.0d0 + ! Use a local tridiagonal solver to find an approximate solution of A*z = r - call tridiag_solver_global_2d(jlocal, ilocal, & - parallel, tasks_col, & - 'col', & ! tridiagonal solve for each column -!! itest, jtest, rtest, & - jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates - itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates - rtest, & - Adiag_v, & - Asubdiag_v, Asupdiag_v, & - omega_v, denom_v, & - xuh_v, xlh_v, & - b_v, x_v, & - .true., & ! first_time - gather_data_col) + call tridiag_solver_local_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + Muu, Mvv, & ! entries of SIA matrix + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r - ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1) + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then - zv(:,:) = 0.0d0 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - zv(ii,jj) = x_v(j,i) - enddo - enddo + ! Use a global tridiagonal solver to find an approximate solution of A*z = r - !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells - !TODO: See whether tridiag_solver_local_2d could be modified to provide zu and zv in halo cells? - call staggered_parallel_halo(zu, parallel) - call staggered_parallel_halo(zv, parallel) + call tridiag_solver_global_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + ilocal, jlocal, & + tasks_row, tasks_col, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v, & + Muu, Mvv, & ! entries of SIA matrix + gather_data_row, gather_data_col, & + .true., & ! first_time = T (first iteration) + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r endif ! precond @@ -2604,60 +3212,42 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- Compute intermediate result for dot product (r,z) call t_startf("pcg_dotprod") - work2u(:,:,1) = ru(:,:) * zu(:,:) - work2v(:,:,1) = rv(:,:) * zv(:,:) + work2u(:,:,:,1) = ru(:,:,:) * zu(:,:,:) + work2v(:,:,:,1) = rv(:,:,:) * zv(:,:,:) call t_stopf("pcg_dotprod") !---- Compute the conjugate direction vector d !---- Since z is correct in halo, so is d - du(:,:) = zu(:,:) - dv(:,:) = zv(:,:) + du(:,:,:) = zu(:,:,:) + dv(:,:,:) = zv(:,:,:) !---- Compute q = A*d - !---- q is correct for locally owned nodes, provided d extends one layer into the halo + !---- q is correct for locally owned nodes call t_startf("pcg_matmult_iter") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & Auu, Auv, & Avu, Avv, & du, dv, & qu, qv) call t_stopf("pcg_matmult_iter") - !WHL - debug - usum = sum(qu(staggered_ilo:staggered_ihi,staggered_jlo:staggered_jhi)) - usum_global = parallel_reduce_sum(usum) - vsum = sum(qv(staggered_ilo:staggered_ihi,staggered_jlo:staggered_jhi)) - vsum_global = parallel_reduce_sum(vsum) - - if (verbose_pcg .and. this_rank == rtest) then -!! write(iulog,*) 'Prep: sum(qu), sum(qv) =', usum_global, vsum_global - endif - !---- Compute intermediate result for dot product (d,q) = (d,Ad) call t_startf("pcg_dotprod") - work2u(:,:,2) = du(:,:) * qu(:,:) - work2v(:,:,2) = dv(:,:) * qv(:,:) + work2u(:,:,:,2) = du(:,:,:) * qu(:,:,:) + work2v(:,:,:,2) = dv(:,:,:) * qv(:,:,:) call t_stopf("pcg_dotprod") !---- Find global sums of (r,z) and (d,q) call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - gsum, & - work2u, work2v) + gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) call t_stopf("pcg_glbsum_iter") - if (verbose_pcg .and. this_rank == rtest) then -!! write(iulog,*) 'Prep: gsum(1), gsum(2) =', gsum(1), gsum(2) - endif - !---- Halo update for q call t_startf("pcg_halo_iter") @@ -2679,227 +3269,112 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- These are correct in halo call t_startf("pcg_vecupdate") - xu(:,:) = xu(:,:) + alpha*du(:,:) - xv(:,:) = xv(:,:) + alpha*dv(:,:) - - ru(:,:) = ru(:,:) - alpha*qu(:,:) ! q = A*d - rv(:,:) = rv(:,:) - alpha*qv(:,:) - call t_stopf("pcg_vecupdate") - - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'iter = 1: i, xu, xv, ru, rv:' -!! do i = itest-3, itest+3 - do i = staggered_ilo, staggered_ihi - write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j) - enddo ! i - endif - - !--------------------------------------------------------------- - ! Iterate to solution - !--------------------------------------------------------------- - - iter_loop: do iter = 2, maxiters_chrongear ! first iteration done above - - if (verbose_pcg .and. this_rank == rtest) then - write(iulog,*) ' ' - write(iulog,*) 'iter =', iter - endif - - !---- Compute PC(r) = solution z of Mz = r - !---- z is correct in halo - - call t_startf("pcg_precond_iter") - - if (precond == HO_PRECOND_NONE) then ! no preconditioning - - zu(:,:) = ru(:,:) ! PC(r) = r - zv(:,:) = rv(:,:) ! PC(r) = r - - elseif (precond == HO_PRECOND_DIAG) then ! diagonal preconditioning - - do j = 1, ny-1 - do i = 1, nx-1 - if (Adiagu(i,j) /= 0.d0) then - zu(i,j) = ru(i,j) / Adiagu(i,j) ! PC(r), where PC is formed from diagonal elements of A - else - zu(i,j) = 0.d0 - endif - if (Adiagv(i,j) /= 0.d0) then - zv(i,j) = rv(i,j) / Adiagv(i,j) - else - zv(i,j) = 0.d0 - endif - enddo ! i - enddo ! j - - elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then ! tridiagonal preconditioning with local solve - - ! Solve M*z = r, where M is a local tridiagonal matrix (one matrix per task) - - !TODO - Test a local solver that can compute zu and zv in the halo - ! (to avoid the halo update below) - - !WHL - debug - if (verbose_tridiag .and. this_rank == rtest) then - i = itest - j = jtest -! write(iulog,*) 'Residual:' -! write(iulog,*) 'r, i, j, ru:', this_rank, i, j, ru(i,j) -! write(iulog,*) 'r, i, j, rv:', this_rank, i, j, rv(i,j) -! write(iulog,*) ' ' -! write(iulog,*) 'jtest =', jtest -! write(iulog,*) 'i, ru, rv:' -! do i = staggered_ihi, staggered_ilo, -1 -! write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) -! enddo - endif - - - !WHL - debug - if (verbose_pcg .and. this_rank == rtest) then - write(iulog,*) ' call tridiag_solver_local_2d' - endif - - call tridiag_solver_local_2d(nx, ny, & - parallel, & - itest, jtest, rtest, & - Adiag_u, Adiag_v, & ! entries of preconditioning matrix - Asubdiag_u, Asubdiag_v, & - Asupdiag_u, Asupdiag_v, & - omega_u, omega_v, & - denom_u, denom_v, & - ru, rv, & ! right hand side - zu, zv) ! solution - - !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells - !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? - call staggered_parallel_halo(zu, parallel) - call staggered_parallel_halo(zv, parallel) - - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'tridiag solve: i, ru, rv, zu, zv:' - do i = itest-3, itest+3 - write(iulog,'(i4, 4f16.10)') i, ru(i,j), rv(i,j), zu(i,j), zv(i,j) - enddo - endif - - elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve - - !WHL - debug - if (verbose_tridiag .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'jtest =', jtest - write(iulog,*) 'i, ru, rv:' - do i = itest-3, itest+3 - write(iulog,'(i4, 2f15.10)') i, ru(i,j), rv(i,j) - enddo - endif - - ! convert ru(nx-1,ny-1) to b_u(ilocal,jlocal) + xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:) + xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:) - do j = 1, jlocal - jj = j + staggered_jlo - 1 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - b_u(i,j) = ru(ii,jj) - enddo - enddo + ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:) ! q = A*d + rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:) + call t_stopf("pcg_vecupdate") - !WHL - debug if (verbose_pcg .and. this_rank == rtest) then j = jtest write(iulog,*) ' ' - write(iulog,*) 'Before global tridiag PC u solve, r, j =', rtest, jtest - write(iulog,*) ' ' - write(iulog,*) 'i, Adiag_u, Asubdiag_u, Asupdiag_u, b_u:' + write(iulog,*) 'iter =', iter + write(iulog,*) 'i, k, xu, xv, ru, rv:' do i = itest-3, itest+3 - write(iulog,'(i4, 4e16.8)') i, Adiag_u(i,j), Asubdiag_u(i,j), Asupdiag_u(i,j), b_u(i,j) - enddo + write(iulog,*) ' ' + do k = 1, nz + write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) + enddo + enddo ! i endif + !--------------------------------------------------------------- + ! Iterate to solution + !--------------------------------------------------------------- - call tridiag_solver_global_2d(ilocal, jlocal, & - parallel, tasks_row, & - 'row', & ! tridiagonal solve for each row -!! itest, jtest, rtest, & - itest - staggered_ilo + 1, & ! itest referenced to (ilocal,jlocal) coordinates - jtest - staggered_jlo + 1, & ! jtest referenced to (ilocal,jlocal) coordinates - rtest, & - Adiag_u, & - Asubdiag_u, Asupdiag_u, & - omega_u, denom_u, & - xuh_u, xlh_u, & - b_u, x_u, & - .false., & ! first_time - gather_data_row) + iter_loop: do iter = 2, maxiters_chrongear ! first iteration done above - ! convert x_u(ilocal,jlocal) to zu(nx-1,ny-1) - zu(:,:) = 0.0d0 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - zu(ii,jj) = x_u(i,j) - enddo - enddo + !---- Compute PC(r) = solution z of Mz = r + !---- z is correct in halo - ! convert rv(nx-1,ny-1) to b_v(jlocal,ilocal) + call t_startf("pcg_precond_iter") - do i = 1, ilocal - ii = i + staggered_ilo - 1 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - b_v(j,i) = rv(ii,jj) - enddo - enddo + if (precond == HO_PRECOND_NONE) then ! no preconditioning - if (verbose_pcg .and. this_rank == rtest) then - j = jtest - write(iulog,*) ' ' - write(iulog,*) 'Before global tridiag PC v solve, r, j =', rtest, jtest - write(iulog,*) ' ' - write(iulog,*) 'i, Adiag_v, Asubdiag_v, Asupdiag_v, b_v:' - do i = itest-3, itest+3 - write(iulog,'(i4, 4e16.8)') i, Adiag_v(i,j), Asubdiag_v(i,j), Asupdiag_v(i,j), b_v(i,j) - enddo - endif + zu(:,:,:) = ru(:,:,:) ! PC(r) = r + zv(:,:,:) = rv(:,:,:) ! PC(r) = r - call tridiag_solver_global_2d(jlocal, ilocal, & - parallel, tasks_col, & - 'col', & ! tridiagonal solve for each column -!! itest, jtest, rtest, & - jtest - staggered_jlo + 1, & ! jtest referenced to (jlocal,ilocal) coordinates - itest - staggered_ilo + 1, & ! itest referenced to (jlocal,ilocal) coordinates - rtest, & - Adiag_v, & - Asubdiag_v, Asupdiag_v, & - omega_v, denom_v, & - xuh_v, xlh_v, & - b_v, x_v, & - .false., & ! first_time - gather_data_col) + elseif (precond == HO_PRECOND_DIAG ) then ! diagonal preconditioning - ! convert x_v(jlocal,ilocal) to zv(nx-1,ny-1) + do j = 1, ny-1 + do i = 1, nx-1 + do k = 1, nz + if (Adiagu(k,i,j) /= 0.d0) then + zu(k,i,j) = ru(k,i,j) / Adiagu(k,i,j) ! PC(r), where PC is formed from diagonal elements of A + else + zu(k,i,j) = 0.d0 + endif + if (Adiagv(k,i,j) /= 0.d0) then + zv(k,i,j) = rv(k,i,j) / Adiagv(k,i,j) + else + zv(k,i,j) = 0.d0 + endif + enddo ! k + enddo ! i + enddo ! j - zv(:,:) = 0.0d0 - do i = 1, ilocal - ii = i + staggered_ilo - 1 - do j = 1, jlocal - jj = j + staggered_jlo - 1 - zv(ii,jj) = x_v(j,i) - enddo - enddo + elseif (precond == HO_PRECOND_SIA) then ! local vertical shallow-ice solver for preconditioning - !Note: Need zu and zv in a row of halo cells so that q = A*d is correct in all locally owned cells - !TODO: See whether tridiag solvers could be modified to provide zu and zv in halo cells? - call staggered_parallel_halo(zu, parallel) - call staggered_parallel_halo(zv, parallel) + call easy_sia_solver(nx, ny, nz, & + active_vertex, & + Muu, ru, zu) ! solve Muu*zu = ru for zu + + call easy_sia_solver(nx, ny, nz, & + active_vertex, & + Mvv, rv, zv) ! solve Mvv*zv = rv for zv + + elseif (precond == HO_PRECOND_TRIDIAG_LOCAL) then + + ! Use a local tridiagonal solver to find an approximate solution of A*z = r + + call tridiag_solver_local_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + Muu, Mvv, & ! entries of SIA matrix + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r + + elseif (precond == HO_PRECOND_TRIDIAG_GLOBAL) then ! tridiagonal preconditioning with global solve + + ! Use a global tridiagonal solver to find an approximate solution of A*z = r + + call tridiag_solver_global_3d(& + nx, ny, & + nz, parallel, & + active_vertex, & + ilocal, jlocal, & + tasks_row, tasks_col, & + itest, jtest, rtest, & + Adiag_u, Adiag_v, & ! entries of 2D preconditioning matrix + Asubdiag_u, Asubdiag_v, & + Asupdiag_u, Asupdiag_v, & + omega_u, omega_v, & + denom_u, denom_v, & + xuh_u, xuh_v, & + xlh_u, xlh_v, & + Muu, Mvv, & ! entries of SIA matrix + gather_data_row, gather_data_col, & + .false., & ! first_time = F (iteration 2+) + ru, rv, & ! 3D residual + zu, zv) ! approximate solution of Az = r endif ! precond @@ -2907,26 +3382,27 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- Compute Az = A*z !---- This is the one matvec multiply required per iteration - !---- Az is correct for locally owned nodes and needs a halo update (below) + !---- Az is correct for local owned nodes and needs a halo update (below) call t_startf("pcg_matmult_iter") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & Auu, Auv, & Avu, Avv, & zu, zv, & Azu, Azv) call t_stopf("pcg_matmult_iter") + !---- Compute intermediate results for the dot products (r,z) and (Az,z) call t_startf("pcg_dotprod") - work2u(:,:,1) = ru(:,:)*zu(:,:) ! terms of dot product (r,z) - work2v(:,:,1) = rv(:,:)*zv(:,:) + work2u(:,:,:,1) = ru(:,:,:)*zu(:,:,:) ! terms of dot product (r,z) + work2v(:,:,:,1) = rv(:,:,:)*zv(:,:,:) - work2u(:,:,2) = Azu(:,:)*zu(:,:) ! terms of dot product (A*z,z) - work2v(:,:,2) = Azv(:,:)*zv(:,:) + work2u(:,:,:,2) = Azu(:,:,:)*zu(:,:,:) ! terms of dot product (A*z,z) + work2v(:,:,:,2) = Azv(:,:,:)*zv(:,:,:) call t_stopf("pcg_dotprod") ! Take the global sums of (r,z) and (Az,z) @@ -2934,14 +3410,11 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & ! this is the one MPI global reduction per iteration. call t_startf("pcg_glbsum_iter") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - gsum, & - work2u, work2v) + gsum = parallel_global_sum_stagger(work2u, 2, parallel, work2v) call t_stopf("pcg_glbsum_iter") !---- Halo update for Az + !---- This is the one halo update required per iteration call t_startf("pcg_halo_iter") call staggered_parallel_halo(Azu, parallel) @@ -2960,7 +3433,7 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & if (alpha /= alpha) then ! alpha is NaN !! write(iulog,*) 'rho, sigma, alpha:', rho, sigma, alpha - call write_log('Chron_Gear PCG solver has failed, alpha = NaN', GM_FATAL) + call write_log('Chron-Gear PCG solver has failed, alpha = NaN', GM_FATAL) endif !---- Update d and q @@ -2968,33 +3441,36 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & call t_startf("pcg_vecupdate") - du(:,:) = zu(:,:) + beta*du(:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i - dv(:,:) = zv(:,:) + beta*dv(:,:) ! - ! (r_(i+1), PC(r_(i+1))) - ! where beta_(i+1) = -------------------- - ! (r_i, PC(r_i)) - qu(:,:) = Azu(:,:) + beta*qu(:,:) - qv(:,:) = Azv(:,:) + beta*qv(:,:) + du(:,:,:) = zu(:,:,:) + beta*du(:,:,:) ! d_(i+1) = PC(r_(i+1)) + beta_(i+1)*d_i + dv(:,:,:) = zv(:,:,:) + beta*dv(:,:,:) ! + ! (r_(i+1), PC(r_(i+1))) + ! where beta_(i+1) = -------------------- + ! (r_i, PC(r_i)) + qu(:,:,:) = Azu(:,:,:) + beta*qu(:,:,:) + qv(:,:,:) = Azv(:,:,:) + beta*qv(:,:,:) !---- Update solution and residual !---- These are correct in halo - xu(:,:) = xu(:,:) + alpha*du(:,:) - xv(:,:) = xv(:,:) + alpha*dv(:,:) + xu(:,:,:) = xu(:,:,:) + alpha*du(:,:,:) + xv(:,:,:) = xv(:,:,:) + alpha*dv(:,:,:) - ru(:,:) = ru(:,:) - alpha*qu(:,:) - rv(:,:) = rv(:,:) - alpha*qv(:,:) + ru(:,:,:) = ru(:,:,:) - alpha*qu(:,:,:) + rv(:,:,:) = rv(:,:,:) - alpha*qv(:,:,:) call t_stopf("pcg_vecupdate") - !WHL - debug if (verbose_pcg .and. this_rank == rtest) then j = jtest - write(iulog,*) 'i, xu, xv, ru, rv:' -!! do i = staggered_ihi, staggered_ilo, -1 + write(iulog,*) ' ' + write(iulog,*) 'iter =', iter + write(iulog,*) 'i, k, xu, xv, ru, rv:' do i = itest-3, itest+3 - write(iulog,'(i4, 4f16.10)') i, xu(i,j), xv(i,j), ru(i,j), rv(i,j) - enddo + write(iulog,*) ' ' + do k = 1, nz + write(iulog,'(2i4, 4f16.10)') i, k, xu(k,i,j), xv(k,i,j), ru(k,i,j), rv(k,i,j) + enddo + enddo ! i endif ! Check for convergence every linear_solve_ncheck iterations. @@ -3004,19 +3480,18 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & ! For convergence check, use r = b - Ax if (mod(iter, linear_solve_ncheck) == 0 .or. iter == 5) then -!! if (mod(iter, linear_solve_ncheck) == 0 .or. iter == linear_solve_ncheck/2) then - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) ' ' - write(iulog,*) ' check convergence, iter =', iter + write(iulog,*) 'Check convergence, iter =', iter endif !---- Compute z = A*x (use z as a temp vector for A*x) call t_startf("pcg_matmult_resid") - call matvec_multiply_structured_2d(nx, ny, & - parallel, & - indxA_2d, active_vertex, & + call matvec_multiply_structured_3d(nx, ny, & + nz, parallel, & + indxA_3d, active_vertex, & Auu, Auv, & Avu, Avv, & xu, xv, & @@ -3026,32 +3501,32 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & !---- Compute residual r = b - A*x call t_startf("pcg_vecupdate") - ru(:,:) = bu(:,:) - zu(:,:) - rv(:,:) = bv(:,:) - zv(:,:) + ru(:,:,:) = bu(:,:,:) - zu(:,:,:) + rv(:,:,:) = bv(:,:,:) - zv(:,:,:) call t_stopf("pcg_vecupdate") !---- Compute dot product (r, r) call t_startf("pcg_dotprod") - worku(:,:) = ru(:,:)*ru(:,:) - workv(:,:) = rv(:,:)*rv(:,:) + worku(:,:,:) = ru(:,:,:)*ru(:,:,:) + workv(:,:,:) = rv(:,:,:)*rv(:,:,:) call t_stopf("pcg_dotprod") call t_startf("pcg_glbsum_resid") - call parallel_global_sum_staggered(& - nx, ny, & - parallel, & - rr, & - worku, workv) + rr = parallel_global_sum_stagger(worku, parallel, workv) call t_stopf("pcg_glbsum_resid") - L2_resid = sqrt(rr) ! L2 norm of residual - err = L2_resid/L2_rhs ! normalized error + ! take square root + L2_resid = sqrt(rr) - if (verbose_pcg .and. this_rank == rtest) then + ! compute normalized error + err = L2_resid/L2_rhs + + if (verbose_pcg .and. main_task) then write(iulog,*) 'iter, L2_resid, L2_rhs, error =', iter, L2_resid, L2_rhs, err endif + !WHL - debug if (verbose_pcg .and. this_rank == rtest) then ru_max = 0.d0 rv_max = 0.d0 @@ -3059,13 +3534,13 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & ju_max = 0 do j = staggered_jlo, staggered_jhi do i = staggered_ilo, staggered_ihi - if (abs(ru(i,j)) > ru_max) then - ru_max = ru(i,j) + if (sum(abs(ru(:,i,j))) > ru_max) then + ru_max = sum(abs(ru(:,i,j))) iu_max = i ju_max = j endif - if (abs(rv(i,j)) > rv_max) then - rv_max = rv(i,j) + if (sum(abs(rv(:,i,j))) > rv_max) then + rv_max = sum(abs(rv(:,i,j))) iv_max = i jv_max = j endif @@ -3081,14 +3556,16 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & if (err < tolerance) then niters = iter - if (verbose_pcg .and. this_rank == rtest) then + if (verbose_pcg .and. main_task) then write(iulog,*) 'Glissade PCG solver has converged, iter =', niters + write(iulog,*) ' ' endif exit iter_loop - elseif (niters == maxiters_chrongear) then - if (verbose_pcg .and. this_rank == rtest) then - write(iulog,*) 'Glissade PCG solver not converged' - write(iulog,*) 'niters, err, tolerance:', niters, err, tolerance + elseif (iter == maxiters_chrongear) then + if (verbose_pcg .and. main_task) then + write(iulog,*) 'Glissade PCG solver did not converge' + write(iulog,*) 'iter, err, tolerance:', iter, err, tolerance + write(iulog,*) ' ' endif endif @@ -3111,12 +3588,10 @@ subroutine pcg_solver_chrongear_2d(nx, ny, & if (allocated(denom_u)) deallocate(denom_u, denom_v) if (allocated(xuh_u)) deallocate(xuh_u, xuh_v) if (allocated(xlh_u)) deallocate(xlh_u, xlh_v) - if (allocated(b_u)) deallocate(b_u, b_v) - if (allocated(x_u)) deallocate(x_u, x_v) if (allocated(gather_data_row)) deallocate(gather_data_row) if (allocated(gather_data_col)) deallocate(gather_data_col) - end subroutine pcg_solver_chrongear_2d + end subroutine pcg_solver_chrongear_3d !**************************************************************************** @@ -3957,6 +4432,7 @@ subroutine setup_preconditioner_tridiag_global_2d(ilocal, jlocal, & !WHL - debug if (verbose_tridiag .and. this_rank == rtest) then + write(iulog,*) write(iulog,*) 'In setup_preconditioner_tridiag_global_2d: itest, jtest, rtest =', itest, jtest, rtest endif @@ -4637,9 +5113,8 @@ subroutine tridiag_solver_global_2d(ilocal, jlocal, & first_time, gather_data) use glimmer_utils, only: tridiag - use cism_parallel, only: distributed_gather_var_row, distributed_gather_var_col, & - distributed_gather_all_var_row, distributed_gather_all_var_col, & - distributed_scatter_var_row, distributed_scatter_var_col + use cism_parallel, only: gather_var_row, gather_var_col, & + gather_all_var_row, gather_all_var_col, scatter_var_row, scatter_var_col integer, intent(in) :: & ilocal, jlocal ! size of input/output arrays; number of locally owned vertices in each direction @@ -4864,11 +5339,11 @@ subroutine tridiag_solver_global_2d(ilocal, jlocal, & if (tridiag_solver_flag == 'row') then call t_startf("pcg_tridiag_gather_row") - call distributed_gather_all_var_row(outdata, gather_data2, parallel) + call gather_all_var_row(outdata, gather_data2, parallel) call t_stopf ("pcg_tridiag_gather_row") elseif (tridiag_solver_flag == 'col') then call t_startf("pcg_tridiag_gather_col") - call distributed_gather_all_var_col(outdata, gather_data2, parallel) + call gather_all_var_col(outdata, gather_data2, parallel) call t_stopf ("pcg_tridiag_gather_col") endif @@ -4879,11 +5354,11 @@ subroutine tridiag_solver_global_2d(ilocal, jlocal, & if (tridiag_solver_flag == 'row') then call t_startf("pcg_tridiag_gather_row") - call distributed_gather_var_row(outdata, gather_data2, parallel) + call gather_var_row(outdata, gather_data2, parallel) call t_stopf ("pcg_tridiag_gather_row") elseif (tridiag_solver_flag == 'col') then call t_startf("pcg_tridiag_gather_col") - call distributed_gather_var_col(outdata, gather_data2, parallel) + call gather_var_col(outdata, gather_data2, parallel) call t_stopf ("pcg_tridiag_gather_col") endif @@ -4998,11 +5473,11 @@ subroutine tridiag_solver_global_2d(ilocal, jlocal, & if (tridiag_solver_flag == 'row') then call t_startf("pcg_tridiag_scatter_row") - call distributed_scatter_var_row(local_coeffs, global_coeffs, parallel) + call scatter_var_row(local_coeffs, global_coeffs, parallel) call t_stopf ("pcg_tridiag_scatter_row") elseif (tridiag_solver_flag == 'col') then call t_startf("pcg_tridiag_scatter_col") - call distributed_scatter_var_col(local_coeffs, global_coeffs, parallel) + call scatter_var_col(local_coeffs, global_coeffs, parallel) call t_stopf ("pcg_tridiag_scatter_col") endif diff --git a/utils/build/generate_ncvars.py b/utils/build/generate_ncvars.py index 993c2c0a..e791670d 100755 --- a/utils/build/generate_ncvars.py +++ b/utils/build/generate_ncvars.py @@ -193,9 +193,9 @@ def write(self,vars): self.print_warning() for l in self.infile.readlines(): for token in self.handletoken: - if l.find(token) is not -1: + if l.find(token) != -1: break - if l.find(token) is not -1: + if l.find(token) != -1: for v in vars.keys(): self.handletoken[token](vars[v]) else: @@ -267,9 +267,9 @@ def write(self,vars): for k in module.keys(): l = l.replace(k.upper(),module[k]) for token in self.handletoken: - if l.find(token) is not -1: + if l.find(token) != -1: break - if l.find(token) is not -1: + if l.find(token) != -1: for v in vars.keys(): self.handletoken[token](vars[v]) elif '!GENVAR_DIMS!' in l: