diff --git a/CMakeLists.txt b/CMakeLists.txt index 398dd09..f19940d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,7 @@ set_property(CACHE SWM_DEVICE PROPERTY STRINGS cpu gpu) option(SWM_C "Enable the C versions of the mini-app" ON) option(SWM_FORTRAN "Enable the Fortran versions of the mini-app" ON) option(SWM_AMREX "Enable the AMReX version of the mini-app" OFF) +option(SWM_KOKKOS "Enable the Kokkos version of the mini-app" OFF) option(SWM_OPENACC "Enable the OpenACC versions of the mini-app" OFF) option(SWM_OPENMP "Enable the OpenMP versions of the mini-app" OFF) @@ -35,3 +36,7 @@ endif() if(SWM_AMREX) add_subdirectory(swm_amrex) endif() + +if(SWM_KOKKOS) + add_subdirectory(swm_kokkos) +endif() \ No newline at end of file diff --git a/swm_c/CMakeLists.txt b/swm_c/CMakeLists.txt index 0132322..2d31d32 100644 --- a/swm_c/CMakeLists.txt +++ b/swm_c/CMakeLists.txt @@ -4,7 +4,7 @@ set(CMAKE_C_STANDARD_REQUIRED ON) # Add optimization flag #add_compile_options(-O2) # Maybe move this to be set if CMAKE_BUILD_TYPE is Release -set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O2") +set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3") # use -O3 for performance evaluation # Flags that get used when building debug flags set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wall -g") diff --git a/swm_kokkos/CMakeLists.txt b/swm_kokkos/CMakeLists.txt new file mode 100644 index 0000000..ae448d3 --- /dev/null +++ b/swm_kokkos/CMakeLists.txt @@ -0,0 +1,14 @@ +set(CMAKE_C_STANDARD 17) +set(CMAKE_C_STANDARD_REQUIRED ON) + +find_package(Kokkos REQUIRED) + +# Use -O2 optimization for release builds to be consistent with the C implementation +# Use -O3 for performance testing +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") + +# Flags that get used when building debug flags +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -g") + +add_subdirectory(various_policy_impls) +add_subdirectory(kokkos_fortran_openacc) \ No newline at end of file diff --git a/swm_kokkos/kokkos_fortran_openacc/shallow_fortran_acc.F90 b/swm_kokkos/kokkos_fortran_openacc/shallow_fortran_acc.F90 new file mode 100644 index 0000000..03eaf7b --- /dev/null +++ b/swm_kokkos/kokkos_fortran_openacc/shallow_fortran_acc.F90 @@ -0,0 +1,244 @@ +module shallow_fortran_acc + use, intrinsic :: iso_c_binding + implicit none + private + public :: fortran_time_loop + +contains + + subroutine fortran_time_loop(cu, cv, z, h, u, unew, uold, & + v, vnew, vold, p, pnew, pold, & + M_LEN, N_LEN, M, N, fsdx, fsdy, & + dx, dy, alpha, tdt, time, itmax) bind (C, name="fortran_time_loop") + + integer(c_int), intent(in), value :: M_LEN, N_LEN, M, N, itmax + real(c_double), intent(in), value :: fsdx, fsdy, dx, dy, alpha + real(c_double), intent(inout) :: tdt, time + type(c_ptr), value :: cu, cv, z, h, u, unew, uold, v, vnew, vold, p, pnew, pold + + ! Local variables + integer :: i, j + real(c_double) :: tdts8, tdtsdx, tdtsdy + real(c_double), dimension(:,:), pointer :: cu_f_ptr => null(), cv_f_ptr => null(), & + z_f_ptr => null(), h_f_ptr => null(), & + u_f_ptr => null(), unew_f_ptr => null(), & + uold_f_ptr => null(), v_f_ptr => null(), & + vnew_f_ptr => null(), vold_f_ptr => null(), & + p_f_ptr => null(), pnew_f_ptr => null(), & + pold_f_ptr => null() + + ! Convert C pointers (raw addresses) to Fortran Array Pointers + call c_f_pointer(cu, cu_f_ptr, [M_LEN, N_LEN]) + call c_f_pointer(cv, cv_f_ptr, [M_LEN, N_LEN]) + call c_f_pointer(z, z_f_ptr, [M_LEN, N_LEN]) + call c_f_pointer(h, h_f_ptr, [M_LEN, N_LEN]) + call c_f_pointer(u, u_f_ptr, [M_LEN, N_LEN]) + call c_f_pointer(unew, unew_f_ptr, [M_LEN, N_LEN]) + call c_f_pointer(uold, uold_f_ptr, [M_LEN, N_LEN]) + call c_f_pointer(v, v_f_ptr, [M_LEN, N_LEN]) + call c_f_pointer(vnew, vnew_f_ptr, [M_LEN, N_LEN]) + call c_f_pointer(vold, vold_f_ptr, [M_LEN, N_LEN]) + call c_f_pointer(p, p_f_ptr, [M_LEN, N_LEN]) + call c_f_pointer(pnew, pnew_f_ptr, [M_LEN, N_LEN]) + call c_f_pointer(pold, pold_f_ptr, [M_LEN, N_LEN]) + + ! ** Start of time loop ** + do i = 1, itmax + call compute_cu_cv_z_h(M_LEN, N_LEN, M, N, p_f_ptr, u_f_ptr, v_f_ptr, & + cu_f_ptr, cv_f_ptr, z_f_ptr, h_f_ptr, fsdx, fsdy) + call apply_bc_cu_cv_z_h(M_LEN, N_LEN, M, N, cu_f_ptr, cv_f_ptr, z_f_ptr, h_f_ptr) + tdts8 = tdt / 8.d0 + tdtsdx = tdt / dx + tdtsdy = tdt / dy + call update_unew_vnew_pnew(M_LEN, N_LEN, M, N, unew_f_ptr, uold_f_ptr, vnew_f_ptr, & + vold_f_ptr, pnew_f_ptr, pold_f_ptr, z_f_ptr, cu_f_ptr, & + cv_f_ptr, h_f_ptr, tdts8, tdtsdx, tdtsdy) + call apply_bc_unew_vnew_pnew(M_LEN, N_LEN, M, N, unew_f_ptr, vnew_f_ptr, pnew_f_ptr) + time = time + tdt + if ( i > 1 ) then + call time_smoothing(M_LEN, N_LEN, alpha, u_f_ptr, unew_f_ptr, uold_f_ptr, & + v_f_ptr, vnew_f_ptr, vold_f_ptr, p_f_ptr, pnew_f_ptr, & + pold_f_ptr) + else + tdt = tdt + tdt + call first_cycle_copy(M_LEN, N_LEN, u_f_ptr, v_f_ptr, p_f_ptr, uold_f_ptr, & + vold_f_ptr, pold_f_ptr) + end if + !$acc wait(1) + call dswap(u_f_ptr, unew_f_ptr) + call dswap(v_f_ptr, vnew_f_ptr) + call dswap(p_f_ptr, pnew_f_ptr) + end do + ! ** End of time loop ** + end subroutine + + subroutine compute_cu_cv_z_h(M_LEN, N_LEN, M, N, p, u, v, cu, cv, z, h, fsdx, fsdy) + + integer(c_int), intent(in) :: M_LEN, N_LEN, M, N + real(c_double), intent(in) :: fsdx, fsdy + real(c_double), dimension(:,:), pointer, intent(in) :: p, u, v + real(c_double), dimension(:,:), pointer, intent(out) :: cu, cv, z, h + + ! Local variables + integer :: i, j + + !$acc parallel loop gang vector collapse(2) deviceptr(p,u,v,cu,cv,z,h) async(1) + do j = 1, N + do i = 1, M + cu(i+1,j) = 0.5d0 * (p(i+1,j) + p(i,j)) * u(i+1,j) + cv(i,j+1) = 0.5d0 * (p(i,j+1) + p(i,j)) * v(i,j+1) + z(i+1,j+1) = (fsdx * (v(i+1,j+1) - v(i,j+1)) - fsdy * (u(i+1,j+1) - u(i+1,j))) / & + (p(i,j) + p(i+1,j) + p(i+1,j+1) + p(i,j+1)) + h(i,j) = p(i,j) + 0.25d0 * (u(i+1,j) * u(i+1,j) + u(i,j) * u(i,j) + & + v(i,j+1) * v(i,j+1) + v(i,j) * v(i,j)) + end do + end do + !$acc end parallel + end subroutine + + subroutine apply_bc_cu_cv_z_h(M_LEN, N_LEN, M, N, cu, cv, z, h) + + integer(c_int), intent(in) :: M_LEN, N_LEN, M, N + real(c_double), dimension(:,:), pointer, intent(inout) :: cu, cv, z, h + + ! Local variables + integer :: i, j + + !$acc parallel loop gang vector deviceptr(cu,cv,z,h) async(1) + do j = 1, N + cu(1,j) = cu(M_LEN,j) + cv(M_LEN,j+1) = cv(1,j+1) + z(1,j+1) = z(M_LEN,j+1) + h(M_LEN,j) = h(1,j) + end do + !$acc end parallel + + !$acc parallel loop gang vector deviceptr(cu,cv,z,h) async(1) + do i = 1, M + cu(i+1,N_LEN) = cu(i+1,1) + cv(i,1) = cv(i,N_LEN) + z(i+1,1) = z(i+1,N_LEN) + h(i,N_LEN) = h(i,1) + end do + !$acc end parallel + + !$acc kernels deviceptr(cu,cv,z,h) async(1) + cu(1,N_LEN) = cu(M_LEN,1) + cv(M_LEN,1) = cv(1,N_LEN) + z(1,1) = z(M_LEN,N_LEN) + h(M_LEN,N_LEN) = h(1,1) + !$acc end kernels + end subroutine + + subroutine update_unew_vnew_pnew(M_LEN, N_LEN, M, N, unew, uold, vnew, vold, & + pnew, pold, z, cu, cv, h, tdts8, tdtsdx, tdtsdy) + + integer(c_int), intent(in) :: M_LEN, N_LEN, M, N + real(c_double), dimension(:,:), pointer, intent(out) :: unew, vnew, pnew + real(c_double), dimension(:,:), pointer, intent(in) :: uold, vold, pold, z, cu, cv, h + real(c_double), intent(in) :: tdts8, tdtsdx, tdtsdy + + ! Local variables + integer :: i, j + + !$acc parallel loop gang vector collapse(2) deviceptr(unew,vnew,pnew,uold,vold,pold,z,cu,cv,h) async(1) + do j = 1, N + do i = 1, M + unew(i+1,j) = uold(i+1,j) + & + tdts8 * (z(i+1,j+1) + z(i+1,j)) * (cv(i+1,j+1) + cv(i,j+1) + cv(i,j) + cv(i+1,j)) - & + tdtsdx * (h(i+1,j) - h(i,j)) + vnew(i,j+1) = vold(i,j+1) - & + tdts8 * (z(i+1,j+1) + z(i,j+1)) * (cu(i+1,j+1) + cu(i,j+1) + cu(i,j) + cu(i+1,j)) - & + tdtsdy * (h(i,j+1) - h(i,j)) + pnew(i,j) = pold(i,j) - tdtsdx * (cu(i+1,j) - cu(i,j)) - tdtsdy * (cv(i,j+1) - cv(i,j)) + end do + end do + !$acc end parallel + end subroutine + + subroutine apply_bc_unew_vnew_pnew(M_LEN, N_LEN, M, N, unew, vnew, pnew) + + integer(c_int), intent(in) :: M_LEN, N_LEN, M, N + real(c_double), dimension(:,:), pointer, intent(inout) :: unew, vnew, pnew + + ! Local variables + integer :: i, j + + !$acc parallel loop gang vector deviceptr(unew, vnew, pnew) async(1) + do j = 1, N + unew(1,j) = unew(M_LEN,j) + vnew(M_LEN,j+1) = vnew(1,j+1) + pnew(M_LEN,j) = pnew(1,j) + end do + !$acc end parallel + + !$acc parallel loop gang vector deviceptr(unew, vnew, pnew) async(1) + do i = 1, M + unew(i+1,N_LEN) = unew(i+1,1) + vnew(i,1) = vnew(i,N_LEN) + pnew(i,N_LEN) = pnew(i,1) + end do + !$acc end parallel + + !$acc kernels deviceptr(unew, vnew, pnew) async(1) + unew(1,N_LEN) = unew(M_LEN,1) + vnew(M_LEN,1) = vnew(1,N_LEN) + pnew(M_LEN,N_LEN) = pnew(1,1) + !$acc end kernels + end subroutine + + subroutine time_smoothing(M_LEN, N_LEN, alpha, u, unew, uold, v, vnew, vold, p, pnew, pold) + + integer(c_int), intent(in) :: M_LEN, N_LEN + real(c_double), intent(in) :: alpha + real(c_double), dimension(:,:), pointer, intent(in) :: u, unew, v, vnew, p, pnew + real(c_double), dimension(:,:), pointer, intent(inout) :: uold, vold, pold + + ! Local variables + integer :: i, j + + !$acc parallel loop gang vector collapse(2) deviceptr(u,unew,uold,v,vnew,vold,p,pnew,pold) async(1) + do j = 1, N_LEN + do i = 1, M_LEN + uold(i,j) = u(i,j) + alpha * (unew(i,j) - 2.d0 * u(i,j) + uold(i,j)) + vold(i,j) = v(i,j) + alpha * (vnew(i,j) - 2.d0 * v(i,j) + vold(i,j)) + pold(i,j) = p(i,j) + alpha * (pnew(i,j) - 2.d0 * p(i,j) + pold(i,j)) + end do + end do + !$acc end parallel + end subroutine + + subroutine first_cycle_copy(M_LEN, N_LEN, u, v, p, uold, vold, pold) + + integer(c_int), intent(in) :: M_LEN, N_LEN + real(c_double), dimension(:,:), pointer, intent(in) :: u, v, p + real(c_double), dimension(:,:), pointer, intent(out) :: uold, vold, pold + + ! Local variables + integer :: i, j + + !$acc parallel loop gang vector collapse(2) deviceptr(u, v, p, uold, vold, pold) async(1) + do j = 1, N_LEN + do i = 1, M_LEN + uold(i,j) = u(i,j) + vold(i,j) = v(i,j) + pold(i,j) = p(i,j) + end do + end do + !$acc end parallel + end subroutine + + subroutine dswap(f_ptr1, f_ptr2) + + real(c_double), dimension(:,:), pointer, intent(inout) :: f_ptr1, f_ptr2 + + ! Local variable to hold a pointer + real(c_double), dimension(:,:), pointer :: f_ptr_tmp + + ! Perform the swap of the addresses + f_ptr_tmp => f_ptr1 + f_ptr1 => f_ptr2 + f_ptr2 => f_ptr_tmp + end subroutine dswap + +end module shallow_fortran_acc \ No newline at end of file diff --git a/swm_kokkos/kokkos_fortran_openacc/shallow_mdrangepolicy.cpp b/swm_kokkos/kokkos_fortran_openacc/shallow_mdrangepolicy.cpp new file mode 100644 index 0000000..1b3e213 --- /dev/null +++ b/swm_kokkos/kokkos_fortran_openacc/shallow_mdrangepolicy.cpp @@ -0,0 +1,310 @@ +/* Code converted from shallow_swap.c to use Kokkos for parallelism. */ + +#include +#include +#include +#include +#include + +#define MIN(x,y) ((x)>(y)?(y):(x)) +#define MAX(x,y) ((x)>(y)?(x):(y)) + +#define M 256 +#define N 256 +#define M_LEN (M + 1) +#define N_LEN (N + 1) +#define ITMAX 4000 +#define L_OUT true +#define VAL_OUT false + +using Layout = Kokkos::LayoutLeft; +using ExecSpace = Kokkos::DefaultExecutionSpace; +using MemSpace = ExecSpace::memory_space; +using ViewMatrixType = Kokkos::View>; +using HostViewMatrixType = Kokkos::View; + +void write_to_file(auto array, int tM, int tN, const char *filename); +void print_to_file(auto array, int tM, int tN, const char *filename); + +// C-style function prototype for the Fortran subroutine. +extern "C" { + void fortran_time_loop(double* cu_raw_ptr, double* cv_raw_ptr, double* z_raw_ptr, + double* h_raw_ptr, double* u_raw_ptr, double* unew_raw_ptr, + double* uold_raw_ptr, double* v_raw_ptr, double* vnew_raw_ptr, + double* vold_raw_ptr, double* p_raw_ptr, double* pnew_raw_ptr, + double* pold_raw_ptr, int m_len, int n_len, int m, int n, + double fsdx, double fsdy, double dx, double dy, double alpha, + double& tdt, double& time, int itmax); +} + +int main(int argc, char **argv) { + + // Initialize Kokkos + Kokkos::initialize( argc, argv ); + { + // Number of variables to allocate + constexpr int num_vars = 14; + + // Declare views before conditional so they are accessible later + ViewMatrixType big_data; + ViewMatrixType u, v, p, unew, vnew, pnew, uold, vold, pold, cu, cv, z, h, psi; + + if constexpr (std::is_same_v) { + // Allocate a large contiguous block for all variables + big_data = ViewMatrixType("big_data", M_LEN, N_LEN * num_vars); + // Create 2-D subviews for each variable + u = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(0 * N_LEN, 1 * N_LEN)); + v = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(1 * N_LEN, 2 * N_LEN)); + p = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(2 * N_LEN, 3 * N_LEN)); + unew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(3 * N_LEN, 4 * N_LEN)); + vnew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(4 * N_LEN, 5 * N_LEN)); + pnew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(5 * N_LEN, 6 * N_LEN)); + uold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(6 * N_LEN, 7 * N_LEN)); + vold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(7 * N_LEN, 8 * N_LEN)); + pold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(8 * N_LEN, 9 * N_LEN)); + cu = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(9 * N_LEN, 10 * N_LEN)); + cv = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(10 * N_LEN, 11 * N_LEN)); + z = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(11 * N_LEN, 12 * N_LEN)); + h = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(12 * N_LEN, 13 * N_LEN)); + psi = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(13 * N_LEN, 14 * N_LEN)); + } + else if constexpr (std::is_same_v) { + big_data = ViewMatrixType("big_data", M_LEN * num_vars, N_LEN); + u = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(0, N_LEN)); + v = Kokkos::subview(big_data, Kokkos::make_pair(M_LEN, 2 * M_LEN), Kokkos::make_pair(0, N_LEN)); + p = Kokkos::subview(big_data, Kokkos::make_pair(2 * M_LEN, 3 * M_LEN), Kokkos::make_pair(0, N_LEN)); + unew = Kokkos::subview(big_data, Kokkos::make_pair(3 * M_LEN, 4 * M_LEN), Kokkos::make_pair(0, N_LEN)); + vnew = Kokkos::subview(big_data, Kokkos::make_pair(4 * M_LEN, 5 * M_LEN), Kokkos::make_pair(0, N_LEN)); + pnew = Kokkos::subview(big_data, Kokkos::make_pair(5 * M_LEN, 6 * M_LEN), Kokkos::make_pair(0, N_LEN)); + uold = Kokkos::subview(big_data, Kokkos::make_pair(6 * M_LEN, 7 * M_LEN), Kokkos::make_pair(0, N_LEN)); + vold = Kokkos::subview(big_data, Kokkos::make_pair(7 * M_LEN, 8 * M_LEN), Kokkos::make_pair(0, N_LEN)); + pold = Kokkos::subview(big_data, Kokkos::make_pair(8 * M_LEN, 9 * M_LEN), Kokkos::make_pair(0, N_LEN)); + cu = Kokkos::subview(big_data, Kokkos::make_pair(9 * M_LEN, 10 * M_LEN), Kokkos::make_pair(0, N_LEN)); + cv = Kokkos::subview(big_data, Kokkos::make_pair(10 * M_LEN, 11 * M_LEN), Kokkos::make_pair(0, N_LEN)); + z = Kokkos::subview(big_data, Kokkos::make_pair(11 * M_LEN, 12 * M_LEN), Kokkos::make_pair(0, N_LEN)); + h = Kokkos::subview(big_data, Kokkos::make_pair(12 * M_LEN, 13 * M_LEN), Kokkos::make_pair(0, N_LEN)); + psi = Kokkos::subview(big_data, Kokkos::make_pair(13 * M_LEN, 14 * M_LEN), Kokkos::make_pair(0, N_LEN)); + } + else { + printf("Using unknown layout\n"); + return -1; + } + + double dt,tdt,dx,dy,a,alpha,el,pi; + double tpi,di,dj,pcf; + double fsdx,fsdy; + + int mnmin; + + // timer variables + double ctime,tcyc,time,ptime; + + // ** Initialisations ** + + // Note below that two delta t (tdt) is set to dt on the first + // cycle after which it is reset to dt+dt. + dt = 90.; + tdt = dt; + + dx = 100000.; + dy = 100000.; + fsdx = 4. / dx; + fsdy = 4. / dy; + + a = 1000000.; + alpha = .001; + + el = N * dx; + pi = 4. * atan(1.); + tpi = pi + pi; + di = tpi / M; + dj = tpi / N; + pcf = pi * pi * a * a / (el * el); + + // Initial values of the stream function and p + Kokkos::parallel_for("init_psi_p", Kokkos::MDRangePolicy>({0,0}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + psi(i,j) = a * std::sin((i + .5) * di) * std::sin((j + .5) * dj); + p(i,j) = pcf * (std::cos(2. * (i) * di) + std::cos(2. * (j) * dj)) + 50000.; + }); + + // Initialize velocities + Kokkos::parallel_for("init_u_v", Kokkos::MDRangePolicy>({0,0}, {M,N}), KOKKOS_LAMBDA(const int i, const int j) { + u(i+1,j) = -(psi(i+1,j+1) - psi(i+1,j)) / dy; + v(i,j+1) = (psi(i+1,j+1) - psi(i,j+1)) / dx; + }); + + // Periodic continuation + Kokkos::parallel_for("periodic_top_bottom_init", Kokkos::RangePolicy(0,N), KOKKOS_LAMBDA(const int j) { + u(0,j) = u(M,j); + v(M,j+1) = v(0,j+1); + }); + + Kokkos::parallel_for("periodic_left_right_init", Kokkos::RangePolicy(0,M), KOKKOS_LAMBDA(const int i) { + u(i+1,N) = u(i+1,0); + v(i,0) = v(i,N); + }); + + Kokkos::parallel_for("periodic_corners_init", Kokkos::RangePolicy(0,1), KOKKOS_LAMBDA(const int) { + u(0,N) = u(M,0); + v(M,0) = v(0,N); + }); + + Kokkos::parallel_for("init_old_arrays", Kokkos::MDRangePolicy>({0,0}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + uold(i,j) = u(i,j); + vold(i,j) = v(i,j); + pold(i,j) = p(i,j); + }); + + // Create host mirrors for output + HostViewMatrixType u_host = Kokkos::create_mirror_view(u); + HostViewMatrixType v_host = Kokkos::create_mirror_view(v); + HostViewMatrixType p_host = Kokkos::create_mirror_view(p); + + // Print initial values + if ( L_OUT ) { + printf(" number of points in the x direction %d\n", N); + printf(" number of points in the y direction %d\n", M); + printf(" grid spacing in the x direction %f\n", dx); + printf(" grid spacing in the y direction %f\n", dy); + printf(" time step %f\n", dt); + printf(" time filter parameter %f\n", alpha); + + mnmin = MIN(M,N); + if constexpr (!std::is_same_v) { + Kokkos::deep_copy(u_host, u); + Kokkos::deep_copy(v_host, v); + Kokkos::deep_copy(p_host, p); + } + // printf(" initial diagonal elements of p\n"); + // for (int i=0; i)` to use swap function + // for the host space, but somehow it is not compiled correctly for the device space. + // Just use deep_copy for both spaces. + Kokkos::deep_copy(u_host, u); + Kokkos::deep_copy(v_host, v); + Kokkos::deep_copy(p_host, p); + + // Output p, u, v fields and run times. + if(VAL_OUT) { + write_to_file(p_host, M_LEN, N_LEN, "p.bin"); + write_to_file(u_host, M_LEN, N_LEN, "u.bin"); + write_to_file(v_host, M_LEN, N_LEN, "v.bin"); + print_to_file(p_host, M_LEN, N_LEN, "p.txt"); + print_to_file(u_host, M_LEN, N_LEN, "u.txt"); + print_to_file(v_host, M_LEN, N_LEN, "v.txt"); + } + + if (L_OUT) { + ptime = time / 3600.; + printf(" cycle number %d model time in hours %f\n", ITMAX, ptime); + + // printf(" diagonal elements of p\n"); + // for (int i=0; i ") + sys.exit(1) + +file1 = sys.argv[1] +file2 = sys.argv[2] + +# Read matrices +matrix1 = read_matrix(file1) +matrix2 = read_matrix(file2) + +debug = False +# Compare matrices +if matrix1.shape != matrix2.shape: + print("Matrices have different dimensions.") +else: + diff = matrix1 - matrix2 + if (debug): + print("Difference matrix:") + print(diff) + if np.all(diff == 0): + print("Matrices are the same.") + else: + is_close = np.isclose(matrix1, matrix2, rtol=1e-10, atol=1e-18) + if np.all(is_close): + print("Matrices are approximately equal (element-wise isclose).") + else: + print("Matrices differ (element-wise isclose).") \ No newline at end of file diff --git a/swm_kokkos/utils/comparison_results.sh b/swm_kokkos/utils/comparison_results.sh new file mode 100755 index 0000000..994435d --- /dev/null +++ b/swm_kokkos/utils/comparison_results.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# This script is used to compare the results of SWM_C and SWM_KOKKOS implementations. +# Works for Derecho only. + +# 1. Enable the conda environment on Derecho +module purge +module load conda +conda activate npl + +# 2. Define the paths to the output files from both implementations +path1="/glade/derecho/scratch/sunjian/SWM_KOKKOS_CUDA_BUILD/swm_c/c/" +path2="/glade/derecho/scratch/sunjian/SWM_KOKKOS_CUDA_BUILD/swm_kokkos/kokkos_fortran_openacc/" + +# 3. Define output file names to compare +file_lists=("p.txt" "u.txt" "v.txt") + +# 4. Loop through each file and compare +for file in "${file_lists[@]}"; do + echo "...Comparing $file between SWM_C and SWM_KOKKOS..." + python comparison_results.py "${path1}${file}" "${path2}${file}" + echo "" +done diff --git a/swm_kokkos/utils/data/O2/kokkos_mdrangepolicy.csv b/swm_kokkos/utils/data/O2/kokkos_mdrangepolicy.csv new file mode 100644 index 0000000..1021b54 --- /dev/null +++ b/swm_kokkos/utils/data/O2/kokkos_mdrangepolicy.csv @@ -0,0 +1,7 @@ +1.153737,4.445689,17.948940,70.982036,301.08363 +1.165415,4.287222,16.920554,67.578802,293.102897 +1.807277,5.777004,22.961752,88.147086,246.243280 +1.066065,3.120944,11.669130,40.764783,145.451626 +0.812044,2.378748,6.805383,21.920536,96.145355 +2.059632,2.177646,2.546136,6.252255,47.941654 +0.316204,0.387476,0.676249,1.892022,7.654405 \ No newline at end of file diff --git a/swm_kokkos/utils/data/O2/kokkos_rangepolicy.csv b/swm_kokkos/utils/data/O2/kokkos_rangepolicy.csv new file mode 100644 index 0000000..e66ec8f --- /dev/null +++ b/swm_kokkos/utils/data/O2/kokkos_rangepolicy.csv @@ -0,0 +1,7 @@ +0.696168,2.854797,11.685417,50.216090,221.208513 +0.851402,3.189540,12.566059,52.828283,229.454574 +0.671949,1.828187,6.652000,30.819137,157.641167 +0.513564,1.109654,3.547551,12.936015,73.718690 +0.495914,0.817680,2.074875,6.832389,39.465916 +2.036877,2.126532,2.273968,2.902701,5.037373 +1.687364,3.182164,7.714989,25.029478,53.057170 diff --git a/swm_kokkos/utils/data/O2/kokkos_teampolicy.csv b/swm_kokkos/utils/data/O2/kokkos_teampolicy.csv new file mode 100644 index 0000000..8409574 --- /dev/null +++ b/swm_kokkos/utils/data/O2/kokkos_teampolicy.csv @@ -0,0 +1,7 @@ +1.311518,5.056343,20.684632,86.576951,362.847894 +1.391423,5.243258,21.197661,87.124148,380.461152 +0.967286,2.992369,11.370942,47.705114,224.860345 +0.677241,1.782859,6.392049,23.841825,108.352790 +0.654278,1.274317,3.813094,13.959125,59.963317 +2.175769,2.956420,5.386826,14.980434,40.661794 +0.274158,0.318020,0.413193,1.287064,4.624398 diff --git a/swm_kokkos/utils/plot.py b/swm_kokkos/utils/plot.py new file mode 100644 index 0000000..989dccf --- /dev/null +++ b/swm_kokkos/utils/plot.py @@ -0,0 +1,74 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +xt=["128x128","256x256","512x512","1024x1024","2048x2048"] # x axis labels +xx=[128*128,256*256,512*512,1024*1024,2048*2048] # x axis values +y0=[0.584025,2.319167,21.536601,87.819181,320.092663] # C, -O2 + +# Read the CSV file into a numpy array +# y1 = pd.read_csv('./data/O2/kokkos_rangepolicy.csv', header=None, sep=',').values +# y1 = pd.read_csv('./data/O2/kokkos_teampolicy.csv', header=None, sep=',').values +y1 = pd.read_csv('./data/O2/kokkos_mdrangepolicy.csv', header=None, sep=',').values +# y4 = pd.read_csv('./data/O3/kokkos_mdrangepolicy.csv', header=None, sep=',').values + +# Statistics: compute element-wise ratio and the average ratio +ratios = np.array(y0) / np.array(y1[5,:]) +avg_ratio = np.mean(ratios) +print("C/Kokkos ratio:", ratios) +print("Average C/Kokkos ratio:", avg_ratio) + +# Figure 1: C vs. Kokkos RangePolicy (serial) +plt.figure(figsize=(8, 6)) +plt.plot(xx, y0, marker='o', linewidth=2., color='k', label='C') +plt.plot(xx, y1[0,:], marker='s', linewidth=2., color='r', label='Kokkos (Serial)') +plt.xscale('log') +plt.yscale('log') +plt.xticks(xx, xt) +plt.ylim(0.1, 400) +plt.xlabel('Domain Size') +plt.ylabel('Execution Time (s)') +plt.title('Performance comparison on a single AMD Milan CPU core (log-log scale)') +plt.legend() +plt.grid(True) +plt.tight_layout() +plt.savefig("figure1.pdf") + +# Figure 2: Kokkos RangePolicy (multi-threaded) +plt.figure(figsize=(8, 6)) +plt.plot(xx, y0, marker='o', linewidth=2., color='k', label='C') +markers = ['s', '^', 'v', 'D', 'P', '*'] +colors = ['r', 'g', 'c', 'm', 'y', 'b'] +labels = ['Kokkos (Serial)', 'Kokkos (OpenMP, 1 thread)', 'Kokkos (OpenMP, 2 threads)', + 'Kokkos (OpenMP, 4 threads)', 'Kokkos (OpenMP, 8 threads)', 'Kokkos (OpenMP, 128 threads)'] +for i in range(6): + plt.plot(xx, y1[i,:], marker=markers[i], linewidth=2., color=colors[i], label=labels[i]) +plt.xscale('log') +plt.yscale('log') +plt.xticks(xx, xt) +plt.ylim(0.1, 400) +plt.xlabel('Domain Size') +plt.ylabel('Execution Time (s)') +plt.title('Performance comparison on an AMD Milan CPU node (log-log scale)') +plt.legend() +plt.grid(True) +plt.tight_layout() +plt.savefig("figure2.pdf") + +# Figure 3: Kokkos RangePolicy (CUDA) +plt.figure(figsize=(8, 6)) +plt.plot(xx, y0, marker='o', linewidth=2., color='k', label='C') +plt.plot(xx, y1[0,:], marker='s', linewidth=2., color='r', label='Kokkos (Serial)') +plt.plot(xx, y1[5,:], marker='^', linewidth=2., color='g', label='Kokkos (OpenMP, 128 threads)') +plt.plot(xx, y1[6,:], marker='v', linewidth=2., color='c', label='Kokkos (CUDA, 1 GPU)') +plt.xscale('log') +plt.yscale('log') +plt.xticks(xx, xt) +plt.ylim(0.1, 400) +plt.xlabel('Domain Size') +plt.ylabel('Execution Time (s)') +plt.title('Performance comparison on an AMD Milan CPU node or one NVIDIA A100 GPU (log-log scale)') +plt.legend() +plt.grid(True) +plt.tight_layout() +plt.savefig("figure3.pdf") \ No newline at end of file diff --git a/swm_kokkos/various_policy_impls/shallow_mdrangepolicy.cpp b/swm_kokkos/various_policy_impls/shallow_mdrangepolicy.cpp new file mode 100644 index 0000000..81b2183 --- /dev/null +++ b/swm_kokkos/various_policy_impls/shallow_mdrangepolicy.cpp @@ -0,0 +1,383 @@ +/* Code converted from shallow_swap.c to use Kokkos for parallelism. */ + +#include +#include +#include +#include + +#define MIN(x,y) ((x)>(y)?(y):(x)) +#define MAX(x,y) ((x)>(y)?(x):(y)) + +#define M 2048 +#define N M +#define M_LEN (M + 1) +#define N_LEN (N + 1) +#define SIZE ((M_LEN)*(N_LEN)) +#define ITMAX 4000 +#define L_OUT true +#define VAL_OUT false + +using Layout = Kokkos::LayoutRight; +using ExecSpace = Kokkos::DefaultExecutionSpace; +using MemSpace = ExecSpace::memory_space; +using ViewMatrixType = Kokkos::View; +using HostViewMatrixType = Kokkos::View; + +void write_to_file(auto array, int tM, int tN, const char *filename); +void print_to_file(auto array, int tM, int tN, const char *filename); + +int main(int argc, char **argv) { + + // Initialize Kokkos + Kokkos::initialize( argc, argv ); + { + // Number of variables to allocate + constexpr int num_vars = 14; + + // Declare views before conditional so they are accessible later + ViewMatrixType big_data; + ViewMatrixType u, v, p, unew, vnew, pnew, uold, vold, pold, cu, cv, z, h, psi; + + if constexpr (std::is_same_v) { + // Allocate a large contiguous block for all variables + big_data = ViewMatrixType("big_data", M_LEN, N_LEN * num_vars); + // Create 2-D subviews for each variable + u = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(0 * N_LEN, 1 * N_LEN)); + v = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(1 * N_LEN, 2 * N_LEN)); + p = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(2 * N_LEN, 3 * N_LEN)); + unew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(3 * N_LEN, 4 * N_LEN)); + vnew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(4 * N_LEN, 5 * N_LEN)); + pnew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(5 * N_LEN, 6 * N_LEN)); + uold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(6 * N_LEN, 7 * N_LEN)); + vold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(7 * N_LEN, 8 * N_LEN)); + pold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(8 * N_LEN, 9 * N_LEN)); + cu = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(9 * N_LEN, 10 * N_LEN)); + cv = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(10 * N_LEN, 11 * N_LEN)); + z = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(11 * N_LEN, 12 * N_LEN)); + h = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(12 * N_LEN, 13 * N_LEN)); + psi = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(13 * N_LEN, 14 * N_LEN)); + } + else if constexpr (std::is_same_v) { + big_data = ViewMatrixType("big_data", M_LEN * num_vars, N_LEN); + u = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(0, N_LEN)); + v = Kokkos::subview(big_data, Kokkos::make_pair(M_LEN, 2 * M_LEN), Kokkos::make_pair(0, N_LEN)); + p = Kokkos::subview(big_data, Kokkos::make_pair(2 * M_LEN, 3 * M_LEN), Kokkos::make_pair(0, N_LEN)); + unew = Kokkos::subview(big_data, Kokkos::make_pair(3 * M_LEN, 4 * M_LEN), Kokkos::make_pair(0, N_LEN)); + vnew = Kokkos::subview(big_data, Kokkos::make_pair(4 * M_LEN, 5 * M_LEN), Kokkos::make_pair(0, N_LEN)); + pnew = Kokkos::subview(big_data, Kokkos::make_pair(5 * M_LEN, 6 * M_LEN), Kokkos::make_pair(0, N_LEN)); + uold = Kokkos::subview(big_data, Kokkos::make_pair(6 * M_LEN, 7 * M_LEN), Kokkos::make_pair(0, N_LEN)); + vold = Kokkos::subview(big_data, Kokkos::make_pair(7 * M_LEN, 8 * M_LEN), Kokkos::make_pair(0, N_LEN)); + pold = Kokkos::subview(big_data, Kokkos::make_pair(8 * M_LEN, 9 * M_LEN), Kokkos::make_pair(0, N_LEN)); + cu = Kokkos::subview(big_data, Kokkos::make_pair(9 * M_LEN, 10 * M_LEN), Kokkos::make_pair(0, N_LEN)); + cv = Kokkos::subview(big_data, Kokkos::make_pair(10 * M_LEN, 11 * M_LEN), Kokkos::make_pair(0, N_LEN)); + z = Kokkos::subview(big_data, Kokkos::make_pair(11 * M_LEN, 12 * M_LEN), Kokkos::make_pair(0, N_LEN)); + h = Kokkos::subview(big_data, Kokkos::make_pair(12 * M_LEN, 13 * M_LEN), Kokkos::make_pair(0, N_LEN)); + psi = Kokkos::subview(big_data, Kokkos::make_pair(13 * M_LEN, 14 * M_LEN), Kokkos::make_pair(0, N_LEN)); + } + else { + printf("Using unknown layout\n"); + return -1; + } + + double dt,tdt,dx,dy,a,alpha,el,pi; + double tpi,di,dj,pcf; + double tdts8,tdtsdx,tdtsdy,fsdx,fsdy; + + int mnmin,ncycle; + + // timer variables + double ctime,tcyc,time,ptime; + + // ** Initialisations ** + + // Note below that two delta t (tdt) is set to dt on the first + // cycle after which it is reset to dt+dt. + dt = 90.; + tdt = dt; + + dx = 100000.; + dy = 100000.; + fsdx = 4. / dx; + fsdy = 4. / dy; + + a = 1000000.; + alpha = .001; + + el = N * dx; + pi = 4. * atan(1.); + tpi = pi + pi; + di = tpi / M; + dj = tpi / N; + pcf = pi * pi * a * a / (el * el); + + // Initial values of the stream function and p + Kokkos::parallel_for("init_psi_p", Kokkos::MDRangePolicy>({0,0}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + psi(i,j) = a * std::sin((i + .5) * di) * std::sin((j + .5) * dj); + p(i,j) = pcf * (std::cos(2. * (i) * di) + std::cos(2. * (j) * dj)) + 50000.; + }); + + // Initialize velocities + Kokkos::parallel_for("init_u_v", Kokkos::MDRangePolicy>({0,0}, {M,N}), KOKKOS_LAMBDA(const int i, const int j) { + u(i+1,j) = -(psi(i+1,j+1) - psi(i+1,j)) / dy; + v(i,j+1) = (psi(i+1,j+1) - psi(i,j+1)) / dx; + }); + + // Periodic continuation + Kokkos::parallel_for("periodic_top_bottom_init", Kokkos::RangePolicy(0,N), KOKKOS_LAMBDA(const int j) { + u(0,j) = u(M,j); + v(M,j+1) = v(0,j+1); + }); + + Kokkos::parallel_for("periodic_left_right_init", Kokkos::RangePolicy(0,M), KOKKOS_LAMBDA(const int i) { + u(i+1,N) = u(i+1,0); + v(i,0) = v(i,N); + }); + + Kokkos::parallel_for("periodic_corners_init", Kokkos::RangePolicy(0,1), KOKKOS_LAMBDA(const int) { + u(0,N) = u(M,0); + v(M,0) = v(0,N); + }); + + Kokkos::parallel_for("init_old_arrays", Kokkos::MDRangePolicy>({0,0}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + uold(i,j) = u(i,j); + vold(i,j) = v(i,j); + pold(i,j) = p(i,j); + }); + + // Create host mirrors for output + HostViewMatrixType u_host = Kokkos::create_mirror_view(u); + HostViewMatrixType v_host = Kokkos::create_mirror_view(v); + HostViewMatrixType p_host = Kokkos::create_mirror_view(p); + + // Print initial values + if ( L_OUT ) { + printf(" number of points in the x direction %d\n", N); + printf(" number of points in the y direction %d\n", M); + printf(" grid spacing in the x direction %f\n", dx); + printf(" grid spacing in the y direction %f\n", dy); + printf(" time step %f\n", dt); + printf(" time filter parameter %f\n", alpha); + + mnmin = MIN(M,N); + if constexpr (!std::is_same_v) { + Kokkos::deep_copy(u_host, u); + Kokkos::deep_copy(v_host, v); + Kokkos::deep_copy(p_host, p); + } + // printf(" initial diagonal elements of p\n"); + // for (int i=0; i>({1,0}, {M_LEN,N}), KOKKOS_LAMBDA(const int i, const int j) { + cu(i,j) = 0.5 * (p(i,j) + p(i-1,j)) * u(i,j); + }); + + // Compute cv + Kokkos::parallel_for("compute_cv", Kokkos::MDRangePolicy>({0,1}, {M,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + cv(i,j) = 0.5 * (p(i,j) + p(i,j-1)) * v(i,j); + }); + + // Compute z + Kokkos::parallel_for("compute_z", Kokkos::MDRangePolicy>({1,1}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + z(i,j) = (fsdx * (v(i,j) - v(i-1,j)) - fsdy * (u(i,j) - u(i,j-1))) / (p(i-1,j-1) + p(i,j-1) + p(i,j) + p(i-1,j)); + }); + + // Compute h + Kokkos::parallel_for("compute_h", Kokkos::MDRangePolicy>({0,0}, {M,N}), KOKKOS_LAMBDA(const int i, const int j) { + h(i,j) = p(i,j) + 0.25 * (u(i+1,j) * u(i+1,j) + u(i,j) * u(i,j) + v(i,j+1) * v(i,j+1) + v(i,j) * v(i,j)); + }); + + // Periodic continuation + Kokkos::parallel_for("periodic_top_bottom_cu_cv_z_h", Kokkos::RangePolicy(0,N), KOKKOS_LAMBDA(const int j) { + cu(0,j) = cu(M,j); + cv(M,j+1) = cv(0,j+1); + z(0,j+1) = z(M,j+1); + h(M,j) = h(0,j); + }); + + Kokkos::parallel_for("periodic_left_right_cu_cv_z_h", Kokkos::RangePolicy(0,M), KOKKOS_LAMBDA(const int i) { + cu(i+1,N) = cu(i+1,0); + cv(i,0) = cv(i,N); + z(i+1,0) = z(i+1,N); + h(i,N) = h(i,0); + }); + + Kokkos::parallel_for("periodic_corner_cu_cv_z_h", Kokkos::RangePolicy(0, 1), KOKKOS_LAMBDA(const int) { + cu(0,N) = cu(M,0); + cv(M,0) = cv(0,N); + z(0,0) = z(M,N); + h(M,N) = h(0,0); + }); + + // Compute new values u,v and p + + tdts8 = tdt / 8.; + tdtsdx = tdt / dx; + tdtsdy = tdt / dy; + + // Compute unew + Kokkos::parallel_for("compute_unew", Kokkos::MDRangePolicy>({1,0}, {M_LEN,N}), KOKKOS_LAMBDA(const int i, const int j) { + unew(i,j) = uold(i,j) + tdts8 * (z(i,j+1) + z(i,j)) * (cv(i,j+1) + cv(i-1,j+1) + cv(i-1,j) + cv(i,j)) - tdtsdx * (h(i,j) - h(i-1,j)); + }); + + // Compute vnew + Kokkos::parallel_for("compute_vnew", Kokkos::MDRangePolicy>({0,1}, {M,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + vnew(i,j) = vold(i,j) - tdts8 * (z(i+1,j) + z(i,j)) * (cu(i+1,j) + cu(i,j) + cu(i,j-1) + cu(i+1,j-1)) - tdtsdy * (h(i,j) - h(i,j-1)); + }); + + // Compute pnew + Kokkos::parallel_for("compute_pnew", Kokkos::MDRangePolicy>({0,0}, {M,N}), KOKKOS_LAMBDA(const int i, const int j) { + pnew(i,j) = pold(i,j) - tdtsdx * (cu(i+1,j) - cu(i,j)) - tdtsdy * (cv(i,j+1) - cv(i,j)); + }); + + // Periodic continuation + Kokkos::parallel_for("periodic_top_bottom_unew_vnew_pnew", Kokkos::RangePolicy(0,N), KOKKOS_LAMBDA(const int j) { + unew(0,j) = unew(M,j); + vnew(M,j+1) = vnew(0,j+1); + pnew(M,j) = pnew(0,j); + }); + + Kokkos::parallel_for("periodic_left_right_unew_vnew_pnew", Kokkos::RangePolicy(0,M), KOKKOS_LAMBDA(const int i) { + unew(i+1,N) = unew(i+1,0); + vnew(i,0) = vnew(i,N); + pnew(i,N) = pnew(i,0); + }); + + Kokkos::parallel_for("periodic_corner_unew_vnew_pnew", Kokkos::RangePolicy(0, 1), KOKKOS_LAMBDA(const int) { + unew(0,N) = unew(M,0); + vnew(M,0) = vnew(0,N); + pnew(M,N) = pnew(0,0); + }); + + time = time + dt; + + // Time smoothing and update for next cycle + + if ( ncycle > 1 ) { + Kokkos::parallel_for("time_smoothing_u", Kokkos::MDRangePolicy>({0,0}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + uold(i,j) = u(i,j) + alpha * (unew(i,j) - 2. * u(i,j) + uold(i,j)); + }); + + Kokkos::parallel_for("time_smoothing_v", Kokkos::MDRangePolicy>({0,0}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + vold(i,j) = v(i,j) + alpha * (vnew(i,j) - 2. * v(i,j) + vold(i,j)); + }); + + Kokkos::parallel_for("time_smoothing_p", Kokkos::MDRangePolicy>({0,0}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + pold(i,j) = p(i,j) + alpha * (pnew(i,j) - 2. * p(i,j) + pold(i,j)); + }); + } + else { + tdt = tdt + tdt; + Kokkos::parallel_for("first_cycle_copy", Kokkos::MDRangePolicy>({0,0}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + uold(i,j) = u(i,j); + vold(i,j) = v(i,j); + pold(i,j) = p(i,j); + }); + } + // My test shows it is better to use fence and then swap for device views, rather than doing a device copy without fence. + if constexpr (!std::is_same_v) { + Kokkos::fence(); + } + // Swap the views + std::swap(u, unew); + std::swap(v, vnew); + std::swap(p, pnew); + } // ** End of time loop ** + + // Try to use `if constexpr (!std::is_same_v)` to use swap function + // for the host space, but somehow it is not compiled correctly for the device space. + // Just use deep_copy for both spaces. + Kokkos::deep_copy(u_host, u); + Kokkos::deep_copy(v_host, v); + Kokkos::deep_copy(p_host, p); + + // Output p, u, v fields and run times. + if(VAL_OUT) { + write_to_file(p_host, M_LEN, N_LEN, "p.bin"); + write_to_file(u_host, M_LEN, N_LEN, "u.bin"); + write_to_file(v_host, M_LEN, N_LEN, "v.bin"); + print_to_file(p_host, M_LEN, N_LEN, "p.txt"); + print_to_file(u_host, M_LEN, N_LEN, "u.txt"); + print_to_file(v_host, M_LEN, N_LEN, "v.txt"); + } + + if (L_OUT) { + ptime = time / 3600.; + printf(" cycle number %d model time in hours %f\n", ITMAX, ptime); + + // printf(" diagonal elements of p\n"); + // for (int i=0; i +#include +#include +#include + +#define MIN(x,y) ((x)>(y)?(y):(x)) +#define MAX(x,y) ((x)>(y)?(x):(y)) + +#define M 128 +#define N 256 +#define M_LEN (M + 1) +#define N_LEN (N + 1) +#define SIZE ((M_LEN)*(N_LEN)) +#define ITMAX 4000 +#define L_OUT true +#define VAL_OUT false + +using Layout = Kokkos::LayoutRight; +using ExecSpace = Kokkos::DefaultExecutionSpace; +using MemSpace = ExecSpace::memory_space; +using ViewMatrixType = Kokkos::View; +using HostViewMatrixType = Kokkos::View; + +void write_to_file(auto array, int tM, int tN, const char *filename); +void print_to_file(auto array, int tM, int tN, const char *filename); + +int main(int argc, char **argv) { + + // Initialize Kokkos + Kokkos::initialize( argc, argv ); + { + // Number of variables to allocate + constexpr int num_vars = 14; + + // Declare views before conditional so they are accessible later + ViewMatrixType big_data; + ViewMatrixType u, v, p, unew, vnew, pnew, uold, vold, pold, cu, cv, z, h, psi; + + if constexpr (std::is_same_v) { + // Allocate a large contiguous block for all variables + big_data = ViewMatrixType("big_data", M_LEN, N_LEN * num_vars); + // Create 2-D subviews for each variable + u = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(0 * N_LEN, 1 * N_LEN)); + v = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(1 * N_LEN, 2 * N_LEN)); + p = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(2 * N_LEN, 3 * N_LEN)); + unew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(3 * N_LEN, 4 * N_LEN)); + vnew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(4 * N_LEN, 5 * N_LEN)); + pnew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(5 * N_LEN, 6 * N_LEN)); + uold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(6 * N_LEN, 7 * N_LEN)); + vold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(7 * N_LEN, 8 * N_LEN)); + pold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(8 * N_LEN, 9 * N_LEN)); + cu = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(9 * N_LEN, 10 * N_LEN)); + cv = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(10 * N_LEN, 11 * N_LEN)); + z = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(11 * N_LEN, 12 * N_LEN)); + h = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(12 * N_LEN, 13 * N_LEN)); + psi = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(13 * N_LEN, 14 * N_LEN)); + } + else if constexpr (std::is_same_v) { + big_data = ViewMatrixType("big_data", M_LEN * num_vars, N_LEN); + u = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(0, N_LEN)); + v = Kokkos::subview(big_data, Kokkos::make_pair(M_LEN, 2 * M_LEN), Kokkos::make_pair(0, N_LEN)); + p = Kokkos::subview(big_data, Kokkos::make_pair(2 * M_LEN, 3 * M_LEN), Kokkos::make_pair(0, N_LEN)); + unew = Kokkos::subview(big_data, Kokkos::make_pair(3 * M_LEN, 4 * M_LEN), Kokkos::make_pair(0, N_LEN)); + vnew = Kokkos::subview(big_data, Kokkos::make_pair(4 * M_LEN, 5 * M_LEN), Kokkos::make_pair(0, N_LEN)); + pnew = Kokkos::subview(big_data, Kokkos::make_pair(5 * M_LEN, 6 * M_LEN), Kokkos::make_pair(0, N_LEN)); + uold = Kokkos::subview(big_data, Kokkos::make_pair(6 * M_LEN, 7 * M_LEN), Kokkos::make_pair(0, N_LEN)); + vold = Kokkos::subview(big_data, Kokkos::make_pair(7 * M_LEN, 8 * M_LEN), Kokkos::make_pair(0, N_LEN)); + pold = Kokkos::subview(big_data, Kokkos::make_pair(8 * M_LEN, 9 * M_LEN), Kokkos::make_pair(0, N_LEN)); + cu = Kokkos::subview(big_data, Kokkos::make_pair(9 * M_LEN, 10 * M_LEN), Kokkos::make_pair(0, N_LEN)); + cv = Kokkos::subview(big_data, Kokkos::make_pair(10 * M_LEN, 11 * M_LEN), Kokkos::make_pair(0, N_LEN)); + z = Kokkos::subview(big_data, Kokkos::make_pair(11 * M_LEN, 12 * M_LEN), Kokkos::make_pair(0, N_LEN)); + h = Kokkos::subview(big_data, Kokkos::make_pair(12 * M_LEN, 13 * M_LEN), Kokkos::make_pair(0, N_LEN)); + psi = Kokkos::subview(big_data, Kokkos::make_pair(13 * M_LEN, 14 * M_LEN), Kokkos::make_pair(0, N_LEN)); + } + else { + printf("Using unknown layout\n"); + return -1; + } + + double dt,tdt,dx,dy,a,alpha,el,pi; + double tpi,di,dj,pcf; + double tdts8,tdtsdx,tdtsdy,fsdx,fsdy; + + int mnmin,ncycle; + + // timer variables + double ctime,tcyc,time,ptime; + + // ** Initialisations ** + + // Note below that two delta t (tdt) is set to dt on the first + // cycle after which it is reset to dt+dt. + dt = 90.; + tdt = dt; + + dx = 100000.; + dy = 100000.; + fsdx = 4. / dx; + fsdy = 4. / dy; + + a = 1000000.; + alpha = .001; + + el = N * dx; + pi = 4. * atan(1.); + tpi = pi + pi; + di = tpi / M; + dj = tpi / N; + pcf = pi * pi * a * a / (el * el); + + // Initial values of the stream function and p + Kokkos::parallel_for("init_psi_p", Kokkos::MDRangePolicy>({0,0}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + psi(i,j) = a * std::sin((i + .5) * di) * std::sin((j + .5) * dj); + p(i,j) = pcf * (std::cos(2. * (i) * di) + std::cos(2. * (j) * dj)) + 50000.; + }); + + // Initialize velocities + Kokkos::parallel_for("init_u_v", Kokkos::MDRangePolicy>({0,0}, {M,N}), KOKKOS_LAMBDA(const int i, const int j) { + u(i+1,j) = -(psi(i+1,j+1) - psi(i+1,j)) / dy; + v(i,j+1) = (psi(i+1,j+1) - psi(i,j+1)) / dx; + }); + + // Periodic continuation + Kokkos::parallel_for("periodic_top_bottom_init", Kokkos::RangePolicy(0,N), KOKKOS_LAMBDA(const int j) { + u(0,j) = u(M,j); + v(M,j+1) = v(0,j+1); + }); + + Kokkos::parallel_for("periodic_left_right_init", Kokkos::RangePolicy(0,M), KOKKOS_LAMBDA(const int i) { + u(i+1,N) = u(i+1,0); + v(i,0) = v(i,N); + }); + + Kokkos::parallel_for("periodic_corners_init", Kokkos::RangePolicy(0,1), KOKKOS_LAMBDA(const int) { + u(0,N) = u(M,0); + v(M,0) = v(0,N); + }); + + Kokkos::parallel_for("init_old_arrays", Kokkos::MDRangePolicy>({0,0}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + uold(i,j) = u(i,j); + vold(i,j) = v(i,j); + pold(i,j) = p(i,j); + }); + + // Create host mirrors for output + HostViewMatrixType u_host = Kokkos::create_mirror_view(u); + HostViewMatrixType v_host = Kokkos::create_mirror_view(v); + HostViewMatrixType p_host = Kokkos::create_mirror_view(p); + + // Print initial values + if ( L_OUT ) { + printf(" number of points in the x direction %d\n", N); + printf(" number of points in the y direction %d\n", M); + printf(" grid spacing in the x direction %f\n", dx); + printf(" grid spacing in the y direction %f\n", dy); + printf(" time step %f\n", dt); + printf(" time filter parameter %f\n", alpha); + + mnmin = MIN(M,N); + if constexpr (!std::is_same_v) { + Kokkos::deep_copy(u_host, u); + Kokkos::deep_copy(v_host, v); + Kokkos::deep_copy(p_host, p); + } + // printf(" initial diagonal elements of p\n"); + // for (int i=0; i(1, M_LEN), KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < N; ++j) { + cu(i,j) = 0.5 * (p(i,j) + p(i-1,j)) * u(i,j); + } + }); + + // Compute cv + Kokkos::parallel_for("compute_cv", Kokkos::RangePolicy(0, M), KOKKOS_LAMBDA(const int i) { + for (int j = 1; j < N_LEN; ++j) { + cv(i,j) = 0.5 * (p(i,j) + p(i,j-1)) * v(i,j); + } + }); + + // Compute z + Kokkos::parallel_for("compute_z", Kokkos::RangePolicy(1, M_LEN), KOKKOS_LAMBDA(const int i) { + for (int j = 1; j < N_LEN; ++j) { + z(i,j) = (fsdx * (v(i,j) - v(i-1,j)) - fsdy * (u(i,j) - u(i,j-1))) / (p(i-1,j-1) + p(i,j-1) + p(i,j) + p(i-1,j)); + } + }); + + // Compute h + Kokkos::parallel_for("compute_h", Kokkos::RangePolicy(0, M), KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < N; ++j) { + h(i,j) = p(i,j) + 0.25 * ( u(i+1,j) * u(i+1,j) + u(i,j) * u(i,j) + v(i,j+1) * v(i,j+1) + v(i,j) * v(i,j) ); + } + }); + + // Periodic continuation + Kokkos::parallel_for("periodic_top_bottom_cu_cv_z_h", Kokkos::RangePolicy(0,N), KOKKOS_LAMBDA(const int j) { + cu(0,j) = cu(M,j); + cv(M,j+1) = cv(0,j+1); + z(0,j+1) = z(M,j+1); + h(M,j) = h(0,j); + }); + + Kokkos::parallel_for("periodic_left_right_cu_cv_z_h", Kokkos::RangePolicy(0,M), KOKKOS_LAMBDA(const int i) { + cu(i+1,N) = cu(i+1,0); + cv(i,0) = cv(i,N); + z(i+1,0) = z(i+1,N); + h(i,N) = h(i,0); + }); + + Kokkos::parallel_for("periodic_corner_cu_cv_z_h", Kokkos::RangePolicy(0, 1), KOKKOS_LAMBDA(const int) { + cu(0,N) = cu(M,0); + cv(M,0) = cv(0,N); + z(0,0) = z(M,N); + h(M,N) = h(0,0); + }); + + // Compute new values u,v and p + + tdts8 = tdt / 8.; + tdtsdx = tdt / dx; + tdtsdy = tdt / dy; + + // Compute unew + Kokkos::parallel_for("compute_unew", Kokkos::RangePolicy(1, M_LEN), KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < N; ++j) { + unew(i,j) = uold(i,j) + tdts8 * (z(i,j+1) + z(i,j)) * (cv(i,j+1) + cv(i-1,j+1) + cv(i-1,j) + cv(i,j)) - tdtsdx * (h(i,j) - h(i-1,j)); + } + }); + + // Compute vnew + Kokkos::parallel_for("compute_vnew", Kokkos::RangePolicy(0, M), KOKKOS_LAMBDA(const int i) { + for (int j = 1; j < N_LEN; ++j) { + vnew(i,j) = vold(i,j) - tdts8 * (z(i+1,j) + z(i,j)) * (cu(i+1,j) + cu(i,j) + cu(i,j-1) + cu(i+1,j-1)) - tdtsdy * (h(i,j) - h(i,j-1)); + } + }); + + // Compute pnew + Kokkos::parallel_for("compute_pnew", Kokkos::RangePolicy(0, M), KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < N; ++j) { + pnew(i,j) = pold(i,j) - tdtsdx * (cu(i+1,j) - cu(i,j)) - tdtsdy * (cv(i,j+1) - cv(i,j)); + } + }); + + // Periodic continuation + + Kokkos::parallel_for("periodic_top_bottom_unew_vnew_pnew", Kokkos::RangePolicy(0,N), KOKKOS_LAMBDA(const int j) { + unew(0,j) = unew(M,j); + vnew(M,j+1) = vnew(0,j+1); + pnew(M,j) = pnew(0,j); + }); + + Kokkos::parallel_for("periodic_left_right_unew_vnew_pnew", Kokkos::RangePolicy(0,M), KOKKOS_LAMBDA(const int i) { + unew(i+1,N) = unew(i+1,0); + vnew(i,0) = vnew(i,N); + pnew(i,N) = pnew(i,0); + }); + + Kokkos::parallel_for("periodic_corner_unew_vnew_pnew", Kokkos::RangePolicy(0, 1), KOKKOS_LAMBDA(const int) { + unew(0,N) = unew(M,0); + vnew(M,0) = vnew(0,N); + pnew(M,N) = pnew(0,0); + }); + + time = time + dt; + + // Time smoothing and update for next cycle + + if ( ncycle > 1 ) { + Kokkos::parallel_for("time_smoothing_u", Kokkos::RangePolicy(0,M_LEN), KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < N_LEN; ++j) { + uold(i,j) = u(i,j) + alpha * (unew(i,j) - 2. * u(i,j) + uold(i,j)); + } + }); + + Kokkos::parallel_for("time_smoothing_v", Kokkos::RangePolicy(0,M_LEN), KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < N_LEN; ++j) { + vold(i,j) = v(i,j) + alpha * (vnew(i,j) - 2. * v(i,j) + vold(i,j)); + } + }); + + Kokkos::parallel_for("time_smoothing_p", Kokkos::RangePolicy(0,M_LEN), KOKKOS_LAMBDA(const int i) { + for (int j = 0; j < N_LEN; ++j) { + pold(i,j) = p(i,j) + alpha * (pnew(i,j) - 2. * p(i,j) + pold(i,j)); + } + }); + } + else { + tdt = tdt + tdt; + Kokkos::parallel_for("first_cycle_copy", Kokkos::MDRangePolicy>({0,0}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + uold(i,j) = u(i,j); + vold(i,j) = v(i,j); + pold(i,j) = p(i,j); + }); + } + // My test shows it is better to use fence and then swap for device views, rather than doing a device copy without fence. + if constexpr (!std::is_same_v) { + Kokkos::fence(); + } + // Swap the views + std::swap(u, unew); + std::swap(v, vnew); + std::swap(p, pnew); + } // ** End of time loop ** + + // Try to use `if constexpr (!std::is_same_v)` to use swap function + // for the host space, but somehow it is not compiled correctly for the device space. + // Just use deep_copy for both spaces. + Kokkos::deep_copy(u_host, u); + Kokkos::deep_copy(v_host, v); + Kokkos::deep_copy(p_host, p); + + // Output p, u, v fields and run times. + if(VAL_OUT) { + write_to_file(p_host, M_LEN, N_LEN, "p.bin"); + write_to_file(u_host, M_LEN, N_LEN, "u.bin"); + write_to_file(v_host, M_LEN, N_LEN, "v.bin"); + print_to_file(p_host, M_LEN, N_LEN, "p.txt"); + print_to_file(u_host, M_LEN, N_LEN, "u.txt"); + print_to_file(v_host, M_LEN, N_LEN, "v.txt"); + } + + if (L_OUT) { + ptime = time / 3600.; + printf(" cycle number %d model time in hours %f\n", ITMAX, ptime); + + // printf(" diagonal elements of p\n"); + // for (int i=0; i +#include +#include +#include + +#define MIN(x,y) ((x)>(y)?(y):(x)) +#define MAX(x,y) ((x)>(y)?(x):(y)) + +#define M 256 +#define N M +#define M_LEN (M + 1) +#define N_LEN (N + 1) +#define SIZE ((M_LEN)*(N_LEN)) +#define ITMAX 4000 +#define L_OUT true +#define VAL_OUT false + +using Layout = Kokkos::LayoutRight; +using ExecSpace = Kokkos::DefaultExecutionSpace; +using MemSpace = ExecSpace::memory_space; +using ViewMatrixType = Kokkos::View; +using HostViewMatrixType = Kokkos::View; + +void write_to_file(auto array, int tM, int tN, const char *filename); +void print_to_file(auto array, int tM, int tN, const char *filename); + +int main(int argc, char **argv) { + + // Initialize Kokkos + Kokkos::initialize( argc, argv ); + { + // Number of variables to allocate + constexpr int num_vars = 14; + + // Declare views before conditional so they are accessible later + ViewMatrixType big_data; + ViewMatrixType u, v, p, unew, vnew, pnew, uold, vold, pold, cu, cv, z, h, psi; + + if constexpr (std::is_same_v) { + // Allocate a large contiguous block for all variables + big_data = ViewMatrixType("big_data", M_LEN, N_LEN * num_vars); + // Create 2-D subviews for each variable + u = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(0 * N_LEN, 1 * N_LEN)); + v = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(1 * N_LEN, 2 * N_LEN)); + p = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(2 * N_LEN, 3 * N_LEN)); + unew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(3 * N_LEN, 4 * N_LEN)); + vnew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(4 * N_LEN, 5 * N_LEN)); + pnew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(5 * N_LEN, 6 * N_LEN)); + uold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(6 * N_LEN, 7 * N_LEN)); + vold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(7 * N_LEN, 8 * N_LEN)); + pold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(8 * N_LEN, 9 * N_LEN)); + cu = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(9 * N_LEN, 10 * N_LEN)); + cv = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(10 * N_LEN, 11 * N_LEN)); + z = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(11 * N_LEN, 12 * N_LEN)); + h = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(12 * N_LEN, 13 * N_LEN)); + psi = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(13 * N_LEN, 14 * N_LEN)); + } + else if constexpr (std::is_same_v) { + big_data = ViewMatrixType("big_data", M_LEN * num_vars, N_LEN); + u = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(0, N_LEN)); + v = Kokkos::subview(big_data, Kokkos::make_pair(M_LEN, 2 * M_LEN), Kokkos::make_pair(0, N_LEN)); + p = Kokkos::subview(big_data, Kokkos::make_pair(2 * M_LEN, 3 * M_LEN), Kokkos::make_pair(0, N_LEN)); + unew = Kokkos::subview(big_data, Kokkos::make_pair(3 * M_LEN, 4 * M_LEN), Kokkos::make_pair(0, N_LEN)); + vnew = Kokkos::subview(big_data, Kokkos::make_pair(4 * M_LEN, 5 * M_LEN), Kokkos::make_pair(0, N_LEN)); + pnew = Kokkos::subview(big_data, Kokkos::make_pair(5 * M_LEN, 6 * M_LEN), Kokkos::make_pair(0, N_LEN)); + uold = Kokkos::subview(big_data, Kokkos::make_pair(6 * M_LEN, 7 * M_LEN), Kokkos::make_pair(0, N_LEN)); + vold = Kokkos::subview(big_data, Kokkos::make_pair(7 * M_LEN, 8 * M_LEN), Kokkos::make_pair(0, N_LEN)); + pold = Kokkos::subview(big_data, Kokkos::make_pair(8 * M_LEN, 9 * M_LEN), Kokkos::make_pair(0, N_LEN)); + cu = Kokkos::subview(big_data, Kokkos::make_pair(9 * M_LEN, 10 * M_LEN), Kokkos::make_pair(0, N_LEN)); + cv = Kokkos::subview(big_data, Kokkos::make_pair(10 * M_LEN, 11 * M_LEN), Kokkos::make_pair(0, N_LEN)); + z = Kokkos::subview(big_data, Kokkos::make_pair(11 * M_LEN, 12 * M_LEN), Kokkos::make_pair(0, N_LEN)); + h = Kokkos::subview(big_data, Kokkos::make_pair(12 * M_LEN, 13 * M_LEN), Kokkos::make_pair(0, N_LEN)); + psi = Kokkos::subview(big_data, Kokkos::make_pair(13 * M_LEN, 14 * M_LEN), Kokkos::make_pair(0, N_LEN)); + } + else { + printf("Using unknown layout\n"); + return -1; + } + + double dt,tdt,dx,dy,a,alpha,el,pi; + double tpi,di,dj,pcf; + double tdts8,tdtsdx,tdtsdy,fsdx,fsdy; + + int mnmin,ncycle; + + // timer variables + double ctime,tcyc,time,ptime; + + // ** Initialisations ** + + // Note below that two delta t (tdt) is set to dt on the first + // cycle after which it is reset to dt+dt. + dt = 90.; + tdt = dt; + + dx = 100000.; + dy = 100000.; + fsdx = 4. / dx; + fsdy = 4. / dy; + + a = 1000000.; + alpha = .001; + + el = N * dx; + pi = 4. * atan(1.); + tpi = pi + pi; + di = tpi / M; + dj = tpi / N; + pcf = pi * pi * a * a / (el * el); + + // Initial values of the stream function and p + Kokkos::parallel_for("init_psi_p", Kokkos::MDRangePolicy>({0,0}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + psi(i,j) = a * std::sin((i + .5) * di) * std::sin((j + .5) * dj); + p(i,j) = pcf * (std::cos(2. * (i) * di) + std::cos(2. * (j) * dj)) + 50000.; + }); + + // Initialize velocities + Kokkos::parallel_for("init_u_v", Kokkos::MDRangePolicy>({0,0}, {M,N}), KOKKOS_LAMBDA(const int i, const int j) { + u(i+1,j) = -(psi(i+1,j+1) - psi(i+1,j)) / dy; + v(i,j+1) = (psi(i+1,j+1) - psi(i,j+1)) / dx; + }); + + // Periodic continuation + Kokkos::parallel_for("periodic_top_bottom_init", Kokkos::RangePolicy(0,N), KOKKOS_LAMBDA(const int j) { + u(0,j) = u(M,j); + v(M,j+1) = v(0,j+1); + }); + + Kokkos::parallel_for("periodic_left_right_init", Kokkos::RangePolicy(0,M), KOKKOS_LAMBDA(const int i) { + u(i+1,N) = u(i+1,0); + v(i,0) = v(i,N); + }); + + Kokkos::parallel_for("periodic_corners_init", Kokkos::RangePolicy(0,1), KOKKOS_LAMBDA(const int) { + u(0,N) = u(M,0); + v(M,0) = v(0,N); + }); + + Kokkos::parallel_for("init_old_arrays", Kokkos::MDRangePolicy>({0,0}, {M_LEN,N_LEN}), KOKKOS_LAMBDA(const int i, const int j) { + uold(i,j) = u(i,j); + vold(i,j) = v(i,j); + pold(i,j) = p(i,j); + }); + + // Create host mirrors for output + HostViewMatrixType u_host = Kokkos::create_mirror_view(u); + HostViewMatrixType v_host = Kokkos::create_mirror_view(v); + HostViewMatrixType p_host = Kokkos::create_mirror_view(p); + + // Print initial values + if ( L_OUT ) { + printf(" number of points in the x direction %d\n", N); + printf(" number of points in the y direction %d\n", M); + printf(" grid spacing in the x direction %f\n", dx); + printf(" grid spacing in the y direction %f\n", dy); + printf(" time step %f\n", dt); + printf(" time filter parameter %f\n", alpha); + + mnmin = MIN(M,N); + if constexpr (!std::is_same_v) { + Kokkos::deep_copy(u_host, u); + Kokkos::deep_copy(v_host, v); + Kokkos::deep_copy(p_host, p); + } + // printf(" initial diagonal elements of p\n"); + // for (int i=0; i(0, M * N), KOKKOS_LAMBDA(const int idx) { + int i = idx / N + 1; + int j = idx % N; + cu(i,j) = 0.5 * (p(i,j) + p(i-1,j)) * u(i,j); + }); + + // Compute cv + Kokkos::parallel_for("compute_cv", Kokkos::RangePolicy(0, M * N), KOKKOS_LAMBDA(const int idx) { + int i = idx / N; + int j = idx % N + 1; + cv(i,j) = 0.5 * (p(i,j) + p(i,j-1)) * v(i,j); + }); + + // Compute z + Kokkos::parallel_for("compute_z", Kokkos::RangePolicy(0, M * N), KOKKOS_LAMBDA(const int idx) { + int i = idx / N + 1; + int j = idx % N + 1; + z(i,j) = (fsdx * (v(i,j) - v(i-1,j)) - fsdy * (u(i,j) - u(i,j-1))) / (p(i-1,j-1) + p(i,j-1) + p(i,j) + p(i-1,j)); + }); + + // Compute h + Kokkos::parallel_for("compute_h", Kokkos::RangePolicy(0, M * N), KOKKOS_LAMBDA(const int idx) { + int i = idx / N; + int j = idx % N; + h(i,j) = p(i,j) + 0.25 * ( u(i+1,j) * u(i+1,j) + u(i,j) * u(i,j) + v(i,j+1) * v(i,j+1) + v(i,j) * v(i,j) ); + }); + + // Periodic continuation + Kokkos::parallel_for("periodic_top_bottom_cu_cv_z_h", Kokkos::RangePolicy(0,N), KOKKOS_LAMBDA(const int j) { + cu(0,j) = cu(M,j); + cv(M,j+1) = cv(0,j+1); + z(0,j+1) = z(M,j+1); + h(M,j) = h(0,j); + }); + + Kokkos::parallel_for("periodic_left_right_cu_cv_z_h", Kokkos::RangePolicy(0,M), KOKKOS_LAMBDA(const int i) { + cu(i+1,N) = cu(i+1,0); + cv(i,0) = cv(i,N); + z(i+1,0) = z(i+1,N); + h(i,N) = h(i,0); + }); + + Kokkos::parallel_for("periodic_corner_cu_cv_z_h", Kokkos::RangePolicy(0, 1), KOKKOS_LAMBDA(const int) { + cu(0,N) = cu(M,0); + cv(M,0) = cv(0,N); + z(0,0) = z(M,N); + h(M,N) = h(0,0); + }); + + // Compute new values u,v and p + tdts8 = tdt / 8.; + tdtsdx = tdt / dx; + tdtsdy = tdt / dy; + + // Compute unew + Kokkos::parallel_for("compute_unew", Kokkos::RangePolicy(0, M * N), KOKKOS_LAMBDA(const int idx) { + int i = idx / N + 1; + int j = idx % N; + unew(i,j) = uold(i,j) + tdts8 * (z(i,j+1) + z(i,j)) * (cv(i,j+1) + cv(i-1,j+1) + cv(i-1,j) + cv(i,j)) - tdtsdx * (h(i,j) - h(i-1,j)); + }); + + // Compute vnew + Kokkos::parallel_for("compute_vnew", Kokkos::RangePolicy(0, M * N), KOKKOS_LAMBDA(const int idx) { + int i = idx / N; + int j = (idx % N) + 1; + vnew(i,j) = vold(i,j) - tdts8 * (z(i+1,j) + z(i,j)) * (cu(i+1,j) + cu(i,j) + cu(i,j-1) + cu(i+1,j-1)) - tdtsdy * (h(i,j) - h(i,j-1)); + }); + + // Compute pnew + Kokkos::parallel_for("compute_pnew", Kokkos::RangePolicy(0, M * N), KOKKOS_LAMBDA(const int idx) { + int i = idx / N; + int j = idx % N; + pnew(i,j) = pold(i,j) - tdtsdx * (cu(i+1,j) - cu(i,j)) - tdtsdy * (cv(i,j+1) - cv(i,j)); + }); + + // Periodic continuation + Kokkos::parallel_for("periodic_top_bottom_unew_vnew_pnew", Kokkos::RangePolicy(0,N), KOKKOS_LAMBDA(const int j) { + unew(0,j) = unew(M,j); + vnew(M,j+1) = vnew(0,j+1); + pnew(M,j) = pnew(0,j); + }); + + Kokkos::parallel_for("periodic_left_right_unew_vnew_pnew", Kokkos::RangePolicy(0,M), KOKKOS_LAMBDA(const int i) { + unew(i+1,N) = unew(i+1,0); + vnew(i,0) = vnew(i,N); + pnew(i,N) = pnew(i,0); + }); + + Kokkos::parallel_for("periodic_corner_unew_vnew_pnew", Kokkos::RangePolicy(0, 1), KOKKOS_LAMBDA(const int) { + unew(0,N) = unew(M,0); + vnew(M,0) = vnew(0,N); + pnew(M,N) = pnew(0,0); + }); + + time = time + dt; + + // Time smoothing and update for next cycle + if ( ncycle > 1 ) { + Kokkos::parallel_for("time_smoothing_u", Kokkos::RangePolicy(0, M_LEN * N_LEN), KOKKOS_LAMBDA(const int idx) { + int i = idx / N_LEN; + int j = idx % N_LEN; + uold(i,j) = u(i,j) + alpha * (unew(i,j) - 2. * u(i,j) + uold(i,j)); + }); + + Kokkos::parallel_for("time_smoothing_v", Kokkos::RangePolicy(0, M_LEN * N_LEN), KOKKOS_LAMBDA(const int idx) { + int i = idx / N_LEN; + int j = idx % N_LEN; + vold(i,j) = v(i,j) + alpha * (vnew(i,j) - 2. * v(i,j) + vold(i,j)); + }); + + Kokkos::parallel_for("time_smoothing_p", Kokkos::RangePolicy(0, M_LEN * N_LEN), KOKKOS_LAMBDA(const int idx) { + int i = idx / N_LEN; + int j = idx % N_LEN; + pold(i,j) = p(i,j) + alpha * (pnew(i,j) - 2. * p(i,j) + pold(i,j)); + }); + } + else { + tdt = tdt + tdt; + Kokkos::parallel_for("first_cycle_copy_outer", Kokkos::RangePolicy(0, M_LEN * N_LEN), KOKKOS_LAMBDA(const int idx) { + int i = idx / N_LEN; + int j = idx % N_LEN; + uold(i,j) = u(i,j); + vold(i,j) = v(i,j); + pold(i,j) = p(i,j); + }); + } + // My test shows it is better to use fence and then swap for device views, rather than doing a device copy without fence. + if constexpr (!std::is_same_v) { + Kokkos::fence(); + } + // Swap the views + std::swap(u, unew); + std::swap(v, vnew); + std::swap(p, pnew); + } // ** End of time loop ** + + // Try to use `if constexpr (!std::is_same_v)` to use swap function + // for the host space, but somehow it is not compiled correctly for the device space. + // Just use deep_copy for both spaces. + Kokkos::deep_copy(u_host, u); + Kokkos::deep_copy(v_host, v); + Kokkos::deep_copy(p_host, p); + + // Output p, u, v fields and run times. + if(VAL_OUT) { + write_to_file(p_host, M_LEN, N_LEN, "p.bin"); + write_to_file(u_host, M_LEN, N_LEN, "u.bin"); + write_to_file(v_host, M_LEN, N_LEN, "v.bin"); + print_to_file(p_host, M_LEN, N_LEN, "p.txt"); + print_to_file(u_host, M_LEN, N_LEN, "u.txt"); + print_to_file(v_host, M_LEN, N_LEN, "v.txt"); + } + + if (L_OUT) { + ptime = time / 3600.; + printf(" cycle number %d model time in hours %f\n", ITMAX, ptime); + + // printf(" diagonal elements of p\n"); + // for (int i=0; i +#include +#include +#include + +#define MIN(x,y) ((x)>(y)?(y):(x)) +#define MAX(x,y) ((x)>(y)?(x):(y)) + +#define M 256 +#define N 256 +#define M_LEN (M + 1) +#define N_LEN (N + 1) +#define SIZE ((M_LEN)*(N_LEN)) +#define ITMAX 4000 +#define L_OUT true +#define VAL_OUT false + +using Layout = Kokkos::LayoutRight; +using ExecSpace = Kokkos::DefaultExecutionSpace; +using MemSpace = ExecSpace::memory_space; +using ViewMatrixType = Kokkos::View; +using HostViewMatrixType = Kokkos::View; +using teamPolicy = Kokkos::TeamPolicy; +using memberType = teamPolicy::member_type; + +void write_to_file(auto array, int tM, int tN, const char *filename); +void print_to_file(auto array, int tM, int tN, const char *filename); + +int main(int argc, char **argv) { + + // Initialize Kokkos + Kokkos::initialize( argc, argv ); + { + // Number of variables to allocate + constexpr int num_vars = 14; + + // Declare views before conditional so they are accessible later + ViewMatrixType big_data; + ViewMatrixType u, v, p, unew, vnew, pnew, uold, vold, pold, cu, cv, z, h, psi; + + if constexpr (std::is_same_v) { + // Allocate a large contiguous block for all variables + big_data = ViewMatrixType("big_data", M_LEN, N_LEN * num_vars); + // Create 2-D subviews for each variable + u = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(0 * N_LEN, 1 * N_LEN)); + v = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(1 * N_LEN, 2 * N_LEN)); + p = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(2 * N_LEN, 3 * N_LEN)); + unew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(3 * N_LEN, 4 * N_LEN)); + vnew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(4 * N_LEN, 5 * N_LEN)); + pnew = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(5 * N_LEN, 6 * N_LEN)); + uold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(6 * N_LEN, 7 * N_LEN)); + vold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(7 * N_LEN, 8 * N_LEN)); + pold = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(8 * N_LEN, 9 * N_LEN)); + cu = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(9 * N_LEN, 10 * N_LEN)); + cv = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(10 * N_LEN, 11 * N_LEN)); + z = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(11 * N_LEN, 12 * N_LEN)); + h = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(12 * N_LEN, 13 * N_LEN)); + psi = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(13 * N_LEN, 14 * N_LEN)); + } + else if constexpr (std::is_same_v) { + big_data = ViewMatrixType("big_data", M_LEN * num_vars, N_LEN); + u = Kokkos::subview(big_data, Kokkos::make_pair(0, M_LEN), Kokkos::make_pair(0, N_LEN)); + v = Kokkos::subview(big_data, Kokkos::make_pair(M_LEN, 2 * M_LEN), Kokkos::make_pair(0, N_LEN)); + p = Kokkos::subview(big_data, Kokkos::make_pair(2 * M_LEN, 3 * M_LEN), Kokkos::make_pair(0, N_LEN)); + unew = Kokkos::subview(big_data, Kokkos::make_pair(3 * M_LEN, 4 * M_LEN), Kokkos::make_pair(0, N_LEN)); + vnew = Kokkos::subview(big_data, Kokkos::make_pair(4 * M_LEN, 5 * M_LEN), Kokkos::make_pair(0, N_LEN)); + pnew = Kokkos::subview(big_data, Kokkos::make_pair(5 * M_LEN, 6 * M_LEN), Kokkos::make_pair(0, N_LEN)); + uold = Kokkos::subview(big_data, Kokkos::make_pair(6 * M_LEN, 7 * M_LEN), Kokkos::make_pair(0, N_LEN)); + vold = Kokkos::subview(big_data, Kokkos::make_pair(7 * M_LEN, 8 * M_LEN), Kokkos::make_pair(0, N_LEN)); + pold = Kokkos::subview(big_data, Kokkos::make_pair(8 * M_LEN, 9 * M_LEN), Kokkos::make_pair(0, N_LEN)); + cu = Kokkos::subview(big_data, Kokkos::make_pair(9 * M_LEN, 10 * M_LEN), Kokkos::make_pair(0, N_LEN)); + cv = Kokkos::subview(big_data, Kokkos::make_pair(10 * M_LEN, 11 * M_LEN), Kokkos::make_pair(0, N_LEN)); + z = Kokkos::subview(big_data, Kokkos::make_pair(11 * M_LEN, 12 * M_LEN), Kokkos::make_pair(0, N_LEN)); + h = Kokkos::subview(big_data, Kokkos::make_pair(12 * M_LEN, 13 * M_LEN), Kokkos::make_pair(0, N_LEN)); + psi = Kokkos::subview(big_data, Kokkos::make_pair(13 * M_LEN, 14 * M_LEN), Kokkos::make_pair(0, N_LEN)); + } + else { + printf("Using unknown layout\n"); + return -1; + } + + double dt,tdt,dx,dy,a,alpha,el,pi; + double tpi,di,dj,pcf; + double tdts8,tdtsdx,tdtsdy,fsdx,fsdy; + + int mnmin,ncycle; + + // timer variables + double ctime,tcyc,time,ptime; + + // ** Initialisations ** + + // Note below that two delta t (tdt) is set to dt on the first + // cycle after which it is reset to dt+dt. + dt = 90.; + tdt = dt; + + dx = 100000.; + dy = 100000.; + fsdx = 4. / dx; + fsdy = 4. / dy; + + a = 1000000.; + alpha = .001; + + el = N * dx; + pi = 4. * atan(1.); + tpi = pi + pi; + di = tpi / M; + dj = tpi / N; + pcf = pi * pi * a * a / (el * el); + + // Initial values of the stream function and p + Kokkos::parallel_for("init_psi_p", teamPolicy(M_LEN, Kokkos::AUTO), KOKKOS_LAMBDA(const memberType& team) { + const int i = team.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N_LEN), [&](const int j) { + psi(i,j) = a * std::sin((i + .5) * di) * std::sin((j + .5) * dj); + p(i,j) = pcf * (std::cos(2. * (i) * di) + std::cos(2. * (j) * dj)) + 50000.; + }); + }); + + // Initialize velocities + Kokkos::parallel_for("init_u_v", teamPolicy(M, Kokkos::AUTO), KOKKOS_LAMBDA(const memberType& team) { + const int i = team.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int j) { + u(i+1,j) = -(psi(i+1,j+1) - psi(i+1,j)) / dy; + v(i,j+1) = (psi(i+1,j+1) - psi(i,j+1)) / dx; + }); + }); + + // Periodic continuation + Kokkos::parallel_for("periodic_top_bottom_init", teamPolicy(N, Kokkos::AUTO), KOKKOS_LAMBDA(const memberType& team) { + const int j = team.league_rank(); + u(0,j) = u(M,j); + v(M,j+1) = v(0,j+1); + }); + + Kokkos::parallel_for("periodic_left_right_init", teamPolicy(M, Kokkos::AUTO), KOKKOS_LAMBDA(const memberType& team) { + const int i = team.league_rank(); + u(i+1,N) = u(i+1,0); + v(i,0) = v(i,N); + }); + + Kokkos::parallel_for("periodic_corners_init", teamPolicy(1, Kokkos::AUTO), KOKKOS_LAMBDA(const memberType& team) { + u(0,N) = u(M,0); + v(M,0) = v(0,N); + }); + + Kokkos::parallel_for("init_old_arrays", teamPolicy(M_LEN, Kokkos::AUTO), KOKKOS_LAMBDA(const memberType& team) { + const int i = team.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N_LEN), [&](const int j) { + uold(i,j) = u(i,j); + vold(i,j) = v(i,j); + pold(i,j) = p(i,j); + }); + }); + + // Create host mirrors for output + HostViewMatrixType u_host = Kokkos::create_mirror_view(u); + HostViewMatrixType v_host = Kokkos::create_mirror_view(v); + HostViewMatrixType p_host = Kokkos::create_mirror_view(p); + + // Print initial values + if ( L_OUT ) { + printf(" number of points in the x direction %d\n", N); + printf(" number of points in the y direction %d\n", M); + printf(" grid spacing in the x direction %f\n", dx); + printf(" grid spacing in the y direction %f\n", dy); + printf(" time step %f\n", dt); + printf(" time filter parameter %f\n", alpha); + + mnmin = MIN(M,N); + if constexpr (!std::is_same_v) { + Kokkos::deep_copy(u_host, u); + Kokkos::deep_copy(v_host, v); + Kokkos::deep_copy(p_host, p); + } + // printf(" initial diagonal elements of p\n"); + // for (int i=0; i 1 ) { + Kokkos::parallel_for("time_smoothing_u", teamPolicy(M_LEN, Kokkos::AUTO), KOKKOS_LAMBDA(const memberType& team) { + const int i = team.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N_LEN), [&](const int j) { + uold(i,j) = u(i,j) + alpha * (unew(i,j) - 2. * u(i,j) + uold(i,j)); + }); + }); + + Kokkos::parallel_for("time_smoothing_v", teamPolicy(M_LEN, Kokkos::AUTO), KOKKOS_LAMBDA(const memberType& team) { + const int i = team.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N_LEN), [&](const int j) { + vold(i,j) = v(i,j) + alpha * (vnew(i,j) - 2. * v(i,j) + vold(i,j)); + }); + }); + + Kokkos::parallel_for("time_smoothing_p", teamPolicy(M_LEN, Kokkos::AUTO), KOKKOS_LAMBDA(const memberType& team) { + const int i = team.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N_LEN), [&](const int j) { + pold(i,j) = p(i,j) + alpha * (pnew(i,j) - 2. * p(i,j) + pold(i,j)); + }); + }); + } + else { + tdt = tdt + tdt; + Kokkos::parallel_for("first_cycle_copy", teamPolicy(M_LEN, Kokkos::AUTO), KOKKOS_LAMBDA(const memberType& team) { + const int i = team.league_rank(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N_LEN), [&](const int j) { + uold(i,j) = u(i,j); + vold(i,j) = v(i,j); + pold(i,j) = p(i,j); + }); + }); + } + // My test shows it is better to use fence and then swap for device views, rather than doing a device copy without fence. + if constexpr (!std::is_same_v) { + Kokkos::fence(); + } + // Swap the views + std::swap(u, unew); + std::swap(v, vnew); + std::swap(p, pnew); + } // ** End of time loop ** + + // Try to use `if constexpr (!std::is_same_v)` to use swap function + // for the host space, but somehow it is not compiled correctly for the device space. + // Just use deep_copy for both spaces. + Kokkos::deep_copy(u_host, u); + Kokkos::deep_copy(v_host, v); + Kokkos::deep_copy(p_host, p); + + // Output p, u, v fields and run times. + if(VAL_OUT) { + write_to_file(p_host, M_LEN, N_LEN, "p.bin"); + write_to_file(u_host, M_LEN, N_LEN, "u.bin"); + write_to_file(v_host, M_LEN, N_LEN, "v.bin"); + print_to_file(p_host, M_LEN, N_LEN, "p.txt"); + print_to_file(u_host, M_LEN, N_LEN, "u.txt"); + print_to_file(v_host, M_LEN, N_LEN, "v.txt"); + } + + if (L_OUT) { + ptime = time / 3600.; + printf(" cycle number %d model time in hours %f\n", ITMAX, ptime); + + // printf(" diagonal elements of p\n"); + // for (int i=0; i