From 82c985da3f94034729b24fbdbd754a36e5872bdc Mon Sep 17 00:00:00 2001 From: huiyuxie Date: Thu, 30 Jan 2025 15:05:56 -1000 Subject: [PATCH] Complete --- src/solvers/dg_1d.jl | 927 +-------------- src/solvers/dg_1d_kernel.jl | 926 +++++++++++++++ src/solvers/dg_2d.jl | 1469 +----------------------- src/solvers/dg_2d_kernel.jl | 1468 ++++++++++++++++++++++++ src/solvers/dg_3d.jl | 2139 +---------------------------------- src/solvers/dg_3d_kernel.jl | 2138 ++++++++++++++++++++++++++++++++++ 6 files changed, 4535 insertions(+), 4532 deletions(-) create mode 100644 src/solvers/dg_1d_kernel.jl create mode 100644 src/solvers/dg_2d_kernel.jl create mode 100644 src/solvers/dg_3d_kernel.jl diff --git a/src/solvers/dg_1d.jl b/src/solvers/dg_1d.jl index 55f3e5b..e146065 100644 --- a/src/solvers/dg_1d.jl +++ b/src/solvers/dg_1d.jl @@ -1,932 +1,7 @@ # Everything related to a DG semidiscretization in 1D. -################################################################################################# -# Functions that end with `_kernel` are CUDA kernels that are going to be launched by -# the @cuda macro with parameters from the kernel configurator. They are purely run on -# the device (i.e., GPU). - -# Kernel for calculating fluxes along normal direction -function flux_kernel!(flux_arr, u, equations::AbstractEquations{1}, flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2) && k <= size(u, 3)) - u_node = get_node_vars(u, equations, j, k) - flux_node = flux(u_node, 1, equations) - - for ii in axes(u, 1) - @inbounds flux_arr[ii, j, k] = flux_node[ii] - end - end - - return nothing -end - -# Kernel for calculating weak form -function weak_form_kernel!(du, derivative_dhat, flux_arr) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) - @inbounds du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros - for ii in axes(du, 2) - @inbounds du[i, j, k] += derivative_dhat[j, ii] * flux_arr[i, ii, k] - end - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating volume integrals with weak form -function flux_weak_form_kernel!(du, u, derivative_dhat, - equations::AbstractEquations{1}, flux::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - shmem_dhat = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_flux = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width), offset) - - # Get thread and block indices only we need to save registers - tx, ty = threadIdx().x, threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - # Tile the computation (restrict to one tile here) - value = zero(eltype(du)) - - # Load global `derivative_dhat` into shared memory - for ty2 in axes(du, 2) - # Transposed load - @inbounds shmem_dhat[ty2, ty] = derivative_dhat[ty, ty2] - end - - # Compute flux values - u_node = get_node_vars(u, equations, ty, k) - flux_node = flux(u_node, 1, equations) - - @inbounds shmem_flux[tx, ty] = flux_node[tx] - - sync_threads() - - # Loop within one block to get weak form - # TODO: Avoid potential bank conflicts - for thread in 1:tile_width - @inbounds value += shmem_dhat[thread, ty] * shmem_flux[tx, thread] - end - - # Synchronization is not needed here if we use only one tile - # sync_threads() - - # Finalize the weak form - @inbounds du[tx, ty, k] = value - - return nothing -end - -# Kernel for calculating volume fluxes -function volume_flux_kernel!(volume_flux_arr, u, equations::AbstractEquations{1}, - volume_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^2 && k <= size(u, 3)) - j1 = div(j - 1, size(u, 2)) + 1 - j2 = rem(j - 1, size(u, 2)) + 1 - - u_node = get_node_vars(u, equations, j1, k) - u_node1 = get_node_vars(u, equations, j2, k) - - volume_flux_node = volume_flux(u_node, u_node1, 1, equations) - - for ii in axes(u, 1) - @inbounds volume_flux_arr[ii, j1, j2, k] = volume_flux_node[ii] - end - end - - return nothing -end - -# Kernel for calculating volume integrals -function volume_integral_kernel!(du, derivative_split, volume_flux_arr, - equations::AbstractEquations{1}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) - @inbounds du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros - for ii in axes(du, 2) - @inbounds du[i, j, k] += derivative_split[j, ii] * (1 - isequal(j, ii)) * # set diagonal elements to zeros - volume_flux_arr[i, j, ii, k] - end - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating volume integrals without conservative terms -function volume_flux_integral_kernel!(du, u, derivative_split, - equations::AbstractEquations{1}, volume_flux::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width), offset) - - # Get thread and block indices only we need to save registers - ty = threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - # Tile the computation (set to one tile here) - # Initialize the values - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty] = zero(eltype(du)) - end - - # Load global `derivative_split` into shared memory - for ty2 in axes(du, 2) - # Transposed load - @inbounds shmem_split[ty2, ty] = derivative_split[ty, ty2] * - (1 - isequal(ty, ty2)) # set diagonal elements to zeros - end - - # Synchronization is not needed here given the access pattern - # sync_threads() - - # Compute volume fluxes - # How to store nodes in shared memory? - for thread in 1:tile_width - # Volume flux is heavy in computation so we should try best to avoid redundant - # computation, i.e., use for loop along x direction here - volume_flux_node = volume_flux(get_node_vars(u, equations, ty, k), - get_node_vars(u, equations, thread, k), - 1, equations) - - # TODO: Avoid potential bank conflicts - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty] += shmem_split[thread, ty] * volume_flux_node[tx] - end - end - - # Synchronization is not needed here if we use only one tile - # sync_threads() - - # Finalize the values - for tx in axes(du, 1) - @inbounds du[tx, ty, k] = shmem_value[tx, ty] - end - - return nothing -end - -# Kernel for calculating symmetric and nonconservative fluxes -function noncons_volume_flux_kernel!(symmetric_flux_arr, noncons_flux_arr, u, derivative_split, - equations::AbstractEquations{1}, symmetric_flux::Any, - nonconservative_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^2 && k <= size(u, 3)) - j1 = div(j - 1, size(u, 2)) + 1 - j2 = rem(j - 1, size(u, 2)) + 1 - - u_node = get_node_vars(u, equations, j1, k) - u_node1 = get_node_vars(u, equations, j2, k) - - symmetric_flux_node = symmetric_flux(u_node, u_node1, 1, equations) - noncons_flux_node = nonconservative_flux(u_node, u_node1, 1, equations) - - for ii in axes(u, 1) - @inbounds begin - symmetric_flux_arr[ii, j1, j2, k] = symmetric_flux_node[ii] * derivative_split[j1, j2] * - (1 - isequal(j1, j2)) # set diagonal elements to zeros - noncons_flux_arr[ii, j1, j2, k] = noncons_flux_node[ii] - end - end - end - - return nothing -end - -# Kernel for calculating symmetric and nonconservative volume integrals -function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr, noncons_flux_arr) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) - @inbounds du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros - - for ii in axes(du, 2) - @inbounds du[i, j, k] += symmetric_flux_arr[i, j, ii, k] + - 0.5f0 * - derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k] - end - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating volume integrals with conservative terms -function volume_flux_integral_kernel!(du, u, derivative_split, - equations::AbstractEquations{1}, - symmetric_flux::Any, nonconservative_flux::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width), offset) - - # Get thread and block indices only we need to save registers - ty = threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - # Tile the computation (set to one tile here) - # Initialize the values - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty] = zero(eltype(du)) - end - - # Load data from global memory into shared memory - for ty2 in axes(du, 2) - # Transposed load - @inbounds shmem_split[ty2, ty] = derivative_split[ty, ty2] - end - - # Synchronization is not needed here given the access pattern - # sync_threads() - - # Compute volume fluxes - # How to store nodes in shared memory? - for thread in 1:tile_width - # Volume flux is heavy in computation so we should try best to avoid redundant - # computation, i.e., use for loop along x direction here - u_node = get_node_vars(u, equations, ty, k) - symmetric_flux_node = symmetric_flux(u_node, - get_node_vars(u, equations, thread, k), - 1, equations) - noncons_flux_node = nonconservative_flux(u_node, - get_node_vars(u, equations, thread, k), - 1, equations) - - # TODO: Avoid potential bank conflicts - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty] += symmetric_flux_node[tx] * shmem_split[thread, ty] * - (1 - isequal(ty, thread)) + # set diagonal elements to zeros - 0.5f0 * - noncons_flux_node[tx] * shmem_split[thread, ty] - end - end - - # Synchronization is not needed here if we use only one tile - # sync_threads() - - # Finalize the values - for tx in axes(du, 1) - @inbounds du[tx, ty, k] = shmem_value[tx, ty] - end - - return nothing -end - -# Kernel for calculating pure DG and DG-FV volume fluxes -function volume_flux_dgfv_kernel!(volume_flux_arr, fstar1_L, fstar1_R, u, - alpha, atol, equations::AbstractEquations{1}, - volume_flux_dg::Any, volume_flux_fv::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^2 && k <= size(u, 3)) - j1 = div(j - 1, size(u, 2)) + 1 - j2 = rem(j - 1, size(u, 2)) + 1 - - dg_only = isapprox(alpha[k], 0, atol = atol) - - u_node = get_node_vars(u, equations, j1, k) - u_node1 = get_node_vars(u, equations, j2, k) - - volume_flux_node = volume_flux_dg(u_node, u_node1, 1, equations) - - for ii in axes(u, 1) - @inbounds volume_flux_arr[ii, j1, j2, k] = volume_flux_node[ii] - - # Small optimization, no much performance gain - if isequal(j1 + 1, j2) # avoid race condition - flux_fv_node = volume_flux_fv(u_node, u_node1, 1, equations) - - @inbounds begin - fstar1_L[ii, j2, k] = flux_fv_node[ii] * (1 - dg_only) - fstar1_R[ii, j2, k] = flux_fv_node[ii] * (1 - dg_only) - end - end - end - - # if j1 != 1 && j2 == 1 # bad - # u_ll = get_node_vars(u, equations, j1 - 1, k) - # u_rr = get_node_vars(u, equations, j1, k) - # flux_fv_node = volume_flux_fv(u_ll, u_rr, 1, equations) - - # for ii in axes(u, 1) - # @inbounds begin - # fstar1_L[ii, j1, k] = flux_fv_node[ii] * (1 - dg_only) - # fstar1_R[ii, j1, k] = flux_fv_node[ii] * (1 - dg_only) - # end - # end - # end - end - - return nothing -end - -# Kernel for calculating pure DG and DG-FV volume integrals -function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, - volume_flux_arr, fstar1_L, fstar1_R, atol, - equations::AbstractEquations{1}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) - @inbounds begin - du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros - alpha_element = alpha[k] - end - - dg_only = isapprox(alpha_element, 0, atol = atol) - - for ii in axes(du, 2) - @inbounds du[i, j, k] += derivative_split[j, ii] * - (1 - isequal(j, ii)) * # set diagonal elements to zeros - volume_flux_arr[i, j, ii, k] * dg_only + - (1 - alpha_element) * derivative_split[j, ii] * - (1 - isequal(j, ii)) * # set diagonal elements to zeros - volume_flux_arr[i, j, ii, k] * (1 - dg_only) - end - - @inbounds du[i, j, k] += alpha_element * inverse_weights[j] * - (fstar1_L[i, j + 1, k] - fstar1_R[i, j, k]) * (1 - dg_only) - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating pure DG and DG-FV volume integrals without conservative terms -function volume_flux_integral_dgfv_kernel!(du, u, alpha, atol, derivative_split, inverse_weights, - equations::AbstractEquations{1}, - volume_flux_dg::Any, volume_flux_fv::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_fstar1 = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width + 1), offset) - offset += sizeof(eltype(du)) * size(du, 1) * (tile_width + 1) - shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width), offset) - - # Get thread and block indices only we need to save registers - ty = threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - # Load global `derivative_split` into shared memory - for ty2 in axes(du, 2) - # Transposed load - @inbounds shmem_split[ty2, ty] = derivative_split[ty, ty2] - end - - # Get variables for computation - @inbounds alpha_element = alpha[k] - dg_only = isapprox(alpha_element, 0, atol = atol) - - # Compute FV volume fluxes - u_node = get_node_vars(u, equations, ty, k) - if ty + 1 <= tile_width - flux_fv_node = volume_flux_fv(u_node, - get_node_vars(u, equations, ty + 1, k), - 1, equations) - end - - # Initialize the values - for tx in axes(du, 1) - @inbounds begin - # Initialize `du` with zeros - shmem_value[tx, ty] = zero(eltype(du)) - # Initialize `fstar` side columes with zeros - shmem_fstar1[tx, 1] = zero(eltype(du)) - shmem_fstar1[tx, tile_width + 1] = zero(eltype(du)) - end - - if ty + 1 <= tile_width - # Set with FV volume fluxes - @inbounds shmem_fstar1[tx, ty + 1] = flux_fv_node[tx] * (1 - dg_only) - end - end - - sync_threads() - - # Contribute FV to the volume integrals - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty] += alpha_element * inverse_weights[ty] * - (shmem_fstar1[tx, ty + 1] - shmem_fstar1[tx, ty]) * (1 - dg_only) - end - - # Compute DG volume fluxes - for thread in 1:tile_width - volume_flux_node = volume_flux_dg(u_node, - get_node_vars(u, equations, thread, k), - 1, equations) - - # Contribute DG to the volume integrals - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty] += shmem_split[thread, ty] * - (1 - isequal(ty, thread)) * # set diagonal elements to zeros - volume_flux_node[tx] * dg_only + - (1 - alpha_element) * shmem_split[thread, ty] * - (1 - isequal(ty, thread)) * # set diagonal elements to zeros - volume_flux_node[tx] * (1 - dg_only) - end - end - - # Finalize the values - for tx in axes(du, 1) - @inbounds du[tx, ty, k] = shmem_value[tx, ty] - end - - return nothing -end - -# Kernel for calculating pure DG and DG-FV volume fluxes -function volume_flux_dgfv_kernel!(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, - u, alpha, atol, derivative_split, - equations::AbstractEquations{1}, - volume_flux_dg::Any, noncons_flux_dg::Any, - volume_flux_fv::Any, noncons_flux_fv::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^2 && k <= size(u, 3)) - j1 = div(j - 1, size(u, 2)) + 1 - j2 = rem(j - 1, size(u, 2)) + 1 - - dg_only = isapprox(alpha[k], 0, atol = atol) - - u_node = get_node_vars(u, equations, j1, k) - u_node1 = get_node_vars(u, equations, j2, k) - - volume_flux_node = volume_flux_dg(u_node, u_node1, 1, equations) - noncons_flux_node = noncons_flux_dg(u_node, u_node1, 1, equations) - - for ii in axes(u, 1) - @inbounds begin - volume_flux_arr[ii, j1, j2, k] = volume_flux_node[ii] * derivative_split[j1, j2] * - (1 - isequal(j1, j2)) # set diagonal elements to zeros - noncons_flux_arr[ii, j1, j2, k] = noncons_flux_node[ii] - end - - # Small optimization, no much performance gain - if isequal(j1 + 1, j2) # avoid race condition - f1_node = volume_flux_fv(u_node, u_node1, 1, equations) - f1_L_node = noncons_flux_fv(u_node, u_node1, 1, equations) - f1_R_node = noncons_flux_fv(u_node1, u_node, 1, equations) - - @inbounds begin - fstar1_L[ii, j2, k] = (f1_node[ii] + 0.5f0 * f1_L_node[ii]) * (1 - dg_only) - fstar1_R[ii, j2, k] = (f1_node[ii] + 0.5f0 * f1_R_node[ii]) * (1 - dg_only) - end - end - end - - # if j1 != 1 && j2 == 1 # bad - # u_ll = get_node_vars(u, equations, j1 - 1, k) - # u_rr = get_node_vars(u, equations, j1, k) - - # f1_node = volume_flux_fv(u_ll, u_rr, 1, equations) - - # f1_L_node = noncons_flux_fv(u_ll, u_rr, 1, equations) - # f1_R_node = noncons_flux_fv(u_rr, u_ll, 1, equations) - - # for ii in axes(u, 1) - # @inbounds begin - # fstar1_L[ii, j1, k] = (f1_node[ii] + 0.5f0 * f1_L_node[ii]) * (1 - dg_only) - # fstar1_R[ii, j1, k] = (f1_node[ii] + 0.5f0 * f1_R_node[ii]) * (1 - dg_only) - # end - # end - # end - end - - return nothing -end - -# Kernel for calculating pure DG and DG-FV volume integrals -function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, - volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, - atol, equations::AbstractEquations{1}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) - @inbounds begin - du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros - alpha_element = alpha[k] - end - - dg_only = isapprox(alpha_element, 0, atol = atol) - - for ii in axes(du, 2) - @inbounds du[i, j, k] += (volume_flux_arr[i, j, ii, k] + - 0.5f0 * - derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k]) * dg_only + - ((1 - alpha_element) * volume_flux_arr[i, j, ii, k] + - 0.5f0 * (1 - alpha_element) * - derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k]) * (1 - dg_only) - end - - @inbounds du[i, j, k] += alpha_element * inverse_weights[j] * - (fstar1_L[i, j + 1, k] - fstar1_R[i, j, k]) * (1 - dg_only) - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating pure DG and DG-FV volume integrals with conservative terms -function volume_flux_integral_dgfv_kernel!(du, u, alpha, atol, derivative_split, inverse_weights, - equations::AbstractEquations{1}, - volume_flux_dg::Any, noncons_flux_dg::Any, - volume_flux_fv::Any, noncons_flux_fv::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_fstar1 = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width + 1, 2), offset) - offset += sizeof(eltype(du)) * size(du, 1) * (tile_width + 1) * 2 - shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width), offset) - - # Get thread and block indices only we need to save registers - ty = threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - # Load global `derivative_split` into shared memory - for ty2 in axes(du, 2) - # Transposed load - @inbounds shmem_split[ty2, ty] = derivative_split[ty, ty2] - end - - # Get variables for computation - @inbounds alpha_element = alpha[k] - dg_only = isapprox(alpha_element, 0, atol = atol) - - # Compute FV volume fluxes - u_node = get_node_vars(u, equations, ty, k) - if ty + 1 <= tile_width - f1_node = volume_flux_fv(u_node, - get_node_vars(u, equations, ty + 1, k), - 1, equations) - f1_L_node = noncons_flux_fv(u_node, - get_node_vars(u, equations, ty + 1, k), - 1, equations) - f1_R_node = noncons_flux_fv(get_node_vars(u, equations, ty + 1, k), - u_node, - 1, equations) - end - - # Initialize the values - for tx in axes(du, 1) - @inbounds begin - # Initialize `du` with zeros - shmem_value[tx, ty] = zero(eltype(du)) - - # TODO: Remove shared memory for `fstar` and use local memory - - # Initialize `fstar` side columes with zeros (1: left) - shmem_fstar1[tx, 1, 1] = zero(eltype(du)) - shmem_fstar1[tx, tile_width + 1, 1] = zero(eltype(du)) - - # Initialize `fstar` side columes with zeros (2: right) - shmem_fstar1[tx, 1, 2] = zero(eltype(du)) - shmem_fstar1[tx, tile_width + 1, 2] = zero(eltype(du)) - end - - if ty + 1 <= tile_width - # Set with FV volume fluxes - @inbounds begin - shmem_fstar1[tx, ty + 1, 1] = (f1_node[tx] + 0.5f0 * f1_L_node[tx]) * (1 - dg_only) - shmem_fstar1[tx, ty + 1, 2] = (f1_node[tx] + 0.5f0 * f1_R_node[tx]) * (1 - dg_only) - end - end - end - - sync_threads() - - # Contribute FV to the volume integrals - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty] += alpha_element * inverse_weights[ty] * - (shmem_fstar1[tx, ty + 1, 1] - shmem_fstar1[tx, ty, 2]) * (1 - dg_only) - end - - # Compute DG volume fluxes - for thread in 1:tile_width - volume_flux_node = volume_flux_dg(u_node, - get_node_vars(u, equations, thread, k), - 1, equations) - noncons_flux_node = noncons_flux_dg(u_node, - get_node_vars(u, equations, thread, k), - 1, equations) - - # Contribute DG to the volume integrals - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty] += (volume_flux_node[tx] * shmem_split[thread, ty] * - (1 - isequal(ty, thread)) + # set diagonal elements to zeros - 0.5f0 * - shmem_split[thread, ty] * noncons_flux_node[tx]) * dg_only + - ((1 - alpha_element) * volume_flux_node[tx] * shmem_split[thread, ty] * - (1 - isequal(ty, thread)) + # set diagonal elements to zeros - 0.5f0 * (1 - alpha_element) * - shmem_split[thread, ty] * noncons_flux_node[tx]) * (1 - dg_only) - end - end - - # Finalize the values - for tx in axes(du, 1) - @inbounds du[tx, ty, k] = shmem_value[tx, ty] - end - - return nothing -end - -# Kernel for prolonging two interfaces -function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(interfaces_u, 2) && k <= size(interfaces_u, 3)) - @inbounds begin - left_element = neighbor_ids[1, k] - right_element = neighbor_ids[2, k] - - interfaces_u[1, j, k] = u[j, size(u, 2), left_element] - interfaces_u[2, j, k] = u[j, 1, right_element] - end - end - - return nothing -end - -# Kernel for calculating surface fluxes -function surface_flux_kernel!(surface_flux_arr, interfaces_u, equations::AbstractEquations{1}, - surface_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - if (j <= size(surface_flux_arr, 2)) - u_ll, u_rr = get_surface_node_vars(interfaces_u, equations, j) - - surface_flux_node = surface_flux(u_ll, u_rr, 1, equations) - - for ii in axes(surface_flux_arr, 1) - @inbounds surface_flux_arr[ii, j] = surface_flux_node[ii] - end - end - - return nothing -end - -# Kernel for calculating surface and both nonconservative fluxes -function surface_noncons_flux_kernel!(surface_flux_arr, noncons_left_arr, noncons_right_arr, - interfaces_u, equations::AbstractEquations{1}, - surface_flux::Any, nonconservative_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - if (j <= size(surface_flux_arr, 2)) - u_ll, u_rr = get_surface_node_vars(interfaces_u, equations, j) - - surface_flux_node = surface_flux(u_ll, u_rr, 1, equations) - noncons_left_node = nonconservative_flux(u_ll, u_rr, 1, equations) - noncons_right_node = nonconservative_flux(u_rr, u_ll, 1, equations) - - for ii in axes(surface_flux_arr, 1) - @inbounds begin - surface_flux_arr[ii, j] = surface_flux_node[ii] - noncons_left_arr[ii, j] = noncons_left_node[ii] - noncons_right_arr[ii, j] = noncons_right_node[ii] - end - end - end - - return nothing -end - -# Kernel for setting interface fluxes -function interface_flux_kernel!(surface_flux_values, surface_flux_arr, neighbor_ids) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_arr, 2)) - @inbounds begin - left_id = neighbor_ids[1, j] - right_id = neighbor_ids[2, j] - - surface_flux_values[i, 2, left_id] = surface_flux_arr[i, j] - surface_flux_values[i, 1, right_id] = surface_flux_arr[i, j] - end - end - - return nothing -end - -# Kernel for setting interface fluxes -function interface_flux_kernel!(surface_flux_values, surface_flux_arr, noncons_left_arr, - noncons_right_arr, neighbor_ids) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_arr, 2)) - @inbounds begin - left_id = neighbor_ids[1, j] - right_id = neighbor_ids[2, j] - - surface_flux_values[i, 2, left_id] = surface_flux_arr[i, j] + - 0.5f0 * noncons_left_arr[i, j] - surface_flux_values[i, 1, right_id] = surface_flux_arr[i, j] + - 0.5f0 * noncons_right_arr[i, j] - end - end - - return nothing -end - -# Kernel for prolonging two boundaries -function prolong_boundaries_kernel!(boundaries_u, u, neighbor_ids, neighbor_sides) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(boundaries_u, 2) && k <= size(boundaries_u, 3)) - @inbounds begin - element = neighbor_ids[k] - side = neighbor_sides[k] - - boundaries_u[1, j, k] = u[j, size(u, 2), element] * (2 - side) # set to 0 instead of NaN - boundaries_u[2, j, k] = u[j, 1, element] * (side - 1) # set to 0 instead of NaN - end - end - - return nothing -end - -# Kernel for calculating boundary fluxes -function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, - indices_arr, neighbor_ids, neighbor_sides, orientations, - boundary_conditions::NamedTuple, equations::AbstractEquations{1}, - surface_flux::Any) - k = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - if (k <= length(boundary_arr)) - @inbounds begin - boundary = boundary_arr[k] - direction = (indices_arr[1] <= boundary) + (indices_arr[2] <= boundary) - - neighbor = neighbor_ids[boundary] - side = neighbor_sides[boundary] - orientation = orientations[boundary] - end - - u_ll, u_rr = get_surface_node_vars(boundaries_u, equations, boundary) - u_inner = (2 - side) * u_ll + (side - 1) * u_rr - x = get_node_coords(node_coordinates, equations, boundary) - - # TODO: Improve this part - if direction == 1 - boundary_flux_node = boundary_conditions[1](u_inner, orientation, - direction, x, t, surface_flux, equations) - else - boundary_flux_node = boundary_conditions[2](u_inner, orientation, - direction, x, t, surface_flux, equations) - end - - for ii in axes(surface_flux_values, 1) - # `boundary_flux_node` can be nothing if periodic boundary condition is applied - @inbounds surface_flux_values[ii, direction, neighbor] = isnothing(boundary_flux_node) ? # bad - surface_flux_values[ii, - direction, - neighbor] : - boundary_flux_node[ii] - end - end - - return nothing -end - -# Kernel for calculating boundary fluxes -function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, - indices_arr, neighbor_ids, neighbor_sides, orientations, - boundary_conditions::NamedTuple, equations::AbstractEquations{1}, - surface_flux::Any, nonconservative_flux::Any) - k = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - if (k <= length(boundary_arr)) - @inbounds begin - boundary = boundary_arr[k] - direction = (indices_arr[1] <= boundary) + (indices_arr[2] <= boundary) - - neighbor = neighbor_ids[boundary] - side = neighbor_sides[boundary] - orientation = orientations[boundary] - end - - u_ll, u_rr = get_surface_node_vars(boundaries_u, equations, boundary) - u_inner = (2 - side) * u_ll + (side - 1) * u_rr - x = get_node_coords(node_coordinates, equations, boundary) - - # TODO: Improve this part - if direction == 1 - flux_node = boundary_conditions[1](u_inner, orientation, direction, x, t, surface_flux, - equations) - noncons_flux_node = boundary_conditions[1](u_inner, orientation, direction, x, t, - nonconservative_flux, equations) - else - flux_node = boundary_conditions[2](u_inner, orientation, direction, x, t, surface_flux, - equations) - noncons_flux_node = boundary_conditions[2](u_inner, orientation, direction, x, t, - nonconservative_flux, equations) - end - - for ii in axes(surface_flux_values, 1) - @inbounds surface_flux_values[ii, direction, neighbor] = flux_node[ii] + - 0.5f0 * noncons_flux_node[ii] - end - end - - return nothing -end - -# Kernel for calculating surface integrals -function surface_integral_kernel!(du, factor_arr, surface_flux_values, - equations::AbstractEquations{1}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) - @inbounds begin - du[i, j, k] -= surface_flux_values[i, 1, k] * isequal(j, 1) * factor_arr[1] - du[i, j, k] += surface_flux_values[i, 2, k] * isequal(j, size(du, 2)) * factor_arr[2] - end - end - - return nothing -end - -# Kernel for applying inverse Jacobian -function jacobian_kernel!(du, inverse_jacobian, equations::AbstractEquations{1}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) - @inbounds du[i, j, k] *= -inverse_jacobian[k] - end - - return nothing -end - -# Kernel for calculating source terms -function source_terms_kernel!(du, u, node_coordinates, t, equations::AbstractEquations{1}, - source_terms::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(du, 2) && k <= size(du, 3)) - u_local = get_node_vars(u, equations, j, k) - x_local = get_node_coords(node_coordinates, equations, j, k) - - source_terms_node = source_terms(u_local, x_local, t, equations) - - for ii in axes(du, 1) - @inbounds du[ii, j, k] += source_terms_node[ii] - end - end - - return nothing -end +include("dg_1d_kernel.jl") -################################################################################################# # Functions that begin with `cuda_` are the functions that pack CUDA kernels together to do # partial work in semidiscretization. They are used to invoke kernels from the host (i.e., CPU) # and run them on the device (i.e., GPU). diff --git a/src/solvers/dg_1d_kernel.jl b/src/solvers/dg_1d_kernel.jl new file mode 100644 index 0000000..73fcc56 --- /dev/null +++ b/src/solvers/dg_1d_kernel.jl @@ -0,0 +1,926 @@ +# GPU kernels related to a DG semidiscretization in 1D. + +# Functions that end with `_kernel` are CUDA kernels that are going to be launched by +# the @cuda macro with parameters from the kernel configurator. They are purely run on +# the device (i.e., GPU). + +# Kernel for calculating fluxes along normal direction +function flux_kernel!(flux_arr, u, equations::AbstractEquations{1}, flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2) && k <= size(u, 3)) + u_node = get_node_vars(u, equations, j, k) + flux_node = flux(u_node, 1, equations) + + for ii in axes(u, 1) + @inbounds flux_arr[ii, j, k] = flux_node[ii] + end + end + + return nothing +end + +# Kernel for calculating weak form +function weak_form_kernel!(du, derivative_dhat, flux_arr) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) + @inbounds du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros + for ii in axes(du, 2) + @inbounds du[i, j, k] += derivative_dhat[j, ii] * flux_arr[i, ii, k] + end + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating volume integrals with weak form +function flux_weak_form_kernel!(du, u, derivative_dhat, + equations::AbstractEquations{1}, flux::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + shmem_dhat = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_flux = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width), offset) + + # Get thread and block indices only we need to save registers + tx, ty = threadIdx().x, threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + # Tile the computation (restrict to one tile here) + value = zero(eltype(du)) + + # Load global `derivative_dhat` into shared memory + for ty2 in axes(du, 2) + # Transposed load + @inbounds shmem_dhat[ty2, ty] = derivative_dhat[ty, ty2] + end + + # Compute flux values + u_node = get_node_vars(u, equations, ty, k) + flux_node = flux(u_node, 1, equations) + + @inbounds shmem_flux[tx, ty] = flux_node[tx] + + sync_threads() + + # Loop within one block to get weak form + # TODO: Avoid potential bank conflicts + for thread in 1:tile_width + @inbounds value += shmem_dhat[thread, ty] * shmem_flux[tx, thread] + end + + # Synchronization is not needed here if we use only one tile + # sync_threads() + + # Finalize the weak form + @inbounds du[tx, ty, k] = value + + return nothing +end + +# Kernel for calculating volume fluxes +function volume_flux_kernel!(volume_flux_arr, u, equations::AbstractEquations{1}, + volume_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^2 && k <= size(u, 3)) + j1 = div(j - 1, size(u, 2)) + 1 + j2 = rem(j - 1, size(u, 2)) + 1 + + u_node = get_node_vars(u, equations, j1, k) + u_node1 = get_node_vars(u, equations, j2, k) + + volume_flux_node = volume_flux(u_node, u_node1, 1, equations) + + for ii in axes(u, 1) + @inbounds volume_flux_arr[ii, j1, j2, k] = volume_flux_node[ii] + end + end + + return nothing +end + +# Kernel for calculating volume integrals +function volume_integral_kernel!(du, derivative_split, volume_flux_arr, + equations::AbstractEquations{1}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) + @inbounds du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros + for ii in axes(du, 2) + @inbounds du[i, j, k] += derivative_split[j, ii] * (1 - isequal(j, ii)) * # set diagonal elements to zeros + volume_flux_arr[i, j, ii, k] + end + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating volume integrals without conservative terms +function volume_flux_integral_kernel!(du, u, derivative_split, + equations::AbstractEquations{1}, volume_flux::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width), offset) + + # Get thread and block indices only we need to save registers + ty = threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + # Tile the computation (set to one tile here) + # Initialize the values + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty] = zero(eltype(du)) + end + + # Load global `derivative_split` into shared memory + for ty2 in axes(du, 2) + # Transposed load + @inbounds shmem_split[ty2, ty] = derivative_split[ty, ty2] * + (1 - isequal(ty, ty2)) # set diagonal elements to zeros + end + + # Synchronization is not needed here given the access pattern + # sync_threads() + + # Compute volume fluxes + # How to store nodes in shared memory? + for thread in 1:tile_width + # Volume flux is heavy in computation so we should try best to avoid redundant + # computation, i.e., use for loop along x direction here + volume_flux_node = volume_flux(get_node_vars(u, equations, ty, k), + get_node_vars(u, equations, thread, k), + 1, equations) + + # TODO: Avoid potential bank conflicts + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty] += shmem_split[thread, ty] * volume_flux_node[tx] + end + end + + # Synchronization is not needed here if we use only one tile + # sync_threads() + + # Finalize the values + for tx in axes(du, 1) + @inbounds du[tx, ty, k] = shmem_value[tx, ty] + end + + return nothing +end + +# Kernel for calculating symmetric and nonconservative fluxes +function noncons_volume_flux_kernel!(symmetric_flux_arr, noncons_flux_arr, u, derivative_split, + equations::AbstractEquations{1}, symmetric_flux::Any, + nonconservative_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^2 && k <= size(u, 3)) + j1 = div(j - 1, size(u, 2)) + 1 + j2 = rem(j - 1, size(u, 2)) + 1 + + u_node = get_node_vars(u, equations, j1, k) + u_node1 = get_node_vars(u, equations, j2, k) + + symmetric_flux_node = symmetric_flux(u_node, u_node1, 1, equations) + noncons_flux_node = nonconservative_flux(u_node, u_node1, 1, equations) + + for ii in axes(u, 1) + @inbounds begin + symmetric_flux_arr[ii, j1, j2, k] = symmetric_flux_node[ii] * derivative_split[j1, j2] * + (1 - isequal(j1, j2)) # set diagonal elements to zeros + noncons_flux_arr[ii, j1, j2, k] = noncons_flux_node[ii] + end + end + end + + return nothing +end + +# Kernel for calculating symmetric and nonconservative volume integrals +function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr, noncons_flux_arr) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) + @inbounds du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros + + for ii in axes(du, 2) + @inbounds du[i, j, k] += symmetric_flux_arr[i, j, ii, k] + + 0.5f0 * + derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k] + end + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating volume integrals with conservative terms +function volume_flux_integral_kernel!(du, u, derivative_split, + equations::AbstractEquations{1}, + symmetric_flux::Any, nonconservative_flux::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width), offset) + + # Get thread and block indices only we need to save registers + ty = threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + # Tile the computation (set to one tile here) + # Initialize the values + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty] = zero(eltype(du)) + end + + # Load data from global memory into shared memory + for ty2 in axes(du, 2) + # Transposed load + @inbounds shmem_split[ty2, ty] = derivative_split[ty, ty2] + end + + # Synchronization is not needed here given the access pattern + # sync_threads() + + # Compute volume fluxes + # How to store nodes in shared memory? + for thread in 1:tile_width + # Volume flux is heavy in computation so we should try best to avoid redundant + # computation, i.e., use for loop along x direction here + u_node = get_node_vars(u, equations, ty, k) + symmetric_flux_node = symmetric_flux(u_node, + get_node_vars(u, equations, thread, k), + 1, equations) + noncons_flux_node = nonconservative_flux(u_node, + get_node_vars(u, equations, thread, k), + 1, equations) + + # TODO: Avoid potential bank conflicts + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty] += symmetric_flux_node[tx] * shmem_split[thread, ty] * + (1 - isequal(ty, thread)) + # set diagonal elements to zeros + 0.5f0 * + noncons_flux_node[tx] * shmem_split[thread, ty] + end + end + + # Synchronization is not needed here if we use only one tile + # sync_threads() + + # Finalize the values + for tx in axes(du, 1) + @inbounds du[tx, ty, k] = shmem_value[tx, ty] + end + + return nothing +end + +# Kernel for calculating pure DG and DG-FV volume fluxes +function volume_flux_dgfv_kernel!(volume_flux_arr, fstar1_L, fstar1_R, u, + alpha, atol, equations::AbstractEquations{1}, + volume_flux_dg::Any, volume_flux_fv::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^2 && k <= size(u, 3)) + j1 = div(j - 1, size(u, 2)) + 1 + j2 = rem(j - 1, size(u, 2)) + 1 + + dg_only = isapprox(alpha[k], 0, atol = atol) + + u_node = get_node_vars(u, equations, j1, k) + u_node1 = get_node_vars(u, equations, j2, k) + + volume_flux_node = volume_flux_dg(u_node, u_node1, 1, equations) + + for ii in axes(u, 1) + @inbounds volume_flux_arr[ii, j1, j2, k] = volume_flux_node[ii] + + # Small optimization, no much performance gain + if isequal(j1 + 1, j2) # avoid race condition + flux_fv_node = volume_flux_fv(u_node, u_node1, 1, equations) + + @inbounds begin + fstar1_L[ii, j2, k] = flux_fv_node[ii] * (1 - dg_only) + fstar1_R[ii, j2, k] = flux_fv_node[ii] * (1 - dg_only) + end + end + end + + # if j1 != 1 && j2 == 1 # bad + # u_ll = get_node_vars(u, equations, j1 - 1, k) + # u_rr = get_node_vars(u, equations, j1, k) + # flux_fv_node = volume_flux_fv(u_ll, u_rr, 1, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar1_L[ii, j1, k] = flux_fv_node[ii] * (1 - dg_only) + # fstar1_R[ii, j1, k] = flux_fv_node[ii] * (1 - dg_only) + # end + # end + # end + end + + return nothing +end + +# Kernel for calculating pure DG and DG-FV volume integrals +function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, + volume_flux_arr, fstar1_L, fstar1_R, atol, + equations::AbstractEquations{1}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) + @inbounds begin + du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros + alpha_element = alpha[k] + end + + dg_only = isapprox(alpha_element, 0, atol = atol) + + for ii in axes(du, 2) + @inbounds du[i, j, k] += derivative_split[j, ii] * + (1 - isequal(j, ii)) * # set diagonal elements to zeros + volume_flux_arr[i, j, ii, k] * dg_only + + (1 - alpha_element) * derivative_split[j, ii] * + (1 - isequal(j, ii)) * # set diagonal elements to zeros + volume_flux_arr[i, j, ii, k] * (1 - dg_only) + end + + @inbounds du[i, j, k] += alpha_element * inverse_weights[j] * + (fstar1_L[i, j + 1, k] - fstar1_R[i, j, k]) * (1 - dg_only) + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating pure DG and DG-FV volume integrals without conservative terms +function volume_flux_integral_dgfv_kernel!(du, u, alpha, atol, derivative_split, inverse_weights, + equations::AbstractEquations{1}, + volume_flux_dg::Any, volume_flux_fv::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_fstar1 = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width + 1), offset) + offset += sizeof(eltype(du)) * size(du, 1) * (tile_width + 1) + shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width), offset) + + # Get thread and block indices only we need to save registers + ty = threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + # Load global `derivative_split` into shared memory + for ty2 in axes(du, 2) + # Transposed load + @inbounds shmem_split[ty2, ty] = derivative_split[ty, ty2] + end + + # Get variables for computation + @inbounds alpha_element = alpha[k] + dg_only = isapprox(alpha_element, 0, atol = atol) + + # Compute FV volume fluxes + u_node = get_node_vars(u, equations, ty, k) + if ty + 1 <= tile_width + flux_fv_node = volume_flux_fv(u_node, + get_node_vars(u, equations, ty + 1, k), + 1, equations) + end + + # Initialize the values + for tx in axes(du, 1) + @inbounds begin + # Initialize `du` with zeros + shmem_value[tx, ty] = zero(eltype(du)) + # Initialize `fstar` side columes with zeros + shmem_fstar1[tx, 1] = zero(eltype(du)) + shmem_fstar1[tx, tile_width + 1] = zero(eltype(du)) + end + + if ty + 1 <= tile_width + # Set with FV volume fluxes + @inbounds shmem_fstar1[tx, ty + 1] = flux_fv_node[tx] * (1 - dg_only) + end + end + + sync_threads() + + # Contribute FV to the volume integrals + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty] += alpha_element * inverse_weights[ty] * + (shmem_fstar1[tx, ty + 1] - shmem_fstar1[tx, ty]) * (1 - dg_only) + end + + # Compute DG volume fluxes + for thread in 1:tile_width + volume_flux_node = volume_flux_dg(u_node, + get_node_vars(u, equations, thread, k), + 1, equations) + + # Contribute DG to the volume integrals + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty] += shmem_split[thread, ty] * + (1 - isequal(ty, thread)) * # set diagonal elements to zeros + volume_flux_node[tx] * dg_only + + (1 - alpha_element) * shmem_split[thread, ty] * + (1 - isequal(ty, thread)) * # set diagonal elements to zeros + volume_flux_node[tx] * (1 - dg_only) + end + end + + # Finalize the values + for tx in axes(du, 1) + @inbounds du[tx, ty, k] = shmem_value[tx, ty] + end + + return nothing +end + +# Kernel for calculating pure DG and DG-FV volume fluxes +function volume_flux_dgfv_kernel!(volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, + u, alpha, atol, derivative_split, + equations::AbstractEquations{1}, + volume_flux_dg::Any, noncons_flux_dg::Any, + volume_flux_fv::Any, noncons_flux_fv::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^2 && k <= size(u, 3)) + j1 = div(j - 1, size(u, 2)) + 1 + j2 = rem(j - 1, size(u, 2)) + 1 + + dg_only = isapprox(alpha[k], 0, atol = atol) + + u_node = get_node_vars(u, equations, j1, k) + u_node1 = get_node_vars(u, equations, j2, k) + + volume_flux_node = volume_flux_dg(u_node, u_node1, 1, equations) + noncons_flux_node = noncons_flux_dg(u_node, u_node1, 1, equations) + + for ii in axes(u, 1) + @inbounds begin + volume_flux_arr[ii, j1, j2, k] = volume_flux_node[ii] * derivative_split[j1, j2] * + (1 - isequal(j1, j2)) # set diagonal elements to zeros + noncons_flux_arr[ii, j1, j2, k] = noncons_flux_node[ii] + end + + # Small optimization, no much performance gain + if isequal(j1 + 1, j2) # avoid race condition + f1_node = volume_flux_fv(u_node, u_node1, 1, equations) + f1_L_node = noncons_flux_fv(u_node, u_node1, 1, equations) + f1_R_node = noncons_flux_fv(u_node1, u_node, 1, equations) + + @inbounds begin + fstar1_L[ii, j2, k] = (f1_node[ii] + 0.5f0 * f1_L_node[ii]) * (1 - dg_only) + fstar1_R[ii, j2, k] = (f1_node[ii] + 0.5f0 * f1_R_node[ii]) * (1 - dg_only) + end + end + end + + # if j1 != 1 && j2 == 1 # bad + # u_ll = get_node_vars(u, equations, j1 - 1, k) + # u_rr = get_node_vars(u, equations, j1, k) + + # f1_node = volume_flux_fv(u_ll, u_rr, 1, equations) + + # f1_L_node = noncons_flux_fv(u_ll, u_rr, 1, equations) + # f1_R_node = noncons_flux_fv(u_rr, u_ll, 1, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar1_L[ii, j1, k] = (f1_node[ii] + 0.5f0 * f1_L_node[ii]) * (1 - dg_only) + # fstar1_R[ii, j1, k] = (f1_node[ii] + 0.5f0 * f1_R_node[ii]) * (1 - dg_only) + # end + # end + # end + end + + return nothing +end + +# Kernel for calculating pure DG and DG-FV volume integrals +function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, + volume_flux_arr, noncons_flux_arr, fstar1_L, fstar1_R, + atol, equations::AbstractEquations{1}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) + @inbounds begin + du[i, j, k] = zero(eltype(du)) # initialize `du` with zeros + alpha_element = alpha[k] + end + + dg_only = isapprox(alpha_element, 0, atol = atol) + + for ii in axes(du, 2) + @inbounds du[i, j, k] += (volume_flux_arr[i, j, ii, k] + + 0.5f0 * + derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k]) * dg_only + + ((1 - alpha_element) * volume_flux_arr[i, j, ii, k] + + 0.5f0 * (1 - alpha_element) * + derivative_split[j, ii] * noncons_flux_arr[i, j, ii, k]) * (1 - dg_only) + end + + @inbounds du[i, j, k] += alpha_element * inverse_weights[j] * + (fstar1_L[i, j + 1, k] - fstar1_R[i, j, k]) * (1 - dg_only) + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating pure DG and DG-FV volume integrals with conservative terms +function volume_flux_integral_dgfv_kernel!(du, u, alpha, atol, derivative_split, inverse_weights, + equations::AbstractEquations{1}, + volume_flux_dg::Any, noncons_flux_dg::Any, + volume_flux_fv::Any, noncons_flux_fv::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_fstar1 = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width + 1, 2), offset) + offset += sizeof(eltype(du)) * size(du, 1) * (tile_width + 1) * 2 + shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width), offset) + + # Get thread and block indices only we need to save registers + ty = threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + # Load global `derivative_split` into shared memory + for ty2 in axes(du, 2) + # Transposed load + @inbounds shmem_split[ty2, ty] = derivative_split[ty, ty2] + end + + # Get variables for computation + @inbounds alpha_element = alpha[k] + dg_only = isapprox(alpha_element, 0, atol = atol) + + # Compute FV volume fluxes + u_node = get_node_vars(u, equations, ty, k) + if ty + 1 <= tile_width + f1_node = volume_flux_fv(u_node, + get_node_vars(u, equations, ty + 1, k), + 1, equations) + f1_L_node = noncons_flux_fv(u_node, + get_node_vars(u, equations, ty + 1, k), + 1, equations) + f1_R_node = noncons_flux_fv(get_node_vars(u, equations, ty + 1, k), + u_node, + 1, equations) + end + + # Initialize the values + for tx in axes(du, 1) + @inbounds begin + # Initialize `du` with zeros + shmem_value[tx, ty] = zero(eltype(du)) + + # TODO: Remove shared memory for `fstar` and use local memory + + # Initialize `fstar` side columes with zeros (1: left) + shmem_fstar1[tx, 1, 1] = zero(eltype(du)) + shmem_fstar1[tx, tile_width + 1, 1] = zero(eltype(du)) + + # Initialize `fstar` side columes with zeros (2: right) + shmem_fstar1[tx, 1, 2] = zero(eltype(du)) + shmem_fstar1[tx, tile_width + 1, 2] = zero(eltype(du)) + end + + if ty + 1 <= tile_width + # Set with FV volume fluxes + @inbounds begin + shmem_fstar1[tx, ty + 1, 1] = (f1_node[tx] + 0.5f0 * f1_L_node[tx]) * (1 - dg_only) + shmem_fstar1[tx, ty + 1, 2] = (f1_node[tx] + 0.5f0 * f1_R_node[tx]) * (1 - dg_only) + end + end + end + + sync_threads() + + # Contribute FV to the volume integrals + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty] += alpha_element * inverse_weights[ty] * + (shmem_fstar1[tx, ty + 1, 1] - shmem_fstar1[tx, ty, 2]) * (1 - dg_only) + end + + # Compute DG volume fluxes + for thread in 1:tile_width + volume_flux_node = volume_flux_dg(u_node, + get_node_vars(u, equations, thread, k), + 1, equations) + noncons_flux_node = noncons_flux_dg(u_node, + get_node_vars(u, equations, thread, k), + 1, equations) + + # Contribute DG to the volume integrals + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty] += (volume_flux_node[tx] * shmem_split[thread, ty] * + (1 - isequal(ty, thread)) + # set diagonal elements to zeros + 0.5f0 * + shmem_split[thread, ty] * noncons_flux_node[tx]) * dg_only + + ((1 - alpha_element) * volume_flux_node[tx] * shmem_split[thread, ty] * + (1 - isequal(ty, thread)) + # set diagonal elements to zeros + 0.5f0 * (1 - alpha_element) * + shmem_split[thread, ty] * noncons_flux_node[tx]) * (1 - dg_only) + end + end + + # Finalize the values + for tx in axes(du, 1) + @inbounds du[tx, ty, k] = shmem_value[tx, ty] + end + + return nothing +end + +# Kernel for prolonging two interfaces +function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(interfaces_u, 2) && k <= size(interfaces_u, 3)) + @inbounds begin + left_element = neighbor_ids[1, k] + right_element = neighbor_ids[2, k] + + interfaces_u[1, j, k] = u[j, size(u, 2), left_element] + interfaces_u[2, j, k] = u[j, 1, right_element] + end + end + + return nothing +end + +# Kernel for calculating surface fluxes +function surface_flux_kernel!(surface_flux_arr, interfaces_u, equations::AbstractEquations{1}, + surface_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + if (j <= size(surface_flux_arr, 2)) + u_ll, u_rr = get_surface_node_vars(interfaces_u, equations, j) + + surface_flux_node = surface_flux(u_ll, u_rr, 1, equations) + + for ii in axes(surface_flux_arr, 1) + @inbounds surface_flux_arr[ii, j] = surface_flux_node[ii] + end + end + + return nothing +end + +# Kernel for calculating surface and both nonconservative fluxes +function surface_noncons_flux_kernel!(surface_flux_arr, noncons_left_arr, noncons_right_arr, + interfaces_u, equations::AbstractEquations{1}, + surface_flux::Any, nonconservative_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + if (j <= size(surface_flux_arr, 2)) + u_ll, u_rr = get_surface_node_vars(interfaces_u, equations, j) + + surface_flux_node = surface_flux(u_ll, u_rr, 1, equations) + noncons_left_node = nonconservative_flux(u_ll, u_rr, 1, equations) + noncons_right_node = nonconservative_flux(u_rr, u_ll, 1, equations) + + for ii in axes(surface_flux_arr, 1) + @inbounds begin + surface_flux_arr[ii, j] = surface_flux_node[ii] + noncons_left_arr[ii, j] = noncons_left_node[ii] + noncons_right_arr[ii, j] = noncons_right_node[ii] + end + end + end + + return nothing +end + +# Kernel for setting interface fluxes +function interface_flux_kernel!(surface_flux_values, surface_flux_arr, neighbor_ids) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_arr, 2)) + @inbounds begin + left_id = neighbor_ids[1, j] + right_id = neighbor_ids[2, j] + + surface_flux_values[i, 2, left_id] = surface_flux_arr[i, j] + surface_flux_values[i, 1, right_id] = surface_flux_arr[i, j] + end + end + + return nothing +end + +# Kernel for setting interface fluxes +function interface_flux_kernel!(surface_flux_values, surface_flux_arr, noncons_left_arr, + noncons_right_arr, neighbor_ids) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_arr, 2)) + @inbounds begin + left_id = neighbor_ids[1, j] + right_id = neighbor_ids[2, j] + + surface_flux_values[i, 2, left_id] = surface_flux_arr[i, j] + + 0.5f0 * noncons_left_arr[i, j] + surface_flux_values[i, 1, right_id] = surface_flux_arr[i, j] + + 0.5f0 * noncons_right_arr[i, j] + end + end + + return nothing +end + +# Kernel for prolonging two boundaries +function prolong_boundaries_kernel!(boundaries_u, u, neighbor_ids, neighbor_sides) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(boundaries_u, 2) && k <= size(boundaries_u, 3)) + @inbounds begin + element = neighbor_ids[k] + side = neighbor_sides[k] + + boundaries_u[1, j, k] = u[j, size(u, 2), element] * (2 - side) # set to 0 instead of NaN + boundaries_u[2, j, k] = u[j, 1, element] * (side - 1) # set to 0 instead of NaN + end + end + + return nothing +end + +# Kernel for calculating boundary fluxes +function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, + indices_arr, neighbor_ids, neighbor_sides, orientations, + boundary_conditions::NamedTuple, equations::AbstractEquations{1}, + surface_flux::Any) + k = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + if (k <= length(boundary_arr)) + @inbounds begin + boundary = boundary_arr[k] + direction = (indices_arr[1] <= boundary) + (indices_arr[2] <= boundary) + + neighbor = neighbor_ids[boundary] + side = neighbor_sides[boundary] + orientation = orientations[boundary] + end + + u_ll, u_rr = get_surface_node_vars(boundaries_u, equations, boundary) + u_inner = (2 - side) * u_ll + (side - 1) * u_rr + x = get_node_coords(node_coordinates, equations, boundary) + + # TODO: Improve this part + if direction == 1 + boundary_flux_node = boundary_conditions[1](u_inner, orientation, + direction, x, t, surface_flux, equations) + else + boundary_flux_node = boundary_conditions[2](u_inner, orientation, + direction, x, t, surface_flux, equations) + end + + for ii in axes(surface_flux_values, 1) + # `boundary_flux_node` can be nothing if periodic boundary condition is applied + @inbounds surface_flux_values[ii, direction, neighbor] = isnothing(boundary_flux_node) ? # bad + surface_flux_values[ii, + direction, + neighbor] : + boundary_flux_node[ii] + end + end + + return nothing +end + +# Kernel for calculating boundary fluxes +function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, + indices_arr, neighbor_ids, neighbor_sides, orientations, + boundary_conditions::NamedTuple, equations::AbstractEquations{1}, + surface_flux::Any, nonconservative_flux::Any) + k = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + if (k <= length(boundary_arr)) + @inbounds begin + boundary = boundary_arr[k] + direction = (indices_arr[1] <= boundary) + (indices_arr[2] <= boundary) + + neighbor = neighbor_ids[boundary] + side = neighbor_sides[boundary] + orientation = orientations[boundary] + end + + u_ll, u_rr = get_surface_node_vars(boundaries_u, equations, boundary) + u_inner = (2 - side) * u_ll + (side - 1) * u_rr + x = get_node_coords(node_coordinates, equations, boundary) + + # TODO: Improve this part + if direction == 1 + flux_node = boundary_conditions[1](u_inner, orientation, direction, x, t, surface_flux, + equations) + noncons_flux_node = boundary_conditions[1](u_inner, orientation, direction, x, t, + nonconservative_flux, equations) + else + flux_node = boundary_conditions[2](u_inner, orientation, direction, x, t, surface_flux, + equations) + noncons_flux_node = boundary_conditions[2](u_inner, orientation, direction, x, t, + nonconservative_flux, equations) + end + + for ii in axes(surface_flux_values, 1) + @inbounds surface_flux_values[ii, direction, neighbor] = flux_node[ii] + + 0.5f0 * noncons_flux_node[ii] + end + end + + return nothing +end + +# Kernel for calculating surface integrals +function surface_integral_kernel!(du, factor_arr, surface_flux_values, + equations::AbstractEquations{1}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) + @inbounds begin + du[i, j, k] -= surface_flux_values[i, 1, k] * isequal(j, 1) * factor_arr[1] + du[i, j, k] += surface_flux_values[i, 2, k] * isequal(j, size(du, 2)) * factor_arr[2] + end + end + + return nothing +end + +# Kernel for applying inverse Jacobian +function jacobian_kernel!(du, inverse_jacobian, equations::AbstractEquations{1}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2) && k <= size(du, 3)) + @inbounds du[i, j, k] *= -inverse_jacobian[k] + end + + return nothing +end + +# Kernel for calculating source terms +function source_terms_kernel!(du, u, node_coordinates, t, equations::AbstractEquations{1}, + source_terms::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(du, 2) && k <= size(du, 3)) + u_local = get_node_vars(u, equations, j, k) + x_local = get_node_coords(node_coordinates, equations, j, k) + + source_terms_node = source_terms(u_local, x_local, t, equations) + + for ii in axes(du, 1) + @inbounds du[ii, j, k] += source_terms_node[ii] + end + end + + return nothing +end diff --git a/src/solvers/dg_2d.jl b/src/solvers/dg_2d.jl index 2487539..12a5180 100644 --- a/src/solvers/dg_2d.jl +++ b/src/solvers/dg_2d.jl @@ -1,1474 +1,7 @@ # Everything related to a DG semidiscretization in 2D. -################################################################################################# -# Functions that end with `_kernel` are CUDA kernels that are going to be launched by -# the @cuda macro with parameters from the kernel configurator. They are purely run on -# the device (i.e., GPU). - -# Kernel for calculating fluxes along normal directions -function flux_kernel!(flux_arr1, flux_arr2, u, equations::AbstractEquations{2}, flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^2 && k <= size(u, 4)) - j1 = div(j - 1, size(u, 2)) + 1 - j2 = rem(j - 1, size(u, 2)) + 1 - - u_node = get_node_vars(u, equations, j1, j2, k) - - flux_node1 = flux(u_node, 1, equations) - flux_node2 = flux(u_node, 2, equations) - - for ii in axes(u, 1) - @inbounds begin - flux_arr1[ii, j1, j2, k] = flux_node1[ii] - flux_arr2[ii, j1, j2, k] = flux_node2[ii] - end - end - end - - return nothing -end - -# Kernel for calculating weak form -function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) - j1 = div(j - 1, size(du, 2)) + 1 - j2 = rem(j - 1, size(du, 2)) + 1 - - @inbounds du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros - - for ii in axes(du, 2) - @inbounds du[i, j1, j2, k] += derivative_dhat[j1, ii] * flux_arr1[i, ii, j2, k] + - derivative_dhat[j2, ii] * flux_arr2[i, j1, ii, k] - end - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating volume integrals with weak form -function flux_weak_form_kernel!(du, u, derivative_dhat, - equations::AbstractEquations{2}, flux::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - shmem_dhat = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_flux = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width, 2), offset) - - # Get thread and block indices only we need to save registers - tx, ty = threadIdx().x, threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - ty1 = div(ty - 1, tile_width) + 1 - ty2 = rem(ty - 1, tile_width) + 1 - - # Tile the computation (restrict to one tile here) - value = zero(eltype(du)) - - # Load global `derivative_dhat` into shared memory - # Transposed load - @inbounds shmem_dhat[ty1, ty2] = derivative_dhat[ty2, ty1] - - # Compute flux values - u_node = get_node_vars(u, equations, ty1, ty2, k) - flux_node1 = flux(u_node, 1, equations) - flux_node2 = flux(u_node, 2, equations) - - @inbounds begin - shmem_flux[tx, ty1, ty2, 1] = flux_node1[tx] - shmem_flux[tx, ty1, ty2, 2] = flux_node2[tx] - end - - sync_threads() - - # Loop within one block to get weak form - # TODO: Avoid potential bank conflicts - for thread in 1:tile_width - @inbounds value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread, ty2, 1] + - shmem_dhat[thread, ty2] * shmem_flux[tx, ty1, thread, 2] - end - - # Synchronization is not needed here if we use only one tile - # sync_threads() - - # Finalize the weak form - @inbounds du[tx, ty1, ty2, k] = value - - return nothing -end - -# Kernel for calculating volume fluxes -function volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2, u, equations::AbstractEquations{2}, - volume_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^3 && k <= size(u, 4)) - u2 = size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - u_node = get_node_vars(u, equations, j1, j2, k) - u_node1 = get_node_vars(u, equations, j3, j2, k) - u_node2 = get_node_vars(u, equations, j1, j3, k) - - volume_flux_node1 = volume_flux(u_node, u_node1, 1, equations) - volume_flux_node2 = volume_flux(u_node, u_node2, 2, equations) - - for ii in axes(u, 1) - @inbounds begin - volume_flux_arr1[ii, j1, j3, j2, k] = volume_flux_node1[ii] - volume_flux_arr2[ii, j1, j2, j3, k] = volume_flux_node2[ii] - end - end - end - - return nothing -end - -# Kernel for calculating volume integrals -function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_flux_arr2, - equations::AbstractEquations{2}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) - j1 = div(j - 1, size(du, 2)) + 1 - j2 = rem(j - 1, size(du, 2)) + 1 - - @inbounds du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros - - for ii in axes(du, 2) - @inbounds du[i, j1, j2, k] += volume_flux_arr1[i, j1, ii, j2, k] * derivative_split[j1, ii] * - (1 - isequal(j1, ii)) + # set diagonal elements to zeros - volume_flux_arr2[i, j1, j2, ii, k] * derivative_split[j2, ii] * - (1 - isequal(j2, ii)) # set diagonal elements to zeros - end - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating volume integrals without conservative terms -function volume_flux_integral_kernel!(du, u, derivative_split, - equations::AbstractEquations{2}, volume_flux::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width), offset) - - # Get thread and block indices only we need save registers - ty = threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - ty1 = div(ty - 1, tile_width) + 1 - ty2 = rem(ty - 1, tile_width) + 1 - - # Tile the computation (set to one tile here) - # Initialize the values - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2] = zero(eltype(du)) - end - - # Load global `derivative_split` into shared memory - # Transposed load - @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] * - (1 - isequal(ty1, ty2)) # set diagonal elements to zeros - - sync_threads() - - # Compute volume fluxes - # How to store nodes in shared memory? - for thread in 1:tile_width - # Volume flux is heavy in computation so we should try best to avoid redundant - # computation, i.e., use for loop along x direction here - u_node = get_node_vars(u, equations, ty1, ty2, k) - volume_flux_node1 = volume_flux(u_node, - get_node_vars(u, equations, thread, ty2, k), - 1, equations) - volume_flux_node2 = volume_flux(u_node, - get_node_vars(u, equations, ty1, thread, k), - 2, equations) - - # TODO: Avoid potential bank conflicts - # Try another way to parallelize (ty1, ty2) with threads to ty3, then - # consolidate each computation back to (ty1, ty2) - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2] += shmem_split[thread, ty1] * volume_flux_node1[tx] + - shmem_split[thread, ty2] * volume_flux_node2[tx] - end - end - - # Synchronization is not needed here if we use only one tile - # sync_threads() - - # Finalize the values - for tx in axes(du, 1) - @inbounds du[tx, ty1, ty2, k] = shmem_value[tx, ty1, ty2] - end - - return nothing -end - -# Kernel for calculating symmetric and nonconservative fluxes -function noncons_volume_flux_kernel!(symmetric_flux_arr1, symmetric_flux_arr2, noncons_flux_arr1, - noncons_flux_arr2, u, derivative_split, - equations::AbstractEquations{2}, symmetric_flux::Any, - nonconservative_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^3 && k <= size(u, 4)) - u2 = size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - u_node = get_node_vars(u, equations, j1, j2, k) - u_node1 = get_node_vars(u, equations, j3, j2, k) - u_node2 = get_node_vars(u, equations, j1, j3, k) - - symmetric_flux_node1 = symmetric_flux(u_node, u_node1, 1, equations) - symmetric_flux_node2 = symmetric_flux(u_node, u_node2, 2, equations) - - noncons_flux_node1 = nonconservative_flux(u_node, u_node1, 1, equations) - noncons_flux_node2 = nonconservative_flux(u_node, u_node2, 2, equations) - - for ii in axes(u, 1) - @inbounds begin - symmetric_flux_arr1[ii, j1, j3, j2, k] = symmetric_flux_node1[ii] * derivative_split[j1, j3] * - (1 - isequal(j1, j3)) # set diagonal elements to zeros - symmetric_flux_arr2[ii, j1, j2, j3, k] = symmetric_flux_node2[ii] * derivative_split[j2, j3] * - (1 - isequal(j2, j3)) # set diagonal elements to zeros - - noncons_flux_arr1[ii, j1, j3, j2, k] = noncons_flux_node1[ii] - noncons_flux_arr2[ii, j1, j2, j3, k] = noncons_flux_node2[ii] - end - end - end - - return nothing -end - -# Kernel for calculating symmetric and nonconservative volume integrals -function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr1, symmetric_flux_arr2, - noncons_flux_arr1, noncons_flux_arr2) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) - j1 = div(j - 1, size(du, 2)) + 1 - j2 = rem(j - 1, size(du, 2)) + 1 - - @inbounds du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros - - for ii in axes(du, 2) - @inbounds du[i, j1, j2, k] += symmetric_flux_arr1[i, j1, ii, j2, k] + - symmetric_flux_arr2[i, j1, j2, ii, k] + - 0.5f0 * - derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, k] + - 0.5f0 * - derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, k] - end - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating volume integrals with conservative terms -function volume_flux_integral_kernel!(du, u, derivative_split, - equations::AbstractEquations{2}, - symmetric_flux::Any, nonconservative_flux::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width), offset) - - # Get thread and block indices only we need save registers - ty = threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - ty1 = div(ty - 1, tile_width) + 1 - ty2 = rem(ty - 1, tile_width) + 1 - - # Tile the computation (set to one tile here) - # Initialize the values - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2] = zero(eltype(du)) - end - - # Load data from global memory into shared memory - # Transposed load - @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] - - sync_threads() - - # Compute volume fluxes - # How to store nodes in shared memory? - for thread in 1:tile_width - # Volume flux is heavy in computation so we should try best to avoid redundant - # computation, i.e., use for loop along x direction here - u_node = get_node_vars(u, equations, ty1, ty2, k) - symmetric_flux_node1 = symmetric_flux(u_node, - get_node_vars(u, equations, thread, ty2, k), - 1, equations) - symmetric_flux_node2 = symmetric_flux(u_node, - get_node_vars(u, equations, ty1, thread, k), - 2, equations) - noncons_flux_node1 = nonconservative_flux(u_node, - get_node_vars(u, equations, thread, ty2, k), - 1, equations) - noncons_flux_node2 = nonconservative_flux(u_node, - get_node_vars(u, equations, ty1, thread, k), - 2, equations) - - # TODO: Avoid potential bank conflicts - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2] += symmetric_flux_node1[tx] * shmem_split[thread, ty1] * - (1 - isequal(ty1, thread)) + # set diagonal elements to zeros - symmetric_flux_node2[tx] * shmem_split[thread, ty2] * - (1 - isequal(ty2, thread)) + # set diagonal elements to zeros - 0.5f0 * - noncons_flux_node1[tx] * shmem_split[thread, ty1] + - 0.5f0 * - noncons_flux_node2[tx] * shmem_split[thread, ty2] - end - end - - # Synchronization is not needed here if we use only one tile - # sync_threads() - - # Finalize the values - for tx in axes(du, 1) - @inbounds du[tx, ty1, ty2, k] = shmem_value[tx, ty1, ty2] - end - - return nothing -end - -# Kernel for calculating pure DG and DG-FV volume fluxes -function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, fstar1_L, fstar1_R, - fstar2_L, fstar2_R, u, alpha, atol, - equations::AbstractEquations{2}, - volume_flux_dg::Any, volume_flux_fv::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^3 && k <= size(u, 4)) - u2 = size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - dg_only = isapprox(alpha[k], 0, atol = atol) - - u_node = get_node_vars(u, equations, j1, j2, k) - u_node1 = get_node_vars(u, equations, j3, j2, k) - u_node2 = get_node_vars(u, equations, j1, j3, k) - - volume_flux_node1 = volume_flux_dg(u_node, u_node1, 1, equations) - volume_flux_node2 = volume_flux_dg(u_node, u_node2, 2, equations) - - for ii in axes(u, 1) - @inbounds begin - volume_flux_arr1[ii, j1, j3, j2, k] = volume_flux_node1[ii] - volume_flux_arr2[ii, j1, j2, j3, k] = volume_flux_node2[ii] - end - - # Small optimization, no much performance gain - if isequal(j1 + 1, j3) # avoid race condition - flux_fv_node1 = volume_flux_fv(u_node, u_node1, 1, equations) - - @inbounds begin - fstar1_L[ii, j3, j2, k] = flux_fv_node1[ii] * (1 - dg_only) - fstar1_R[ii, j3, j2, k] = flux_fv_node1[ii] * (1 - dg_only) - end - end - - if isequal(j2 + 1, j3) # avoid race condition - flux_fv_node2 = volume_flux_fv(u_node, u_node2, 2, equations) - - @inbounds begin - fstar2_L[ii, j1, j3, k] = flux_fv_node2[ii] * (1 - dg_only) - fstar2_R[ii, j1, j3, k] = flux_fv_node2[ii] * (1 - dg_only) - end - end - end - - # if j1 != 1 && j3 == 1 # bad - # u_ll = get_node_vars(u, equations, j1 - 1, j2, k) - # u_rr = get_node_vars(u, equations, j1, j2, k) - # flux_fv_node1 = volume_flux_fv(u_ll, u_rr, 1, equations) - - # for ii in axes(u, 1) - # @inbounds begin - # fstar1_L[ii, j1, j2, k] = flux_fv_node1[ii] * (1 - dg_only) - # fstar1_R[ii, j1, j2, k] = flux_fv_node1[ii] * (1 - dg_only) - # end - # end - # end - - # if j2 != 1 && j3 == 1 # bad - # u_ll = get_node_vars(u, equations, j1, j2 - 1, k) - # u_rr = get_node_vars(u, equations, j1, j2, k) - # flux_fv_node2 = volume_flux_fv(u_ll, u_rr, 2, equations) - - # for ii in axes(u, 1) - # @inbounds begin - # fstar2_L[ii, j1, j2, k] = flux_fv_node2[ii] * (1 - dg_only) - # fstar2_R[ii, j1, j2, k] = flux_fv_node2[ii] * (1 - dg_only) - # end - # end - # end - end - - return nothing -end - -# Kernel for calculating pure DG and DG-FV volume integrals -function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, - volume_flux_arr1, volume_flux_arr2, - fstar1_L, fstar1_R, fstar2_L, fstar2_R, atol, - equations::AbstractEquations{2}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) - j1 = div(j - 1, size(du, 2)) + 1 - j2 = rem(j - 1, size(du, 2)) + 1 - - @inbounds begin - du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros - alpha_element = alpha[k] - end - - dg_only = isapprox(alpha_element, 0, atol = atol) - - for ii in axes(du, 2) - @inbounds du[i, j1, j2, k] += (derivative_split[j1, ii] * - (1 - isequal(j1, ii)) * # set diagonal elements to zeros - volume_flux_arr1[i, j1, ii, j2, k] + - derivative_split[j2, ii] * - (1 - isequal(j2, ii)) * # set diagonal elements to zeros - volume_flux_arr2[i, j1, j2, ii, k]) * dg_only + - ((1 - alpha_element) * derivative_split[j1, ii] * - (1 - isequal(j1, ii)) * # set diagonal elements to zeros - volume_flux_arr1[i, j1, ii, j2, k] + - (1 - alpha_element) * derivative_split[j2, ii] * - (1 - isequal(j2, ii)) * # set diagonal elements to zeros - volume_flux_arr2[i, j1, j2, ii, k]) * (1 - dg_only) - end - - @inbounds du[i, j1, j2, k] += alpha_element * - (inverse_weights[j1] * - (fstar1_L[i, j1 + 1, j2, k] - fstar1_R[i, j1, j2, k]) + - inverse_weights[j2] * - (fstar2_L[i, j1, j2 + 1, k] - fstar2_R[i, j1, j2, k])) * (1 - dg_only) - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating pure DG and DG-FV volume integrals without conservative terms -function volume_flux_integral_dgfv_kernel!(du, u, alpha, atol, derivative_split, inverse_weights, - equations::AbstractEquations{2}, - volume_flux_dg::Any, volume_flux_fv::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - # TODO: Combine `fstar` into single allocation - shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_fstar1 = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width + 1, tile_width), offset) - offset += sizeof(eltype(du)) * size(du, 1) * (tile_width + 1) * tile_width - shmem_fstar2 = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width + 1), offset) - offset += sizeof(eltype(du)) * size(du, 1) * tile_width * (tile_width + 1) - shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width), offset) - - # Get thread and block indices only we need save registers - ty = threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - ty1 = div(ty - 1, tile_width) + 1 - ty2 = rem(ty - 1, tile_width) + 1 - - # Load global `derivative_split` into shared memory - # Transposed load - @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] - - # Get variables for computation - @inbounds alpha_element = alpha[k] - dg_only = isapprox(alpha_element, 0, atol = atol) - - # Compute FV volume fluxes - u_node = get_node_vars(u, equations, ty1, ty2, k) - if ty1 + 1 <= tile_width - flux_fv_node1 = volume_flux_fv(u_node, - get_node_vars(u, equations, ty1 + 1, ty2, k), - 1, equations) - end - if ty2 + 1 <= tile_width - flux_fv_node2 = volume_flux_fv(u_node, - get_node_vars(u, equations, ty1, ty2 + 1, k), - 2, equations) - end - - # Initialize the values - for tx in axes(du, 1) - @inbounds begin - # Initialize `du` with zeros - shmem_value[tx, ty1, ty2] = zero(eltype(du)) - # Initialize `fstar` side columes with zeros - shmem_fstar1[tx, 1, ty2] = zero(eltype(du)) - shmem_fstar1[tx, tile_width + 1, ty2] = zero(eltype(du)) - shmem_fstar2[tx, ty1, 1] = zero(eltype(du)) - shmem_fstar2[tx, ty1, tile_width + 1] = zero(eltype(du)) - end - - if ty1 + 1 <= tile_width - # Set with FV volume fluxes - @inbounds shmem_fstar1[tx, ty1 + 1, ty2] = flux_fv_node1[tx] * (1 - dg_only) - end - if ty2 + 1 <= tile_width - # Set with FV volume fluxes - @inbounds shmem_fstar2[tx, ty1, ty2 + 1] = flux_fv_node2[tx] * (1 - dg_only) - end - end - - sync_threads() - - # Contribute FV to the volume integrals - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2] += alpha_element * - (inverse_weights[ty1] * - (shmem_fstar1[tx, ty1 + 1, ty2] - shmem_fstar1[tx, ty1, ty2]) + - inverse_weights[ty2] * - (shmem_fstar2[tx, ty1, ty2 + 1] - shmem_fstar2[tx, ty1, ty2])) * - (1 - dg_only) - end - - # Compute DG volume fluxes - for thread in 1:tile_width - volume_flux_node1 = volume_flux_dg(u_node, - get_node_vars(u, equations, thread, ty2, k), - 1, equations) - volume_flux_node2 = volume_flux_dg(u_node, - get_node_vars(u, equations, ty1, thread, k), - 2, equations) - - # Contribute DG to the volume integrals - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2] += (shmem_split[thread, ty1] * - (1 - isequal(ty1, thread)) * # set diagonal elements to zeros - volume_flux_node1[tx] + - shmem_split[thread, ty2] * - (1 - isequal(ty2, thread)) * # set diagonal elements to zeros - volume_flux_node2[tx]) * dg_only + - ((1 - alpha_element) * shmem_split[thread, ty1] * - (1 - isequal(ty1, thread)) * # set diagonal elements to zeros - volume_flux_node1[tx] + - (1 - alpha_element) * shmem_split[thread, ty2] * - (1 - isequal(ty2, thread)) * # set diagonal elements to zeros - volume_flux_node2[tx]) * (1 - dg_only) - end - end - - # Finalize the values - for tx in axes(du, 1) - @inbounds du[tx, ty1, ty2, k] = shmem_value[tx, ty1, ty2] - end - - return nothing -end - -# Kernel for calculating pure DG and DG-FV volume fluxes -function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, noncons_flux_arr1, - noncons_flux_arr2, fstar1_L, fstar1_R, fstar2_L, fstar2_R, - u, alpha, atol, derivative_split, - equations::AbstractEquations{2}, - volume_flux_dg::Any, noncons_flux_dg::Any, - volume_flux_fv::Any, noncons_flux_fv::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^3 && k <= size(u, 4)) - u2 = size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - dg_only = isapprox(alpha[k], 0, atol = atol) - - u_node = get_node_vars(u, equations, j1, j2, k) - u_node1 = get_node_vars(u, equations, j3, j2, k) - u_node2 = get_node_vars(u, equations, j1, j3, k) - - volume_flux_node1 = volume_flux_dg(u_node, u_node1, 1, equations) - volume_flux_node2 = volume_flux_dg(u_node, u_node2, 2, equations) - - noncons_flux_node1 = noncons_flux_dg(u_node, u_node1, 1, equations) - noncons_flux_node2 = noncons_flux_dg(u_node, u_node2, 2, equations) - - for ii in axes(u, 1) - @inbounds begin - volume_flux_arr1[ii, j1, j3, j2, k] = volume_flux_node1[ii] * derivative_split[j1, j3] * - (1 - isequal(j1, j3)) # set diagonal elements to zeros - volume_flux_arr2[ii, j1, j2, j3, k] = volume_flux_node2[ii] * derivative_split[j2, j3] * - (1 - isequal(j2, j3)) # set diagonal elements to zeros - noncons_flux_arr1[ii, j1, j3, j2, k] = noncons_flux_node1[ii] - noncons_flux_arr2[ii, j1, j2, j3, k] = noncons_flux_node2[ii] - end - - # Small optimization, no much performance gain - if isequal(j1 + 1, j3) # avoid race condition - f1_node = volume_flux_fv(u_node, u_node1, 1, equations) - f1_L_node = noncons_flux_fv(u_node, u_node1, 1, equations) - f1_R_node = noncons_flux_fv(u_node1, u_node, 1, equations) - - @inbounds begin - fstar1_L[ii, j3, j2, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only) - fstar1_R[ii, j3, j2, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only) - end - end - - if isequal(j2 + 1, j3) # avoid race condition - f2_node = volume_flux_fv(u_node, u_node2, 2, equations) - f2_L_node = noncons_flux_fv(u_node, u_node2, 2, equations) - f2_R_node = noncons_flux_fv(u_node2, u_node, 2, equations) - - @inbounds begin - fstar2_L[ii, j1, j3, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only) - fstar2_R[ii, j1, j3, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only) - end - end - end - - # if j1 != 1 && j3 == 1 # bad - # u_ll = get_node_vars(u, equations, j1 - 1, j2, k) - # u_rr = get_node_vars(u, equations, j1, j2, k) - - # f1_node = volume_flux_fv(u_ll, u_rr, 1, equations) - - # f1_L_node = noncons_flux_fv(u_ll, u_rr, 1, equations) - # f1_R_node = noncons_flux_fv(u_rr, u_ll, 1, equations) - - # for ii in axes(u, 1) - # @inbounds begin - # fstar1_L[ii, j1, j2, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only) - # fstar1_R[ii, j1, j2, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only) - # end - # end - # end - - # if j2 != 1 && j3 == 1 # bad - # u_ll = get_node_vars(u, equations, j1, j2 - 1, k) - # u_rr = get_node_vars(u, equations, j1, j2, k) - - # f2_node = volume_flux_fv(u_ll, u_rr, 2, equations) - - # f2_L_node = noncons_flux_fv(u_ll, u_rr, 2, equations) - # f2_R_node = noncons_flux_fv(u_rr, u_ll, 2, equations) - - # for ii in axes(u, 1) - # @inbounds begin - # fstar2_L[ii, j1, j2, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only) - # fstar2_R[ii, j1, j2, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only) - # end - # end - # end - end - - return nothing -end - -# Kernel for calculating pure DG and DG-FV volume integrals -function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, - volume_flux_arr1, volume_flux_arr2, - noncons_flux_arr1, noncons_flux_arr2, - fstar1_L, fstar1_R, fstar2_L, fstar2_R, atol, - equations::AbstractEquations{2}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) - j1 = div(j - 1, size(du, 2)) + 1 - j2 = rem(j - 1, size(du, 2)) + 1 - - @inbounds begin - du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros - alpha_element = alpha[k] - end - - dg_only = isapprox(alpha_element, 0, atol = atol) - - for ii in axes(du, 2) - @inbounds du[i, j1, j2, k] += (volume_flux_arr1[i, j1, ii, j2, k] + - volume_flux_arr2[i, j1, j2, ii, k] + - 0.5f0 * - (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, k] + - derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, k])) * dg_only + - ((1 - alpha_element) * - volume_flux_arr1[i, j1, ii, j2, k] + - (1 - alpha_element) * - volume_flux_arr2[i, j1, j2, ii, k] + - 0.5f0 * (1 - alpha_element) * - (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, k] + - derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, k])) * (1 - dg_only) - end - - @inbounds du[i, j1, j2, k] += alpha_element * - (inverse_weights[j1] * - (fstar1_L[i, j1 + 1, j2, k] - fstar1_R[i, j1, j2, k]) + - inverse_weights[j2] * - (fstar2_L[i, j1, j2 + 1, k] - fstar2_R[i, j1, j2, k])) * (1 - dg_only) - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating pure DG and DG-FV volume integrals with conservative terms -function volume_flux_integral_dgfv_kernel!(du, u, alpha, atol, derivative_split, inverse_weights, - equations::AbstractEquations{2}, - volume_flux_dg::Any, noncons_flux_dg::Any, - volume_flux_fv::Any, noncons_flux_fv::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_fstar1 = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width + 1, tile_width, 2), offset) - offset += sizeof(eltype(du)) * size(du, 1) * (tile_width + 1) * tile_width * 2 - shmem_fstar2 = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width + 1, 2), offset) - offset += sizeof(eltype(du)) * size(du, 1) * tile_width * (tile_width + 1) * 2 - shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width), offset) - - # Get thread and block indices only we need save registers - ty = threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - ty1 = div(ty - 1, tile_width) + 1 - ty2 = rem(ty - 1, tile_width) + 1 - - # Load global `derivative_split` into shared memory - # Transposed load - @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] - - # Get variables for computation - @inbounds alpha_element = alpha[k] - dg_only = isapprox(alpha_element, 0, atol = atol) - - # Compute FV volume fluxes - u_node = get_node_vars(u, equations, ty1, ty2, k) - if ty1 + 1 <= tile_width - f1_node = volume_flux_fv(u_node, - get_node_vars(u, equations, ty1 + 1, ty2, k), - 1, equations) - f1_L_node = noncons_flux_fv(u_node, - get_node_vars(u, equations, ty1 + 1, ty2, k), - 1, equations) - f1_R_node = noncons_flux_fv(get_node_vars(u, equations, ty1 + 1, ty2, k), - u_node, - 1, equations) - end - if ty2 + 1 <= tile_width - f2_node = volume_flux_fv(u_node, - get_node_vars(u, equations, ty1, ty2 + 1, k), - 2, equations) - f2_L_node = noncons_flux_fv(u_node, - get_node_vars(u, equations, ty1, ty2 + 1, k), - 2, equations) - f2_R_node = noncons_flux_fv(get_node_vars(u, equations, ty1, ty2 + 1, k), - u_node, - 2, equations) - end - - # Initialize the values - for tx in axes(du, 1) - @inbounds begin - # Initialize `du` with zeros - shmem_value[tx, ty1, ty2] = zero(eltype(du)) - - # TODO: Remove shared memory for `fstar` and use local memory - - # Initialize `fstar` side columes with zeros (1: left) - shmem_fstar1[tx, 1, ty2, 1] = zero(eltype(du)) - shmem_fstar1[tx, tile_width + 1, ty2, 1] = zero(eltype(du)) - shmem_fstar2[tx, ty1, 1, 1] = zero(eltype(du)) - shmem_fstar2[tx, ty1, tile_width + 1, 1] = zero(eltype(du)) - - # Initialize `fstar` side columes with zeros (2: right) - shmem_fstar1[tx, 1, ty2, 2] = zero(eltype(du)) - shmem_fstar1[tx, tile_width + 1, ty2, 2] = zero(eltype(du)) - shmem_fstar2[tx, ty1, 1, 2] = zero(eltype(du)) - shmem_fstar2[tx, ty1, tile_width + 1, 2] = zero(eltype(du)) - end - - if ty1 + 1 <= tile_width - # Set with FV volume fluxes - @inbounds begin - shmem_fstar1[tx, ty1 + 1, ty2, 1] = f1_node[tx] + 0.5f0 * f1_L_node[tx] * (1 - dg_only) - shmem_fstar1[tx, ty1 + 1, ty2, 2] = f1_node[tx] + 0.5f0 * f1_R_node[tx] * (1 - dg_only) - end - end - if ty2 + 1 <= tile_width - # Set with FV volume fluxes - @inbounds begin - shmem_fstar2[tx, ty1, ty2 + 1, 1] = f2_node[tx] + 0.5f0 * f2_L_node[tx] * (1 - dg_only) - shmem_fstar2[tx, ty1, ty2 + 1, 2] = f2_node[tx] + 0.5f0 * f2_R_node[tx] * (1 - dg_only) - end - end - end - - sync_threads() - - # Contribute FV to the volume integrals - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2] += alpha_element * - (inverse_weights[ty1] * - (shmem_fstar1[tx, ty1 + 1, ty2, 1] - shmem_fstar1[tx, ty1, ty2, 2]) + - inverse_weights[ty2] * - (shmem_fstar2[tx, ty1, ty2 + 1, 1] - shmem_fstar2[tx, ty1, ty2, 2])) * (1 - dg_only) - end - - # Compute DG volume fluxes - for thread in 1:tile_width - volume_flux_node1 = volume_flux_dg(u_node, - get_node_vars(u, equations, thread, ty2, k), - 1, equations) - volume_flux_node2 = volume_flux_dg(u_node, - get_node_vars(u, equations, ty1, thread, k), - 2, equations) - - noncons_flux_node1 = noncons_flux_dg(u_node, - get_node_vars(u, equations, thread, ty2, k), - 1, equations) - noncons_flux_node2 = noncons_flux_dg(u_node, - get_node_vars(u, equations, ty1, thread, k), - 2, equations) - - # Contribute DG to the volume integrals - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2] += (volume_flux_node1[tx] * shmem_split[thread, ty1] * - (1 - isequal(ty1, thread)) + # set diagonal elements to zeros - volume_flux_node2[tx] * shmem_split[thread, ty2] * - (1 - isequal(ty2, thread)) + - 0.5f0 * - (shmem_split[thread, ty1] * noncons_flux_node1[tx] + - shmem_split[thread, ty2] * noncons_flux_node2[tx])) * dg_only + - ((1 - alpha_element) * - volume_flux_node1[tx] * shmem_split[thread, ty1] * - (1 - isequal(ty1, thread)) + # set diagonal elements to zeros - (1 - alpha_element) * - volume_flux_node2[tx] * shmem_split[thread, ty2] * - (1 - isequal(ty2, thread)) + # set diagonal elements to zeros - 0.5f0 * (1 - alpha_element) * - (shmem_split[thread, ty1] * noncons_flux_node1[tx] + - shmem_split[thread, ty2] * noncons_flux_node2[tx])) * (1 - dg_only) - end - end - - # Finalize the values - for tx in axes(du, 1) - @inbounds du[tx, ty1, ty2, k] = shmem_value[tx, ty1, ty2] - end - - return nothing -end - -# Kernel for prolonging two interfaces -function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids, orientations, - euqations::AbstractEquations{2}) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(interfaces_u, 2) * size(interfaces_u, 3) && k <= size(interfaces_u, 4)) - u2 = size(u, 2) # size(interfaces_u, 3) == size(u, 2) - - j1 = div(j - 1, u2) + 1 - j2 = rem(j - 1, u2) + 1 - - @inbounds begin - orientation = orientations[k] - left_element = neighbor_ids[1, k] - right_element = neighbor_ids[2, k] - - interfaces_u[1, j1, j2, k] = u[j1, - (2 - orientation) * u2 + (orientation - 1) * j2, - (2 - orientation) * j2 + (orientation - 1) * u2, - left_element] - interfaces_u[2, j1, j2, k] = u[j1, - (2 - orientation) + (orientation - 1) * j2, - (2 - orientation) * j2 + (orientation - 1), - right_element] - end - end - - return nothing -end - -# Kernel for calculating surface fluxes -function surface_flux_kernel!(surface_flux_arr, interfaces_u, orientations, - equations::AbstractEquations{2}, surface_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(surface_flux_arr, 2) && k <= size(surface_flux_arr, 3)) - u_ll, u_rr = get_surface_node_vars(interfaces_u, equations, j, k) - @inbounds orientation = orientations[k] - - surface_flux_node = surface_flux(u_ll, u_rr, orientation, equations) - - for ii in axes(surface_flux_arr, 1) - @inbounds surface_flux_arr[ii, j, k] = surface_flux_node[ii] - end - end - - return nothing -end - -# Kernel for calculating surface and both nonconservative fluxes -function surface_noncons_flux_kernel!(surface_flux_arr, noncons_left_arr, noncons_right_arr, - interfaces_u, orientations, equations::AbstractEquations{2}, - surface_flux::Any, nonconservative_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(surface_flux_arr, 2) && k <= size(surface_flux_arr, 3)) - u_ll, u_rr = get_surface_node_vars(interfaces_u, equations, j, k) - @inbounds orientation = orientations[k] - - surface_flux_node = surface_flux(u_ll, u_rr, orientation, equations) - noncons_left_node = nonconservative_flux(u_ll, u_rr, orientation, equations) - noncons_right_node = nonconservative_flux(u_rr, u_ll, orientation, equations) - - for ii in axes(surface_flux_arr, 1) - @inbounds begin - surface_flux_arr[ii, j, k] = surface_flux_node[ii] - noncons_left_arr[ii, j, k] = noncons_left_node[ii] - noncons_right_arr[ii, j, k] = noncons_right_node[ii] - end - end - end - - return nothing -end - -# Kernel for setting interface fluxes -function interface_flux_kernel!(surface_flux_values, surface_flux_arr, neighbor_ids, orientations, - equations::AbstractEquations{2}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_arr, 2) && - k <= size(surface_flux_arr, 3)) - @inbounds begin - left_id = neighbor_ids[1, k] - right_id = neighbor_ids[2, k] - - left_direction = 2 * orientations[k] - right_direction = 2 * orientations[k] - 1 - - surface_flux_values[i, j, left_direction, left_id] = surface_flux_arr[i, j, k] - surface_flux_values[i, j, right_direction, right_id] = surface_flux_arr[i, j, k] - end - end - - return nothing -end - -# Kernel for setting interface fluxes -function interface_flux_kernel!(surface_flux_values, surface_flux_arr, noncons_left_arr, - noncons_right_arr, neighbor_ids, orientations, - equations::AbstractEquations{2}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_arr, 2) && - k <= size(surface_flux_arr, 3)) - @inbounds begin - left_id = neighbor_ids[1, k] - right_id = neighbor_ids[2, k] - - left_direction = 2 * orientations[k] - right_direction = 2 * orientations[k] - 1 - - surface_flux_values[i, j, left_direction, left_id] = surface_flux_arr[i, j, k] + - 0.5f0 * - noncons_left_arr[i, j, k] - surface_flux_values[i, j, right_direction, right_id] = surface_flux_arr[i, j, k] + - 0.5f0 * - noncons_right_arr[i, j, k] - end - end - - return nothing -end - -# Kernel for prolonging two boundaries -function prolong_boundaries_kernel!(boundaries_u, u, neighbor_ids, neighbor_sides, orientations, - equations::AbstractEquations{2}) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(boundaries_u, 2) * size(boundaries_u, 3) && k <= size(boundaries_u, 4)) - u2 = size(u, 2) # size(boundaries_u, 3) == size(u, 2) - - j1 = div(j - 1, u2) + 1 - j2 = rem(j - 1, u2) + 1 - - @inbounds begin - element = neighbor_ids[k] - side = neighbor_sides[k] - orientation = orientations[k] - - boundaries_u[1, j1, j2, k] = u[j1, - (2 - orientation) * u2 + (orientation - 1) * j2, - (2 - orientation) * j2 + (orientation - 1) * u2, - element] * (2 - side) # Set to 0 instead of NaN - boundaries_u[2, j1, j2, k] = u[j1, - (2 - orientation) + (orientation - 1) * j2, - (2 - orientation) * j2 + (orientation - 1), - element] * (side - 1) # Set to 0 instead of NaN - end - end - - return nothing -end - -# Kernel for calculating boundary fluxes -function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, - indices_arr, neighbor_ids, neighbor_sides, orientations, - boundary_conditions::NamedTuple, equations::AbstractEquations{2}, - surface_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(surface_flux_values, 2) && k <= length(boundary_arr)) - @inbounds begin - boundary = boundary_arr[k] - direction = (indices_arr[1] <= boundary) + (indices_arr[2] <= boundary) + - (indices_arr[3] <= boundary) + (indices_arr[4] <= boundary) - - neighbor = neighbor_ids[boundary] - side = neighbor_sides[boundary] - orientation = orientations[boundary] - end - - u_ll, u_rr = get_surface_node_vars(boundaries_u, equations, j, boundary) - u_inner = (2 - side) * u_ll + (side - 1) * u_rr - x = get_node_coords(node_coordinates, equations, j, boundary) - - # TODO: Improve this part - if direction == 1 - boundary_flux_node = boundary_conditions[1](u_inner, orientation, - direction, x, t, surface_flux, equations) - elseif direction == 2 - boundary_flux_node = boundary_conditions[2](u_inner, orientation, - direction, x, t, surface_flux, equations) - elseif direction == 3 - boundary_flux_node = boundary_conditions[3](u_inner, orientation, - direction, x, t, surface_flux, equations) - else - boundary_flux_node = boundary_conditions[4](u_inner, orientation, - direction, x, t, surface_flux, equations) - end - - for ii in axes(surface_flux_values, 1) - # `boundary_flux_node` can be nothing if periodic boundary condition is applied - @inbounds surface_flux_values[ii, j, direction, neighbor] = isnothing(boundary_flux_node) ? # bad - surface_flux_values[ii, j, - direction, - neighbor] : - boundary_flux_node[ii] - end - end - - return nothing -end - -# Kernel for calculating boundary fluxes -function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, - indices_arr, neighbor_ids, neighbor_sides, orientations, - boundary_conditions::NamedTuple, equations::AbstractEquations{2}, - surface_flux::Any, nonconservative_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(surface_flux_values, 2) && k <= length(boundary_arr)) - @inbounds begin - boundary = boundary_arr[k] - direction = (indices_arr[1] <= boundary) + (indices_arr[2] <= boundary) + - (indices_arr[3] <= boundary) + (indices_arr[4] <= boundary) - - neighbor = neighbor_ids[boundary] - side = neighbor_sides[boundary] - orientation = orientations[boundary] - end - - u_ll, u_rr = get_surface_node_vars(boundaries_u, equations, j, boundary) - u_inner = (2 - side) * u_ll + (side - 1) * u_rr - x = get_node_coords(node_coordinates, equations, j, boundary) - - # TODO: Improve this part - if direction == 1 - flux_node = boundary_conditions[1](u_inner, orientation, direction, x, t, surface_flux, - equations) - noncons_flux_node = boundary_conditions[1](u_inner, orientation, direction, x, t, - nonconservative_flux, equations) - elseif direction == 2 - flux_node = boundary_conditions[2](u_inner, orientation, direction, x, t, surface_flux, - equations) - noncons_flux_node = boundary_conditions[2](u_inner, orientation, direction, x, t, - nonconservative_flux, equations) - elseif direction == 3 - flux_node = boundary_conditions[3](u_inner, orientation, direction, x, t, surface_flux, - equations) - noncons_flux_node = boundary_conditions[3](u_inner, orientation, direction, x, t, - nonconservative_flux, equations) - else - flux_node = boundary_conditions[4](u_inner, orientation, direction, x, t, surface_flux, - equations) - noncons_flux_node = boundary_conditions[4](u_inner, orientation, direction, x, t, - nonconservative_flux, equations) - end - - for ii in axes(surface_flux_values, 1) - @inbounds surface_flux_values[ii, j, direction, neighbor] = flux_node[ii] + - 0.5f0 * noncons_flux_node[ii] - end - end - - return nothing -end - -# Kernel for copying data small to small on mortars -function prolong_mortars_small2small_kernel!(u_upper, u_lower, u, neighbor_ids, large_sides, - orientations) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(u_upper, 2) && j <= size(u_upper, 3) && k <= size(u_upper, 4)) - @inbounds begin - large_side = large_sides[k] - orientation = orientations[k] - - lower_element = neighbor_ids[1, k] - upper_element = neighbor_ids[2, k] - end - - u2 = size(u, 2) - - @inbounds begin - u_upper[2, i, j, k] = u[i, - (2 - orientation) + (orientation - 1) * j, - (2 - orientation) * j + (orientation - 1), - upper_element] * (2 - large_side) - - u_lower[2, i, j, k] = u[i, - (2 - orientation) + (orientation - 1) * j, - (2 - orientation) * j + (orientation - 1), - lower_element] * (2 - large_side) - - u_upper[1, i, j, k] = u[i, - (2 - orientation) * u2 + (orientation - 1) * j, - (2 - orientation) * j + (orientation - 1) * u2, - upper_element] * (large_side - 1) - - u_lower[1, i, j, k] = u[i, - (2 - orientation) * u2 + (orientation - 1) * j, - (2 - orientation) * j + (orientation - 1) * u2, - lower_element] * (large_side - 1) - end - end - - return nothing -end - -# Kernel for interpolating data large to small on mortars -function prolong_mortars_large2small_kernel!(u_upper, u_lower, u, forward_upper, forward_lower, - neighbor_ids, large_sides, orientations) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(u_upper, 2) && j <= size(u_upper, 3) && k <= size(u_upper, 4)) - @inbounds begin - large_side = large_sides[k] - orientation = orientations[k] - large_element = neighbor_ids[3, k] - end - - leftright = large_side - u2 = size(u, 2) - - for jj in axes(forward_upper, 2) - @inbounds begin - u_upper[leftright, i, j, k] += forward_upper[j, jj] * - u[i, - (2 - orientation) * u2 + (orientation - 1) * jj, - (2 - orientation) * jj + (orientation - 1) * u2, - large_element] * (2 - large_side) - u_lower[leftright, i, j, k] += forward_lower[j, jj] * - u[i, - (2 - orientation) * u2 + (orientation - 1) * jj, - (2 - orientation) * jj + (orientation - 1) * u2, - large_element] * (2 - large_side) - end - end - - for jj in axes(forward_lower, 2) - @inbounds begin - u_upper[leftright, i, j, k] += forward_upper[j, jj] * - u[i, - (2 - orientation) + (orientation - 1) * jj, - (2 - orientation) * jj + (orientation - 1), - large_element] * (large_side - 1) - u_lower[leftright, i, j, k] += forward_lower[j, jj] * - u[i, - (2 - orientation) + (orientation - 1) * jj, - (2 - orientation) * jj + (orientation - 1), - large_element] * (large_side - 1) - end - end - end - - return nothing -end - -# Kernel for calculating mortar fluxes -function mortar_flux_kernel!(fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, - fstar_secondary_lower, u_upper, u_lower, orientations, - equations::AbstractEquations{2}, surface_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u_upper, 3) && k <= length(orientations)) - u_upper_ll, u_upper_rr = get_surface_node_vars(u_upper, equations, j, k) - u_lower_ll, u_lower_rr = get_surface_node_vars(u_lower, equations, j, k) - @inbounds orientation = orientations[k] - - flux_upper_node = surface_flux(u_upper_ll, u_upper_rr, orientation, equations) - flux_lower_node = surface_flux(u_lower_ll, u_lower_rr, orientation, equations) - - for ii in axes(fstar_primary_upper, 1) - @inbounds begin - fstar_primary_upper[ii, j, k] = flux_upper_node[ii] - fstar_primary_lower[ii, j, k] = flux_lower_node[ii] - fstar_secondary_upper[ii, j, k] = flux_upper_node[ii] - fstar_secondary_lower[ii, j, k] = flux_lower_node[ii] - end - end - end - - return nothing -end - -# Kernel for calculating mortar fluxes and adding nonconservative fluxes -function mortar_flux_kernel!(fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, - fstar_secondary_lower, u_upper, u_lower, orientations, large_sides, - equations::AbstractEquations{2}, surface_flux::Any, - nonconservative_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u_upper, 3) && k <= length(orientations)) - u_upper_ll, u_upper_rr = get_surface_node_vars(u_upper, equations, j, k) - u_lower_ll, u_lower_rr = get_surface_node_vars(u_lower, equations, j, k) - - @inbounds begin - orientation = orientations[k] - large_side = large_sides[k] - end - - flux_upper_node = surface_flux(u_upper_ll, u_upper_rr, orientation, equations) - flux_lower_node = surface_flux(u_lower_ll, u_lower_rr, orientation, equations) - - for ii in axes(fstar_primary_upper, 1) - @inbounds begin - fstar_primary_upper[ii, j, k] = flux_upper_node[ii] - fstar_primary_lower[ii, j, k] = flux_lower_node[ii] - fstar_secondary_upper[ii, j, k] = flux_upper_node[ii] - fstar_secondary_lower[ii, j, k] = flux_lower_node[ii] - end - end - - u_upper1 = (2 - large_side) * u_upper_ll + (large_side - 1) * u_upper_rr - u_upper2 = (large_side - 1) * u_upper_ll + (2 - large_side) * u_upper_rr - - u_lower1 = (2 - large_side) * u_lower_ll + (large_side - 1) * u_lower_rr - u_lower2 = (large_side - 1) * u_lower_ll + (2 - large_side) * u_lower_rr - - noncons_flux_primary_upper = nonconservative_flux(u_upper1, u_upper2, orientation, - equations) - noncons_flux_primary_lower = nonconservative_flux(u_lower1, u_lower2, orientation, - equations) - noncons_flux_secondary_upper = nonconservative_flux(u_upper2, u_upper1, orientation, - equations) - noncons_flux_secondary_lower = nonconservative_flux(u_lower2, u_lower1, orientation, - equations) - - for ii in axes(fstar_primary_upper, 1) - @inbounds begin - fstar_primary_upper[ii, j, k] += 0.5f0 * noncons_flux_primary_upper[ii] - fstar_primary_lower[ii, j, k] += 0.5f0 * noncons_flux_primary_lower[ii] - fstar_secondary_upper[ii, j, k] += 0.5f0 * noncons_flux_secondary_upper[ii] - fstar_secondary_lower[ii, j, k] += 0.5f0 * noncons_flux_secondary_lower[ii] - end - end - end - - return nothing -end - -# Kernel for copying mortar fluxes small to small and small to large -function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, - fstar_primary_upper, fstar_primary_lower, - fstar_secondary_upper, fstar_secondary_lower, - reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2) && - k <= length(orientations)) - @inbounds begin - large_element = neighbor_ids[3, k] - upper_element = neighbor_ids[2, k] - lower_element = neighbor_ids[1, k] - - large_side = large_sides[k] - orientation = orientations[k] - - # Use math expression to enhance performance (against control flow), it is equivalent to, - # `(2 - large_side) * (2 - orientation) * 1 + - # (2 - large_side) * (orientation - 1) * 3 + - # (large_side - 1) * (2 - orientation) * 2 + - # (large_side - 1) * (orientation - 1) * 4`. - direction = large_side + 2 * orientation - 2 - - surface_flux_values[i, j, direction, upper_element] = fstar_primary_upper[i, j, k] - surface_flux_values[i, j, direction, lower_element] = fstar_primary_lower[i, j, k] - - # Use math expression to enhance performance (against control flow), it is equivalent to, - # `(2 - large_side) * (2 - orientation) * 2 + - # (2 - large_side) * (orientation - 1) * 4 + - # (large_side - 1) * (2 - orientation) * 1 + - # (large_side - 1) * (orientation - 1) * 3`. - direction = 2 * orientation - large_side + 1 - end - - for ii in axes(reverse_upper, 2) # i.e., ` for ii in axes(reverse_lower, 2)` - @inbounds tmp_surface_flux_values[i, j, direction, large_element] += fstar_secondary_upper[i, ii, k] * - reverse_upper[j, ii] + - fstar_secondary_lower[i, ii, k] * - reverse_lower[j, ii] - end - - @inbounds surface_flux_values[i, j, direction, large_element] = tmp_surface_flux_values[i, j, - direction, - large_element] - end - - return nothing -end - -# Kernel for calculating surface integrals -function surface_integral_kernel!(du, factor_arr, surface_flux_values, - equations::AbstractEquations{2}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) - u2 = size(du, 2) - - j1 = div(j - 1, u2) + 1 - j2 = rem(j - 1, u2) + 1 - - @inbounds begin - du[i, j1, j2, k] -= (surface_flux_values[i, j2, 1, k] * isequal(j1, 1) + - surface_flux_values[i, j1, 3, k] * isequal(j2, 1)) * factor_arr[1] - du[i, j1, j2, k] += (surface_flux_values[i, j2, 2, k] * isequal(j1, u2) + - surface_flux_values[i, j1, 4, k] * isequal(j2, u2)) * factor_arr[2] - end - end - - return nothing -end - -# Kernel for applying inverse Jacobian -function jacobian_kernel!(du, inverse_jacobian, equations::AbstractEquations{2}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) - j1 = div(j - 1, size(du, 2)) + 1 - j2 = rem(j - 1, size(du, 2)) + 1 - - @inbounds du[i, j1, j2, k] *= -inverse_jacobian[k] - end - - return nothing -end - -# CUDA kernel for calculating source terms -function source_terms_kernel!(du, u, node_coordinates, t, equations::AbstractEquations{2}, - source_terms::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(du, 2)^2 && k <= size(du, 4)) - j1 = div(j - 1, size(du, 2)) + 1 - j2 = rem(j - 1, size(du, 2)) + 1 - - u_local = get_node_vars(u, equations, j1, j2, k) - x_local = get_node_coords(node_coordinates, equations, j1, j2, k) - - source_terms_node = source_terms(u_local, x_local, t, equations) - - for ii in axes(du, 1) - @inbounds du[ii, j1, j2, k] += source_terms_node[ii] - end - end - - return nothing -end +include("dg_2d_kernel.jl") -################################################################################################# # Functions that begin with `cuda_` are the functions that pack CUDA kernels together to do # partial work in semidiscretization. They are used to invoke kernels from the host (i.e., CPU) # and run them on the device (i.e., GPU). diff --git a/src/solvers/dg_2d_kernel.jl b/src/solvers/dg_2d_kernel.jl new file mode 100644 index 0000000..faafa12 --- /dev/null +++ b/src/solvers/dg_2d_kernel.jl @@ -0,0 +1,1468 @@ +# GPU kernels related to a DG semidiscretization in 2D. + +# Functions that end with `_kernel` are CUDA kernels that are going to be launched by +# the @cuda macro with parameters from the kernel configurator. They are purely run on +# the device (i.e., GPU). + +# Kernel for calculating fluxes along normal directions +function flux_kernel!(flux_arr1, flux_arr2, u, equations::AbstractEquations{2}, flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^2 && k <= size(u, 4)) + j1 = div(j - 1, size(u, 2)) + 1 + j2 = rem(j - 1, size(u, 2)) + 1 + + u_node = get_node_vars(u, equations, j1, j2, k) + + flux_node1 = flux(u_node, 1, equations) + flux_node2 = flux(u_node, 2, equations) + + for ii in axes(u, 1) + @inbounds begin + flux_arr1[ii, j1, j2, k] = flux_node1[ii] + flux_arr2[ii, j1, j2, k] = flux_node2[ii] + end + end + end + + return nothing +end + +# Kernel for calculating weak form +function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) + j1 = div(j - 1, size(du, 2)) + 1 + j2 = rem(j - 1, size(du, 2)) + 1 + + @inbounds du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros + + for ii in axes(du, 2) + @inbounds du[i, j1, j2, k] += derivative_dhat[j1, ii] * flux_arr1[i, ii, j2, k] + + derivative_dhat[j2, ii] * flux_arr2[i, j1, ii, k] + end + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating volume integrals with weak form +function flux_weak_form_kernel!(du, u, derivative_dhat, + equations::AbstractEquations{2}, flux::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + shmem_dhat = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_flux = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width, 2), offset) + + # Get thread and block indices only we need to save registers + tx, ty = threadIdx().x, threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + ty1 = div(ty - 1, tile_width) + 1 + ty2 = rem(ty - 1, tile_width) + 1 + + # Tile the computation (restrict to one tile here) + value = zero(eltype(du)) + + # Load global `derivative_dhat` into shared memory + # Transposed load + @inbounds shmem_dhat[ty1, ty2] = derivative_dhat[ty2, ty1] + + # Compute flux values + u_node = get_node_vars(u, equations, ty1, ty2, k) + flux_node1 = flux(u_node, 1, equations) + flux_node2 = flux(u_node, 2, equations) + + @inbounds begin + shmem_flux[tx, ty1, ty2, 1] = flux_node1[tx] + shmem_flux[tx, ty1, ty2, 2] = flux_node2[tx] + end + + sync_threads() + + # Loop within one block to get weak form + # TODO: Avoid potential bank conflicts + for thread in 1:tile_width + @inbounds value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread, ty2, 1] + + shmem_dhat[thread, ty2] * shmem_flux[tx, ty1, thread, 2] + end + + # Synchronization is not needed here if we use only one tile + # sync_threads() + + # Finalize the weak form + @inbounds du[tx, ty1, ty2, k] = value + + return nothing +end + +# Kernel for calculating volume fluxes +function volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2, u, equations::AbstractEquations{2}, + volume_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^3 && k <= size(u, 4)) + u2 = size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + u_node = get_node_vars(u, equations, j1, j2, k) + u_node1 = get_node_vars(u, equations, j3, j2, k) + u_node2 = get_node_vars(u, equations, j1, j3, k) + + volume_flux_node1 = volume_flux(u_node, u_node1, 1, equations) + volume_flux_node2 = volume_flux(u_node, u_node2, 2, equations) + + for ii in axes(u, 1) + @inbounds begin + volume_flux_arr1[ii, j1, j3, j2, k] = volume_flux_node1[ii] + volume_flux_arr2[ii, j1, j2, j3, k] = volume_flux_node2[ii] + end + end + end + + return nothing +end + +# Kernel for calculating volume integrals +function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_flux_arr2, + equations::AbstractEquations{2}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) + j1 = div(j - 1, size(du, 2)) + 1 + j2 = rem(j - 1, size(du, 2)) + 1 + + @inbounds du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros + + for ii in axes(du, 2) + @inbounds du[i, j1, j2, k] += volume_flux_arr1[i, j1, ii, j2, k] * derivative_split[j1, ii] * + (1 - isequal(j1, ii)) + # set diagonal elements to zeros + volume_flux_arr2[i, j1, j2, ii, k] * derivative_split[j2, ii] * + (1 - isequal(j2, ii)) # set diagonal elements to zeros + end + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating volume integrals without conservative terms +function volume_flux_integral_kernel!(du, u, derivative_split, + equations::AbstractEquations{2}, volume_flux::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width), offset) + + # Get thread and block indices only we need save registers + ty = threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + ty1 = div(ty - 1, tile_width) + 1 + ty2 = rem(ty - 1, tile_width) + 1 + + # Tile the computation (set to one tile here) + # Initialize the values + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2] = zero(eltype(du)) + end + + # Load global `derivative_split` into shared memory + # Transposed load + @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] * + (1 - isequal(ty1, ty2)) # set diagonal elements to zeros + + sync_threads() + + # Compute volume fluxes + # How to store nodes in shared memory? + for thread in 1:tile_width + # Volume flux is heavy in computation so we should try best to avoid redundant + # computation, i.e., use for loop along x direction here + u_node = get_node_vars(u, equations, ty1, ty2, k) + volume_flux_node1 = volume_flux(u_node, + get_node_vars(u, equations, thread, ty2, k), + 1, equations) + volume_flux_node2 = volume_flux(u_node, + get_node_vars(u, equations, ty1, thread, k), + 2, equations) + + # TODO: Avoid potential bank conflicts + # Try another way to parallelize (ty1, ty2) with threads to ty3, then + # consolidate each computation back to (ty1, ty2) + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2] += shmem_split[thread, ty1] * volume_flux_node1[tx] + + shmem_split[thread, ty2] * volume_flux_node2[tx] + end + end + + # Synchronization is not needed here if we use only one tile + # sync_threads() + + # Finalize the values + for tx in axes(du, 1) + @inbounds du[tx, ty1, ty2, k] = shmem_value[tx, ty1, ty2] + end + + return nothing +end + +# Kernel for calculating symmetric and nonconservative fluxes +function noncons_volume_flux_kernel!(symmetric_flux_arr1, symmetric_flux_arr2, noncons_flux_arr1, + noncons_flux_arr2, u, derivative_split, + equations::AbstractEquations{2}, symmetric_flux::Any, + nonconservative_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^3 && k <= size(u, 4)) + u2 = size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + u_node = get_node_vars(u, equations, j1, j2, k) + u_node1 = get_node_vars(u, equations, j3, j2, k) + u_node2 = get_node_vars(u, equations, j1, j3, k) + + symmetric_flux_node1 = symmetric_flux(u_node, u_node1, 1, equations) + symmetric_flux_node2 = symmetric_flux(u_node, u_node2, 2, equations) + + noncons_flux_node1 = nonconservative_flux(u_node, u_node1, 1, equations) + noncons_flux_node2 = nonconservative_flux(u_node, u_node2, 2, equations) + + for ii in axes(u, 1) + @inbounds begin + symmetric_flux_arr1[ii, j1, j3, j2, k] = symmetric_flux_node1[ii] * derivative_split[j1, j3] * + (1 - isequal(j1, j3)) # set diagonal elements to zeros + symmetric_flux_arr2[ii, j1, j2, j3, k] = symmetric_flux_node2[ii] * derivative_split[j2, j3] * + (1 - isequal(j2, j3)) # set diagonal elements to zeros + + noncons_flux_arr1[ii, j1, j3, j2, k] = noncons_flux_node1[ii] + noncons_flux_arr2[ii, j1, j2, j3, k] = noncons_flux_node2[ii] + end + end + end + + return nothing +end + +# Kernel for calculating symmetric and nonconservative volume integrals +function volume_integral_kernel!(du, derivative_split, symmetric_flux_arr1, symmetric_flux_arr2, + noncons_flux_arr1, noncons_flux_arr2) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) + j1 = div(j - 1, size(du, 2)) + 1 + j2 = rem(j - 1, size(du, 2)) + 1 + + @inbounds du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros + + for ii in axes(du, 2) + @inbounds du[i, j1, j2, k] += symmetric_flux_arr1[i, j1, ii, j2, k] + + symmetric_flux_arr2[i, j1, j2, ii, k] + + 0.5f0 * + derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, k] + + 0.5f0 * + derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, k] + end + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating volume integrals with conservative terms +function volume_flux_integral_kernel!(du, u, derivative_split, + equations::AbstractEquations{2}, + symmetric_flux::Any, nonconservative_flux::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width), offset) + + # Get thread and block indices only we need save registers + ty = threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + ty1 = div(ty - 1, tile_width) + 1 + ty2 = rem(ty - 1, tile_width) + 1 + + # Tile the computation (set to one tile here) + # Initialize the values + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2] = zero(eltype(du)) + end + + # Load data from global memory into shared memory + # Transposed load + @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] + + sync_threads() + + # Compute volume fluxes + # How to store nodes in shared memory? + for thread in 1:tile_width + # Volume flux is heavy in computation so we should try best to avoid redundant + # computation, i.e., use for loop along x direction here + u_node = get_node_vars(u, equations, ty1, ty2, k) + symmetric_flux_node1 = symmetric_flux(u_node, + get_node_vars(u, equations, thread, ty2, k), + 1, equations) + symmetric_flux_node2 = symmetric_flux(u_node, + get_node_vars(u, equations, ty1, thread, k), + 2, equations) + noncons_flux_node1 = nonconservative_flux(u_node, + get_node_vars(u, equations, thread, ty2, k), + 1, equations) + noncons_flux_node2 = nonconservative_flux(u_node, + get_node_vars(u, equations, ty1, thread, k), + 2, equations) + + # TODO: Avoid potential bank conflicts + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2] += symmetric_flux_node1[tx] * shmem_split[thread, ty1] * + (1 - isequal(ty1, thread)) + # set diagonal elements to zeros + symmetric_flux_node2[tx] * shmem_split[thread, ty2] * + (1 - isequal(ty2, thread)) + # set diagonal elements to zeros + 0.5f0 * + noncons_flux_node1[tx] * shmem_split[thread, ty1] + + 0.5f0 * + noncons_flux_node2[tx] * shmem_split[thread, ty2] + end + end + + # Synchronization is not needed here if we use only one tile + # sync_threads() + + # Finalize the values + for tx in axes(du, 1) + @inbounds du[tx, ty1, ty2, k] = shmem_value[tx, ty1, ty2] + end + + return nothing +end + +# Kernel for calculating pure DG and DG-FV volume fluxes +function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, fstar1_L, fstar1_R, + fstar2_L, fstar2_R, u, alpha, atol, + equations::AbstractEquations{2}, + volume_flux_dg::Any, volume_flux_fv::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^3 && k <= size(u, 4)) + u2 = size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + dg_only = isapprox(alpha[k], 0, atol = atol) + + u_node = get_node_vars(u, equations, j1, j2, k) + u_node1 = get_node_vars(u, equations, j3, j2, k) + u_node2 = get_node_vars(u, equations, j1, j3, k) + + volume_flux_node1 = volume_flux_dg(u_node, u_node1, 1, equations) + volume_flux_node2 = volume_flux_dg(u_node, u_node2, 2, equations) + + for ii in axes(u, 1) + @inbounds begin + volume_flux_arr1[ii, j1, j3, j2, k] = volume_flux_node1[ii] + volume_flux_arr2[ii, j1, j2, j3, k] = volume_flux_node2[ii] + end + + # Small optimization, no much performance gain + if isequal(j1 + 1, j3) # avoid race condition + flux_fv_node1 = volume_flux_fv(u_node, u_node1, 1, equations) + + @inbounds begin + fstar1_L[ii, j3, j2, k] = flux_fv_node1[ii] * (1 - dg_only) + fstar1_R[ii, j3, j2, k] = flux_fv_node1[ii] * (1 - dg_only) + end + end + + if isequal(j2 + 1, j3) # avoid race condition + flux_fv_node2 = volume_flux_fv(u_node, u_node2, 2, equations) + + @inbounds begin + fstar2_L[ii, j1, j3, k] = flux_fv_node2[ii] * (1 - dg_only) + fstar2_R[ii, j1, j3, k] = flux_fv_node2[ii] * (1 - dg_only) + end + end + end + + # if j1 != 1 && j3 == 1 # bad + # u_ll = get_node_vars(u, equations, j1 - 1, j2, k) + # u_rr = get_node_vars(u, equations, j1, j2, k) + # flux_fv_node1 = volume_flux_fv(u_ll, u_rr, 1, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar1_L[ii, j1, j2, k] = flux_fv_node1[ii] * (1 - dg_only) + # fstar1_R[ii, j1, j2, k] = flux_fv_node1[ii] * (1 - dg_only) + # end + # end + # end + + # if j2 != 1 && j3 == 1 # bad + # u_ll = get_node_vars(u, equations, j1, j2 - 1, k) + # u_rr = get_node_vars(u, equations, j1, j2, k) + # flux_fv_node2 = volume_flux_fv(u_ll, u_rr, 2, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar2_L[ii, j1, j2, k] = flux_fv_node2[ii] * (1 - dg_only) + # fstar2_R[ii, j1, j2, k] = flux_fv_node2[ii] * (1 - dg_only) + # end + # end + # end + end + + return nothing +end + +# Kernel for calculating pure DG and DG-FV volume integrals +function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, + volume_flux_arr1, volume_flux_arr2, + fstar1_L, fstar1_R, fstar2_L, fstar2_R, atol, + equations::AbstractEquations{2}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) + j1 = div(j - 1, size(du, 2)) + 1 + j2 = rem(j - 1, size(du, 2)) + 1 + + @inbounds begin + du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros + alpha_element = alpha[k] + end + + dg_only = isapprox(alpha_element, 0, atol = atol) + + for ii in axes(du, 2) + @inbounds du[i, j1, j2, k] += (derivative_split[j1, ii] * + (1 - isequal(j1, ii)) * # set diagonal elements to zeros + volume_flux_arr1[i, j1, ii, j2, k] + + derivative_split[j2, ii] * + (1 - isequal(j2, ii)) * # set diagonal elements to zeros + volume_flux_arr2[i, j1, j2, ii, k]) * dg_only + + ((1 - alpha_element) * derivative_split[j1, ii] * + (1 - isequal(j1, ii)) * # set diagonal elements to zeros + volume_flux_arr1[i, j1, ii, j2, k] + + (1 - alpha_element) * derivative_split[j2, ii] * + (1 - isequal(j2, ii)) * # set diagonal elements to zeros + volume_flux_arr2[i, j1, j2, ii, k]) * (1 - dg_only) + end + + @inbounds du[i, j1, j2, k] += alpha_element * + (inverse_weights[j1] * + (fstar1_L[i, j1 + 1, j2, k] - fstar1_R[i, j1, j2, k]) + + inverse_weights[j2] * + (fstar2_L[i, j1, j2 + 1, k] - fstar2_R[i, j1, j2, k])) * (1 - dg_only) + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating pure DG and DG-FV volume integrals without conservative terms +function volume_flux_integral_dgfv_kernel!(du, u, alpha, atol, derivative_split, inverse_weights, + equations::AbstractEquations{2}, + volume_flux_dg::Any, volume_flux_fv::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + # TODO: Combine `fstar` into single allocation + shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_fstar1 = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width + 1, tile_width), offset) + offset += sizeof(eltype(du)) * size(du, 1) * (tile_width + 1) * tile_width + shmem_fstar2 = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width + 1), offset) + offset += sizeof(eltype(du)) * size(du, 1) * tile_width * (tile_width + 1) + shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width), offset) + + # Get thread and block indices only we need save registers + ty = threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + ty1 = div(ty - 1, tile_width) + 1 + ty2 = rem(ty - 1, tile_width) + 1 + + # Load global `derivative_split` into shared memory + # Transposed load + @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] + + # Get variables for computation + @inbounds alpha_element = alpha[k] + dg_only = isapprox(alpha_element, 0, atol = atol) + + # Compute FV volume fluxes + u_node = get_node_vars(u, equations, ty1, ty2, k) + if ty1 + 1 <= tile_width + flux_fv_node1 = volume_flux_fv(u_node, + get_node_vars(u, equations, ty1 + 1, ty2, k), + 1, equations) + end + if ty2 + 1 <= tile_width + flux_fv_node2 = volume_flux_fv(u_node, + get_node_vars(u, equations, ty1, ty2 + 1, k), + 2, equations) + end + + # Initialize the values + for tx in axes(du, 1) + @inbounds begin + # Initialize `du` with zeros + shmem_value[tx, ty1, ty2] = zero(eltype(du)) + # Initialize `fstar` side columes with zeros + shmem_fstar1[tx, 1, ty2] = zero(eltype(du)) + shmem_fstar1[tx, tile_width + 1, ty2] = zero(eltype(du)) + shmem_fstar2[tx, ty1, 1] = zero(eltype(du)) + shmem_fstar2[tx, ty1, tile_width + 1] = zero(eltype(du)) + end + + if ty1 + 1 <= tile_width + # Set with FV volume fluxes + @inbounds shmem_fstar1[tx, ty1 + 1, ty2] = flux_fv_node1[tx] * (1 - dg_only) + end + if ty2 + 1 <= tile_width + # Set with FV volume fluxes + @inbounds shmem_fstar2[tx, ty1, ty2 + 1] = flux_fv_node2[tx] * (1 - dg_only) + end + end + + sync_threads() + + # Contribute FV to the volume integrals + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2] += alpha_element * + (inverse_weights[ty1] * + (shmem_fstar1[tx, ty1 + 1, ty2] - shmem_fstar1[tx, ty1, ty2]) + + inverse_weights[ty2] * + (shmem_fstar2[tx, ty1, ty2 + 1] - shmem_fstar2[tx, ty1, ty2])) * + (1 - dg_only) + end + + # Compute DG volume fluxes + for thread in 1:tile_width + volume_flux_node1 = volume_flux_dg(u_node, + get_node_vars(u, equations, thread, ty2, k), + 1, equations) + volume_flux_node2 = volume_flux_dg(u_node, + get_node_vars(u, equations, ty1, thread, k), + 2, equations) + + # Contribute DG to the volume integrals + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2] += (shmem_split[thread, ty1] * + (1 - isequal(ty1, thread)) * # set diagonal elements to zeros + volume_flux_node1[tx] + + shmem_split[thread, ty2] * + (1 - isequal(ty2, thread)) * # set diagonal elements to zeros + volume_flux_node2[tx]) * dg_only + + ((1 - alpha_element) * shmem_split[thread, ty1] * + (1 - isequal(ty1, thread)) * # set diagonal elements to zeros + volume_flux_node1[tx] + + (1 - alpha_element) * shmem_split[thread, ty2] * + (1 - isequal(ty2, thread)) * # set diagonal elements to zeros + volume_flux_node2[tx]) * (1 - dg_only) + end + end + + # Finalize the values + for tx in axes(du, 1) + @inbounds du[tx, ty1, ty2, k] = shmem_value[tx, ty1, ty2] + end + + return nothing +end + +# Kernel for calculating pure DG and DG-FV volume fluxes +function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, noncons_flux_arr1, + noncons_flux_arr2, fstar1_L, fstar1_R, fstar2_L, fstar2_R, + u, alpha, atol, derivative_split, + equations::AbstractEquations{2}, + volume_flux_dg::Any, noncons_flux_dg::Any, + volume_flux_fv::Any, noncons_flux_fv::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^3 && k <= size(u, 4)) + u2 = size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + dg_only = isapprox(alpha[k], 0, atol = atol) + + u_node = get_node_vars(u, equations, j1, j2, k) + u_node1 = get_node_vars(u, equations, j3, j2, k) + u_node2 = get_node_vars(u, equations, j1, j3, k) + + volume_flux_node1 = volume_flux_dg(u_node, u_node1, 1, equations) + volume_flux_node2 = volume_flux_dg(u_node, u_node2, 2, equations) + + noncons_flux_node1 = noncons_flux_dg(u_node, u_node1, 1, equations) + noncons_flux_node2 = noncons_flux_dg(u_node, u_node2, 2, equations) + + for ii in axes(u, 1) + @inbounds begin + volume_flux_arr1[ii, j1, j3, j2, k] = volume_flux_node1[ii] * derivative_split[j1, j3] * + (1 - isequal(j1, j3)) # set diagonal elements to zeros + volume_flux_arr2[ii, j1, j2, j3, k] = volume_flux_node2[ii] * derivative_split[j2, j3] * + (1 - isequal(j2, j3)) # set diagonal elements to zeros + noncons_flux_arr1[ii, j1, j3, j2, k] = noncons_flux_node1[ii] + noncons_flux_arr2[ii, j1, j2, j3, k] = noncons_flux_node2[ii] + end + + # Small optimization, no much performance gain + if isequal(j1 + 1, j3) # avoid race condition + f1_node = volume_flux_fv(u_node, u_node1, 1, equations) + f1_L_node = noncons_flux_fv(u_node, u_node1, 1, equations) + f1_R_node = noncons_flux_fv(u_node1, u_node, 1, equations) + + @inbounds begin + fstar1_L[ii, j3, j2, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only) + fstar1_R[ii, j3, j2, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only) + end + end + + if isequal(j2 + 1, j3) # avoid race condition + f2_node = volume_flux_fv(u_node, u_node2, 2, equations) + f2_L_node = noncons_flux_fv(u_node, u_node2, 2, equations) + f2_R_node = noncons_flux_fv(u_node2, u_node, 2, equations) + + @inbounds begin + fstar2_L[ii, j1, j3, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only) + fstar2_R[ii, j1, j3, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only) + end + end + end + + # if j1 != 1 && j3 == 1 # bad + # u_ll = get_node_vars(u, equations, j1 - 1, j2, k) + # u_rr = get_node_vars(u, equations, j1, j2, k) + + # f1_node = volume_flux_fv(u_ll, u_rr, 1, equations) + + # f1_L_node = noncons_flux_fv(u_ll, u_rr, 1, equations) + # f1_R_node = noncons_flux_fv(u_rr, u_ll, 1, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar1_L[ii, j1, j2, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only) + # fstar1_R[ii, j1, j2, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only) + # end + # end + # end + + # if j2 != 1 && j3 == 1 # bad + # u_ll = get_node_vars(u, equations, j1, j2 - 1, k) + # u_rr = get_node_vars(u, equations, j1, j2, k) + + # f2_node = volume_flux_fv(u_ll, u_rr, 2, equations) + + # f2_L_node = noncons_flux_fv(u_ll, u_rr, 2, equations) + # f2_R_node = noncons_flux_fv(u_rr, u_ll, 2, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar2_L[ii, j1, j2, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only) + # fstar2_R[ii, j1, j2, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only) + # end + # end + # end + end + + return nothing +end + +# Kernel for calculating pure DG and DG-FV volume integrals +function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, + volume_flux_arr1, volume_flux_arr2, + noncons_flux_arr1, noncons_flux_arr2, + fstar1_L, fstar1_R, fstar2_L, fstar2_R, atol, + equations::AbstractEquations{2}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) + j1 = div(j - 1, size(du, 2)) + 1 + j2 = rem(j - 1, size(du, 2)) + 1 + + @inbounds begin + du[i, j1, j2, k] = zero(eltype(du)) # initialize `du` with zeros + alpha_element = alpha[k] + end + + dg_only = isapprox(alpha_element, 0, atol = atol) + + for ii in axes(du, 2) + @inbounds du[i, j1, j2, k] += (volume_flux_arr1[i, j1, ii, j2, k] + + volume_flux_arr2[i, j1, j2, ii, k] + + 0.5f0 * + (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, k] + + derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, k])) * dg_only + + ((1 - alpha_element) * + volume_flux_arr1[i, j1, ii, j2, k] + + (1 - alpha_element) * + volume_flux_arr2[i, j1, j2, ii, k] + + 0.5f0 * (1 - alpha_element) * + (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, k] + + derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, k])) * (1 - dg_only) + end + + @inbounds du[i, j1, j2, k] += alpha_element * + (inverse_weights[j1] * + (fstar1_L[i, j1 + 1, j2, k] - fstar1_R[i, j1, j2, k]) + + inverse_weights[j2] * + (fstar2_L[i, j1, j2 + 1, k] - fstar2_R[i, j1, j2, k])) * (1 - dg_only) + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating pure DG and DG-FV volume integrals with conservative terms +function volume_flux_integral_dgfv_kernel!(du, u, alpha, atol, derivative_split, inverse_weights, + equations::AbstractEquations{2}, + volume_flux_dg::Any, noncons_flux_dg::Any, + volume_flux_fv::Any, noncons_flux_fv::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_fstar1 = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width + 1, tile_width, 2), offset) + offset += sizeof(eltype(du)) * size(du, 1) * (tile_width + 1) * tile_width * 2 + shmem_fstar2 = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width + 1, 2), offset) + offset += sizeof(eltype(du)) * size(du, 1) * tile_width * (tile_width + 1) * 2 + shmem_value = CuDynamicSharedArray(eltype(du), (size(du, 1), tile_width, tile_width), offset) + + # Get thread and block indices only we need save registers + ty = threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + ty1 = div(ty - 1, tile_width) + 1 + ty2 = rem(ty - 1, tile_width) + 1 + + # Load global `derivative_split` into shared memory + # Transposed load + @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] + + # Get variables for computation + @inbounds alpha_element = alpha[k] + dg_only = isapprox(alpha_element, 0, atol = atol) + + # Compute FV volume fluxes + u_node = get_node_vars(u, equations, ty1, ty2, k) + if ty1 + 1 <= tile_width + f1_node = volume_flux_fv(u_node, + get_node_vars(u, equations, ty1 + 1, ty2, k), + 1, equations) + f1_L_node = noncons_flux_fv(u_node, + get_node_vars(u, equations, ty1 + 1, ty2, k), + 1, equations) + f1_R_node = noncons_flux_fv(get_node_vars(u, equations, ty1 + 1, ty2, k), + u_node, + 1, equations) + end + if ty2 + 1 <= tile_width + f2_node = volume_flux_fv(u_node, + get_node_vars(u, equations, ty1, ty2 + 1, k), + 2, equations) + f2_L_node = noncons_flux_fv(u_node, + get_node_vars(u, equations, ty1, ty2 + 1, k), + 2, equations) + f2_R_node = noncons_flux_fv(get_node_vars(u, equations, ty1, ty2 + 1, k), + u_node, + 2, equations) + end + + # Initialize the values + for tx in axes(du, 1) + @inbounds begin + # Initialize `du` with zeros + shmem_value[tx, ty1, ty2] = zero(eltype(du)) + + # TODO: Remove shared memory for `fstar` and use local memory + + # Initialize `fstar` side columes with zeros (1: left) + shmem_fstar1[tx, 1, ty2, 1] = zero(eltype(du)) + shmem_fstar1[tx, tile_width + 1, ty2, 1] = zero(eltype(du)) + shmem_fstar2[tx, ty1, 1, 1] = zero(eltype(du)) + shmem_fstar2[tx, ty1, tile_width + 1, 1] = zero(eltype(du)) + + # Initialize `fstar` side columes with zeros (2: right) + shmem_fstar1[tx, 1, ty2, 2] = zero(eltype(du)) + shmem_fstar1[tx, tile_width + 1, ty2, 2] = zero(eltype(du)) + shmem_fstar2[tx, ty1, 1, 2] = zero(eltype(du)) + shmem_fstar2[tx, ty1, tile_width + 1, 2] = zero(eltype(du)) + end + + if ty1 + 1 <= tile_width + # Set with FV volume fluxes + @inbounds begin + shmem_fstar1[tx, ty1 + 1, ty2, 1] = f1_node[tx] + 0.5f0 * f1_L_node[tx] * (1 - dg_only) + shmem_fstar1[tx, ty1 + 1, ty2, 2] = f1_node[tx] + 0.5f0 * f1_R_node[tx] * (1 - dg_only) + end + end + if ty2 + 1 <= tile_width + # Set with FV volume fluxes + @inbounds begin + shmem_fstar2[tx, ty1, ty2 + 1, 1] = f2_node[tx] + 0.5f0 * f2_L_node[tx] * (1 - dg_only) + shmem_fstar2[tx, ty1, ty2 + 1, 2] = f2_node[tx] + 0.5f0 * f2_R_node[tx] * (1 - dg_only) + end + end + end + + sync_threads() + + # Contribute FV to the volume integrals + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2] += alpha_element * + (inverse_weights[ty1] * + (shmem_fstar1[tx, ty1 + 1, ty2, 1] - shmem_fstar1[tx, ty1, ty2, 2]) + + inverse_weights[ty2] * + (shmem_fstar2[tx, ty1, ty2 + 1, 1] - shmem_fstar2[tx, ty1, ty2, 2])) * (1 - dg_only) + end + + # Compute DG volume fluxes + for thread in 1:tile_width + volume_flux_node1 = volume_flux_dg(u_node, + get_node_vars(u, equations, thread, ty2, k), + 1, equations) + volume_flux_node2 = volume_flux_dg(u_node, + get_node_vars(u, equations, ty1, thread, k), + 2, equations) + + noncons_flux_node1 = noncons_flux_dg(u_node, + get_node_vars(u, equations, thread, ty2, k), + 1, equations) + noncons_flux_node2 = noncons_flux_dg(u_node, + get_node_vars(u, equations, ty1, thread, k), + 2, equations) + + # Contribute DG to the volume integrals + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2] += (volume_flux_node1[tx] * shmem_split[thread, ty1] * + (1 - isequal(ty1, thread)) + # set diagonal elements to zeros + volume_flux_node2[tx] * shmem_split[thread, ty2] * + (1 - isequal(ty2, thread)) + + 0.5f0 * + (shmem_split[thread, ty1] * noncons_flux_node1[tx] + + shmem_split[thread, ty2] * noncons_flux_node2[tx])) * dg_only + + ((1 - alpha_element) * + volume_flux_node1[tx] * shmem_split[thread, ty1] * + (1 - isequal(ty1, thread)) + # set diagonal elements to zeros + (1 - alpha_element) * + volume_flux_node2[tx] * shmem_split[thread, ty2] * + (1 - isequal(ty2, thread)) + # set diagonal elements to zeros + 0.5f0 * (1 - alpha_element) * + (shmem_split[thread, ty1] * noncons_flux_node1[tx] + + shmem_split[thread, ty2] * noncons_flux_node2[tx])) * (1 - dg_only) + end + end + + # Finalize the values + for tx in axes(du, 1) + @inbounds du[tx, ty1, ty2, k] = shmem_value[tx, ty1, ty2] + end + + return nothing +end + +# Kernel for prolonging two interfaces +function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids, orientations, + euqations::AbstractEquations{2}) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(interfaces_u, 2) * size(interfaces_u, 3) && k <= size(interfaces_u, 4)) + u2 = size(u, 2) # size(interfaces_u, 3) == size(u, 2) + + j1 = div(j - 1, u2) + 1 + j2 = rem(j - 1, u2) + 1 + + @inbounds begin + orientation = orientations[k] + left_element = neighbor_ids[1, k] + right_element = neighbor_ids[2, k] + + interfaces_u[1, j1, j2, k] = u[j1, + (2 - orientation) * u2 + (orientation - 1) * j2, + (2 - orientation) * j2 + (orientation - 1) * u2, + left_element] + interfaces_u[2, j1, j2, k] = u[j1, + (2 - orientation) + (orientation - 1) * j2, + (2 - orientation) * j2 + (orientation - 1), + right_element] + end + end + + return nothing +end + +# Kernel for calculating surface fluxes +function surface_flux_kernel!(surface_flux_arr, interfaces_u, orientations, + equations::AbstractEquations{2}, surface_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(surface_flux_arr, 2) && k <= size(surface_flux_arr, 3)) + u_ll, u_rr = get_surface_node_vars(interfaces_u, equations, j, k) + @inbounds orientation = orientations[k] + + surface_flux_node = surface_flux(u_ll, u_rr, orientation, equations) + + for ii in axes(surface_flux_arr, 1) + @inbounds surface_flux_arr[ii, j, k] = surface_flux_node[ii] + end + end + + return nothing +end + +# Kernel for calculating surface and both nonconservative fluxes +function surface_noncons_flux_kernel!(surface_flux_arr, noncons_left_arr, noncons_right_arr, + interfaces_u, orientations, equations::AbstractEquations{2}, + surface_flux::Any, nonconservative_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(surface_flux_arr, 2) && k <= size(surface_flux_arr, 3)) + u_ll, u_rr = get_surface_node_vars(interfaces_u, equations, j, k) + @inbounds orientation = orientations[k] + + surface_flux_node = surface_flux(u_ll, u_rr, orientation, equations) + noncons_left_node = nonconservative_flux(u_ll, u_rr, orientation, equations) + noncons_right_node = nonconservative_flux(u_rr, u_ll, orientation, equations) + + for ii in axes(surface_flux_arr, 1) + @inbounds begin + surface_flux_arr[ii, j, k] = surface_flux_node[ii] + noncons_left_arr[ii, j, k] = noncons_left_node[ii] + noncons_right_arr[ii, j, k] = noncons_right_node[ii] + end + end + end + + return nothing +end + +# Kernel for setting interface fluxes +function interface_flux_kernel!(surface_flux_values, surface_flux_arr, neighbor_ids, orientations, + equations::AbstractEquations{2}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_arr, 2) && + k <= size(surface_flux_arr, 3)) + @inbounds begin + left_id = neighbor_ids[1, k] + right_id = neighbor_ids[2, k] + + left_direction = 2 * orientations[k] + right_direction = 2 * orientations[k] - 1 + + surface_flux_values[i, j, left_direction, left_id] = surface_flux_arr[i, j, k] + surface_flux_values[i, j, right_direction, right_id] = surface_flux_arr[i, j, k] + end + end + + return nothing +end + +# Kernel for setting interface fluxes +function interface_flux_kernel!(surface_flux_values, surface_flux_arr, noncons_left_arr, + noncons_right_arr, neighbor_ids, orientations, + equations::AbstractEquations{2}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_arr, 2) && + k <= size(surface_flux_arr, 3)) + @inbounds begin + left_id = neighbor_ids[1, k] + right_id = neighbor_ids[2, k] + + left_direction = 2 * orientations[k] + right_direction = 2 * orientations[k] - 1 + + surface_flux_values[i, j, left_direction, left_id] = surface_flux_arr[i, j, k] + + 0.5f0 * + noncons_left_arr[i, j, k] + surface_flux_values[i, j, right_direction, right_id] = surface_flux_arr[i, j, k] + + 0.5f0 * + noncons_right_arr[i, j, k] + end + end + + return nothing +end + +# Kernel for prolonging two boundaries +function prolong_boundaries_kernel!(boundaries_u, u, neighbor_ids, neighbor_sides, orientations, + equations::AbstractEquations{2}) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(boundaries_u, 2) * size(boundaries_u, 3) && k <= size(boundaries_u, 4)) + u2 = size(u, 2) # size(boundaries_u, 3) == size(u, 2) + + j1 = div(j - 1, u2) + 1 + j2 = rem(j - 1, u2) + 1 + + @inbounds begin + element = neighbor_ids[k] + side = neighbor_sides[k] + orientation = orientations[k] + + boundaries_u[1, j1, j2, k] = u[j1, + (2 - orientation) * u2 + (orientation - 1) * j2, + (2 - orientation) * j2 + (orientation - 1) * u2, + element] * (2 - side) # Set to 0 instead of NaN + boundaries_u[2, j1, j2, k] = u[j1, + (2 - orientation) + (orientation - 1) * j2, + (2 - orientation) * j2 + (orientation - 1), + element] * (side - 1) # Set to 0 instead of NaN + end + end + + return nothing +end + +# Kernel for calculating boundary fluxes +function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, + indices_arr, neighbor_ids, neighbor_sides, orientations, + boundary_conditions::NamedTuple, equations::AbstractEquations{2}, + surface_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(surface_flux_values, 2) && k <= length(boundary_arr)) + @inbounds begin + boundary = boundary_arr[k] + direction = (indices_arr[1] <= boundary) + (indices_arr[2] <= boundary) + + (indices_arr[3] <= boundary) + (indices_arr[4] <= boundary) + + neighbor = neighbor_ids[boundary] + side = neighbor_sides[boundary] + orientation = orientations[boundary] + end + + u_ll, u_rr = get_surface_node_vars(boundaries_u, equations, j, boundary) + u_inner = (2 - side) * u_ll + (side - 1) * u_rr + x = get_node_coords(node_coordinates, equations, j, boundary) + + # TODO: Improve this part + if direction == 1 + boundary_flux_node = boundary_conditions[1](u_inner, orientation, + direction, x, t, surface_flux, equations) + elseif direction == 2 + boundary_flux_node = boundary_conditions[2](u_inner, orientation, + direction, x, t, surface_flux, equations) + elseif direction == 3 + boundary_flux_node = boundary_conditions[3](u_inner, orientation, + direction, x, t, surface_flux, equations) + else + boundary_flux_node = boundary_conditions[4](u_inner, orientation, + direction, x, t, surface_flux, equations) + end + + for ii in axes(surface_flux_values, 1) + # `boundary_flux_node` can be nothing if periodic boundary condition is applied + @inbounds surface_flux_values[ii, j, direction, neighbor] = isnothing(boundary_flux_node) ? # bad + surface_flux_values[ii, j, + direction, + neighbor] : + boundary_flux_node[ii] + end + end + + return nothing +end + +# Kernel for calculating boundary fluxes +function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, + indices_arr, neighbor_ids, neighbor_sides, orientations, + boundary_conditions::NamedTuple, equations::AbstractEquations{2}, + surface_flux::Any, nonconservative_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(surface_flux_values, 2) && k <= length(boundary_arr)) + @inbounds begin + boundary = boundary_arr[k] + direction = (indices_arr[1] <= boundary) + (indices_arr[2] <= boundary) + + (indices_arr[3] <= boundary) + (indices_arr[4] <= boundary) + + neighbor = neighbor_ids[boundary] + side = neighbor_sides[boundary] + orientation = orientations[boundary] + end + + u_ll, u_rr = get_surface_node_vars(boundaries_u, equations, j, boundary) + u_inner = (2 - side) * u_ll + (side - 1) * u_rr + x = get_node_coords(node_coordinates, equations, j, boundary) + + # TODO: Improve this part + if direction == 1 + flux_node = boundary_conditions[1](u_inner, orientation, direction, x, t, surface_flux, + equations) + noncons_flux_node = boundary_conditions[1](u_inner, orientation, direction, x, t, + nonconservative_flux, equations) + elseif direction == 2 + flux_node = boundary_conditions[2](u_inner, orientation, direction, x, t, surface_flux, + equations) + noncons_flux_node = boundary_conditions[2](u_inner, orientation, direction, x, t, + nonconservative_flux, equations) + elseif direction == 3 + flux_node = boundary_conditions[3](u_inner, orientation, direction, x, t, surface_flux, + equations) + noncons_flux_node = boundary_conditions[3](u_inner, orientation, direction, x, t, + nonconservative_flux, equations) + else + flux_node = boundary_conditions[4](u_inner, orientation, direction, x, t, surface_flux, + equations) + noncons_flux_node = boundary_conditions[4](u_inner, orientation, direction, x, t, + nonconservative_flux, equations) + end + + for ii in axes(surface_flux_values, 1) + @inbounds surface_flux_values[ii, j, direction, neighbor] = flux_node[ii] + + 0.5f0 * noncons_flux_node[ii] + end + end + + return nothing +end + +# Kernel for copying data small to small on mortars +function prolong_mortars_small2small_kernel!(u_upper, u_lower, u, neighbor_ids, large_sides, + orientations) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(u_upper, 2) && j <= size(u_upper, 3) && k <= size(u_upper, 4)) + @inbounds begin + large_side = large_sides[k] + orientation = orientations[k] + + lower_element = neighbor_ids[1, k] + upper_element = neighbor_ids[2, k] + end + + u2 = size(u, 2) + + @inbounds begin + u_upper[2, i, j, k] = u[i, + (2 - orientation) + (orientation - 1) * j, + (2 - orientation) * j + (orientation - 1), + upper_element] * (2 - large_side) + + u_lower[2, i, j, k] = u[i, + (2 - orientation) + (orientation - 1) * j, + (2 - orientation) * j + (orientation - 1), + lower_element] * (2 - large_side) + + u_upper[1, i, j, k] = u[i, + (2 - orientation) * u2 + (orientation - 1) * j, + (2 - orientation) * j + (orientation - 1) * u2, + upper_element] * (large_side - 1) + + u_lower[1, i, j, k] = u[i, + (2 - orientation) * u2 + (orientation - 1) * j, + (2 - orientation) * j + (orientation - 1) * u2, + lower_element] * (large_side - 1) + end + end + + return nothing +end + +# Kernel for interpolating data large to small on mortars +function prolong_mortars_large2small_kernel!(u_upper, u_lower, u, forward_upper, forward_lower, + neighbor_ids, large_sides, orientations) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(u_upper, 2) && j <= size(u_upper, 3) && k <= size(u_upper, 4)) + @inbounds begin + large_side = large_sides[k] + orientation = orientations[k] + large_element = neighbor_ids[3, k] + end + + leftright = large_side + u2 = size(u, 2) + + for jj in axes(forward_upper, 2) + @inbounds begin + u_upper[leftright, i, j, k] += forward_upper[j, jj] * + u[i, + (2 - orientation) * u2 + (orientation - 1) * jj, + (2 - orientation) * jj + (orientation - 1) * u2, + large_element] * (2 - large_side) + u_lower[leftright, i, j, k] += forward_lower[j, jj] * + u[i, + (2 - orientation) * u2 + (orientation - 1) * jj, + (2 - orientation) * jj + (orientation - 1) * u2, + large_element] * (2 - large_side) + end + end + + for jj in axes(forward_lower, 2) + @inbounds begin + u_upper[leftright, i, j, k] += forward_upper[j, jj] * + u[i, + (2 - orientation) + (orientation - 1) * jj, + (2 - orientation) * jj + (orientation - 1), + large_element] * (large_side - 1) + u_lower[leftright, i, j, k] += forward_lower[j, jj] * + u[i, + (2 - orientation) + (orientation - 1) * jj, + (2 - orientation) * jj + (orientation - 1), + large_element] * (large_side - 1) + end + end + end + + return nothing +end + +# Kernel for calculating mortar fluxes +function mortar_flux_kernel!(fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, + fstar_secondary_lower, u_upper, u_lower, orientations, + equations::AbstractEquations{2}, surface_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u_upper, 3) && k <= length(orientations)) + u_upper_ll, u_upper_rr = get_surface_node_vars(u_upper, equations, j, k) + u_lower_ll, u_lower_rr = get_surface_node_vars(u_lower, equations, j, k) + @inbounds orientation = orientations[k] + + flux_upper_node = surface_flux(u_upper_ll, u_upper_rr, orientation, equations) + flux_lower_node = surface_flux(u_lower_ll, u_lower_rr, orientation, equations) + + for ii in axes(fstar_primary_upper, 1) + @inbounds begin + fstar_primary_upper[ii, j, k] = flux_upper_node[ii] + fstar_primary_lower[ii, j, k] = flux_lower_node[ii] + fstar_secondary_upper[ii, j, k] = flux_upper_node[ii] + fstar_secondary_lower[ii, j, k] = flux_lower_node[ii] + end + end + end + + return nothing +end + +# Kernel for calculating mortar fluxes and adding nonconservative fluxes +function mortar_flux_kernel!(fstar_primary_upper, fstar_primary_lower, fstar_secondary_upper, + fstar_secondary_lower, u_upper, u_lower, orientations, large_sides, + equations::AbstractEquations{2}, surface_flux::Any, + nonconservative_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u_upper, 3) && k <= length(orientations)) + u_upper_ll, u_upper_rr = get_surface_node_vars(u_upper, equations, j, k) + u_lower_ll, u_lower_rr = get_surface_node_vars(u_lower, equations, j, k) + + @inbounds begin + orientation = orientations[k] + large_side = large_sides[k] + end + + flux_upper_node = surface_flux(u_upper_ll, u_upper_rr, orientation, equations) + flux_lower_node = surface_flux(u_lower_ll, u_lower_rr, orientation, equations) + + for ii in axes(fstar_primary_upper, 1) + @inbounds begin + fstar_primary_upper[ii, j, k] = flux_upper_node[ii] + fstar_primary_lower[ii, j, k] = flux_lower_node[ii] + fstar_secondary_upper[ii, j, k] = flux_upper_node[ii] + fstar_secondary_lower[ii, j, k] = flux_lower_node[ii] + end + end + + u_upper1 = (2 - large_side) * u_upper_ll + (large_side - 1) * u_upper_rr + u_upper2 = (large_side - 1) * u_upper_ll + (2 - large_side) * u_upper_rr + + u_lower1 = (2 - large_side) * u_lower_ll + (large_side - 1) * u_lower_rr + u_lower2 = (large_side - 1) * u_lower_ll + (2 - large_side) * u_lower_rr + + noncons_flux_primary_upper = nonconservative_flux(u_upper1, u_upper2, orientation, + equations) + noncons_flux_primary_lower = nonconservative_flux(u_lower1, u_lower2, orientation, + equations) + noncons_flux_secondary_upper = nonconservative_flux(u_upper2, u_upper1, orientation, + equations) + noncons_flux_secondary_lower = nonconservative_flux(u_lower2, u_lower1, orientation, + equations) + + for ii in axes(fstar_primary_upper, 1) + @inbounds begin + fstar_primary_upper[ii, j, k] += 0.5f0 * noncons_flux_primary_upper[ii] + fstar_primary_lower[ii, j, k] += 0.5f0 * noncons_flux_primary_lower[ii] + fstar_secondary_upper[ii, j, k] += 0.5f0 * noncons_flux_secondary_upper[ii] + fstar_secondary_lower[ii, j, k] += 0.5f0 * noncons_flux_secondary_lower[ii] + end + end + end + + return nothing +end + +# Kernel for copying mortar fluxes small to small and small to large +function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, + fstar_primary_upper, fstar_primary_lower, + fstar_secondary_upper, fstar_secondary_lower, + reverse_upper, reverse_lower, neighbor_ids, large_sides, + orientations) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2) && + k <= length(orientations)) + @inbounds begin + large_element = neighbor_ids[3, k] + upper_element = neighbor_ids[2, k] + lower_element = neighbor_ids[1, k] + + large_side = large_sides[k] + orientation = orientations[k] + + # Use math expression to enhance performance (against control flow), it is equivalent to, + # `(2 - large_side) * (2 - orientation) * 1 + + # (2 - large_side) * (orientation - 1) * 3 + + # (large_side - 1) * (2 - orientation) * 2 + + # (large_side - 1) * (orientation - 1) * 4`. + direction = large_side + 2 * orientation - 2 + + surface_flux_values[i, j, direction, upper_element] = fstar_primary_upper[i, j, k] + surface_flux_values[i, j, direction, lower_element] = fstar_primary_lower[i, j, k] + + # Use math expression to enhance performance (against control flow), it is equivalent to, + # `(2 - large_side) * (2 - orientation) * 2 + + # (2 - large_side) * (orientation - 1) * 4 + + # (large_side - 1) * (2 - orientation) * 1 + + # (large_side - 1) * (orientation - 1) * 3`. + direction = 2 * orientation - large_side + 1 + end + + for ii in axes(reverse_upper, 2) # i.e., ` for ii in axes(reverse_lower, 2)` + @inbounds tmp_surface_flux_values[i, j, direction, large_element] += fstar_secondary_upper[i, ii, k] * + reverse_upper[j, ii] + + fstar_secondary_lower[i, ii, k] * + reverse_lower[j, ii] + end + + @inbounds surface_flux_values[i, j, direction, large_element] = tmp_surface_flux_values[i, j, + direction, + large_element] + end + + return nothing +end + +# Kernel for calculating surface integrals +function surface_integral_kernel!(du, factor_arr, surface_flux_values, + equations::AbstractEquations{2}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) + u2 = size(du, 2) + + j1 = div(j - 1, u2) + 1 + j2 = rem(j - 1, u2) + 1 + + @inbounds begin + du[i, j1, j2, k] -= (surface_flux_values[i, j2, 1, k] * isequal(j1, 1) + + surface_flux_values[i, j1, 3, k] * isequal(j2, 1)) * factor_arr[1] + du[i, j1, j2, k] += (surface_flux_values[i, j2, 2, k] * isequal(j1, u2) + + surface_flux_values[i, j1, 4, k] * isequal(j2, u2)) * factor_arr[2] + end + end + + return nothing +end + +# Kernel for applying inverse Jacobian +function jacobian_kernel!(du, inverse_jacobian, equations::AbstractEquations{2}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^2 && k <= size(du, 4)) + j1 = div(j - 1, size(du, 2)) + 1 + j2 = rem(j - 1, size(du, 2)) + 1 + + @inbounds du[i, j1, j2, k] *= -inverse_jacobian[k] + end + + return nothing +end + +# CUDA kernel for calculating source terms +function source_terms_kernel!(du, u, node_coordinates, t, equations::AbstractEquations{2}, + source_terms::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(du, 2)^2 && k <= size(du, 4)) + j1 = div(j - 1, size(du, 2)) + 1 + j2 = rem(j - 1, size(du, 2)) + 1 + + u_local = get_node_vars(u, equations, j1, j2, k) + x_local = get_node_coords(node_coordinates, equations, j1, j2, k) + + source_terms_node = source_terms(u_local, x_local, t, equations) + + for ii in axes(du, 1) + @inbounds du[ii, j1, j2, k] += source_terms_node[ii] + end + end + + return nothing +end diff --git a/src/solvers/dg_3d.jl b/src/solvers/dg_3d.jl index 92404f8..f860501 100644 --- a/src/solvers/dg_3d.jl +++ b/src/solvers/dg_3d.jl @@ -1,2144 +1,7 @@ # Everything related to a DG semidiscretization in 3D. -################################################################################################# -# Functions that end with `_kernel` are CUDA kernels that are going to be launched by -# the @cuda macro with parameters from the kernel configurator. They are purely run on -# the device (i.e., GPU). - -# Kernel for calculating fluxes along normal directions -function flux_kernel!(flux_arr1, flux_arr2, flux_arr3, u, equations::AbstractEquations{3}, - flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^3 && k <= size(u, 5)) - u2 = size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - u_node = get_node_vars(u, equations, j1, j2, j3, k) - - flux_node1 = flux(u_node, 1, equations) - flux_node2 = flux(u_node, 2, equations) - flux_node3 = flux(u_node, 3, equations) - - for ii in axes(u, 1) - @inbounds begin - flux_arr1[ii, j1, j2, j3, k] = flux_node1[ii] - flux_arr2[ii, j1, j2, j3, k] = flux_node2[ii] - flux_arr3[ii, j1, j2, j3, k] = flux_node3[ii] - end - end - end - - return nothing -end - -# Kernel for calculating weak form -function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2, flux_arr3) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) - u2 = size(du, 2) # size(du, 2) == size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - @inbounds du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros - - for ii in axes(du, 2) - @inbounds du[i, j1, j2, j3, k] += derivative_dhat[j1, ii] * flux_arr1[i, ii, j2, j3, k] + - derivative_dhat[j2, ii] * flux_arr2[i, j1, ii, j3, k] + - derivative_dhat[j3, ii] * flux_arr3[i, j1, j2, ii, k] - end - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating volume integrals with weak form -function flux_weak_form_kernel!(du, u, derivative_dhat, - equations::AbstractEquations{3}, flux::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - shmem_dhat = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_flux = CuDynamicSharedArray(eltype(du), - (size(du, 1), tile_width, tile_width, tile_width, 3), offset) - - # Get thread and block indices only we need save registers - tx, ty = threadIdx().x, threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - ty1 = div(ty - 1, tile_width^2) + 1 - ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 - ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 - - # Tile the computation (restrict to one tile here) - value = zero(eltype(du)) - - # Load global `derivative_dhat` into shared memory - # Transposed load - @inbounds shmem_dhat[ty1, ty2] = derivative_dhat[ty2, ty1] - - # Compute flux values - u_node = get_node_vars(u, equations, ty1, ty2, ty3, k) - flux_node1 = flux(u_node, 1, equations) - flux_node2 = flux(u_node, 2, equations) - flux_node3 = flux(u_node, 3, equations) - - @inbounds begin - shmem_flux[tx, ty1, ty2, ty3, 1] = flux_node1[tx] - shmem_flux[tx, ty1, ty2, ty3, 2] = flux_node2[tx] - shmem_flux[tx, ty1, ty2, ty3, 3] = flux_node3[tx] - end - - sync_threads() - - # Loop within one block to get weak form - # TODO: Avoid potential bank conflicts - for thread in 1:tile_width - @inbounds value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread, ty2, ty3, 1] + - shmem_dhat[thread, ty2] * shmem_flux[tx, ty1, thread, ty3, 2] + - shmem_dhat[thread, ty3] * shmem_flux[tx, ty1, ty2, thread, 3] - end - - # Synchronization is not needed here if we use only one tile - # sync_threads() - - # Finalize the weak form - @inbounds du[tx, ty1, ty2, ty3, k] = value - - return nothing -end - -# CUDA kernel for calculating volume fluxes -function volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, u, - equations::AbstractEquations{3}, volume_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^4 && k <= size(u, 5)) - u2 = size(u, 2) - - j1 = div(j - 1, u2^3) + 1 - j2 = div(rem(j - 1, u2^3), u2^2) + 1 - j3 = div(rem(j - 1, u2^2), u2) + 1 - j4 = rem(j - 1, u2) + 1 - - u_node = get_node_vars(u, equations, j1, j2, j3, k) - u_node1 = get_node_vars(u, equations, j4, j2, j3, k) - u_node2 = get_node_vars(u, equations, j1, j4, j3, k) - u_node3 = get_node_vars(u, equations, j1, j2, j4, k) - - volume_flux_node1 = volume_flux(u_node, u_node1, 1, equations) - volume_flux_node2 = volume_flux(u_node, u_node2, 2, equations) - volume_flux_node3 = volume_flux(u_node, u_node3, 3, equations) - - for ii in axes(u, 1) - @inbounds begin - volume_flux_arr1[ii, j1, j4, j2, j3, k] = volume_flux_node1[ii] - volume_flux_arr2[ii, j1, j2, j4, j3, k] = volume_flux_node2[ii] - volume_flux_arr3[ii, j1, j2, j3, j4, k] = volume_flux_node3[ii] - end - end - end - - return nothing -end - -# Kernel for calculating volume integrals -function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_flux_arr2, - volume_flux_arr3, equations::AbstractEquations{3}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) - u2 = size(du, 2) # size(du, 2) == size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - @inbounds du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros - - for ii in axes(du, 2) - @inbounds du[i, j1, j2, j3, k] += volume_flux_arr1[i, j1, ii, j2, j3, k] * derivative_split[j1, ii] * - (1 - isequal(j1, ii)) + # set diagonal elements to zeros - volume_flux_arr2[i, j1, j2, ii, j3, k] * derivative_split[j2, ii] * - (1 - isequal(j2, ii)) + # set diagonal elements to zeros - volume_flux_arr3[i, j1, j2, j3, ii, k] * derivative_split[j3, ii] * - (1 - isequal(j3, ii)) # set diagonal elements to zeros - end - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating volume integrals without conservative terms -function volume_flux_integral_kernel!(du, u, derivative_split, - equations::AbstractEquations{3}, volume_flux::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_value = CuDynamicSharedArray(eltype(du), - (size(du, 1), tile_width, tile_width, tile_width), offset) - - # Get thread and block indices only we need save registers - ty = threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - ty1 = div(ty - 1, tile_width^2) + 1 - ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 - ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 - - # Tile the computation (set to one tile here) - # Initialize the values - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2, ty3] = zero(eltype(du)) - end - - # Load global `derivative_split` into shared memory - # Transposed load - @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] * - (1 - isequal(ty1, ty2)) # set diagonal elements to zeros - - sync_threads() - - # Compute volume fluxes - # How to store nodes in shared memory? - for thread in 1:tile_width - # Volume flux is heavy in computation so we should try best to avoid redundant - # computation, i.e., use for loop along x direction here - u_node = get_node_vars(u, equations, ty1, ty2, ty3, k) - volume_flux_node1 = volume_flux(u_node, - get_node_vars(u, equations, thread, ty2, ty3, k), - 1, equations) - volume_flux_node2 = volume_flux(u_node, - get_node_vars(u, equations, ty1, thread, ty3, k), - 2, equations) - volume_flux_node3 = volume_flux(u_node, - get_node_vars(u, equations, ty1, ty2, thread, k), - 3, equations) - - # TODO: Avoid potential bank conflicts - # Try another way to parallelize (ty1, ty2, ty3) with threads to ty4, - # then consolidate each computation back to (ty1, ty2, ty3) - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2, ty3] += shmem_split[thread, ty1] * volume_flux_node1[tx] + - shmem_split[thread, ty2] * volume_flux_node2[tx] + - shmem_split[thread, ty3] * volume_flux_node3[tx] - end - end - - # Synchronization is not needed here if we use only one tile - # sync_threads() - - # Finalize the values - for tx in axes(du, 1) - @inbounds du[tx, ty1, ty2, ty3, k] = shmem_value[tx, ty1, ty2, ty3] - end - - return nothing -end - -# Kernel for calculating symmetric and nonconservative fluxes -function noncons_volume_flux_kernel!(symmetric_flux_arr1, symmetric_flux_arr2, symmetric_flux_arr3, - noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, - u, derivative_split, equations::AbstractEquations{3}, - symmetric_flux::Any, nonconservative_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^4 && k <= size(u, 5)) - u2 = size(u, 2) - - j1 = div(j - 1, u2^3) + 1 - j2 = div(rem(j - 1, u2^3), u2^2) + 1 - j3 = div(rem(j - 1, u2^2), u2) + 1 - j4 = rem(j - 1, u2) + 1 - - u_node = get_node_vars(u, equations, j1, j2, j3, k) - u_node1 = get_node_vars(u, equations, j4, j2, j3, k) - u_node2 = get_node_vars(u, equations, j1, j4, j3, k) - u_node3 = get_node_vars(u, equations, j1, j2, j4, k) - - symmetric_flux_node1 = symmetric_flux(u_node, u_node1, 1, equations) - symmetric_flux_node2 = symmetric_flux(u_node, u_node2, 2, equations) - symmetric_flux_node3 = symmetric_flux(u_node, u_node3, 3, equations) - - noncons_flux_node1 = nonconservative_flux(u_node, u_node1, 1, equations) - noncons_flux_node2 = nonconservative_flux(u_node, u_node2, 2, equations) - noncons_flux_node3 = nonconservative_flux(u_node, u_node3, 3, equations) - - for ii in axes(u, 1) - @inbounds begin - symmetric_flux_arr1[ii, j1, j4, j2, j3, k] = symmetric_flux_node1[ii] * derivative_split[j1, j4] * - (1 - isequal(j1, j4)) # set diagonal elements to zeros - symmetric_flux_arr2[ii, j1, j2, j4, j3, k] = symmetric_flux_node2[ii] * derivative_split[j2, j4] * - (1 - isequal(j2, j4)) # set diagonal elements to zeros - symmetric_flux_arr3[ii, j1, j2, j3, j4, k] = symmetric_flux_node3[ii] * derivative_split[j3, j4] * - (1 - isequal(j3, j4)) # set diagonal elements to zeros - - noncons_flux_arr1[ii, j1, j4, j2, j3, k] = noncons_flux_node1[ii] - noncons_flux_arr2[ii, j1, j2, j4, j3, k] = noncons_flux_node2[ii] - noncons_flux_arr3[ii, j1, j2, j3, j4, k] = noncons_flux_node3[ii] - end - end - end - - return nothing -end - -# Kernel for calculating symmetric and nonconservative volume integrals -function volume_integral_kernel!(du, derivative_split, - symmetric_flux_arr1, symmetric_flux_arr2, symmetric_flux_arr3, - noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) - u2 = size(du, 2) # size(du, 2) == size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - @inbounds du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros - - for ii in axes(du, 2) - @inbounds du[i, j1, j2, j3, k] += symmetric_flux_arr1[i, j1, ii, j2, j3, k] + - symmetric_flux_arr2[i, j1, j2, ii, j3, k] + - symmetric_flux_arr3[i, j1, j2, j3, ii, k] + - 0.5f0 * - derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, j3, k] + - 0.5f0 * - derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, j3, k] + - 0.5f0 * - derivative_split[j3, ii] * noncons_flux_arr3[i, j1, j2, j3, ii, k] - end - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating volume integrals with conservative terms -function volume_flux_integral_kernel!(du, u, derivative_split, - equations::AbstractEquations{3}, - symmetric_flux::Any, nonconservative_flux::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_value = CuDynamicSharedArray(eltype(du), - (size(du, 1), tile_width, tile_width, tile_width), offset) - - # Get thread and block indices only we need save registers - ty = threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - ty1 = div(ty - 1, tile_width^2) + 1 - ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 - ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 - - # Tile the computation (set to one tile here) - # Initialize the values - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2, ty3] = zero(eltype(du)) - end - - # Load data from global memory into shared memory - # Transposed load - @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] - - sync_threads() - - # Compute volume fluxes - # How to store nodes in shared memory? - for thread in 1:tile_width - # Volume flux is heavy in computation so we should try best to avoid redundant - # computation, i.e., use for loop along x direction here - u_node = get_node_vars(u, equations, ty1, ty2, ty3, k) - symmetric_flux_node1 = symmetric_flux(u_node, - get_node_vars(u, equations, thread, ty2, ty3, k), - 1, equations) - symmetric_flux_node2 = symmetric_flux(u_node, - get_node_vars(u, equations, ty1, thread, ty3, k), - 2, equations) - symmetric_flux_node3 = symmetric_flux(u_node, - get_node_vars(u, equations, ty1, ty2, thread, k), - 3, equations) - noncons_flux_node1 = nonconservative_flux(u_node, - get_node_vars(u, equations, thread, ty2, ty3, k), - 1, equations) - noncons_flux_node2 = nonconservative_flux(u_node, - get_node_vars(u, equations, ty1, thread, ty3, k), - 2, equations) - noncons_flux_node3 = nonconservative_flux(u_node, - get_node_vars(u, equations, ty1, ty2, thread, k), - 3, equations) - - # TODO: Avoid potential bank conflicts - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2, ty3] += symmetric_flux_node1[tx] * shmem_split[thread, ty1] * - (1 - isequal(ty1, thread)) + # set diagonal elements to zeros - symmetric_flux_node2[tx] * shmem_split[thread, ty2] * - (1 - isequal(ty2, thread)) + # set diagonal elements to zeros - symmetric_flux_node3[tx] * shmem_split[thread, ty3] * - (1 - isequal(ty3, thread)) + # set diagonal elements to zeros - 0.5f0 * - noncons_flux_node1[tx] * shmem_split[thread, ty1] + - 0.5f0 * - noncons_flux_node2[tx] * shmem_split[thread, ty2] + - 0.5f0 * - noncons_flux_node3[tx] * shmem_split[thread, ty3] - end - end - - # Synchronization is not needed here if we use only one tile - # sync_threads() - - # Finalize the values - for tx in axes(du, 1) - @inbounds du[tx, ty1, ty2, ty3, k] = shmem_value[tx, ty1, ty2, ty3] - end - - return nothing -end - -# Kernel for calculating pure DG and DG-FV volume fluxes -function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, - fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, - u, alpha, atol, equations::AbstractEquations{3}, - volume_flux_dg::Any, volume_flux_fv::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^4 && k <= size(u, 5)) - u2 = size(u, 2) - - j1 = div(j - 1, u2^3) + 1 - j2 = div(rem(j - 1, u2^3), u2^2) + 1 - j3 = div(rem(j - 1, u2^2), u2) + 1 - j4 = rem(j - 1, u2) + 1 - - dg_only = isapprox(alpha[k], 0, atol = atol) - - u_node = get_node_vars(u, equations, j1, j2, j3, k) - u_node1 = get_node_vars(u, equations, j4, j2, j3, k) - u_node2 = get_node_vars(u, equations, j1, j4, j3, k) - u_node3 = get_node_vars(u, equations, j1, j2, j4, k) - - volume_flux_node1 = volume_flux_dg(u_node, u_node1, 1, equations) - volume_flux_node2 = volume_flux_dg(u_node, u_node2, 2, equations) - volume_flux_node3 = volume_flux_dg(u_node, u_node3, 3, equations) - - for ii in axes(u, 1) - @inbounds begin - volume_flux_arr1[ii, j1, j4, j2, j3, k] = volume_flux_node1[ii] - volume_flux_arr2[ii, j1, j2, j4, j3, k] = volume_flux_node2[ii] - volume_flux_arr3[ii, j1, j2, j3, j4, k] = volume_flux_node3[ii] - end - - # Small optimization, no much performance gain - if isequal(j1 + 1, j4) # avoid race condition - flux_fv_node1 = volume_flux_fv(u_node, u_node1, 1, equations) - - @inbounds begin - fstar1_L[ii, j4, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only) - fstar1_R[ii, j4, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only) - end - end - - if isequal(j2 + 1, j4) # avoid race condition - flux_fv_node2 = volume_flux_fv(u_node, u_node2, 2, equations) - - @inbounds begin - fstar2_L[ii, j1, j4, j3, k] = flux_fv_node2[ii] * (1 - dg_only) - fstar2_R[ii, j1, j4, j3, k] = flux_fv_node2[ii] * (1 - dg_only) - end - end - - if isequal(j3 + 1, j4) # avoid race condition - flux_fv_node3 = volume_flux_fv(u_node, u_node3, 3, equations) - - @inbounds begin - fstar3_L[ii, j1, j2, j4, k] = flux_fv_node3[ii] * (1 - dg_only) - fstar3_R[ii, j1, j2, j4, k] = flux_fv_node3[ii] * (1 - dg_only) - end - end - end - - # if j1 != 1 && j4 == 1 # bad - # u_ll = get_node_vars(u, equations, j1 - 1, j2, j3, k) - # u_rr = get_node_vars(u, equations, j1, j2, j3, k) - # flux_fv_node1 = volume_flux_fv(u_ll, u_rr, 1, equations) - - # for ii in axes(u, 1) - # @inbounds begin - # fstar1_L[ii, j1, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only) - # fstar1_R[ii, j1, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only) - # end - # end - # end - - # if j2 != 1 && j4 == 1 # bad - # u_ll = get_node_vars(u, equations, j1, j2 - 1, j3, k) - # u_rr = get_node_vars(u, equations, j1, j2, j3, k) - # flux_fv_node2 = volume_flux_fv(u_ll, u_rr, 2, equations) - - # for ii in axes(u, 1) - # @inbounds begin - # fstar2_L[ii, j1, j2, j3, k] = flux_fv_node2[ii] * (1 - dg_only) - # fstar2_R[ii, j1, j2, j3, k] = flux_fv_node2[ii] * (1 - dg_only) - # end - # end - # end - - # if j3 != 1 && j4 == 1 # bad - # u_ll = get_node_vars(u, equations, j1, j2, j3 - 1, k) - # u_rr = get_node_vars(u, equations, j1, j2, j3, k) - # flux_fv_node3 = volume_flux_fv(u_ll, u_rr, 3, equations) - - # for ii in axes(u, 1) - # @inbounds begin - # fstar3_L[ii, j1, j2, j3, k] = flux_fv_node3[ii] * (1 - dg_only) - # fstar3_R[ii, j1, j2, j3, k] = flux_fv_node3[ii] * (1 - dg_only) - # end - # end - # end - end - - return nothing -end - -# Kernel for calculating pure DG and DG-FV volume integrals -function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, - volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, - fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, - atol, equations::AbstractEquations{3}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) - u2 = size(du, 2) # size(du, 2) == size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - @inbounds begin - du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros - alpha_element = alpha[k] - end - - dg_only = isapprox(alpha_element, 0, atol = atol) - - for ii in axes(du, 2) - @inbounds du[i, j1, j2, j3, k] += (derivative_split[j1, ii] * - (1 - isequal(j1, ii)) * # set diagonal elements to zeros - volume_flux_arr1[i, j1, ii, j2, j3, k] + - derivative_split[j2, ii] * - (1 - isequal(j2, ii)) * # set diagonal elements to zeros - volume_flux_arr2[i, j1, j2, ii, j3, k] + - derivative_split[j3, ii] * - (1 - isequal(j3, ii)) * # set diagonal elements to zeros - volume_flux_arr3[i, j1, j2, j3, ii, k]) * dg_only + - ((1 - alpha_element) * derivative_split[j1, ii] * - (1 - isequal(j1, ii)) * # set diagonal elements to zeros - volume_flux_arr1[i, j1, ii, j2, j3, k] + - (1 - alpha_element) * derivative_split[j2, ii] * - (1 - isequal(j2, ii)) * # set diagonal elements to zeros - volume_flux_arr2[i, j1, j2, ii, j3, k] + - (1 - alpha_element) * derivative_split[j3, ii] * - (1 - isequal(j3, ii)) * # set diagonal elements to zeros - volume_flux_arr3[i, j1, j2, j3, ii, k]) * (1 - dg_only) - end - - @inbounds du[i, j1, j2, j3, k] += alpha_element * - (inverse_weights[j1] * - (fstar1_L[i, j1 + 1, j2, j3, k] - fstar1_R[i, j1, j2, j3, k]) + - inverse_weights[j2] * - (fstar2_L[i, j1, j2 + 1, j3, k] - fstar2_R[i, j1, j2, j3, k]) + - inverse_weights[j3] * - (fstar3_L[i, j1, j2, j3 + 1, k] - fstar3_R[i, j1, j2, j3, k])) * (1 - dg_only) - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating pure DG and DG-FV volume integrals without conservative terms -function volume_flux_integral_dgfv_kernel!(du, u, alpha, atol, derivative_split, inverse_weights, - equations::AbstractEquations{3}, - volume_flux_dg::Any, volume_flux_fv::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - # TODO: Combine `fstar` into single allocation - shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_fstar1 = CuDynamicSharedArray(eltype(du), - (size(du, 1), tile_width + 1, tile_width, tile_width), offset) - offset += sizeof(eltype(du)) * size(du, 1) * (tile_width + 1) * tile_width * tile_width - shmem_fstar2 = CuDynamicSharedArray(eltype(du), - (size(du, 1), tile_width, tile_width + 1, tile_width), offset) - offset += sizeof(eltype(du)) * size(du, 1) * tile_width * (tile_width + 1) * tile_width - shmem_fstar3 = CuDynamicSharedArray(eltype(du), - (size(du, 1), tile_width, tile_width, tile_width + 1), offset) - offset += sizeof(eltype(du)) * size(du, 1) * tile_width * tile_width * (tile_width + 1) - shmem_value = CuDynamicSharedArray(eltype(du), - (size(du, 1), tile_width, tile_width, tile_width), offset) - - # Get thread and block indices only we need save registers - ty = threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - ty1 = div(ty - 1, tile_width^2) + 1 - ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 - ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 - - # Load global `derivative_split` into shared memory - # Transposed load - @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] - - # Get variables for computation - @inbounds alpha_element = alpha[k] - dg_only = isapprox(alpha_element, 0, atol = atol) - - # Compute FV volume fluxes - u_node = get_node_vars(u, equations, ty1, ty2, ty3, k) - if ty1 + 1 <= tile_width - flux_fv_node1 = volume_flux_fv(u_node, - get_node_vars(u, equations, ty1 + 1, ty2, ty3, k), - 1, equations) - end - if ty2 + 1 <= tile_width - flux_fv_node2 = volume_flux_fv(u_node, - get_node_vars(u, equations, ty1, ty2 + 1, ty3, k), - 2, equations) - end - if ty3 + 1 <= tile_width - flux_fv_node3 = volume_flux_fv(u_node, - get_node_vars(u, equations, ty1, ty2, ty3 + 1, k), - 3, equations) - end - - # Initialize the values - for tx in axes(du, 1) - @inbounds begin - # Initialize `du` with zeros - shmem_value[tx, ty1, ty2, ty3] = zero(eltype(du)) - # Initialize `fstar` side columes with zeros - shmem_fstar1[tx, 1, ty2, ty3] = zero(eltype(du)) - shmem_fstar1[tx, tile_width + 1, ty2, ty3] = zero(eltype(du)) - shmem_fstar2[tx, ty1, 1, ty3] = zero(eltype(du)) - shmem_fstar2[tx, ty1, tile_width + 1, ty3] = zero(eltype(du)) - shmem_fstar3[tx, ty1, ty2, 1] = zero(eltype(du)) - shmem_fstar3[tx, ty1, ty2, tile_width + 1] = zero(eltype(du)) - end - - if ty1 + 1 <= tile_width - # Set with FV volume fluxes - @inbounds shmem_fstar1[tx, ty1 + 1, ty2, ty3] = flux_fv_node1[tx] * (1 - dg_only) - end - if ty2 + 1 <= tile_width - # Set with FV volume fluxes - @inbounds shmem_fstar2[tx, ty1, ty2 + 1, ty3] = flux_fv_node2[tx] * (1 - dg_only) - end - if ty3 + 1 <= tile_width - # Set with FV volume fluxes - @inbounds shmem_fstar3[tx, ty1, ty2, ty3 + 1] = flux_fv_node3[tx] * (1 - dg_only) - end - end - - sync_threads() - - # Contribute FV to the volume integrals - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2, ty3] += alpha_element * - (inverse_weights[ty1] * - (shmem_fstar1[tx, ty1 + 1, ty2, ty3] - shmem_fstar1[tx, ty1, ty2, ty3]) + - inverse_weights[ty2] * - (shmem_fstar2[tx, ty1, ty2 + 1, ty3] - shmem_fstar2[tx, ty1, ty2, ty3]) + - inverse_weights[ty3] * - (shmem_fstar3[tx, ty1, ty2, ty3 + 1] - shmem_fstar3[tx, ty1, ty2, ty3])) * - (1 - dg_only) - end - - # Compute DG volume fluxes - for thread in 1:tile_width - volume_flux_node1 = volume_flux_dg(u_node, - get_node_vars(u, equations, thread, ty2, ty3, k), - 1, equations) - volume_flux_node2 = volume_flux_dg(u_node, - get_node_vars(u, equations, ty1, thread, ty3, k), - 2, equations) - volume_flux_node3 = volume_flux_dg(u_node, - get_node_vars(u, equations, ty1, ty2, thread, k), - 3, equations) - - # Contribute DG to the volume integrals - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2, ty3] += (shmem_split[thread, ty1] * - (1 - isequal(ty1, thread)) * # set diagonal elements to zeros - volume_flux_node1[tx] + - shmem_split[thread, ty2] * - (1 - isequal(ty2, thread)) * # set diagonal elements to zeros - volume_flux_node2[tx] + - shmem_split[thread, ty3] * - (1 - isequal(ty3, thread)) * # set diagonal elements to zeros - volume_flux_node3[tx]) * dg_only + - ((1 - alpha_element) * shmem_split[thread, ty1] * - (1 - isequal(ty1, thread)) * # set diagonal elements to zeros - volume_flux_node1[tx] + - (1 - alpha_element) * shmem_split[thread, ty2] * - (1 - isequal(ty2, thread)) * # set diagonal elements to zeros - volume_flux_node2[tx] + - (1 - alpha_element) * shmem_split[thread, ty3] * - (1 - isequal(ty3, thread)) * # set diagonal elements to zeros - volume_flux_node3[tx]) * (1 - dg_only) - end - end - - # Finalize the values - for tx in axes(du, 1) - @inbounds du[tx, ty1, ty2, ty3, k] = shmem_value[tx, ty1, ty2, ty3] - end - - return nothing -end - -# Kernel for calculating pure DG and DG-FV volume fluxes -function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, - noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, - fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, - u, alpha, atol, derivative_split, - equations::AbstractEquations{3}, - volume_flux_dg::Any, noncons_flux_dg::Any, - volume_flux_fv::Any, noncons_flux_fv::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(u, 2)^4 && k <= size(u, 5)) - u2 = size(u, 2) - - j1 = div(j - 1, u2^3) + 1 - j2 = div(rem(j - 1, u2^3), u2^2) + 1 - j3 = div(rem(j - 1, u2^2), u2) + 1 - j4 = rem(j - 1, u2) + 1 - - dg_only = isapprox(alpha[k], 0, atol = atol) - - u_node = get_node_vars(u, equations, j1, j2, j3, k) - u_node1 = get_node_vars(u, equations, j4, j2, j3, k) - u_node2 = get_node_vars(u, equations, j1, j4, j3, k) - u_node3 = get_node_vars(u, equations, j1, j2, j4, k) - - volume_flux_node1 = volume_flux_dg(u_node, u_node1, 1, equations) - volume_flux_node2 = volume_flux_dg(u_node, u_node2, 2, equations) - volume_flux_node3 = volume_flux_dg(u_node, u_node3, 3, equations) - - noncons_flux_node1 = noncons_flux_dg(u_node, u_node1, 1, equations) - noncons_flux_node2 = noncons_flux_dg(u_node, u_node2, 2, equations) - noncons_flux_node3 = noncons_flux_dg(u_node, u_node3, 3, equations) - - for ii in axes(u, 1) - @inbounds begin - volume_flux_arr1[ii, j1, j4, j2, j3, k] = volume_flux_node1[ii] * derivative_split[j1, j4] * - (1 - isequal(j1, j4)) # set diagonal elements to zeros - volume_flux_arr2[ii, j1, j2, j4, j3, k] = volume_flux_node2[ii] * derivative_split[j2, j4] * - (1 - isequal(j2, j4)) # set diagonal elements to zeros - volume_flux_arr3[ii, j1, j2, j3, j4, k] = volume_flux_node3[ii] * derivative_split[j3, j4] * - (1 - isequal(j3, j4)) # set diagonal elements to zeros - - noncons_flux_arr1[ii, j1, j4, j2, j3, k] = noncons_flux_node1[ii] - noncons_flux_arr2[ii, j1, j2, j4, j3, k] = noncons_flux_node2[ii] - noncons_flux_arr3[ii, j1, j2, j3, j4, k] = noncons_flux_node3[ii] - end - - # Small optimization, no much performance gain - if isequal(j1 + 1, j4) # avoid race condition - f1_node = volume_flux_fv(u_node, u_node1, 1, equations) - f1_L_node = noncons_flux_fv(u_node, u_node1, 1, equations) - f1_R_node = noncons_flux_fv(u_node1, u_node, 1, equations) - - @inbounds begin - fstar1_L[ii, j4, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only) - fstar1_R[ii, j4, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only) - end - end - - if isequal(j2 + 1, j4) # avoid race condition - f2_node = volume_flux_fv(u_node, u_node2, 2, equations) - f2_L_node = noncons_flux_fv(u_node, u_node2, 2, equations) - f2_R_node = noncons_flux_fv(u_node2, u_node, 2, equations) - - @inbounds begin - fstar2_L[ii, j1, j4, j3, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only) - fstar2_R[ii, j1, j4, j3, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only) - end - end - - if isequal(j3 + 1, j4) # avoid race condition - f3_node = volume_flux_fv(u_node, u_node3, 3, equations) - f3_L_node = noncons_flux_fv(u_node, u_node3, 3, equations) - f3_R_node = noncons_flux_fv(u_node3, u_node, 3, equations) - - @inbounds begin - fstar3_L[ii, j1, j2, j4, k] = f3_node[ii] + 0.5f0 * f3_L_node[ii] * (1 - dg_only) - fstar3_R[ii, j1, j2, j4, k] = f3_node[ii] + 0.5f0 * f3_R_node[ii] * (1 - dg_only) - end - end - end - - # if j1 != 1 && j4 == 1 # bad - # u_ll = get_node_vars(u, equations, j1 - 1, j2, j3, k) - # u_rr = get_node_vars(u, equations, j1, j2, j3, k) - - # f1_node = volume_flux_fv(u_ll, u_rr, 1, equations) - - # f1_L_node = noncons_flux_fv(u_ll, u_rr, 1, equations) - # f1_R_node = noncons_flux_fv(u_rr, u_ll, 1, equations) - - # for ii in axes(u, 1) - # @inbounds begin - # fstar1_L[ii, j1, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only) - # fstar1_R[ii, j1, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only) - # end - # end - # end - - # if j2 != 1 && j4 == 1 # bad - # u_ll = get_node_vars(u, equations, j1, j2 - 1, j3, k) - # u_rr = get_node_vars(u, equations, j1, j2, j3, k) - - # f2_node = volume_flux_fv(u_ll, u_rr, 2, equations) - - # f2_L_node = noncons_flux_fv(u_ll, u_rr, 2, equations) - # f2_R_node = noncons_flux_fv(u_rr, u_ll, 2, equations) - - # for ii in axes(u, 1) - # @inbounds begin - # fstar2_L[ii, j1, j2, j3, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only) - # fstar2_R[ii, j1, j2, j3, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only) - # end - # end - # end - - # if j3 != 1 && j4 == 1 # bad - # u_ll = get_node_vars(u, equations, j1, j2, j3 - 1, k) - # u_rr = get_node_vars(u, equations, j1, j2, j3, k) - - # f3_node = volume_flux_fv(u_ll, u_rr, 3, equations) - - # f3_L_node = noncons_flux_fv(u_ll, u_rr, 3, equations) - # f3_R_node = noncons_flux_fv(u_rr, u_ll, 3, equations) - - # for ii in axes(u, 1) - # @inbounds begin - # fstar3_L[ii, j1, j2, j3, k] = f3_node[ii] + 0.5f0 * f3_L_node[ii] * (1 - dg_only) - # fstar3_R[ii, j1, j2, j3, k] = f3_node[ii] + 0.5f0 * f3_R_node[ii] * (1 - dg_only) - # end - # end - # end - end - - return nothing -end - -# Kernel for calculating pure DG and DG-FV volume integrals -function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, - volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, - noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, - fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, - atol, equations::AbstractEquations{3}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) - u2 = size(du, 2) # size(du, 2) == size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - @inbounds begin - du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros - alpha_element = alpha[k] - end - - dg_only = isapprox(alpha_element, 0, atol = atol) - - for ii in axes(du, 2) - @inbounds du[i, j1, j2, j3, k] += (volume_flux_arr1[i, j1, ii, j2, j3, k] + - volume_flux_arr2[i, j1, j2, ii, j3, k] + - volume_flux_arr3[i, j1, j2, j3, ii, k] + - 0.5f0 * - (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, j3, k] + - derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, j3, k] + - derivative_split[j3, ii] * noncons_flux_arr3[i, j1, j2, j3, ii, k])) * dg_only + - ((1 - alpha_element) * - volume_flux_arr1[i, j1, ii, j2, j3, k] + - (1 - alpha_element) * - volume_flux_arr2[i, j1, j2, ii, j3, k] + - (1 - alpha_element) * - volume_flux_arr3[i, j1, j2, j3, ii, k] + - 0.5f0 * (1 - alpha_element) * - (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, j3, k] + - derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, j3, k] + - derivative_split[j3, ii] * noncons_flux_arr3[i, j1, j2, j3, ii, k])) * (1 - dg_only) - end - - @inbounds du[i, j1, j2, j3, k] += alpha_element * - (inverse_weights[j1] * - (fstar1_L[i, j1 + 1, j2, j3, k] - fstar1_R[i, j1, j2, j3, k]) + - inverse_weights[j2] * - (fstar2_L[i, j1, j2 + 1, j3, k] - fstar2_R[i, j1, j2, j3, k]) + - inverse_weights[j3] * - (fstar3_L[i, j1, j2, j3 + 1, k] - fstar3_R[i, j1, j2, j3, k])) * (1 - dg_only) - end - - return nothing -end - -############################################################################## New optimization -# Kernel for calculating pure DG and DG-FV volume integrals with conservative terms -function volume_flux_integral_dgfv_kernel!(du, u, alpha, atol, derivative_split, inverse_weights, - equations::AbstractEquations{3}, - volume_flux_dg::Any, noncons_flux_dg::Any, - volume_flux_fv::Any, noncons_flux_fv::Any) - # Set tile width - tile_width = size(du, 2) - offset = 0 # offset bytes for shared memory - - # Allocate dynamic shared memory - shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) - offset += sizeof(eltype(du)) * tile_width^2 - shmem_fstar1 = CuDynamicSharedArray(eltype(du), - (size(du, 1), tile_width + 1, tile_width, tile_width, 2), offset) - offset += sizeof(eltype(du)) * size(du, 1) * (tile_width + 1) * tile_width * tile_width * 2 - shmem_fstar2 = CuDynamicSharedArray(eltype(du), - (size(du, 1), tile_width, tile_width + 1, tile_width, 2), offset) - offset += sizeof(eltype(du)) * size(du, 1) * tile_width * (tile_width + 1) * tile_width * 2 - shmem_fstar3 = CuDynamicSharedArray(eltype(du), - (size(du, 1), tile_width, tile_width, tile_width + 1, 2), offset) - offset += sizeof(eltype(du)) * size(du, 1) * tile_width * tile_width * (tile_width + 1) * 2 - shmem_value = CuDynamicSharedArray(eltype(du), - (size(du, 1), tile_width, tile_width, tile_width), offset) - - # Get thread and block indices only we need save registers - ty = threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - ty1 = div(ty - 1, tile_width^2) + 1 - ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 - ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 - - # Load global `derivative_split` into shared memory - # Transposed load - @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] - - # Get variables for computation - @inbounds alpha_element = alpha[k] - dg_only = isapprox(alpha_element, 0, atol = atol) - - # Compute FV volume fluxes - u_node = get_node_vars(u, equations, ty1, ty2, ty3, k) - if ty1 + 1 <= tile_width - f1_node = volume_flux_fv(u_node, - get_node_vars(u, equations, ty1 + 1, ty2, ty3, k), - 1, equations) - f1_L_node = noncons_flux_fv(u_node, - get_node_vars(u, equations, ty1 + 1, ty2, ty3, k), - 1, equations) - f1_R_node = noncons_flux_fv(get_node_vars(u, equations, ty1 + 1, ty2, ty3, k), - u_node, - 1, equations) - end - if ty2 + 1 <= tile_width - f2_node = volume_flux_fv(u_node, - get_node_vars(u, equations, ty1, ty2 + 1, ty3, k), - 2, equations) - f2_L_node = noncons_flux_fv(u_node, - get_node_vars(u, equations, ty1, ty2 + 1, ty3, k), - 2, equations) - f2_R_node = noncons_flux_fv(get_node_vars(u, equations, ty1, ty2 + 1, ty3, k), - u_node, - 2, equations) - end - if ty3 + 1 <= tile_width - f3_node = volume_flux_fv(u_node, - get_node_vars(u, equations, ty1, ty2, ty3 + 1, k), - 3, equations) - f3_L_node = noncons_flux_fv(u_node, - get_node_vars(u, equations, ty1, ty2, ty3 + 1, k), - 3, equations) - f3_R_node = noncons_flux_fv(get_node_vars(u, equations, ty1, ty2, ty3 + 1, k), - u_node, - 3, equations) - end - - # Initialize the values - for tx in axes(du, 1) - @inbounds begin - # Initialize `du` with zeros - shmem_value[tx, ty1, ty2, ty3] = zero(eltype(du)) - - # TODO: Remove shared memory for `fstar` and use local memory - - # Initialize `fstar` side columes with zeros (1: left) - shmem_fstar1[tx, 1, ty2, ty3, 1] = zero(eltype(du)) - shmem_fstar1[tx, tile_width + 1, ty2, ty3, 1] = zero(eltype(du)) - shmem_fstar2[tx, ty1, 1, ty3, 1] = zero(eltype(du)) - shmem_fstar2[tx, ty1, tile_width + 1, ty3, 1] = zero(eltype(du)) - shmem_fstar3[tx, ty1, ty2, 1, 1] = zero(eltype(du)) - shmem_fstar3[tx, ty1, ty2, tile_width + 1, 1] = zero(eltype(du)) - - # Initialize `fstar` side columes with zeros (2: right) - shmem_fstar1[tx, 1, ty2, ty3, 2] = zero(eltype(du)) - shmem_fstar1[tx, tile_width + 1, ty2, ty3, 2] = zero(eltype(du)) - shmem_fstar2[tx, ty1, 1, ty3, 2] = zero(eltype(du)) - shmem_fstar2[tx, ty1, tile_width + 1, ty3, 2] = zero(eltype(du)) - shmem_fstar3[tx, ty1, ty2, 1, 2] = zero(eltype(du)) - shmem_fstar3[tx, ty1, ty2, tile_width + 1, 2] = zero(eltype(du)) - end - - if ty1 + 1 <= tile_width - # Set with FV volume fluxes - @inbounds begin - shmem_fstar1[tx, ty1 + 1, ty2, ty3, 1] = f1_node[tx] + 0.5f0 * f1_L_node[tx] * (1 - dg_only) - shmem_fstar1[tx, ty1 + 1, ty2, ty3, 2] = f1_node[tx] + 0.5f0 * f1_R_node[tx] * (1 - dg_only) - end - end - if ty2 + 1 <= tile_width - # Set with FV volume fluxes - @inbounds begin - shmem_fstar2[tx, ty1, ty2 + 1, ty3, 1] = f2_node[tx] + 0.5f0 * f2_L_node[tx] * (1 - dg_only) - shmem_fstar2[tx, ty1, ty2 + 1, ty3, 2] = f2_node[tx] + 0.5f0 * f2_R_node[tx] * (1 - dg_only) - end - end - if ty3 + 1 <= tile_width - # Set with FV volume fluxes - @inbounds begin - shmem_fstar3[tx, ty1, ty2, ty3 + 1, 1] = f3_node[tx] + 0.5f0 * f3_L_node[tx] * (1 - dg_only) - shmem_fstar3[tx, ty1, ty2, ty3 + 1, 2] = f3_node[tx] + 0.5f0 * f3_R_node[tx] * (1 - dg_only) - end - end - end - - sync_threads() - - # Contribute FV to the volume integrals - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2, ty3] += alpha_element * - (inverse_weights[ty1] * - (shmem_fstar1[tx, ty1 + 1, ty2, ty3, 1] - shmem_fstar1[tx, ty1, ty2, ty3, 2]) + - inverse_weights[ty2] * - (shmem_fstar2[tx, ty1, ty2 + 1, ty3, 1] - shmem_fstar2[tx, ty1, ty2, ty3, 2]) + - inverse_weights[ty3] * - (shmem_fstar3[tx, ty1, ty2, ty3 + 1, 1] - shmem_fstar3[tx, ty1, ty2, ty3, 2])) * - (1 - dg_only) - end - - # Compute DG volume fluxes - for thread in 1:tile_width - volume_flux_node1 = volume_flux_dg(u_node, - get_node_vars(u, equations, thread, ty2, ty3, k), - 1, equations) - volume_flux_node2 = volume_flux_dg(u_node, - get_node_vars(u, equations, ty1, thread, ty3, k), - 2, equations) - volume_flux_node3 = volume_flux_dg(u_node, - get_node_vars(u, equations, ty1, ty2, thread, k), - 3, equations) - - noncons_flux_node1 = noncons_flux_dg(u_node, - get_node_vars(u, equations, thread, ty2, ty3, k), - 1, equations) - noncons_flux_node2 = noncons_flux_dg(u_node, - get_node_vars(u, equations, ty1, thread, ty3, k), - 2, equations) - noncons_flux_node3 = noncons_flux_dg(u_node, - get_node_vars(u, equations, ty1, ty2, thread, k), - 3, equations) - - # Contribute DG to the volume integrals - for tx in axes(du, 1) - @inbounds shmem_value[tx, ty1, ty2, ty3] += (volume_flux_node1[tx] * shmem_split[thread, ty1] * - (1 - isequal(ty1, thread)) + - volume_flux_node2[tx] * shmem_split[thread, ty2] * - (1 - isequal(ty2, thread)) + - volume_flux_node3[tx] * shmem_split[thread, ty3] * - (1 - isequal(ty3, thread)) + - 0.5f0 * - (shmem_split[thread, ty1] * noncons_flux_node1[tx] + - shmem_split[thread, ty2] * noncons_flux_node2[tx] + - shmem_split[thread, ty3] * noncons_flux_node3[tx])) * dg_only + - ((1 - alpha_element) * - volume_flux_node1[tx] * shmem_split[thread, ty1] * - (1 - isequal(ty1, thread)) + - (1 - alpha_element) * - volume_flux_node2[tx] * shmem_split[thread, ty2] * - (1 - isequal(ty2, thread)) + - (1 - alpha_element) * - volume_flux_node3[tx] * shmem_split[thread, ty3] * - (1 - isequal(ty3, thread)) + - 0.5f0 * (1 - alpha_element) * - (shmem_split[thread, ty1] * noncons_flux_node1[tx] + - shmem_split[thread, ty2] * noncons_flux_node2[tx] + - shmem_split[thread, ty3] * noncons_flux_node3[tx])) * (1 - dg_only) - end - end - - # Finalize the values - for tx in axes(du, 1) - @inbounds du[tx, ty1, ty2, ty3, k] = shmem_value[tx, ty1, ty2, ty3] - end - - return nothing -end - -# Kernel for prolonging two interfaces -function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids, orientations, - equations::AbstractEquations{3}) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(interfaces_u, 2) * size(interfaces_u, 3)^2 && k <= size(interfaces_u, 5)) - u2 = size(u, 2) # size(interfaces_u, 3) == size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - @inbounds begin - orientation = orientations[k] - left_element = neighbor_ids[1, k] - right_element = neighbor_ids[2, k] - - interfaces_u[1, j1, j2, j3, k] = u[j1, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j3, - isequal(orientation, 1) * j3 + isequal(orientation, 2) * j3 + isequal(orientation, 3) * u2, - left_element] - interfaces_u[2, j1, j2, j3, k] = u[j1, - isequal(orientation, 1) + isequal(orientation, 2) * j2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) + isequal(orientation, 3) * j3, - isequal(orientation, 1) * j3 + isequal(orientation, 2) * j3 + isequal(orientation, 3), - right_element] - end - end - - return nothing -end - -# Kernel for calculating surface fluxes -function surface_flux_kernel!(surface_flux_arr, interfaces_u, orientations, - equations::AbstractEquations{3}, surface_flux::Any) - j1 = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j2 = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (j1 <= size(surface_flux_arr, 2) && j2 <= size(surface_flux_arr, 3) && - k <= size(surface_flux_arr, 4)) - u_ll, u_rr = get_surface_node_vars(interfaces_u, equations, j1, j2, k) - @inbounds orientation = orientations[k] - - surface_flux_node = surface_flux(u_ll, u_rr, orientation, equations) - - for ii in axes(surface_flux_arr, 1) - @inbounds surface_flux_arr[ii, j1, j2, k] = surface_flux_node[ii] - end - end - - return nothing -end - -# Kernel for calculating surface and both nonconservative fluxes -function surface_noncons_flux_kernel!(surface_flux_arr, noncons_left_arr, noncons_right_arr, - interfaces_u, orientations, equations::AbstractEquations{3}, - surface_flux::Any, nonconservative_flux::Any) - j1 = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j2 = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (j1 <= size(surface_flux_arr, 2) && j2 <= size(surface_flux_arr, 3) && - k <= size(surface_flux_arr, 4)) - u_ll, u_rr = get_surface_node_vars(interfaces_u, equations, j1, j2, k) - @inbounds orientation = orientations[k] - - surface_flux_node = surface_flux(u_ll, u_rr, orientation, equations) - noncons_left_node = nonconservative_flux(u_ll, u_rr, orientation, equations) - noncons_right_node = nonconservative_flux(u_rr, u_ll, orientation, equations) - - for ii in axes(surface_flux_arr, 1) - @inbounds begin - surface_flux_arr[ii, j1, j2, k] = surface_flux_node[ii] - noncons_left_arr[ii, j1, j2, k] = noncons_left_node[ii] - noncons_right_arr[ii, j1, j2, k] = noncons_right_node[ii] - end - end - end - - return nothing -end - -# Kernel for setting interface fluxes -function interface_flux_kernel!(surface_flux_values, surface_flux_arr, neighbor_ids, orientations, - equations::AbstractEquations{3}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_arr, 2)^2 && - k <= size(surface_flux_arr, 4)) - j1 = div(j - 1, size(surface_flux_arr, 2)) + 1 - j2 = rem(j - 1, size(surface_flux_arr, 2)) + 1 - - @inbounds begin - left_id = neighbor_ids[1, k] - right_id = neighbor_ids[2, k] - - left_direction = 2 * orientations[k] - right_direction = 2 * orientations[k] - 1 - - surface_flux_values[i, j1, j2, left_direction, left_id] = surface_flux_arr[i, j1, j2, k] - surface_flux_values[i, j1, j2, right_direction, right_id] = surface_flux_arr[i, j1, j2, k] - end - end - - return nothing -end - -# Kernel for setting interface fluxes -function interface_flux_kernel!(surface_flux_values, surface_flux_arr, noncons_left_arr, - noncons_right_arr, neighbor_ids, orientations, - equations::AbstractEquations{3}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_arr, 2)^2 && - k <= size(surface_flux_arr, 4)) - j1 = div(j - 1, size(surface_flux_arr, 2)) + 1 - j2 = rem(j - 1, size(surface_flux_arr, 2)) + 1 - - @inbounds begin - left_id = neighbor_ids[1, k] - right_id = neighbor_ids[2, k] - - left_direction = 2 * orientations[k] - right_direction = 2 * orientations[k] - 1 - - surface_flux_values[i, j1, j2, left_direction, left_id] = surface_flux_arr[i, j1, j2, k] + - 0.5f0 * - noncons_left_arr[i, j1, j2, k] - surface_flux_values[i, j1, j2, right_direction, right_id] = surface_flux_arr[i, j1, j2, k] + - 0.5f0 * - noncons_right_arr[i, j1, j2, k] - end - end - - return nothing -end - -# Kernel for prolonging two boundaries -function prolong_boundaries_kernel!(boundaries_u, u, neighbor_ids, neighbor_sides, orientations, - equations::AbstractEquations{3}) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(boundaries_u, 2) * size(boundaries_u, 3)^2 && k <= size(boundaries_u, 5)) - u2 = size(u, 2) # size(boundaries_u, 3) == size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - @inbounds begin - element = neighbor_ids[k] - side = neighbor_sides[k] - orientation = orientations[k] - - boundaries_u[1, j1, j2, j3, k] = u[j1, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j3, - isequal(orientation, 1) * j3 + isequal(orientation, 2) * j3 + isequal(orientation, 3) * u2, - element] * (2 - side) # Set to 0 instead of NaN - boundaries_u[2, j1, j2, j3, k] = u[j1, - isequal(orientation, 1) + isequal(orientation, 2) * j2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) + isequal(orientation, 3) * j3, - isequal(orientation, 1) * j3 + isequal(orientation, 2) * j3 + isequal(orientation, 3), - element] * (side - 1) # Set to 0 instead of NaN - end - end - - return nothing -end - -# Kernel for calculating boundary fluxes -function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, - indices_arr, neighbor_ids, neighbor_sides, orientations, - boundary_conditions::NamedTuple, equations::AbstractEquations{3}, - surface_flux::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(surface_flux_values, 2)^2 && k <= length(boundary_arr)) - j1 = div(j - 1, size(surface_flux_values, 2)) + 1 - j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 - - @inbounds begin - boundary = boundary_arr[k] - direction = (indices_arr[1] <= boundary) + (indices_arr[2] <= boundary) + - (indices_arr[3] <= boundary) + (indices_arr[4] <= boundary) + - (indices_arr[5] <= boundary) + (indices_arr[6] <= boundary) - - neighbor = neighbor_ids[boundary] - side = neighbor_sides[boundary] - orientation = orientations[boundary] - end - - u_ll, u_rr = get_surface_node_vars(boundaries_u, equations, j1, j2, boundary) - u_inner = (2 - side) * u_ll + (side - 1) * u_rr - x = get_node_coords(node_coordinates, equations, j1, j2, boundary) - - # TODO: Improve this part - if direction == 1 - boundary_flux_node = boundary_conditions[1](u_inner, orientation, - direction, x, t, surface_flux, equations) - elseif direction == 2 - boundary_flux_node = boundary_conditions[2](u_inner, orientation, - direction, x, t, surface_flux, equations) - elseif direction == 3 - boundary_flux_node = boundary_conditions[3](u_inner, orientation, - direction, x, t, surface_flux, equations) - elseif direction == 4 - boundary_flux_node = boundary_conditions[4](u_inner, orientation, - direction, x, t, surface_flux, equations) - elseif direction == 5 - boundary_flux_node = boundary_conditions[5](u_inner, orientation, - direction, x, t, surface_flux, equations) - else - boundary_flux_node = boundary_conditions[6](u_inner, orientation, - direction, x, t, surface_flux, equations) - end - - for ii in axes(surface_flux_values, 1) - # `boundary_flux_node` can be nothing if periodic boundary condition is applied - @inbounds surface_flux_values[ii, j1, j2, direction, neighbor] = isnothing(boundary_flux_node) ? # bad - surface_flux_values[ii, j1, - j2, - direction, - neighbor] : - boundary_flux_node[ii] - end - end - - return nothing -end - -# Kernel for copying data small to small on mortars -function prolong_mortars_small2small_kernel!(u_upper_left, u_upper_right, u_lower_left, - u_lower_right, u, neighbor_ids, large_sides, - orientations) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(u_upper_left, 2) && j <= size(u_upper_left, 3)^2 && k <= size(u_upper_left, 5)) - u2 = size(u, 2) # size(u_upper_left, 3) == size(u, 2) - - j1 = div(j - 1, u2) + 1 - j2 = rem(j - 1, u2) + 1 - - @inbounds begin - large_side = large_sides[k] - orientation = orientations[k] - - lower_left_element = neighbor_ids[1, k] - lower_right_element = neighbor_ids[2, k] - upper_left_element = neighbor_ids[3, k] - upper_right_element = neighbor_ids[4, k] - - u_upper_left[2, i, j1, j2, k] = u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, - isequal(orientation, 1) * j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), - upper_left_element] * (2 - large_side) - - u_upper_right[2, i, j1, j2, k] = u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, - isequal(orientation, 1) * j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), - upper_right_element] * (2 - large_side) - - u_lower_left[2, i, j1, j2, k] = u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, - isequal(orientation, 1) * j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), - lower_left_element] * (2 - large_side) - - u_lower_right[2, i, j1, j2, k] = u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, - isequal(orientation, 1) * j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), - lower_right_element] * (2 - large_side) - - u_upper_left[1, i, j1, j2, k] = u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, - isequal(orientation, 1) * j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - upper_left_element] * (large_side - 1) - - u_upper_right[1, i, j1, j2, k] = u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, - isequal(orientation, 1) * j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - upper_right_element] * (large_side - 1) - - u_lower_left[1, i, j1, j2, k] = u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, - isequal(orientation, 1) * j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - lower_left_element] * (large_side - 1) - - u_lower_right[1, i, j1, j2, k] = u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, - isequal(orientation, 1) * j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - lower_right_element] * (large_side - 1) - end - end - - return nothing -end - -# # Kernel for interpolating data large to small on mortars - step 1 -# function prolong_mortars_large2small_kernel!(tmp_upper_left, tmp_upper_right, tmp_lower_left, -# tmp_lower_right, u, forward_upper, -# forward_lower, neighbor_ids, large_sides, orientations) -# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x -# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y -# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - -# if (i <= size(tmp_upper_left, 2) && j <= size(tmp_upper_left, 3)^2 && -# k <= size(tmp_upper_left, 5)) -# u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2) - -# j1 = div(j - 1, u2) + 1 -# j2 = rem(j - 1, u2) + 1 - -# large_side = large_sides[k] -# orientation = orientations[k] -# large_element = neighbor_ids[5, k] - -# leftright = large_side - -# @inbounds begin -# for j1j1 in axes(forward_lower, 2) -# tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * -# u[i, -# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, -# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, -# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, -# large_element] * (2 - large_side) - -# tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * -# u[i, -# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, -# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, -# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, -# large_element] * (2 - large_side) - -# tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * -# u[i, -# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, -# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, -# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, -# large_element] * (2 - large_side) - -# tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * -# u[i, -# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, -# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, -# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, -# large_element] * (2 - large_side) -# end - -# for j1j1 in axes(forward_lower, 2) -# tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * -# u[i, -# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, -# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, -# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), -# large_element] * (large_side - 1) - -# tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * -# u[i, -# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, -# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, -# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), -# large_element] * (large_side - 1) - -# tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * -# u[i, -# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, -# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, -# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), -# large_element] * (large_side - 1) - -# tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * -# u[i, -# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, -# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, -# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), -# large_element] * (large_side - 1) -# end -# end -# end - -# return nothing -# end - -# # Kernel for interpolating data large to small on mortars - step 2 -# function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, -# u_lower_right, tmp_upper_left, tmp_upper_right, -# tmp_lower_left, tmp_lower_right, forward_upper, -# forward_lower, large_sides) -# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x -# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y -# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - -# if (i <= size(u_upper_left, 2) && j <= size(u_upper_left, 3)^2 && -# k <= size(u_upper_left, 5)) -# u2 = size(u_upper_left, 3) # size(u_upper_left, 3) == size(u, 2) - -# j1 = div(j - 1, u2) + 1 -# j2 = rem(j - 1, u2) + 1 - -# leftright = large_sides[k] - -# @inbounds begin -# for j2j2 in axes(forward_upper, 2) -# u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * -# tmp_upper_left[leftright, i, j1, j2j2, k] - -# u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * -# tmp_upper_right[leftright, i, j1, j2j2, k] - -# u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * -# tmp_lower_left[leftright, i, j1, j2j2, k] - -# u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * -# tmp_lower_right[leftright, i, j1, j2j2, k] -# end -# end -# end - -# return nothing -# end - -# Kernel for interpolating data large to small on mortars (optimized) -function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, u_lower_right, - tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right, - u, forward_upper, forward_lower, neighbor_ids, large_sides, - orientations) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - # Loop stride for each dimension - stride_x = gridDim().x * blockDim().x - stride_y = gridDim().y * blockDim().y - stride_z = gridDim().z * blockDim().z - - # Cooperative kernel needs stride loops to handle the constrained launch size - while i <= size(tmp_upper_left, 2) - while j <= size(tmp_upper_left, 3)^2 - while k <= size(tmp_upper_left, 5) - u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2) - - j1 = div(j - 1, u2) + 1 - j2 = rem(j - 1, u2) + 1 - - @inbounds begin - large_side = large_sides[k] - orientation = orientations[k] - large_element = neighbor_ids[5, k] - end - - leftright = large_side - - for j1j1 in axes(forward_lower, 2) - @inbounds begin - tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * - u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - large_element] * (2 - large_side) - - tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * - u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - large_element] * (2 - large_side) - - tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * - u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - large_element] * (2 - large_side) - - tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * - u[i, - isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, - large_element] * (2 - large_side) - end - end - - for j1j1 in axes(forward_lower, 2) - @inbounds begin - tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * - u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, - 3), - large_element] * (large_side - 1) - - tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * - u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, - 3), - large_element] * (large_side - 1) - - tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * - u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, - 3), - large_element] * (large_side - 1) - - tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * - u[i, - isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, - isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, - isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, - 3), - large_element] * (large_side - 1) - end - end - - # Grid scope synchronization - grid = CG.this_grid() - CG.sync(grid) - - for j2j2 in axes(forward_upper, 2) - @inbounds begin - u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * - tmp_upper_left[leftright, i, j1, j2j2, k] - - u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * - tmp_upper_right[leftright, i, j1, j2j2, k] - - u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * - tmp_lower_left[leftright, i, j1, j2j2, k] - - u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * - tmp_lower_right[leftright, i, j1, j2j2, k] - end - end - k += stride_z - end - j += stride_y - end - i += stride_x - end - - return nothing -end - -# Kernel for calculating mortar fluxes -function mortar_flux_kernel!(fstar_primary_upper_left, fstar_primary_upper_right, - fstar_primary_lower_left, fstar_primary_lower_right, - fstar_secondary_upper_left, fstar_secondary_upper_right, - fstar_secondary_lower_left, fstar_seondary_lower_right, - u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations, - equations::AbstractEquations{3}, surface_flux::Any) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(u_upper_left, 3) && j <= size(u_upper_left, 4) && k <= length(orientations)) - u_upper_left_ll, u_upper_left_rr = get_surface_node_vars(u_upper_left, equations, i, j, k) - u_upper_right_ll, u_upper_right_rr = get_surface_node_vars(u_upper_right, equations, i, j, k) - u_lower_left_ll, u_lower_left_rr = get_surface_node_vars(u_lower_left, equations, i, j, k) - u_lower_right_ll, u_lower_right_rr = get_surface_node_vars(u_lower_right, equations, i, j, k) - - @inbounds orientation = orientations[k] - - flux_upper_left_node = surface_flux(u_upper_left_ll, u_upper_left_rr, orientation, - equations) - flux_upper_right_node = surface_flux(u_upper_right_ll, u_upper_right_rr, orientation, - equations) - flux_lower_left_node = surface_flux(u_lower_left_ll, u_lower_left_rr, orientation, - equations) - flux_lower_right_node = surface_flux(u_lower_right_ll, u_lower_right_rr, orientation, - equations) - - for ii in axes(fstar_primary_upper_left, 1) - @inbounds begin - fstar_primary_upper_left[ii, i, j, k] = flux_upper_left_node[ii] - fstar_primary_upper_right[ii, i, j, k] = flux_upper_right_node[ii] - - fstar_primary_lower_left[ii, i, j, k] = flux_lower_left_node[ii] - fstar_primary_lower_right[ii, i, j, k] = flux_lower_right_node[ii] - - fstar_secondary_upper_left[ii, i, j, k] = flux_upper_left_node[ii] - fstar_secondary_upper_right[ii, i, j, k] = flux_upper_right_node[ii] - - fstar_secondary_lower_left[ii, i, j, k] = flux_lower_left_node[ii] - fstar_seondary_lower_right[ii, i, j, k] = flux_lower_right_node[ii] - end - end - end - - return nothing -end - -# Kernel for calculating mortar fluxes and adding nonconservative fluxes -function mortar_flux_kernel!(fstar_primary_upper_left, fstar_primary_upper_right, - fstar_primary_lower_left, fstar_primary_lower_right, - fstar_secondary_upper_left, fstar_secondary_upper_right, - fstar_secondary_lower_left, fstar_seondary_lower_right, - u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations, - large_sides, equations::AbstractEquations{3}, surface_flux::Any, - nonconservative_flux::Any) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(u_upper_left, 3) && j <= size(u_upper_left, 4) && k <= length(orientations)) - u_upper_left_ll, u_upper_left_rr = get_surface_node_vars(u_upper_left, equations, i, j, k) - u_upper_right_ll, u_upper_right_rr = get_surface_node_vars(u_upper_right, equations, i, j, k) - u_lower_left_ll, u_lower_left_rr = get_surface_node_vars(u_lower_left, equations, i, j, k) - u_lower_right_ll, u_lower_right_rr = get_surface_node_vars(u_lower_right, equations, i, j, k) - - @inbounds begin - orientation = orientations[k] - large_side = large_sides[k] - end - - flux_upper_left_node = surface_flux(u_upper_left_ll, u_upper_left_rr, orientation, - equations) - flux_upper_right_node = surface_flux(u_upper_right_ll, u_upper_right_rr, orientation, - equations) - flux_lower_left_node = surface_flux(u_lower_left_ll, u_lower_left_rr, orientation, - equations) - flux_lower_right_node = surface_flux(u_lower_right_ll, u_lower_right_rr, orientation, - equations) - - for ii in axes(fstar_primary_upper_left, 1) - @inbounds begin - fstar_primary_upper_left[ii, i, j, k] = flux_upper_left_node[ii] - fstar_primary_upper_right[ii, i, j, k] = flux_upper_right_node[ii] - - fstar_primary_lower_left[ii, i, j, k] = flux_lower_left_node[ii] - fstar_primary_lower_right[ii, i, j, k] = flux_lower_right_node[ii] - - fstar_secondary_upper_left[ii, i, j, k] = flux_upper_left_node[ii] - fstar_secondary_upper_right[ii, i, j, k] = flux_upper_right_node[ii] - - fstar_secondary_lower_left[ii, i, j, k] = flux_lower_left_node[ii] - fstar_seondary_lower_right[ii, i, j, k] = flux_lower_right_node[ii] - end - end - - u_upper_left1 = (2 - large_side) * u_upper_left_ll + (large_side - 1) * u_upper_left_rr - u_upper_left2 = (large_side - 1) * u_upper_left_ll + (2 - large_side) * u_upper_left_rr - - u_upper_right1 = (2 - large_side) * u_upper_right_ll + (large_side - 1) * u_upper_right_rr - u_upper_right2 = (large_side - 1) * u_upper_right_ll + (2 - large_side) * u_upper_right_rr - - u_lower_left1 = (2 - large_side) * u_lower_left_ll + (large_side - 1) * u_lower_left_rr - u_lower_left2 = (large_side - 1) * u_lower_left_ll + (2 - large_side) * u_lower_left_rr - - u_lower_right1 = (2 - large_side) * u_lower_right_ll + (large_side - 1) * u_lower_right_rr - u_lower_right2 = (large_side - 1) * u_lower_right_ll + (2 - large_side) * u_lower_right_rr - - noncons_flux_primary_upper_left = nonconservative_flux(u_upper_left1, u_upper_left2, - orientation, equations) - noncons_flux_primary_upper_right = nonconservative_flux(u_upper_right1, u_upper_right2, - orientation, equations) - noncons_flux_primary_lower_left = nonconservative_flux(u_lower_left1, u_lower_left2, - orientation, equations) - noncons_flux_primary_lower_right = nonconservative_flux(u_lower_right1, u_lower_right2, - orientation, equations) - noncons_flux_secondary_upper_left = nonconservative_flux(u_upper_left2, u_upper_left1, - orientation, equations) - noncons_flux_secondary_upper_right = nonconservative_flux(u_upper_right2, u_upper_right1, - orientation, equations) - noncons_flux_secondary_lower_left = nonconservative_flux(u_lower_left2, u_lower_left1, - orientation, equations) - noncons_flux_secondary_lower_right = nonconservative_flux(u_lower_right2, u_lower_right1, - orientation, equations) - - for ii in axes(fstar_primary_upper_left, 1) - @inbounds begin - fstar_primary_upper_left[ii, i, j, k] += 0.5f0 * noncons_flux_primary_upper_left[ii] - fstar_primary_upper_right[ii, i, j, k] += 0.5f0 * noncons_flux_primary_upper_right[ii] - - fstar_primary_lower_left[ii, i, j, k] += 0.5f0 * noncons_flux_primary_lower_left[ii] - fstar_primary_lower_right[ii, i, j, k] += 0.5f0 * noncons_flux_primary_lower_right[ii] - - fstar_secondary_upper_left[ii, i, j, k] += 0.5f0 * noncons_flux_secondary_upper_left[ii] - fstar_secondary_upper_right[ii, i, j, k] += 0.5f0 * noncons_flux_secondary_upper_right[ii] - - fstar_secondary_lower_left[ii, i, j, k] += 0.5f0 * noncons_flux_secondary_lower_left[ii] - fstar_seondary_lower_right[ii, i, j, k] += 0.5f0 * noncons_flux_secondary_lower_right[ii] - end - end - end - - return nothing -end - -# # Kernel for copying mortar fluxes small to small and small to large - step 1 -# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right, -# tmp_lower_left, tmp_lower_right, -# fstar_primary_upper_left, fstar_primary_upper_right, -# fstar_primary_lower_left, fstar_primary_lower_right, -# fstar_secondary_upper_left, fstar_secondary_upper_right, -# fstar_secondary_lower_left, fstar_secondary_lower_right, -# reverse_upper, reverse_lower, neighbor_ids, large_sides, -# orientations) -# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x -# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y -# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - -# if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && -# k <= length(orientations)) -# j1 = div(j - 1, size(surface_flux_values, 2)) + 1 -# j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 - -# lower_left_element = neighbor_ids[1, k] -# lower_right_element = neighbor_ids[2, k] -# upper_left_element = neighbor_ids[3, k] -# upper_right_element = neighbor_ids[4, k] -# large_element = neighbor_ids[5, k] - -# large_side = large_sides[k] -# orientation = orientations[k] - -# # Use simple math expression to enhance the performance (against control flow), -# # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 + -# # isequal(large_side, 1) * isequal(orientation, 2) * 3 + -# # isequal(large_side, 1) * isequal(orientation, 3) * 5 + -# # isequal(large_side, 2) * isequal(orientation, 1) * 2 + -# # isequal(large_side, 2) * isequal(orientation, 2) * 4 + -# # isequal(large_side, 2) * isequal(orientation, 3) * 6`. -# # Please also check the original code in Trixi.jl when you modify this code. -# direction = 2 * orientation + large_side - 2 - -# surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k] -# surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k] -# surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k] -# surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k] - -# # Use simple math expression to enhance the performance (against control flow), -# # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 + -# # isequal(large_side, 1) * isequal(orientation, 2) * 4 + -# # isequal(large_side, 1) * isequal(orientation, 3) * 6 + -# # isequal(large_side, 2) * isequal(orientation, 1) * 1 + -# # isequal(large_side, 2) * isequal(orientation, 2) * 3 + -# # isequal(large_side, 2) * isequal(orientation, 3) * 5`. -# # Please also check the original code in Trixi.jl when you modify this code. -# direction = 2 * orientation - large_side + 1 - -# @inbounds begin -# for j1j1 in axes(reverse_upper, 2) -# tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * -# fstar_secondary_upper_left[i, j1j1, j2, k] -# tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * -# fstar_secondary_upper_right[i, j1j1, j2, k] -# tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * -# fstar_secondary_lower_left[i, j1j1, j2, k] -# tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * -# fstar_secondary_lower_right[i, j1j1, j2, k] -# end -# end -# end - -# return nothing -# end - -# # Kernel for copying mortar fluxes small to small and small to large - step 2 -# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, -# tmp_upper_right, tmp_lower_left, tmp_lower_right, -# reverse_upper, reverse_lower, neighbor_ids, large_sides, -# orientations, equations::AbstractEquations{3}) -# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x -# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y -# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - -# if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && -# k <= length(orientations)) -# j1 = div(j - 1, size(surface_flux_values, 2)) + 1 -# j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 - -# large_element = neighbor_ids[5, k] - -# large_side = large_sides[k] -# orientation = orientations[k] - -# # See step 1 for the explanation of the following expression -# direction = 2 * orientation - large_side + 1 - -# @inbounds begin -# for j2j2 in axes(reverse_lower, 2) -# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * -# tmp_upper_left[i, j1, j2j2, -# direction, -# large_element] -# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * -# tmp_upper_right[i, j1, j2j2, -# direction, -# large_element] -# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * -# tmp_lower_left[i, j1, j2j2, -# direction, -# large_element] -# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * -# tmp_lower_right[i, j1, j2j2, -# direction, -# large_element] -# end - -# surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2, -# direction, -# large_element] -# end -# end - -# return nothing -# end - -# Kernel for copying mortar fluxes small to small and small to large (optimized) -function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, - tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right, - fstar_primary_upper_left, fstar_primary_upper_right, - fstar_primary_lower_left, fstar_primary_lower_right, - fstar_secondary_upper_left, fstar_secondary_upper_right, - fstar_secondary_lower_left, fstar_secondary_lower_right, - reverse_upper, reverse_lower, neighbor_ids, large_sides, - orientations) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - # Loop stride for each dimension - stride_x = gridDim().x * blockDim().x - stride_y = gridDim().y * blockDim().y - stride_z = gridDim().z * blockDim().z - - # Cooperative kernel needs stride loops to handle the constrained launch size - while i <= size(surface_flux_values, 1) - while j <= size(surface_flux_values, 2)^2 - while k <= length(orientations) - j1 = div(j - 1, size(surface_flux_values, 2)) + 1 - j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 - - @inbounds begin - lower_left_element = neighbor_ids[1, k] - lower_right_element = neighbor_ids[2, k] - upper_left_element = neighbor_ids[3, k] - upper_right_element = neighbor_ids[4, k] - large_element = neighbor_ids[5, k] - - large_side = large_sides[k] - orientation = orientations[k] - - # Use simple math expression to enhance the performance (against control flow), - # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 + - # isequal(large_side, 1) * isequal(orientation, 2) * 3 + - # isequal(large_side, 1) * isequal(orientation, 3) * 5 + - # isequal(large_side, 2) * isequal(orientation, 1) * 2 + - # isequal(large_side, 2) * isequal(orientation, 2) * 4 + - # isequal(large_side, 2) * isequal(orientation, 3) * 6`. - # Please also check the original code in Trixi.jl when you modify this code. - direction = 2 * orientation + large_side - 2 - - surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k] - surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k] - surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k] - surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k] - - # Use simple math expression to enhance the performance (against control flow), - # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 + - # isequal(large_side, 1) * isequal(orientation, 2) * 4 + - # isequal(large_side, 1) * isequal(orientation, 3) * 6 + - # isequal(large_side, 2) * isequal(orientation, 1) * 1 + - # isequal(large_side, 2) * isequal(orientation, 2) * 3 + - # isequal(large_side, 2) * isequal(orientation, 3) * 5`. - # Please also check the original code in Trixi.jl when you modify this code. - direction = 2 * orientation - large_side + 1 - end - - for j1j1 in axes(reverse_upper, 2) - @inbounds begin - tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * - fstar_secondary_upper_left[i, j1j1, j2, k] - tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * - fstar_secondary_upper_right[i, j1j1, j2, k] - tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * - fstar_secondary_lower_left[i, j1j1, j2, k] - tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * - fstar_secondary_lower_right[i, j1j1, j2, k] - end - end - - # Grid scope synchronization - grid = CG.this_grid() - CG.sync(grid) - - for j2j2 in axes(reverse_lower, 2) - @inbounds begin - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * - tmp_upper_left[i, j1, j2j2, - direction, - large_element] - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * - tmp_upper_right[i, j1, j2j2, - direction, - large_element] - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * - tmp_lower_left[i, j1, j2j2, - direction, - large_element] - tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * - tmp_lower_right[i, j1, j2j2, - direction, - large_element] - end - end - - @inbounds surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2, - direction, - large_element] - k += stride_z - end - j += stride_y - end - i += stride_x - end - - return nothing -end - -# Kernel for calculating surface integrals -function surface_integral_kernel!(du, factor_arr, surface_flux_values, - equations::AbstractEquations{3}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) - u2 = size(du, 2) # size(du, 2) == size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - @inbounds begin - du[i, j1, j2, j3, k] -= (surface_flux_values[i, j2, j3, 1, k] * isequal(j1, 1) + - surface_flux_values[i, j1, j3, 3, k] * isequal(j2, 1) + - surface_flux_values[i, j1, j2, 5, k] * isequal(j3, 1)) * - factor_arr[1] - du[i, j1, j2, j3, k] += (surface_flux_values[i, j2, j3, 2, k] * isequal(j1, u2) + - surface_flux_values[i, j1, j3, 4, k] * isequal(j2, u2) + - surface_flux_values[i, j1, j2, 6, k] * isequal(j3, u2)) * - factor_arr[2] - end - end - - return nothing -end - -# Kernel for applying inverse Jacobian -function jacobian_kernel!(du, inverse_jacobian, equations::AbstractEquations{3}) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - j = (blockIdx().y - 1) * blockDim().y + threadIdx().y - k = (blockIdx().z - 1) * blockDim().z + threadIdx().z - - if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) - u2 = size(du, 2) # size(du, 2) == size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - @inbounds du[i, j1, j2, j3, k] *= -inverse_jacobian[k] - end - - return nothing -end - -# Kernel for calculating source terms -function source_terms_kernel!(du, u, node_coordinates, t, equations::AbstractEquations{3}, - source_terms::Any) - j = (blockIdx().x - 1) * blockDim().x + threadIdx().x - k = (blockIdx().y - 1) * blockDim().y + threadIdx().y - - if (j <= size(du, 2)^3 && k <= size(du, 5)) - u2 = size(u, 2) # size(du, 2) == size(u, 2) - - j1 = div(j - 1, u2^2) + 1 - j2 = div(rem(j - 1, u2^2), u2) + 1 - j3 = rem(rem(j - 1, u2^2), u2) + 1 - - u_local = get_node_vars(u, equations, j1, j2, j3, k) - x_local = get_node_coords(node_coordinates, equations, j1, j2, j3, k) - - source_terms_node = source_terms(u_local, x_local, t, equations) - - for ii in axes(du, 1) - @inbounds du[ii, j1, j2, j3, k] += source_terms_node[ii] - end - end - - return nothing -end +include("dg_3d_kernel.jl") -################################################################################################# # Functions that begin with `cuda_` are the functions that pack CUDA kernels together to do # partial work in semidiscretization. They are used to invoke kernels from the host (i.e., CPU) # and run them on the device (i.e., GPU). diff --git a/src/solvers/dg_3d_kernel.jl b/src/solvers/dg_3d_kernel.jl new file mode 100644 index 0000000..bfb4b40 --- /dev/null +++ b/src/solvers/dg_3d_kernel.jl @@ -0,0 +1,2138 @@ +# GPU kernels related to a DG semidiscretization in 3D. + +# Functions that end with `_kernel` are CUDA kernels that are going to be launched by +# the @cuda macro with parameters from the kernel configurator. They are purely run on +# the device (i.e., GPU). + +# Kernel for calculating fluxes along normal directions +function flux_kernel!(flux_arr1, flux_arr2, flux_arr3, u, equations::AbstractEquations{3}, + flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^3 && k <= size(u, 5)) + u2 = size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + u_node = get_node_vars(u, equations, j1, j2, j3, k) + + flux_node1 = flux(u_node, 1, equations) + flux_node2 = flux(u_node, 2, equations) + flux_node3 = flux(u_node, 3, equations) + + for ii in axes(u, 1) + @inbounds begin + flux_arr1[ii, j1, j2, j3, k] = flux_node1[ii] + flux_arr2[ii, j1, j2, j3, k] = flux_node2[ii] + flux_arr3[ii, j1, j2, j3, k] = flux_node3[ii] + end + end + end + + return nothing +end + +# Kernel for calculating weak form +function weak_form_kernel!(du, derivative_dhat, flux_arr1, flux_arr2, flux_arr3) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) + u2 = size(du, 2) # size(du, 2) == size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + @inbounds du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros + + for ii in axes(du, 2) + @inbounds du[i, j1, j2, j3, k] += derivative_dhat[j1, ii] * flux_arr1[i, ii, j2, j3, k] + + derivative_dhat[j2, ii] * flux_arr2[i, j1, ii, j3, k] + + derivative_dhat[j3, ii] * flux_arr3[i, j1, j2, ii, k] + end + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating volume integrals with weak form +function flux_weak_form_kernel!(du, u, derivative_dhat, + equations::AbstractEquations{3}, flux::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + shmem_dhat = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_flux = CuDynamicSharedArray(eltype(du), + (size(du, 1), tile_width, tile_width, tile_width, 3), offset) + + # Get thread and block indices only we need save registers + tx, ty = threadIdx().x, threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + ty1 = div(ty - 1, tile_width^2) + 1 + ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 + ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 + + # Tile the computation (restrict to one tile here) + value = zero(eltype(du)) + + # Load global `derivative_dhat` into shared memory + # Transposed load + @inbounds shmem_dhat[ty1, ty2] = derivative_dhat[ty2, ty1] + + # Compute flux values + u_node = get_node_vars(u, equations, ty1, ty2, ty3, k) + flux_node1 = flux(u_node, 1, equations) + flux_node2 = flux(u_node, 2, equations) + flux_node3 = flux(u_node, 3, equations) + + @inbounds begin + shmem_flux[tx, ty1, ty2, ty3, 1] = flux_node1[tx] + shmem_flux[tx, ty1, ty2, ty3, 2] = flux_node2[tx] + shmem_flux[tx, ty1, ty2, ty3, 3] = flux_node3[tx] + end + + sync_threads() + + # Loop within one block to get weak form + # TODO: Avoid potential bank conflicts + for thread in 1:tile_width + @inbounds value += shmem_dhat[thread, ty1] * shmem_flux[tx, thread, ty2, ty3, 1] + + shmem_dhat[thread, ty2] * shmem_flux[tx, ty1, thread, ty3, 2] + + shmem_dhat[thread, ty3] * shmem_flux[tx, ty1, ty2, thread, 3] + end + + # Synchronization is not needed here if we use only one tile + # sync_threads() + + # Finalize the weak form + @inbounds du[tx, ty1, ty2, ty3, k] = value + + return nothing +end + +# CUDA kernel for calculating volume fluxes +function volume_flux_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, u, + equations::AbstractEquations{3}, volume_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^4 && k <= size(u, 5)) + u2 = size(u, 2) + + j1 = div(j - 1, u2^3) + 1 + j2 = div(rem(j - 1, u2^3), u2^2) + 1 + j3 = div(rem(j - 1, u2^2), u2) + 1 + j4 = rem(j - 1, u2) + 1 + + u_node = get_node_vars(u, equations, j1, j2, j3, k) + u_node1 = get_node_vars(u, equations, j4, j2, j3, k) + u_node2 = get_node_vars(u, equations, j1, j4, j3, k) + u_node3 = get_node_vars(u, equations, j1, j2, j4, k) + + volume_flux_node1 = volume_flux(u_node, u_node1, 1, equations) + volume_flux_node2 = volume_flux(u_node, u_node2, 2, equations) + volume_flux_node3 = volume_flux(u_node, u_node3, 3, equations) + + for ii in axes(u, 1) + @inbounds begin + volume_flux_arr1[ii, j1, j4, j2, j3, k] = volume_flux_node1[ii] + volume_flux_arr2[ii, j1, j2, j4, j3, k] = volume_flux_node2[ii] + volume_flux_arr3[ii, j1, j2, j3, j4, k] = volume_flux_node3[ii] + end + end + end + + return nothing +end + +# Kernel for calculating volume integrals +function volume_integral_kernel!(du, derivative_split, volume_flux_arr1, volume_flux_arr2, + volume_flux_arr3, equations::AbstractEquations{3}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) + u2 = size(du, 2) # size(du, 2) == size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + @inbounds du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros + + for ii in axes(du, 2) + @inbounds du[i, j1, j2, j3, k] += volume_flux_arr1[i, j1, ii, j2, j3, k] * derivative_split[j1, ii] * + (1 - isequal(j1, ii)) + # set diagonal elements to zeros + volume_flux_arr2[i, j1, j2, ii, j3, k] * derivative_split[j2, ii] * + (1 - isequal(j2, ii)) + # set diagonal elements to zeros + volume_flux_arr3[i, j1, j2, j3, ii, k] * derivative_split[j3, ii] * + (1 - isequal(j3, ii)) # set diagonal elements to zeros + end + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating volume integrals without conservative terms +function volume_flux_integral_kernel!(du, u, derivative_split, + equations::AbstractEquations{3}, volume_flux::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_value = CuDynamicSharedArray(eltype(du), + (size(du, 1), tile_width, tile_width, tile_width), offset) + + # Get thread and block indices only we need save registers + ty = threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + ty1 = div(ty - 1, tile_width^2) + 1 + ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 + ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 + + # Tile the computation (set to one tile here) + # Initialize the values + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2, ty3] = zero(eltype(du)) + end + + # Load global `derivative_split` into shared memory + # Transposed load + @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] * + (1 - isequal(ty1, ty2)) # set diagonal elements to zeros + + sync_threads() + + # Compute volume fluxes + # How to store nodes in shared memory? + for thread in 1:tile_width + # Volume flux is heavy in computation so we should try best to avoid redundant + # computation, i.e., use for loop along x direction here + u_node = get_node_vars(u, equations, ty1, ty2, ty3, k) + volume_flux_node1 = volume_flux(u_node, + get_node_vars(u, equations, thread, ty2, ty3, k), + 1, equations) + volume_flux_node2 = volume_flux(u_node, + get_node_vars(u, equations, ty1, thread, ty3, k), + 2, equations) + volume_flux_node3 = volume_flux(u_node, + get_node_vars(u, equations, ty1, ty2, thread, k), + 3, equations) + + # TODO: Avoid potential bank conflicts + # Try another way to parallelize (ty1, ty2, ty3) with threads to ty4, + # then consolidate each computation back to (ty1, ty2, ty3) + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2, ty3] += shmem_split[thread, ty1] * volume_flux_node1[tx] + + shmem_split[thread, ty2] * volume_flux_node2[tx] + + shmem_split[thread, ty3] * volume_flux_node3[tx] + end + end + + # Synchronization is not needed here if we use only one tile + # sync_threads() + + # Finalize the values + for tx in axes(du, 1) + @inbounds du[tx, ty1, ty2, ty3, k] = shmem_value[tx, ty1, ty2, ty3] + end + + return nothing +end + +# Kernel for calculating symmetric and nonconservative fluxes +function noncons_volume_flux_kernel!(symmetric_flux_arr1, symmetric_flux_arr2, symmetric_flux_arr3, + noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, + u, derivative_split, equations::AbstractEquations{3}, + symmetric_flux::Any, nonconservative_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^4 && k <= size(u, 5)) + u2 = size(u, 2) + + j1 = div(j - 1, u2^3) + 1 + j2 = div(rem(j - 1, u2^3), u2^2) + 1 + j3 = div(rem(j - 1, u2^2), u2) + 1 + j4 = rem(j - 1, u2) + 1 + + u_node = get_node_vars(u, equations, j1, j2, j3, k) + u_node1 = get_node_vars(u, equations, j4, j2, j3, k) + u_node2 = get_node_vars(u, equations, j1, j4, j3, k) + u_node3 = get_node_vars(u, equations, j1, j2, j4, k) + + symmetric_flux_node1 = symmetric_flux(u_node, u_node1, 1, equations) + symmetric_flux_node2 = symmetric_flux(u_node, u_node2, 2, equations) + symmetric_flux_node3 = symmetric_flux(u_node, u_node3, 3, equations) + + noncons_flux_node1 = nonconservative_flux(u_node, u_node1, 1, equations) + noncons_flux_node2 = nonconservative_flux(u_node, u_node2, 2, equations) + noncons_flux_node3 = nonconservative_flux(u_node, u_node3, 3, equations) + + for ii in axes(u, 1) + @inbounds begin + symmetric_flux_arr1[ii, j1, j4, j2, j3, k] = symmetric_flux_node1[ii] * derivative_split[j1, j4] * + (1 - isequal(j1, j4)) # set diagonal elements to zeros + symmetric_flux_arr2[ii, j1, j2, j4, j3, k] = symmetric_flux_node2[ii] * derivative_split[j2, j4] * + (1 - isequal(j2, j4)) # set diagonal elements to zeros + symmetric_flux_arr3[ii, j1, j2, j3, j4, k] = symmetric_flux_node3[ii] * derivative_split[j3, j4] * + (1 - isequal(j3, j4)) # set diagonal elements to zeros + + noncons_flux_arr1[ii, j1, j4, j2, j3, k] = noncons_flux_node1[ii] + noncons_flux_arr2[ii, j1, j2, j4, j3, k] = noncons_flux_node2[ii] + noncons_flux_arr3[ii, j1, j2, j3, j4, k] = noncons_flux_node3[ii] + end + end + end + + return nothing +end + +# Kernel for calculating symmetric and nonconservative volume integrals +function volume_integral_kernel!(du, derivative_split, + symmetric_flux_arr1, symmetric_flux_arr2, symmetric_flux_arr3, + noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) + u2 = size(du, 2) # size(du, 2) == size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + @inbounds du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros + + for ii in axes(du, 2) + @inbounds du[i, j1, j2, j3, k] += symmetric_flux_arr1[i, j1, ii, j2, j3, k] + + symmetric_flux_arr2[i, j1, j2, ii, j3, k] + + symmetric_flux_arr3[i, j1, j2, j3, ii, k] + + 0.5f0 * + derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, j3, k] + + 0.5f0 * + derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, j3, k] + + 0.5f0 * + derivative_split[j3, ii] * noncons_flux_arr3[i, j1, j2, j3, ii, k] + end + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating volume integrals with conservative terms +function volume_flux_integral_kernel!(du, u, derivative_split, + equations::AbstractEquations{3}, + symmetric_flux::Any, nonconservative_flux::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_value = CuDynamicSharedArray(eltype(du), + (size(du, 1), tile_width, tile_width, tile_width), offset) + + # Get thread and block indices only we need save registers + ty = threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + ty1 = div(ty - 1, tile_width^2) + 1 + ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 + ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 + + # Tile the computation (set to one tile here) + # Initialize the values + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2, ty3] = zero(eltype(du)) + end + + # Load data from global memory into shared memory + # Transposed load + @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] + + sync_threads() + + # Compute volume fluxes + # How to store nodes in shared memory? + for thread in 1:tile_width + # Volume flux is heavy in computation so we should try best to avoid redundant + # computation, i.e., use for loop along x direction here + u_node = get_node_vars(u, equations, ty1, ty2, ty3, k) + symmetric_flux_node1 = symmetric_flux(u_node, + get_node_vars(u, equations, thread, ty2, ty3, k), + 1, equations) + symmetric_flux_node2 = symmetric_flux(u_node, + get_node_vars(u, equations, ty1, thread, ty3, k), + 2, equations) + symmetric_flux_node3 = symmetric_flux(u_node, + get_node_vars(u, equations, ty1, ty2, thread, k), + 3, equations) + noncons_flux_node1 = nonconservative_flux(u_node, + get_node_vars(u, equations, thread, ty2, ty3, k), + 1, equations) + noncons_flux_node2 = nonconservative_flux(u_node, + get_node_vars(u, equations, ty1, thread, ty3, k), + 2, equations) + noncons_flux_node3 = nonconservative_flux(u_node, + get_node_vars(u, equations, ty1, ty2, thread, k), + 3, equations) + + # TODO: Avoid potential bank conflicts + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2, ty3] += symmetric_flux_node1[tx] * shmem_split[thread, ty1] * + (1 - isequal(ty1, thread)) + # set diagonal elements to zeros + symmetric_flux_node2[tx] * shmem_split[thread, ty2] * + (1 - isequal(ty2, thread)) + # set diagonal elements to zeros + symmetric_flux_node3[tx] * shmem_split[thread, ty3] * + (1 - isequal(ty3, thread)) + # set diagonal elements to zeros + 0.5f0 * + noncons_flux_node1[tx] * shmem_split[thread, ty1] + + 0.5f0 * + noncons_flux_node2[tx] * shmem_split[thread, ty2] + + 0.5f0 * + noncons_flux_node3[tx] * shmem_split[thread, ty3] + end + end + + # Synchronization is not needed here if we use only one tile + # sync_threads() + + # Finalize the values + for tx in axes(du, 1) + @inbounds du[tx, ty1, ty2, ty3, k] = shmem_value[tx, ty1, ty2, ty3] + end + + return nothing +end + +# Kernel for calculating pure DG and DG-FV volume fluxes +function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, + fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, + u, alpha, atol, equations::AbstractEquations{3}, + volume_flux_dg::Any, volume_flux_fv::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^4 && k <= size(u, 5)) + u2 = size(u, 2) + + j1 = div(j - 1, u2^3) + 1 + j2 = div(rem(j - 1, u2^3), u2^2) + 1 + j3 = div(rem(j - 1, u2^2), u2) + 1 + j4 = rem(j - 1, u2) + 1 + + dg_only = isapprox(alpha[k], 0, atol = atol) + + u_node = get_node_vars(u, equations, j1, j2, j3, k) + u_node1 = get_node_vars(u, equations, j4, j2, j3, k) + u_node2 = get_node_vars(u, equations, j1, j4, j3, k) + u_node3 = get_node_vars(u, equations, j1, j2, j4, k) + + volume_flux_node1 = volume_flux_dg(u_node, u_node1, 1, equations) + volume_flux_node2 = volume_flux_dg(u_node, u_node2, 2, equations) + volume_flux_node3 = volume_flux_dg(u_node, u_node3, 3, equations) + + for ii in axes(u, 1) + @inbounds begin + volume_flux_arr1[ii, j1, j4, j2, j3, k] = volume_flux_node1[ii] + volume_flux_arr2[ii, j1, j2, j4, j3, k] = volume_flux_node2[ii] + volume_flux_arr3[ii, j1, j2, j3, j4, k] = volume_flux_node3[ii] + end + + # Small optimization, no much performance gain + if isequal(j1 + 1, j4) # avoid race condition + flux_fv_node1 = volume_flux_fv(u_node, u_node1, 1, equations) + + @inbounds begin + fstar1_L[ii, j4, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only) + fstar1_R[ii, j4, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only) + end + end + + if isequal(j2 + 1, j4) # avoid race condition + flux_fv_node2 = volume_flux_fv(u_node, u_node2, 2, equations) + + @inbounds begin + fstar2_L[ii, j1, j4, j3, k] = flux_fv_node2[ii] * (1 - dg_only) + fstar2_R[ii, j1, j4, j3, k] = flux_fv_node2[ii] * (1 - dg_only) + end + end + + if isequal(j3 + 1, j4) # avoid race condition + flux_fv_node3 = volume_flux_fv(u_node, u_node3, 3, equations) + + @inbounds begin + fstar3_L[ii, j1, j2, j4, k] = flux_fv_node3[ii] * (1 - dg_only) + fstar3_R[ii, j1, j2, j4, k] = flux_fv_node3[ii] * (1 - dg_only) + end + end + end + + # if j1 != 1 && j4 == 1 # bad + # u_ll = get_node_vars(u, equations, j1 - 1, j2, j3, k) + # u_rr = get_node_vars(u, equations, j1, j2, j3, k) + # flux_fv_node1 = volume_flux_fv(u_ll, u_rr, 1, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar1_L[ii, j1, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only) + # fstar1_R[ii, j1, j2, j3, k] = flux_fv_node1[ii] * (1 - dg_only) + # end + # end + # end + + # if j2 != 1 && j4 == 1 # bad + # u_ll = get_node_vars(u, equations, j1, j2 - 1, j3, k) + # u_rr = get_node_vars(u, equations, j1, j2, j3, k) + # flux_fv_node2 = volume_flux_fv(u_ll, u_rr, 2, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar2_L[ii, j1, j2, j3, k] = flux_fv_node2[ii] * (1 - dg_only) + # fstar2_R[ii, j1, j2, j3, k] = flux_fv_node2[ii] * (1 - dg_only) + # end + # end + # end + + # if j3 != 1 && j4 == 1 # bad + # u_ll = get_node_vars(u, equations, j1, j2, j3 - 1, k) + # u_rr = get_node_vars(u, equations, j1, j2, j3, k) + # flux_fv_node3 = volume_flux_fv(u_ll, u_rr, 3, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar3_L[ii, j1, j2, j3, k] = flux_fv_node3[ii] * (1 - dg_only) + # fstar3_R[ii, j1, j2, j3, k] = flux_fv_node3[ii] * (1 - dg_only) + # end + # end + # end + end + + return nothing +end + +# Kernel for calculating pure DG and DG-FV volume integrals +function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, + volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, + fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, + atol, equations::AbstractEquations{3}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) + u2 = size(du, 2) # size(du, 2) == size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + @inbounds begin + du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros + alpha_element = alpha[k] + end + + dg_only = isapprox(alpha_element, 0, atol = atol) + + for ii in axes(du, 2) + @inbounds du[i, j1, j2, j3, k] += (derivative_split[j1, ii] * + (1 - isequal(j1, ii)) * # set diagonal elements to zeros + volume_flux_arr1[i, j1, ii, j2, j3, k] + + derivative_split[j2, ii] * + (1 - isequal(j2, ii)) * # set diagonal elements to zeros + volume_flux_arr2[i, j1, j2, ii, j3, k] + + derivative_split[j3, ii] * + (1 - isequal(j3, ii)) * # set diagonal elements to zeros + volume_flux_arr3[i, j1, j2, j3, ii, k]) * dg_only + + ((1 - alpha_element) * derivative_split[j1, ii] * + (1 - isequal(j1, ii)) * # set diagonal elements to zeros + volume_flux_arr1[i, j1, ii, j2, j3, k] + + (1 - alpha_element) * derivative_split[j2, ii] * + (1 - isequal(j2, ii)) * # set diagonal elements to zeros + volume_flux_arr2[i, j1, j2, ii, j3, k] + + (1 - alpha_element) * derivative_split[j3, ii] * + (1 - isequal(j3, ii)) * # set diagonal elements to zeros + volume_flux_arr3[i, j1, j2, j3, ii, k]) * (1 - dg_only) + end + + @inbounds du[i, j1, j2, j3, k] += alpha_element * + (inverse_weights[j1] * + (fstar1_L[i, j1 + 1, j2, j3, k] - fstar1_R[i, j1, j2, j3, k]) + + inverse_weights[j2] * + (fstar2_L[i, j1, j2 + 1, j3, k] - fstar2_R[i, j1, j2, j3, k]) + + inverse_weights[j3] * + (fstar3_L[i, j1, j2, j3 + 1, k] - fstar3_R[i, j1, j2, j3, k])) * (1 - dg_only) + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating pure DG and DG-FV volume integrals without conservative terms +function volume_flux_integral_dgfv_kernel!(du, u, alpha, atol, derivative_split, inverse_weights, + equations::AbstractEquations{3}, + volume_flux_dg::Any, volume_flux_fv::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + # TODO: Combine `fstar` into single allocation + shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_fstar1 = CuDynamicSharedArray(eltype(du), + (size(du, 1), tile_width + 1, tile_width, tile_width), offset) + offset += sizeof(eltype(du)) * size(du, 1) * (tile_width + 1) * tile_width * tile_width + shmem_fstar2 = CuDynamicSharedArray(eltype(du), + (size(du, 1), tile_width, tile_width + 1, tile_width), offset) + offset += sizeof(eltype(du)) * size(du, 1) * tile_width * (tile_width + 1) * tile_width + shmem_fstar3 = CuDynamicSharedArray(eltype(du), + (size(du, 1), tile_width, tile_width, tile_width + 1), offset) + offset += sizeof(eltype(du)) * size(du, 1) * tile_width * tile_width * (tile_width + 1) + shmem_value = CuDynamicSharedArray(eltype(du), + (size(du, 1), tile_width, tile_width, tile_width), offset) + + # Get thread and block indices only we need save registers + ty = threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + ty1 = div(ty - 1, tile_width^2) + 1 + ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 + ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 + + # Load global `derivative_split` into shared memory + # Transposed load + @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] + + # Get variables for computation + @inbounds alpha_element = alpha[k] + dg_only = isapprox(alpha_element, 0, atol = atol) + + # Compute FV volume fluxes + u_node = get_node_vars(u, equations, ty1, ty2, ty3, k) + if ty1 + 1 <= tile_width + flux_fv_node1 = volume_flux_fv(u_node, + get_node_vars(u, equations, ty1 + 1, ty2, ty3, k), + 1, equations) + end + if ty2 + 1 <= tile_width + flux_fv_node2 = volume_flux_fv(u_node, + get_node_vars(u, equations, ty1, ty2 + 1, ty3, k), + 2, equations) + end + if ty3 + 1 <= tile_width + flux_fv_node3 = volume_flux_fv(u_node, + get_node_vars(u, equations, ty1, ty2, ty3 + 1, k), + 3, equations) + end + + # Initialize the values + for tx in axes(du, 1) + @inbounds begin + # Initialize `du` with zeros + shmem_value[tx, ty1, ty2, ty3] = zero(eltype(du)) + # Initialize `fstar` side columes with zeros + shmem_fstar1[tx, 1, ty2, ty3] = zero(eltype(du)) + shmem_fstar1[tx, tile_width + 1, ty2, ty3] = zero(eltype(du)) + shmem_fstar2[tx, ty1, 1, ty3] = zero(eltype(du)) + shmem_fstar2[tx, ty1, tile_width + 1, ty3] = zero(eltype(du)) + shmem_fstar3[tx, ty1, ty2, 1] = zero(eltype(du)) + shmem_fstar3[tx, ty1, ty2, tile_width + 1] = zero(eltype(du)) + end + + if ty1 + 1 <= tile_width + # Set with FV volume fluxes + @inbounds shmem_fstar1[tx, ty1 + 1, ty2, ty3] = flux_fv_node1[tx] * (1 - dg_only) + end + if ty2 + 1 <= tile_width + # Set with FV volume fluxes + @inbounds shmem_fstar2[tx, ty1, ty2 + 1, ty3] = flux_fv_node2[tx] * (1 - dg_only) + end + if ty3 + 1 <= tile_width + # Set with FV volume fluxes + @inbounds shmem_fstar3[tx, ty1, ty2, ty3 + 1] = flux_fv_node3[tx] * (1 - dg_only) + end + end + + sync_threads() + + # Contribute FV to the volume integrals + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2, ty3] += alpha_element * + (inverse_weights[ty1] * + (shmem_fstar1[tx, ty1 + 1, ty2, ty3] - shmem_fstar1[tx, ty1, ty2, ty3]) + + inverse_weights[ty2] * + (shmem_fstar2[tx, ty1, ty2 + 1, ty3] - shmem_fstar2[tx, ty1, ty2, ty3]) + + inverse_weights[ty3] * + (shmem_fstar3[tx, ty1, ty2, ty3 + 1] - shmem_fstar3[tx, ty1, ty2, ty3])) * + (1 - dg_only) + end + + # Compute DG volume fluxes + for thread in 1:tile_width + volume_flux_node1 = volume_flux_dg(u_node, + get_node_vars(u, equations, thread, ty2, ty3, k), + 1, equations) + volume_flux_node2 = volume_flux_dg(u_node, + get_node_vars(u, equations, ty1, thread, ty3, k), + 2, equations) + volume_flux_node3 = volume_flux_dg(u_node, + get_node_vars(u, equations, ty1, ty2, thread, k), + 3, equations) + + # Contribute DG to the volume integrals + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2, ty3] += (shmem_split[thread, ty1] * + (1 - isequal(ty1, thread)) * # set diagonal elements to zeros + volume_flux_node1[tx] + + shmem_split[thread, ty2] * + (1 - isequal(ty2, thread)) * # set diagonal elements to zeros + volume_flux_node2[tx] + + shmem_split[thread, ty3] * + (1 - isequal(ty3, thread)) * # set diagonal elements to zeros + volume_flux_node3[tx]) * dg_only + + ((1 - alpha_element) * shmem_split[thread, ty1] * + (1 - isequal(ty1, thread)) * # set diagonal elements to zeros + volume_flux_node1[tx] + + (1 - alpha_element) * shmem_split[thread, ty2] * + (1 - isequal(ty2, thread)) * # set diagonal elements to zeros + volume_flux_node2[tx] + + (1 - alpha_element) * shmem_split[thread, ty3] * + (1 - isequal(ty3, thread)) * # set diagonal elements to zeros + volume_flux_node3[tx]) * (1 - dg_only) + end + end + + # Finalize the values + for tx in axes(du, 1) + @inbounds du[tx, ty1, ty2, ty3, k] = shmem_value[tx, ty1, ty2, ty3] + end + + return nothing +end + +# Kernel for calculating pure DG and DG-FV volume fluxes +function volume_flux_dgfv_kernel!(volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, + noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, + fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, + u, alpha, atol, derivative_split, + equations::AbstractEquations{3}, + volume_flux_dg::Any, noncons_flux_dg::Any, + volume_flux_fv::Any, noncons_flux_fv::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(u, 2)^4 && k <= size(u, 5)) + u2 = size(u, 2) + + j1 = div(j - 1, u2^3) + 1 + j2 = div(rem(j - 1, u2^3), u2^2) + 1 + j3 = div(rem(j - 1, u2^2), u2) + 1 + j4 = rem(j - 1, u2) + 1 + + dg_only = isapprox(alpha[k], 0, atol = atol) + + u_node = get_node_vars(u, equations, j1, j2, j3, k) + u_node1 = get_node_vars(u, equations, j4, j2, j3, k) + u_node2 = get_node_vars(u, equations, j1, j4, j3, k) + u_node3 = get_node_vars(u, equations, j1, j2, j4, k) + + volume_flux_node1 = volume_flux_dg(u_node, u_node1, 1, equations) + volume_flux_node2 = volume_flux_dg(u_node, u_node2, 2, equations) + volume_flux_node3 = volume_flux_dg(u_node, u_node3, 3, equations) + + noncons_flux_node1 = noncons_flux_dg(u_node, u_node1, 1, equations) + noncons_flux_node2 = noncons_flux_dg(u_node, u_node2, 2, equations) + noncons_flux_node3 = noncons_flux_dg(u_node, u_node3, 3, equations) + + for ii in axes(u, 1) + @inbounds begin + volume_flux_arr1[ii, j1, j4, j2, j3, k] = volume_flux_node1[ii] * derivative_split[j1, j4] * + (1 - isequal(j1, j4)) # set diagonal elements to zeros + volume_flux_arr2[ii, j1, j2, j4, j3, k] = volume_flux_node2[ii] * derivative_split[j2, j4] * + (1 - isequal(j2, j4)) # set diagonal elements to zeros + volume_flux_arr3[ii, j1, j2, j3, j4, k] = volume_flux_node3[ii] * derivative_split[j3, j4] * + (1 - isequal(j3, j4)) # set diagonal elements to zeros + + noncons_flux_arr1[ii, j1, j4, j2, j3, k] = noncons_flux_node1[ii] + noncons_flux_arr2[ii, j1, j2, j4, j3, k] = noncons_flux_node2[ii] + noncons_flux_arr3[ii, j1, j2, j3, j4, k] = noncons_flux_node3[ii] + end + + # Small optimization, no much performance gain + if isequal(j1 + 1, j4) # avoid race condition + f1_node = volume_flux_fv(u_node, u_node1, 1, equations) + f1_L_node = noncons_flux_fv(u_node, u_node1, 1, equations) + f1_R_node = noncons_flux_fv(u_node1, u_node, 1, equations) + + @inbounds begin + fstar1_L[ii, j4, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only) + fstar1_R[ii, j4, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only) + end + end + + if isequal(j2 + 1, j4) # avoid race condition + f2_node = volume_flux_fv(u_node, u_node2, 2, equations) + f2_L_node = noncons_flux_fv(u_node, u_node2, 2, equations) + f2_R_node = noncons_flux_fv(u_node2, u_node, 2, equations) + + @inbounds begin + fstar2_L[ii, j1, j4, j3, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only) + fstar2_R[ii, j1, j4, j3, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only) + end + end + + if isequal(j3 + 1, j4) # avoid race condition + f3_node = volume_flux_fv(u_node, u_node3, 3, equations) + f3_L_node = noncons_flux_fv(u_node, u_node3, 3, equations) + f3_R_node = noncons_flux_fv(u_node3, u_node, 3, equations) + + @inbounds begin + fstar3_L[ii, j1, j2, j4, k] = f3_node[ii] + 0.5f0 * f3_L_node[ii] * (1 - dg_only) + fstar3_R[ii, j1, j2, j4, k] = f3_node[ii] + 0.5f0 * f3_R_node[ii] * (1 - dg_only) + end + end + end + + # if j1 != 1 && j4 == 1 # bad + # u_ll = get_node_vars(u, equations, j1 - 1, j2, j3, k) + # u_rr = get_node_vars(u, equations, j1, j2, j3, k) + + # f1_node = volume_flux_fv(u_ll, u_rr, 1, equations) + + # f1_L_node = noncons_flux_fv(u_ll, u_rr, 1, equations) + # f1_R_node = noncons_flux_fv(u_rr, u_ll, 1, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar1_L[ii, j1, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_L_node[ii] * (1 - dg_only) + # fstar1_R[ii, j1, j2, j3, k] = f1_node[ii] + 0.5f0 * f1_R_node[ii] * (1 - dg_only) + # end + # end + # end + + # if j2 != 1 && j4 == 1 # bad + # u_ll = get_node_vars(u, equations, j1, j2 - 1, j3, k) + # u_rr = get_node_vars(u, equations, j1, j2, j3, k) + + # f2_node = volume_flux_fv(u_ll, u_rr, 2, equations) + + # f2_L_node = noncons_flux_fv(u_ll, u_rr, 2, equations) + # f2_R_node = noncons_flux_fv(u_rr, u_ll, 2, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar2_L[ii, j1, j2, j3, k] = f2_node[ii] + 0.5f0 * f2_L_node[ii] * (1 - dg_only) + # fstar2_R[ii, j1, j2, j3, k] = f2_node[ii] + 0.5f0 * f2_R_node[ii] * (1 - dg_only) + # end + # end + # end + + # if j3 != 1 && j4 == 1 # bad + # u_ll = get_node_vars(u, equations, j1, j2, j3 - 1, k) + # u_rr = get_node_vars(u, equations, j1, j2, j3, k) + + # f3_node = volume_flux_fv(u_ll, u_rr, 3, equations) + + # f3_L_node = noncons_flux_fv(u_ll, u_rr, 3, equations) + # f3_R_node = noncons_flux_fv(u_rr, u_ll, 3, equations) + + # for ii in axes(u, 1) + # @inbounds begin + # fstar3_L[ii, j1, j2, j3, k] = f3_node[ii] + 0.5f0 * f3_L_node[ii] * (1 - dg_only) + # fstar3_R[ii, j1, j2, j3, k] = f3_node[ii] + 0.5f0 * f3_R_node[ii] * (1 - dg_only) + # end + # end + # end + end + + return nothing +end + +# Kernel for calculating pure DG and DG-FV volume integrals +function volume_integral_dgfv_kernel!(du, alpha, derivative_split, inverse_weights, + volume_flux_arr1, volume_flux_arr2, volume_flux_arr3, + noncons_flux_arr1, noncons_flux_arr2, noncons_flux_arr3, + fstar1_L, fstar1_R, fstar2_L, fstar2_R, fstar3_L, fstar3_R, + atol, equations::AbstractEquations{3}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) + u2 = size(du, 2) # size(du, 2) == size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + @inbounds begin + du[i, j1, j2, j3, k] = zero(eltype(du)) # initialize `du` with zeros + alpha_element = alpha[k] + end + + dg_only = isapprox(alpha_element, 0, atol = atol) + + for ii in axes(du, 2) + @inbounds du[i, j1, j2, j3, k] += (volume_flux_arr1[i, j1, ii, j2, j3, k] + + volume_flux_arr2[i, j1, j2, ii, j3, k] + + volume_flux_arr3[i, j1, j2, j3, ii, k] + + 0.5f0 * + (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, j3, k] + + derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, j3, k] + + derivative_split[j3, ii] * noncons_flux_arr3[i, j1, j2, j3, ii, k])) * dg_only + + ((1 - alpha_element) * + volume_flux_arr1[i, j1, ii, j2, j3, k] + + (1 - alpha_element) * + volume_flux_arr2[i, j1, j2, ii, j3, k] + + (1 - alpha_element) * + volume_flux_arr3[i, j1, j2, j3, ii, k] + + 0.5f0 * (1 - alpha_element) * + (derivative_split[j1, ii] * noncons_flux_arr1[i, j1, ii, j2, j3, k] + + derivative_split[j2, ii] * noncons_flux_arr2[i, j1, j2, ii, j3, k] + + derivative_split[j3, ii] * noncons_flux_arr3[i, j1, j2, j3, ii, k])) * (1 - dg_only) + end + + @inbounds du[i, j1, j2, j3, k] += alpha_element * + (inverse_weights[j1] * + (fstar1_L[i, j1 + 1, j2, j3, k] - fstar1_R[i, j1, j2, j3, k]) + + inverse_weights[j2] * + (fstar2_L[i, j1, j2 + 1, j3, k] - fstar2_R[i, j1, j2, j3, k]) + + inverse_weights[j3] * + (fstar3_L[i, j1, j2, j3 + 1, k] - fstar3_R[i, j1, j2, j3, k])) * (1 - dg_only) + end + + return nothing +end + +############################################################################## New optimization +# Kernel for calculating pure DG and DG-FV volume integrals with conservative terms +function volume_flux_integral_dgfv_kernel!(du, u, alpha, atol, derivative_split, inverse_weights, + equations::AbstractEquations{3}, + volume_flux_dg::Any, noncons_flux_dg::Any, + volume_flux_fv::Any, noncons_flux_fv::Any) + # Set tile width + tile_width = size(du, 2) + offset = 0 # offset bytes for shared memory + + # Allocate dynamic shared memory + shmem_split = CuDynamicSharedArray(eltype(du), (tile_width, tile_width)) + offset += sizeof(eltype(du)) * tile_width^2 + shmem_fstar1 = CuDynamicSharedArray(eltype(du), + (size(du, 1), tile_width + 1, tile_width, tile_width, 2), offset) + offset += sizeof(eltype(du)) * size(du, 1) * (tile_width + 1) * tile_width * tile_width * 2 + shmem_fstar2 = CuDynamicSharedArray(eltype(du), + (size(du, 1), tile_width, tile_width + 1, tile_width, 2), offset) + offset += sizeof(eltype(du)) * size(du, 1) * tile_width * (tile_width + 1) * tile_width * 2 + shmem_fstar3 = CuDynamicSharedArray(eltype(du), + (size(du, 1), tile_width, tile_width, tile_width + 1, 2), offset) + offset += sizeof(eltype(du)) * size(du, 1) * tile_width * tile_width * (tile_width + 1) * 2 + shmem_value = CuDynamicSharedArray(eltype(du), + (size(du, 1), tile_width, tile_width, tile_width), offset) + + # Get thread and block indices only we need save registers + ty = threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + ty1 = div(ty - 1, tile_width^2) + 1 + ty2 = div(rem(ty - 1, tile_width^2), tile_width) + 1 + ty3 = rem(rem(ty - 1, tile_width^2), tile_width) + 1 + + # Load global `derivative_split` into shared memory + # Transposed load + @inbounds shmem_split[ty1, ty2] = derivative_split[ty2, ty1] + + # Get variables for computation + @inbounds alpha_element = alpha[k] + dg_only = isapprox(alpha_element, 0, atol = atol) + + # Compute FV volume fluxes + u_node = get_node_vars(u, equations, ty1, ty2, ty3, k) + if ty1 + 1 <= tile_width + f1_node = volume_flux_fv(u_node, + get_node_vars(u, equations, ty1 + 1, ty2, ty3, k), + 1, equations) + f1_L_node = noncons_flux_fv(u_node, + get_node_vars(u, equations, ty1 + 1, ty2, ty3, k), + 1, equations) + f1_R_node = noncons_flux_fv(get_node_vars(u, equations, ty1 + 1, ty2, ty3, k), + u_node, + 1, equations) + end + if ty2 + 1 <= tile_width + f2_node = volume_flux_fv(u_node, + get_node_vars(u, equations, ty1, ty2 + 1, ty3, k), + 2, equations) + f2_L_node = noncons_flux_fv(u_node, + get_node_vars(u, equations, ty1, ty2 + 1, ty3, k), + 2, equations) + f2_R_node = noncons_flux_fv(get_node_vars(u, equations, ty1, ty2 + 1, ty3, k), + u_node, + 2, equations) + end + if ty3 + 1 <= tile_width + f3_node = volume_flux_fv(u_node, + get_node_vars(u, equations, ty1, ty2, ty3 + 1, k), + 3, equations) + f3_L_node = noncons_flux_fv(u_node, + get_node_vars(u, equations, ty1, ty2, ty3 + 1, k), + 3, equations) + f3_R_node = noncons_flux_fv(get_node_vars(u, equations, ty1, ty2, ty3 + 1, k), + u_node, + 3, equations) + end + + # Initialize the values + for tx in axes(du, 1) + @inbounds begin + # Initialize `du` with zeros + shmem_value[tx, ty1, ty2, ty3] = zero(eltype(du)) + + # TODO: Remove shared memory for `fstar` and use local memory + + # Initialize `fstar` side columes with zeros (1: left) + shmem_fstar1[tx, 1, ty2, ty3, 1] = zero(eltype(du)) + shmem_fstar1[tx, tile_width + 1, ty2, ty3, 1] = zero(eltype(du)) + shmem_fstar2[tx, ty1, 1, ty3, 1] = zero(eltype(du)) + shmem_fstar2[tx, ty1, tile_width + 1, ty3, 1] = zero(eltype(du)) + shmem_fstar3[tx, ty1, ty2, 1, 1] = zero(eltype(du)) + shmem_fstar3[tx, ty1, ty2, tile_width + 1, 1] = zero(eltype(du)) + + # Initialize `fstar` side columes with zeros (2: right) + shmem_fstar1[tx, 1, ty2, ty3, 2] = zero(eltype(du)) + shmem_fstar1[tx, tile_width + 1, ty2, ty3, 2] = zero(eltype(du)) + shmem_fstar2[tx, ty1, 1, ty3, 2] = zero(eltype(du)) + shmem_fstar2[tx, ty1, tile_width + 1, ty3, 2] = zero(eltype(du)) + shmem_fstar3[tx, ty1, ty2, 1, 2] = zero(eltype(du)) + shmem_fstar3[tx, ty1, ty2, tile_width + 1, 2] = zero(eltype(du)) + end + + if ty1 + 1 <= tile_width + # Set with FV volume fluxes + @inbounds begin + shmem_fstar1[tx, ty1 + 1, ty2, ty3, 1] = f1_node[tx] + 0.5f0 * f1_L_node[tx] * (1 - dg_only) + shmem_fstar1[tx, ty1 + 1, ty2, ty3, 2] = f1_node[tx] + 0.5f0 * f1_R_node[tx] * (1 - dg_only) + end + end + if ty2 + 1 <= tile_width + # Set with FV volume fluxes + @inbounds begin + shmem_fstar2[tx, ty1, ty2 + 1, ty3, 1] = f2_node[tx] + 0.5f0 * f2_L_node[tx] * (1 - dg_only) + shmem_fstar2[tx, ty1, ty2 + 1, ty3, 2] = f2_node[tx] + 0.5f0 * f2_R_node[tx] * (1 - dg_only) + end + end + if ty3 + 1 <= tile_width + # Set with FV volume fluxes + @inbounds begin + shmem_fstar3[tx, ty1, ty2, ty3 + 1, 1] = f3_node[tx] + 0.5f0 * f3_L_node[tx] * (1 - dg_only) + shmem_fstar3[tx, ty1, ty2, ty3 + 1, 2] = f3_node[tx] + 0.5f0 * f3_R_node[tx] * (1 - dg_only) + end + end + end + + sync_threads() + + # Contribute FV to the volume integrals + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2, ty3] += alpha_element * + (inverse_weights[ty1] * + (shmem_fstar1[tx, ty1 + 1, ty2, ty3, 1] - shmem_fstar1[tx, ty1, ty2, ty3, 2]) + + inverse_weights[ty2] * + (shmem_fstar2[tx, ty1, ty2 + 1, ty3, 1] - shmem_fstar2[tx, ty1, ty2, ty3, 2]) + + inverse_weights[ty3] * + (shmem_fstar3[tx, ty1, ty2, ty3 + 1, 1] - shmem_fstar3[tx, ty1, ty2, ty3, 2])) * + (1 - dg_only) + end + + # Compute DG volume fluxes + for thread in 1:tile_width + volume_flux_node1 = volume_flux_dg(u_node, + get_node_vars(u, equations, thread, ty2, ty3, k), + 1, equations) + volume_flux_node2 = volume_flux_dg(u_node, + get_node_vars(u, equations, ty1, thread, ty3, k), + 2, equations) + volume_flux_node3 = volume_flux_dg(u_node, + get_node_vars(u, equations, ty1, ty2, thread, k), + 3, equations) + + noncons_flux_node1 = noncons_flux_dg(u_node, + get_node_vars(u, equations, thread, ty2, ty3, k), + 1, equations) + noncons_flux_node2 = noncons_flux_dg(u_node, + get_node_vars(u, equations, ty1, thread, ty3, k), + 2, equations) + noncons_flux_node3 = noncons_flux_dg(u_node, + get_node_vars(u, equations, ty1, ty2, thread, k), + 3, equations) + + # Contribute DG to the volume integrals + for tx in axes(du, 1) + @inbounds shmem_value[tx, ty1, ty2, ty3] += (volume_flux_node1[tx] * shmem_split[thread, ty1] * + (1 - isequal(ty1, thread)) + + volume_flux_node2[tx] * shmem_split[thread, ty2] * + (1 - isequal(ty2, thread)) + + volume_flux_node3[tx] * shmem_split[thread, ty3] * + (1 - isequal(ty3, thread)) + + 0.5f0 * + (shmem_split[thread, ty1] * noncons_flux_node1[tx] + + shmem_split[thread, ty2] * noncons_flux_node2[tx] + + shmem_split[thread, ty3] * noncons_flux_node3[tx])) * dg_only + + ((1 - alpha_element) * + volume_flux_node1[tx] * shmem_split[thread, ty1] * + (1 - isequal(ty1, thread)) + + (1 - alpha_element) * + volume_flux_node2[tx] * shmem_split[thread, ty2] * + (1 - isequal(ty2, thread)) + + (1 - alpha_element) * + volume_flux_node3[tx] * shmem_split[thread, ty3] * + (1 - isequal(ty3, thread)) + + 0.5f0 * (1 - alpha_element) * + (shmem_split[thread, ty1] * noncons_flux_node1[tx] + + shmem_split[thread, ty2] * noncons_flux_node2[tx] + + shmem_split[thread, ty3] * noncons_flux_node3[tx])) * (1 - dg_only) + end + end + + # Finalize the values + for tx in axes(du, 1) + @inbounds du[tx, ty1, ty2, ty3, k] = shmem_value[tx, ty1, ty2, ty3] + end + + return nothing +end + +# Kernel for prolonging two interfaces +function prolong_interfaces_kernel!(interfaces_u, u, neighbor_ids, orientations, + equations::AbstractEquations{3}) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(interfaces_u, 2) * size(interfaces_u, 3)^2 && k <= size(interfaces_u, 5)) + u2 = size(u, 2) # size(interfaces_u, 3) == size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + @inbounds begin + orientation = orientations[k] + left_element = neighbor_ids[1, k] + right_element = neighbor_ids[2, k] + + interfaces_u[1, j1, j2, j3, k] = u[j1, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j3, + isequal(orientation, 1) * j3 + isequal(orientation, 2) * j3 + isequal(orientation, 3) * u2, + left_element] + interfaces_u[2, j1, j2, j3, k] = u[j1, + isequal(orientation, 1) + isequal(orientation, 2) * j2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) + isequal(orientation, 3) * j3, + isequal(orientation, 1) * j3 + isequal(orientation, 2) * j3 + isequal(orientation, 3), + right_element] + end + end + + return nothing +end + +# Kernel for calculating surface fluxes +function surface_flux_kernel!(surface_flux_arr, interfaces_u, orientations, + equations::AbstractEquations{3}, surface_flux::Any) + j1 = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j2 = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (j1 <= size(surface_flux_arr, 2) && j2 <= size(surface_flux_arr, 3) && + k <= size(surface_flux_arr, 4)) + u_ll, u_rr = get_surface_node_vars(interfaces_u, equations, j1, j2, k) + @inbounds orientation = orientations[k] + + surface_flux_node = surface_flux(u_ll, u_rr, orientation, equations) + + for ii in axes(surface_flux_arr, 1) + @inbounds surface_flux_arr[ii, j1, j2, k] = surface_flux_node[ii] + end + end + + return nothing +end + +# Kernel for calculating surface and both nonconservative fluxes +function surface_noncons_flux_kernel!(surface_flux_arr, noncons_left_arr, noncons_right_arr, + interfaces_u, orientations, equations::AbstractEquations{3}, + surface_flux::Any, nonconservative_flux::Any) + j1 = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j2 = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (j1 <= size(surface_flux_arr, 2) && j2 <= size(surface_flux_arr, 3) && + k <= size(surface_flux_arr, 4)) + u_ll, u_rr = get_surface_node_vars(interfaces_u, equations, j1, j2, k) + @inbounds orientation = orientations[k] + + surface_flux_node = surface_flux(u_ll, u_rr, orientation, equations) + noncons_left_node = nonconservative_flux(u_ll, u_rr, orientation, equations) + noncons_right_node = nonconservative_flux(u_rr, u_ll, orientation, equations) + + for ii in axes(surface_flux_arr, 1) + @inbounds begin + surface_flux_arr[ii, j1, j2, k] = surface_flux_node[ii] + noncons_left_arr[ii, j1, j2, k] = noncons_left_node[ii] + noncons_right_arr[ii, j1, j2, k] = noncons_right_node[ii] + end + end + end + + return nothing +end + +# Kernel for setting interface fluxes +function interface_flux_kernel!(surface_flux_values, surface_flux_arr, neighbor_ids, orientations, + equations::AbstractEquations{3}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_arr, 2)^2 && + k <= size(surface_flux_arr, 4)) + j1 = div(j - 1, size(surface_flux_arr, 2)) + 1 + j2 = rem(j - 1, size(surface_flux_arr, 2)) + 1 + + @inbounds begin + left_id = neighbor_ids[1, k] + right_id = neighbor_ids[2, k] + + left_direction = 2 * orientations[k] + right_direction = 2 * orientations[k] - 1 + + surface_flux_values[i, j1, j2, left_direction, left_id] = surface_flux_arr[i, j1, j2, k] + surface_flux_values[i, j1, j2, right_direction, right_id] = surface_flux_arr[i, j1, j2, k] + end + end + + return nothing +end + +# Kernel for setting interface fluxes +function interface_flux_kernel!(surface_flux_values, surface_flux_arr, noncons_left_arr, + noncons_right_arr, neighbor_ids, orientations, + equations::AbstractEquations{3}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_arr, 2)^2 && + k <= size(surface_flux_arr, 4)) + j1 = div(j - 1, size(surface_flux_arr, 2)) + 1 + j2 = rem(j - 1, size(surface_flux_arr, 2)) + 1 + + @inbounds begin + left_id = neighbor_ids[1, k] + right_id = neighbor_ids[2, k] + + left_direction = 2 * orientations[k] + right_direction = 2 * orientations[k] - 1 + + surface_flux_values[i, j1, j2, left_direction, left_id] = surface_flux_arr[i, j1, j2, k] + + 0.5f0 * + noncons_left_arr[i, j1, j2, k] + surface_flux_values[i, j1, j2, right_direction, right_id] = surface_flux_arr[i, j1, j2, k] + + 0.5f0 * + noncons_right_arr[i, j1, j2, k] + end + end + + return nothing +end + +# Kernel for prolonging two boundaries +function prolong_boundaries_kernel!(boundaries_u, u, neighbor_ids, neighbor_sides, orientations, + equations::AbstractEquations{3}) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(boundaries_u, 2) * size(boundaries_u, 3)^2 && k <= size(boundaries_u, 5)) + u2 = size(u, 2) # size(boundaries_u, 3) == size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + @inbounds begin + element = neighbor_ids[k] + side = neighbor_sides[k] + orientation = orientations[k] + + boundaries_u[1, j1, j2, j3, k] = u[j1, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j3, + isequal(orientation, 1) * j3 + isequal(orientation, 2) * j3 + isequal(orientation, 3) * u2, + element] * (2 - side) # Set to 0 instead of NaN + boundaries_u[2, j1, j2, j3, k] = u[j1, + isequal(orientation, 1) + isequal(orientation, 2) * j2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) + isequal(orientation, 3) * j3, + isequal(orientation, 1) * j3 + isequal(orientation, 2) * j3 + isequal(orientation, 3), + element] * (side - 1) # Set to 0 instead of NaN + end + end + + return nothing +end + +# Kernel for calculating boundary fluxes +function boundary_flux_kernel!(surface_flux_values, boundaries_u, node_coordinates, t, boundary_arr, + indices_arr, neighbor_ids, neighbor_sides, orientations, + boundary_conditions::NamedTuple, equations::AbstractEquations{3}, + surface_flux::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(surface_flux_values, 2)^2 && k <= length(boundary_arr)) + j1 = div(j - 1, size(surface_flux_values, 2)) + 1 + j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + + @inbounds begin + boundary = boundary_arr[k] + direction = (indices_arr[1] <= boundary) + (indices_arr[2] <= boundary) + + (indices_arr[3] <= boundary) + (indices_arr[4] <= boundary) + + (indices_arr[5] <= boundary) + (indices_arr[6] <= boundary) + + neighbor = neighbor_ids[boundary] + side = neighbor_sides[boundary] + orientation = orientations[boundary] + end + + u_ll, u_rr = get_surface_node_vars(boundaries_u, equations, j1, j2, boundary) + u_inner = (2 - side) * u_ll + (side - 1) * u_rr + x = get_node_coords(node_coordinates, equations, j1, j2, boundary) + + # TODO: Improve this part + if direction == 1 + boundary_flux_node = boundary_conditions[1](u_inner, orientation, + direction, x, t, surface_flux, equations) + elseif direction == 2 + boundary_flux_node = boundary_conditions[2](u_inner, orientation, + direction, x, t, surface_flux, equations) + elseif direction == 3 + boundary_flux_node = boundary_conditions[3](u_inner, orientation, + direction, x, t, surface_flux, equations) + elseif direction == 4 + boundary_flux_node = boundary_conditions[4](u_inner, orientation, + direction, x, t, surface_flux, equations) + elseif direction == 5 + boundary_flux_node = boundary_conditions[5](u_inner, orientation, + direction, x, t, surface_flux, equations) + else + boundary_flux_node = boundary_conditions[6](u_inner, orientation, + direction, x, t, surface_flux, equations) + end + + for ii in axes(surface_flux_values, 1) + # `boundary_flux_node` can be nothing if periodic boundary condition is applied + @inbounds surface_flux_values[ii, j1, j2, direction, neighbor] = isnothing(boundary_flux_node) ? # bad + surface_flux_values[ii, j1, + j2, + direction, + neighbor] : + boundary_flux_node[ii] + end + end + + return nothing +end + +# Kernel for copying data small to small on mortars +function prolong_mortars_small2small_kernel!(u_upper_left, u_upper_right, u_lower_left, + u_lower_right, u, neighbor_ids, large_sides, + orientations) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(u_upper_left, 2) && j <= size(u_upper_left, 3)^2 && k <= size(u_upper_left, 5)) + u2 = size(u, 2) # size(u_upper_left, 3) == size(u, 2) + + j1 = div(j - 1, u2) + 1 + j2 = rem(j - 1, u2) + 1 + + @inbounds begin + large_side = large_sides[k] + orientation = orientations[k] + + lower_left_element = neighbor_ids[1, k] + lower_right_element = neighbor_ids[2, k] + upper_left_element = neighbor_ids[3, k] + upper_right_element = neighbor_ids[4, k] + + u_upper_left[2, i, j1, j2, k] = u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, + isequal(orientation, 1) * j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), + upper_left_element] * (2 - large_side) + + u_upper_right[2, i, j1, j2, k] = u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, + isequal(orientation, 1) * j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), + upper_right_element] * (2 - large_side) + + u_lower_left[2, i, j1, j2, k] = u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, + isequal(orientation, 1) * j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), + lower_left_element] * (2 - large_side) + + u_lower_right[2, i, j1, j2, k] = u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, + isequal(orientation, 1) * j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), + lower_right_element] * (2 - large_side) + + u_upper_left[1, i, j1, j2, k] = u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, + isequal(orientation, 1) * j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + upper_left_element] * (large_side - 1) + + u_upper_right[1, i, j1, j2, k] = u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, + isequal(orientation, 1) * j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + upper_right_element] * (large_side - 1) + + u_lower_left[1, i, j1, j2, k] = u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, + isequal(orientation, 1) * j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + lower_left_element] * (large_side - 1) + + u_lower_right[1, i, j1, j2, k] = u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1 + isequal(orientation, 3) * j1, + isequal(orientation, 1) * j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + lower_right_element] * (large_side - 1) + end + end + + return nothing +end + +# # Kernel for interpolating data large to small on mortars - step 1 +# function prolong_mortars_large2small_kernel!(tmp_upper_left, tmp_upper_right, tmp_lower_left, +# tmp_lower_right, u, forward_upper, +# forward_lower, neighbor_ids, large_sides, orientations) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(tmp_upper_left, 2) && j <= size(tmp_upper_left, 3)^2 && +# k <= size(tmp_upper_left, 5)) +# u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2) + +# j1 = div(j - 1, u2) + 1 +# j2 = rem(j - 1, u2) + 1 + +# large_side = large_sides[k] +# orientation = orientations[k] +# large_element = neighbor_ids[5, k] + +# leftright = large_side + +# @inbounds begin +# for j1j1 in axes(forward_lower, 2) +# tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * +# u[i, +# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, +# large_element] * (2 - large_side) + +# tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * +# u[i, +# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, +# large_element] * (2 - large_side) + +# tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * +# u[i, +# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, +# large_element] * (2 - large_side) + +# tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * +# u[i, +# isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, +# large_element] * (2 - large_side) +# end + +# for j1j1 in axes(forward_lower, 2) +# tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * +# u[i, +# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), +# large_element] * (large_side - 1) + +# tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * +# u[i, +# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), +# large_element] * (large_side - 1) + +# tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * +# u[i, +# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), +# large_element] * (large_side - 1) + +# tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * +# u[i, +# isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, +# isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, +# isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3), +# large_element] * (large_side - 1) +# end +# end +# end + +# return nothing +# end + +# # Kernel for interpolating data large to small on mortars - step 2 +# function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, +# u_lower_right, tmp_upper_left, tmp_upper_right, +# tmp_lower_left, tmp_lower_right, forward_upper, +# forward_lower, large_sides) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(u_upper_left, 2) && j <= size(u_upper_left, 3)^2 && +# k <= size(u_upper_left, 5)) +# u2 = size(u_upper_left, 3) # size(u_upper_left, 3) == size(u, 2) + +# j1 = div(j - 1, u2) + 1 +# j2 = rem(j - 1, u2) + 1 + +# leftright = large_sides[k] + +# @inbounds begin +# for j2j2 in axes(forward_upper, 2) +# u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * +# tmp_upper_left[leftright, i, j1, j2j2, k] + +# u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * +# tmp_upper_right[leftright, i, j1, j2j2, k] + +# u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * +# tmp_lower_left[leftright, i, j1, j2j2, k] + +# u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * +# tmp_lower_right[leftright, i, j1, j2j2, k] +# end +# end +# end + +# return nothing +# end + +# Kernel for interpolating data large to small on mortars (optimized) +function prolong_mortars_large2small_kernel!(u_upper_left, u_upper_right, u_lower_left, u_lower_right, + tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right, + u, forward_upper, forward_lower, neighbor_ids, large_sides, + orientations) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + # Loop stride for each dimension + stride_x = gridDim().x * blockDim().x + stride_y = gridDim().y * blockDim().y + stride_z = gridDim().z * blockDim().z + + # Cooperative kernel needs stride loops to handle the constrained launch size + while i <= size(tmp_upper_left, 2) + while j <= size(tmp_upper_left, 3)^2 + while k <= size(tmp_upper_left, 5) + u2 = size(tmp_upper_left, 3) # size(tmp_upper_left, 3) == size(u, 2) + + j1 = div(j - 1, u2) + 1 + j2 = rem(j - 1, u2) + 1 + + @inbounds begin + large_side = large_sides[k] + orientation = orientations[k] + large_element = neighbor_ids[5, k] + end + + leftright = large_side + + for j1j1 in axes(forward_lower, 2) + @inbounds begin + tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * + u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + large_element] * (2 - large_side) + + tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * + u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + large_element] * (2 - large_side) + + tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * + u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + large_element] * (2 - large_side) + + tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * + u[i, + isequal(orientation, 1) * u2 + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) * u2 + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, 3) * u2, + large_element] * (2 - large_side) + end + end + + for j1j1 in axes(forward_lower, 2) + @inbounds begin + tmp_upper_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * + u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, + 3), + large_element] * (large_side - 1) + + tmp_upper_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * + u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, + 3), + large_element] * (large_side - 1) + + tmp_lower_left[leftright, i, j1, j2, k] += forward_lower[j1, j1j1] * + u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, + 3), + large_element] * (large_side - 1) + + tmp_lower_right[leftright, i, j1, j2, k] += forward_upper[j1, j1j1] * + u[i, + isequal(orientation, 1) + isequal(orientation, 2) * j1j1 + isequal(orientation, 3) * j1j1, + isequal(orientation, 1) * j1j1 + isequal(orientation, 2) + isequal(orientation, 3) * j2, + isequal(orientation, 1) * j2 + isequal(orientation, 2) * j2 + isequal(orientation, + 3), + large_element] * (large_side - 1) + end + end + + # Grid scope synchronization + grid = CG.this_grid() + CG.sync(grid) + + for j2j2 in axes(forward_upper, 2) + @inbounds begin + u_upper_left[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * + tmp_upper_left[leftright, i, j1, j2j2, k] + + u_upper_right[leftright, i, j1, j2, k] += forward_upper[j2, j2j2] * + tmp_upper_right[leftright, i, j1, j2j2, k] + + u_lower_left[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * + tmp_lower_left[leftright, i, j1, j2j2, k] + + u_lower_right[leftright, i, j1, j2, k] += forward_lower[j2, j2j2] * + tmp_lower_right[leftright, i, j1, j2j2, k] + end + end + k += stride_z + end + j += stride_y + end + i += stride_x + end + + return nothing +end + +# Kernel for calculating mortar fluxes +function mortar_flux_kernel!(fstar_primary_upper_left, fstar_primary_upper_right, + fstar_primary_lower_left, fstar_primary_lower_right, + fstar_secondary_upper_left, fstar_secondary_upper_right, + fstar_secondary_lower_left, fstar_seondary_lower_right, + u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations, + equations::AbstractEquations{3}, surface_flux::Any) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(u_upper_left, 3) && j <= size(u_upper_left, 4) && k <= length(orientations)) + u_upper_left_ll, u_upper_left_rr = get_surface_node_vars(u_upper_left, equations, i, j, k) + u_upper_right_ll, u_upper_right_rr = get_surface_node_vars(u_upper_right, equations, i, j, k) + u_lower_left_ll, u_lower_left_rr = get_surface_node_vars(u_lower_left, equations, i, j, k) + u_lower_right_ll, u_lower_right_rr = get_surface_node_vars(u_lower_right, equations, i, j, k) + + @inbounds orientation = orientations[k] + + flux_upper_left_node = surface_flux(u_upper_left_ll, u_upper_left_rr, orientation, + equations) + flux_upper_right_node = surface_flux(u_upper_right_ll, u_upper_right_rr, orientation, + equations) + flux_lower_left_node = surface_flux(u_lower_left_ll, u_lower_left_rr, orientation, + equations) + flux_lower_right_node = surface_flux(u_lower_right_ll, u_lower_right_rr, orientation, + equations) + + for ii in axes(fstar_primary_upper_left, 1) + @inbounds begin + fstar_primary_upper_left[ii, i, j, k] = flux_upper_left_node[ii] + fstar_primary_upper_right[ii, i, j, k] = flux_upper_right_node[ii] + + fstar_primary_lower_left[ii, i, j, k] = flux_lower_left_node[ii] + fstar_primary_lower_right[ii, i, j, k] = flux_lower_right_node[ii] + + fstar_secondary_upper_left[ii, i, j, k] = flux_upper_left_node[ii] + fstar_secondary_upper_right[ii, i, j, k] = flux_upper_right_node[ii] + + fstar_secondary_lower_left[ii, i, j, k] = flux_lower_left_node[ii] + fstar_seondary_lower_right[ii, i, j, k] = flux_lower_right_node[ii] + end + end + end + + return nothing +end + +# Kernel for calculating mortar fluxes and adding nonconservative fluxes +function mortar_flux_kernel!(fstar_primary_upper_left, fstar_primary_upper_right, + fstar_primary_lower_left, fstar_primary_lower_right, + fstar_secondary_upper_left, fstar_secondary_upper_right, + fstar_secondary_lower_left, fstar_seondary_lower_right, + u_upper_left, u_upper_right, u_lower_left, u_lower_right, orientations, + large_sides, equations::AbstractEquations{3}, surface_flux::Any, + nonconservative_flux::Any) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(u_upper_left, 3) && j <= size(u_upper_left, 4) && k <= length(orientations)) + u_upper_left_ll, u_upper_left_rr = get_surface_node_vars(u_upper_left, equations, i, j, k) + u_upper_right_ll, u_upper_right_rr = get_surface_node_vars(u_upper_right, equations, i, j, k) + u_lower_left_ll, u_lower_left_rr = get_surface_node_vars(u_lower_left, equations, i, j, k) + u_lower_right_ll, u_lower_right_rr = get_surface_node_vars(u_lower_right, equations, i, j, k) + + @inbounds begin + orientation = orientations[k] + large_side = large_sides[k] + end + + flux_upper_left_node = surface_flux(u_upper_left_ll, u_upper_left_rr, orientation, + equations) + flux_upper_right_node = surface_flux(u_upper_right_ll, u_upper_right_rr, orientation, + equations) + flux_lower_left_node = surface_flux(u_lower_left_ll, u_lower_left_rr, orientation, + equations) + flux_lower_right_node = surface_flux(u_lower_right_ll, u_lower_right_rr, orientation, + equations) + + for ii in axes(fstar_primary_upper_left, 1) + @inbounds begin + fstar_primary_upper_left[ii, i, j, k] = flux_upper_left_node[ii] + fstar_primary_upper_right[ii, i, j, k] = flux_upper_right_node[ii] + + fstar_primary_lower_left[ii, i, j, k] = flux_lower_left_node[ii] + fstar_primary_lower_right[ii, i, j, k] = flux_lower_right_node[ii] + + fstar_secondary_upper_left[ii, i, j, k] = flux_upper_left_node[ii] + fstar_secondary_upper_right[ii, i, j, k] = flux_upper_right_node[ii] + + fstar_secondary_lower_left[ii, i, j, k] = flux_lower_left_node[ii] + fstar_seondary_lower_right[ii, i, j, k] = flux_lower_right_node[ii] + end + end + + u_upper_left1 = (2 - large_side) * u_upper_left_ll + (large_side - 1) * u_upper_left_rr + u_upper_left2 = (large_side - 1) * u_upper_left_ll + (2 - large_side) * u_upper_left_rr + + u_upper_right1 = (2 - large_side) * u_upper_right_ll + (large_side - 1) * u_upper_right_rr + u_upper_right2 = (large_side - 1) * u_upper_right_ll + (2 - large_side) * u_upper_right_rr + + u_lower_left1 = (2 - large_side) * u_lower_left_ll + (large_side - 1) * u_lower_left_rr + u_lower_left2 = (large_side - 1) * u_lower_left_ll + (2 - large_side) * u_lower_left_rr + + u_lower_right1 = (2 - large_side) * u_lower_right_ll + (large_side - 1) * u_lower_right_rr + u_lower_right2 = (large_side - 1) * u_lower_right_ll + (2 - large_side) * u_lower_right_rr + + noncons_flux_primary_upper_left = nonconservative_flux(u_upper_left1, u_upper_left2, + orientation, equations) + noncons_flux_primary_upper_right = nonconservative_flux(u_upper_right1, u_upper_right2, + orientation, equations) + noncons_flux_primary_lower_left = nonconservative_flux(u_lower_left1, u_lower_left2, + orientation, equations) + noncons_flux_primary_lower_right = nonconservative_flux(u_lower_right1, u_lower_right2, + orientation, equations) + noncons_flux_secondary_upper_left = nonconservative_flux(u_upper_left2, u_upper_left1, + orientation, equations) + noncons_flux_secondary_upper_right = nonconservative_flux(u_upper_right2, u_upper_right1, + orientation, equations) + noncons_flux_secondary_lower_left = nonconservative_flux(u_lower_left2, u_lower_left1, + orientation, equations) + noncons_flux_secondary_lower_right = nonconservative_flux(u_lower_right2, u_lower_right1, + orientation, equations) + + for ii in axes(fstar_primary_upper_left, 1) + @inbounds begin + fstar_primary_upper_left[ii, i, j, k] += 0.5f0 * noncons_flux_primary_upper_left[ii] + fstar_primary_upper_right[ii, i, j, k] += 0.5f0 * noncons_flux_primary_upper_right[ii] + + fstar_primary_lower_left[ii, i, j, k] += 0.5f0 * noncons_flux_primary_lower_left[ii] + fstar_primary_lower_right[ii, i, j, k] += 0.5f0 * noncons_flux_primary_lower_right[ii] + + fstar_secondary_upper_left[ii, i, j, k] += 0.5f0 * noncons_flux_secondary_upper_left[ii] + fstar_secondary_upper_right[ii, i, j, k] += 0.5f0 * noncons_flux_secondary_upper_right[ii] + + fstar_secondary_lower_left[ii, i, j, k] += 0.5f0 * noncons_flux_secondary_lower_left[ii] + fstar_seondary_lower_right[ii, i, j, k] += 0.5f0 * noncons_flux_secondary_lower_right[ii] + end + end + end + + return nothing +end + +# # Kernel for copying mortar fluxes small to small and small to large - step 1 +# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_upper_left, tmp_upper_right, +# tmp_lower_left, tmp_lower_right, +# fstar_primary_upper_left, fstar_primary_upper_right, +# fstar_primary_lower_left, fstar_primary_lower_right, +# fstar_secondary_upper_left, fstar_secondary_upper_right, +# fstar_secondary_lower_left, fstar_secondary_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && +# k <= length(orientations)) +# j1 = div(j - 1, size(surface_flux_values, 2)) + 1 +# j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + +# lower_left_element = neighbor_ids[1, k] +# lower_right_element = neighbor_ids[2, k] +# upper_left_element = neighbor_ids[3, k] +# upper_right_element = neighbor_ids[4, k] +# large_element = neighbor_ids[5, k] + +# large_side = large_sides[k] +# orientation = orientations[k] + +# # Use simple math expression to enhance the performance (against control flow), +# # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 + +# # isequal(large_side, 1) * isequal(orientation, 2) * 3 + +# # isequal(large_side, 1) * isequal(orientation, 3) * 5 + +# # isequal(large_side, 2) * isequal(orientation, 1) * 2 + +# # isequal(large_side, 2) * isequal(orientation, 2) * 4 + +# # isequal(large_side, 2) * isequal(orientation, 3) * 6`. +# # Please also check the original code in Trixi.jl when you modify this code. +# direction = 2 * orientation + large_side - 2 + +# surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k] +# surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k] +# surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k] +# surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k] + +# # Use simple math expression to enhance the performance (against control flow), +# # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 + +# # isequal(large_side, 1) * isequal(orientation, 2) * 4 + +# # isequal(large_side, 1) * isequal(orientation, 3) * 6 + +# # isequal(large_side, 2) * isequal(orientation, 1) * 1 + +# # isequal(large_side, 2) * isequal(orientation, 2) * 3 + +# # isequal(large_side, 2) * isequal(orientation, 3) * 5`. +# # Please also check the original code in Trixi.jl when you modify this code. +# direction = 2 * orientation - large_side + 1 + +# @inbounds begin +# for j1j1 in axes(reverse_upper, 2) +# tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * +# fstar_secondary_upper_left[i, j1j1, j2, k] +# tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * +# fstar_secondary_upper_right[i, j1j1, j2, k] +# tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * +# fstar_secondary_lower_left[i, j1j1, j2, k] +# tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * +# fstar_secondary_lower_right[i, j1j1, j2, k] +# end +# end +# end + +# return nothing +# end + +# # Kernel for copying mortar fluxes small to small and small to large - step 2 +# function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, tmp_upper_left, +# tmp_upper_right, tmp_lower_left, tmp_lower_right, +# reverse_upper, reverse_lower, neighbor_ids, large_sides, +# orientations, equations::AbstractEquations{3}) +# i = (blockIdx().x - 1) * blockDim().x + threadIdx().x +# j = (blockIdx().y - 1) * blockDim().y + threadIdx().y +# k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + +# if (i <= size(surface_flux_values, 1) && j <= size(surface_flux_values, 2)^2 && +# k <= length(orientations)) +# j1 = div(j - 1, size(surface_flux_values, 2)) + 1 +# j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + +# large_element = neighbor_ids[5, k] + +# large_side = large_sides[k] +# orientation = orientations[k] + +# # See step 1 for the explanation of the following expression +# direction = 2 * orientation - large_side + 1 + +# @inbounds begin +# for j2j2 in axes(reverse_lower, 2) +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * +# tmp_upper_left[i, j1, j2j2, +# direction, +# large_element] +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * +# tmp_upper_right[i, j1, j2j2, +# direction, +# large_element] +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * +# tmp_lower_left[i, j1, j2j2, +# direction, +# large_element] +# tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * +# tmp_lower_right[i, j1, j2j2, +# direction, +# large_element] +# end + +# surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2, +# direction, +# large_element] +# end +# end + +# return nothing +# end + +# Kernel for copying mortar fluxes small to small and small to large (optimized) +function mortar_flux_copy_to_kernel!(surface_flux_values, tmp_surface_flux_values, + tmp_upper_left, tmp_upper_right, tmp_lower_left, tmp_lower_right, + fstar_primary_upper_left, fstar_primary_upper_right, + fstar_primary_lower_left, fstar_primary_lower_right, + fstar_secondary_upper_left, fstar_secondary_upper_right, + fstar_secondary_lower_left, fstar_secondary_lower_right, + reverse_upper, reverse_lower, neighbor_ids, large_sides, + orientations) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + # Loop stride for each dimension + stride_x = gridDim().x * blockDim().x + stride_y = gridDim().y * blockDim().y + stride_z = gridDim().z * blockDim().z + + # Cooperative kernel needs stride loops to handle the constrained launch size + while i <= size(surface_flux_values, 1) + while j <= size(surface_flux_values, 2)^2 + while k <= length(orientations) + j1 = div(j - 1, size(surface_flux_values, 2)) + 1 + j2 = rem(j - 1, size(surface_flux_values, 2)) + 1 + + @inbounds begin + lower_left_element = neighbor_ids[1, k] + lower_right_element = neighbor_ids[2, k] + upper_left_element = neighbor_ids[3, k] + upper_right_element = neighbor_ids[4, k] + large_element = neighbor_ids[5, k] + + large_side = large_sides[k] + orientation = orientations[k] + + # Use simple math expression to enhance the performance (against control flow), + # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 1 + + # isequal(large_side, 1) * isequal(orientation, 2) * 3 + + # isequal(large_side, 1) * isequal(orientation, 3) * 5 + + # isequal(large_side, 2) * isequal(orientation, 1) * 2 + + # isequal(large_side, 2) * isequal(orientation, 2) * 4 + + # isequal(large_side, 2) * isequal(orientation, 3) * 6`. + # Please also check the original code in Trixi.jl when you modify this code. + direction = 2 * orientation + large_side - 2 + + surface_flux_values[i, j1, j2, direction, upper_left_element] = fstar_primary_upper_left[i, j1, j2, k] + surface_flux_values[i, j1, j2, direction, upper_right_element] = fstar_primary_upper_right[i, j1, j2, k] + surface_flux_values[i, j1, j2, direction, lower_left_element] = fstar_primary_lower_left[i, j1, j2, k] + surface_flux_values[i, j1, j2, direction, lower_right_element] = fstar_primary_lower_right[i, j1, j2, k] + + # Use simple math expression to enhance the performance (against control flow), + # it is equivalent to, `isequal(large_side, 1) * isequal(orientation, 1) * 2 + + # isequal(large_side, 1) * isequal(orientation, 2) * 4 + + # isequal(large_side, 1) * isequal(orientation, 3) * 6 + + # isequal(large_side, 2) * isequal(orientation, 1) * 1 + + # isequal(large_side, 2) * isequal(orientation, 2) * 3 + + # isequal(large_side, 2) * isequal(orientation, 3) * 5`. + # Please also check the original code in Trixi.jl when you modify this code. + direction = 2 * orientation - large_side + 1 + end + + for j1j1 in axes(reverse_upper, 2) + @inbounds begin + tmp_upper_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * + fstar_secondary_upper_left[i, j1j1, j2, k] + tmp_upper_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * + fstar_secondary_upper_right[i, j1j1, j2, k] + tmp_lower_left[i, j1, j2, direction, large_element] += reverse_lower[j1, j1j1] * + fstar_secondary_lower_left[i, j1j1, j2, k] + tmp_lower_right[i, j1, j2, direction, large_element] += reverse_upper[j1, j1j1] * + fstar_secondary_lower_right[i, j1j1, j2, k] + end + end + + # Grid scope synchronization + grid = CG.this_grid() + CG.sync(grid) + + for j2j2 in axes(reverse_lower, 2) + @inbounds begin + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * + tmp_upper_left[i, j1, j2j2, + direction, + large_element] + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_upper[j2, j2j2] * + tmp_upper_right[i, j1, j2j2, + direction, + large_element] + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * + tmp_lower_left[i, j1, j2j2, + direction, + large_element] + tmp_surface_flux_values[i, j1, j2, direction, large_element] += reverse_lower[j2, j2j2] * + tmp_lower_right[i, j1, j2j2, + direction, + large_element] + end + end + + @inbounds surface_flux_values[i, j1, j2, direction, large_element] = tmp_surface_flux_values[i, j1, j2, + direction, + large_element] + k += stride_z + end + j += stride_y + end + i += stride_x + end + + return nothing +end + +# Kernel for calculating surface integrals +function surface_integral_kernel!(du, factor_arr, surface_flux_values, + equations::AbstractEquations{3}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) + u2 = size(du, 2) # size(du, 2) == size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + @inbounds begin + du[i, j1, j2, j3, k] -= (surface_flux_values[i, j2, j3, 1, k] * isequal(j1, 1) + + surface_flux_values[i, j1, j3, 3, k] * isequal(j2, 1) + + surface_flux_values[i, j1, j2, 5, k] * isequal(j3, 1)) * + factor_arr[1] + du[i, j1, j2, j3, k] += (surface_flux_values[i, j2, j3, 2, k] * isequal(j1, u2) + + surface_flux_values[i, j1, j3, 4, k] * isequal(j2, u2) + + surface_flux_values[i, j1, j2, 6, k] * isequal(j3, u2)) * + factor_arr[2] + end + end + + return nothing +end + +# Kernel for applying inverse Jacobian +function jacobian_kernel!(du, inverse_jacobian, equations::AbstractEquations{3}) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + j = (blockIdx().y - 1) * blockDim().y + threadIdx().y + k = (blockIdx().z - 1) * blockDim().z + threadIdx().z + + if (i <= size(du, 1) && j <= size(du, 2)^3 && k <= size(du, 5)) + u2 = size(du, 2) # size(du, 2) == size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + @inbounds du[i, j1, j2, j3, k] *= -inverse_jacobian[k] + end + + return nothing +end + +# Kernel for calculating source terms +function source_terms_kernel!(du, u, node_coordinates, t, equations::AbstractEquations{3}, + source_terms::Any) + j = (blockIdx().x - 1) * blockDim().x + threadIdx().x + k = (blockIdx().y - 1) * blockDim().y + threadIdx().y + + if (j <= size(du, 2)^3 && k <= size(du, 5)) + u2 = size(u, 2) # size(du, 2) == size(u, 2) + + j1 = div(j - 1, u2^2) + 1 + j2 = div(rem(j - 1, u2^2), u2) + 1 + j3 = rem(rem(j - 1, u2^2), u2) + 1 + + u_local = get_node_vars(u, equations, j1, j2, j3, k) + x_local = get_node_coords(node_coordinates, equations, j1, j2, j3, k) + + source_terms_node = source_terms(u_local, x_local, t, equations) + + for ii in axes(du, 1) + @inbounds du[ii, j1, j2, j3, k] += source_terms_node[ii] + end + end + + return nothing +end