Skip to content

Commit

Permalink
Small optimization patch for flux differencing kernels (#122)
Browse files Browse the repository at this point in the history
  • Loading branch information
huiyuxie authored Jan 24, 2025
1 parent e4aa19d commit f29d3d4
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 36 deletions.
15 changes: 4 additions & 11 deletions src/solvers/dg_1d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -249,9 +249,6 @@ function volume_flux_integral_kernel!(du, u, derivative_split,
# Allocate dynamic shared memory
shmem_split = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width))
offset += sizeof(eltype(du)) * tile_width^2
shmem_szero = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width),
offset)
offset += sizeof(eltype(du)) * tile_width^2
shmem_value = @cuDynamicSharedMem(eltype(du), (size(du, 1), tile_width),
offset)

Expand All @@ -268,12 +265,7 @@ function volume_flux_integral_kernel!(du, u, derivative_split,
# Load data from global memory into shared memory
for ty2 in axes(du, 2)
# Transposed access
# TODO: Combine into single shared memory
@inbounds begin
shmem_split[ty2, ty] = derivative_split[ty, ty2]
shmem_szero[ty2, ty] = derivative_split[ty, ty2] *
(1 - isequal(ty, ty2)) # set diagonal elements to zeros
end
@inbounds shmem_split[ty2, ty] = derivative_split[ty, ty2]
end

# Synchronization is not needed here given the access pattern
Expand All @@ -294,7 +286,8 @@ function volume_flux_integral_kernel!(du, u, derivative_split,

# TODO: Avoid potential bank conflicts
for tx in axes(du, 1)
@inbounds shmem_value[tx, ty] += symmetric_flux_node[tx] * shmem_szero[thread, ty] +
@inbounds shmem_value[tx, ty] += symmetric_flux_node[tx] * shmem_split[thread, ty] *
(1 - isequal(ty, thread)) + # set diagonal elements to zeros
0.5f0 *
noncons_flux_node[tx] * shmem_split[thread, ty]
end
Expand Down Expand Up @@ -945,7 +938,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{1}, nonconservative_terms::
volume_integral_kernel(du, derivative_split, symmetric_flux_arr, noncons_flux_arr;
kernel_configurator_3d(volume_integral_kernel, size(du)...)...)
else
shmem_size = (size(du, 2)^2 * 2 + size(du, 1) * size(du, 2)) * sizeof(RealT)
shmem_size = (size(du, 2)^2 + size(du, 1) * size(du, 2)) * sizeof(RealT)
threads = (1, size(du, 2), 1)
blocks = (1, 1, size(du, 3))
@cuda threads=threads blocks=blocks shmem=shmem_size volume_flux_integral_kernel!(du, u,
Expand Down
18 changes: 6 additions & 12 deletions src/solvers/dg_2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -307,9 +307,6 @@ function volume_flux_integral_kernel!(du, u, derivative_split,
# Allocate dynamic shared memory
shmem_split = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width))
offset += sizeof(eltype(du)) * tile_width^2
shmem_szero = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width),
offset)
offset += sizeof(eltype(du)) * tile_width^2
shmem_value = @cuDynamicSharedMem(eltype(du), (size(du, 1), tile_width, tile_width),
offset)

Expand All @@ -327,12 +324,7 @@ function volume_flux_integral_kernel!(du, u, derivative_split,

# Load data from global memory into shared memory
# Transposed access
# TODO: Combine into single shared memory
@inbounds begin
shmem_split[ty2, ty1] = derivative_split[ty1, ty2]
shmem_szero[ty2, ty1] = derivative_split[ty1, ty2] *
(1 - isequal(ty1, ty2)) # set diagonal elements to zeros
end
@inbounds shmem_split[ty2, ty1] = derivative_split[ty1, ty2]

sync_threads()

Expand All @@ -357,8 +349,10 @@ function volume_flux_integral_kernel!(du, u, derivative_split,

# TODO: Avoid potential bank conflicts
for tx in axes(du, 1)
@inbounds shmem_value[tx, ty1, ty2] += symmetric_flux_node1[tx] * shmem_szero[thread, ty1] +
symmetric_flux_node2[tx] * shmem_szero[thread, ty2] +
@inbounds shmem_value[tx, ty1, ty2] += symmetric_flux_node1[tx] * shmem_split[thread, ty1] *
(1 - isequal(ty1, thread)) + # set diagonal elements to zeros
symmetric_flux_node2[tx] * shmem_split[thread, ty2] *
(1 - isequal(ty2, thread)) + # set diagonal elements to zeros
0.5f0 *
noncons_flux_node1[tx] * shmem_split[thread, ty1] +
0.5f0 *
Expand Down Expand Up @@ -1464,7 +1458,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{2}, nonconservative_terms::
kernel_configurator_3d(volume_integral_kernel, size(du, 1),
size(du, 2)^2, size(du, 4))...)
else
shmem_size = (size(du, 2)^2 * 2 + size(du, 1) * size(du, 2)^2) * sizeof(RealT)
shmem_size = (size(du, 2)^2 + size(du, 1) * size(du, 2)^2) * sizeof(RealT)
threads = (1, size(du, 2)^2, 1)
blocks = (1, 1, size(du, 4))
@cuda threads=threads blocks=blocks shmem=shmem_size volume_flux_integral_kernel!(du, u,
Expand Down
21 changes: 8 additions & 13 deletions src/solvers/dg_3d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -349,9 +349,6 @@ function volume_flux_integral_kernel!(du, u, derivative_split,
# Allocate dynamic shared memory
shmem_split = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width))
offset += sizeof(eltype(du)) * tile_width^2
shmem_szero = @cuDynamicSharedMem(eltype(du), (tile_width, tile_width),
offset)
offset += sizeof(eltype(du)) * tile_width^2
shmem_value = @cuDynamicSharedMem(eltype(du), (size(du, 1), tile_width, tile_width, tile_width),
offset)

Expand All @@ -370,12 +367,7 @@ function volume_flux_integral_kernel!(du, u, derivative_split,

# Load data from global memory into shared memory
# Transposed access
# TODO: Combine into single shared memory
@inbounds begin
shmem_split[ty2, ty1] = derivative_split[ty1, ty2]
shmem_szero[ty2, ty1] = derivative_split[ty1, ty2] *
(1 - isequal(ty1, ty2)) # set diagonal elements to zeros
end
@inbounds shmem_split[ty2, ty1] = derivative_split[ty1, ty2]

sync_threads()

Expand Down Expand Up @@ -406,9 +398,12 @@ function volume_flux_integral_kernel!(du, u, derivative_split,

# TODO: Avoid potential bank conflicts
for tx in axes(du, 1)
@inbounds shmem_value[tx, ty1, ty2, ty3] += symmetric_flux_node1[tx] * shmem_szero[thread, ty1] +
symmetric_flux_node2[tx] * shmem_szero[thread, ty2] +
symmetric_flux_node3[tx] * shmem_szero[thread, ty3] +
@inbounds shmem_value[tx, ty1, ty2, ty3] += symmetric_flux_node1[tx] * shmem_split[thread, ty1] *
(1 - isequal(ty1, thread)) + # set diagonal elements to zeros
symmetric_flux_node2[tx] * shmem_split[thread, ty2] *
(1 - isequal(ty2, thread)) + # set diagonal elements to zeros
symmetric_flux_node3[tx] * shmem_split[thread, ty3] *
(1 - isequal(ty3, thread)) + # set diagonal elements to zeros
0.5f0 *
noncons_flux_node1[tx] * shmem_split[thread, ty1] +
0.5f0 *
Expand Down Expand Up @@ -2098,7 +2093,7 @@ function cuda_volume_integral!(du, u, mesh::TreeMesh{3}, nonconservative_terms::
kernel_configurator_3d(volume_integral_kernel, size(du, 1),
size(du, 2)^3, size(du, 5))...)
else
shmem_size = (size(du, 2)^2 * 2 + size(du, 1) * size(du, 2)^3) * sizeof(RealT)
shmem_size = (size(du, 2)^2 + size(du, 1) * size(du, 2)^3) * sizeof(RealT)
threads = (1, size(du, 2)^3, 1)
blocks = (1, 1, size(du, 5))
@cuda threads=threads blocks=blocks shmem=shmem_size volume_flux_integral_kernel!(du, u,
Expand Down

0 comments on commit f29d3d4

Please sign in to comment.