Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
fedc44e
Add support for tiling in RedistributeGPU
atmyers Apr 3, 2026
936c109
remove unused
atmyers Apr 3, 2026
60c4106
add [[nodiscard]]
atmyers Apr 3, 2026
97d1f75
fix nodiscard
atmyers Apr 3, 2026
f469e7b
add test with tiling
atmyers Apr 4, 2026
0ace095
Make cpu execution go through RedistributeGPU as well and rename the …
atmyers Apr 7, 2026
f2fec69
OpenMP-ize Redistribute_impl
atmyers Apr 7, 2026
39f8bbd
auto -> auto*
atmyers Apr 7, 2026
0da1ea7
fix particle initialization in ParallelContext test
atmyers Apr 7, 2026
7eb9218
fix narrowing
atmyers Apr 7, 2026
c2c9e6f
OMP another loop
atmyers Apr 8, 2026
3108286
don't need to partition when building on CPU
atmyers Apr 8, 2026
30f019e
use one-pass instead of partitioning for the CPU path
atmyers Apr 8, 2026
3174abf
fix ptd redefinition
atmyers Apr 8, 2026
090f35e
openmp for cpu path
atmyers Apr 10, 2026
164032c
use data duplication strategy for OpenMP
atmyers Apr 10, 2026
f6a0ae3
parallelize inner loop too
atmyers Apr 10, 2026
0d3f089
push openmp up to tiling level for ParticleCopyPlan::build
atmyers Apr 10, 2026
2abd4ae
move omp threading up a level in unpackBuffer
atmyers Apr 10, 2026
7ee2168
push omp work up a level in unpackBuffer
atmyers Apr 10, 2026
a21b752
Don't need ParallelForOMP anymore
atmyers Apr 10, 2026
cb5a2e8
remove another ParallelForOMP
atmyers Apr 10, 2026
a5c63ee
don't do omp setup work unless needed
atmyers Apr 13, 2026
e891205
remove timers
atmyers Apr 13, 2026
bdc8584
use a helper function rather than a lambda
atmyers Apr 13, 2026
fec8362
fix unused
atmyers Apr 13, 2026
c401070
remove another unused
atmyers Apr 13, 2026
630dc36
more unused
atmyers Apr 13, 2026
d036975
only build the needed metadata for CPU/GPU path
atmyers Apr 14, 2026
a46447c
Refactor ParticleCopyPlan::build for clarity
atmyers Apr 14, 2026
dc3b3e2
Merge branch 'development' into redist_gpu_tiling
atmyers Apr 14, 2026
7e9dfec
fix tabs
atmyers Apr 14, 2026
d7f5074
fix whitespace
atmyers Apr 14, 2026
ba2c404
fix forwarding warning
atmyers Apr 14, 2026
ae00215
Don't narrow int -> bool for the local variable
atmyers Apr 15, 2026
5d37a2f
Make sure to reset m_rcv_box_tids
atmyers Apr 15, 2026
e381baa
add back in postLocate for FHDeX
atmyers Apr 15, 2026
10ff0f6
Remove old RedistributeCPU
atmyers Apr 15, 2026
1d8ba40
add another short-circuit to speed up the serial algorithm
atmyers Apr 15, 2026
bb1551c
tabs
atmyers Apr 15, 2026
a8f0e81
don't take early exit on non-local redistribute
atmyers Apr 16, 2026
18bfd6e
Fix the short circuit logic
atmyers Apr 16, 2026
b98c864
tweak check
atmyers Apr 16, 2026
426e741
move assignment out of condition
atmyers Apr 16, 2026
93329eb
fix stray )
atmyers Apr 16, 2026
3870c0d
Assert that tiling is off for neighbor particles on the GPU
atmyers Apr 21, 2026
f2b2b94
reduce duplication
atmyers Apr 21, 2026
0e8877d
fix assert
atmyers Apr 21, 2026
06612bf
fix assert
atmyers Apr 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions Src/Base/AMReX_GpuAtomic.H
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ namespace Gpu::Atomic {
// For LogicalOr and LogicalAnd, the data type is int.
// For Exch and CAS, the data type is generic.
// All these functions are non-atomic in host code!!!
// If one needs them to be atomic in host code, use HostDevice::Atomic::*. Currently only
// HostDevice::Atomic::Add is supported. We could certainly add more.
// If one needs them to be atomic in host code, use HostDevice::Atomic::*.
// If we add more types for atomicAdd, we also need to update HasAtomicAdd in AMReX_TypeTraits.H.

/// \cond DOXYGEN_IGNORE
Expand Down Expand Up @@ -617,6 +616,21 @@ namespace HostDevice::Atomic {
*sum += value;
}

template <class T>
AMREX_FORCE_INLINE
T FetchAdd_Host (T* const sum, T const value) noexcept
{
T old;
#ifdef AMREX_USE_OMP
#pragma omp atomic capture
#endif
{
old = *sum;
*sum += value;
}
return old;
}

template <class T>
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
void Add (T* const sum, T const value) noexcept
Expand All @@ -625,6 +639,14 @@ namespace HostDevice::Atomic {
AMREX_IF_ON_HOST((Add_Host(sum,value);))
}

template <class T>
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
T FetchAdd (T* const sum, T const value) noexcept
{
AMREX_IF_ON_DEVICE((return Gpu::Atomic::Add(sum,value);))
AMREX_IF_ON_HOST((return FetchAdd_Host(sum,value);))
}

}

#ifdef AMREX_USE_GPU
Expand Down
1 change: 1 addition & 0 deletions Src/Particle/AMReX_NeighborParticles.H
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ namespace amrex {
struct NeighborCode
{
int grid_id;
Box grid_box;
IntVect periodic_shift;
};

Expand Down
21 changes: 16 additions & 5 deletions Src/Particle/AMReX_NeighborParticlesGPUImpl.H
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ buildNeighborMask ()
{
NeighborCode code;
code.grid_id = nbor_grid.grid_id;
code.grid_box = ba[nbor_grid.grid_id];
code.periodic_shift = nbor_grid.periodic_shift;
h_code_arr.push_back(code);
h_isec_boxes.push_back(nbor_grid.box);
Expand Down Expand Up @@ -170,6 +171,8 @@ buildNeighborCopyOp (bool use_boundary_neighbor)
auto p_code_array = m_code_array[gid].dataPtr();
auto p_isec_boxes = m_isec_boxes[gid].dataPtr();
const int nisec_box = m_isec_boxes[gid].size();
const bool do_tiling = this->do_tiling;
const IntVect tile_size = this->tile_size;
// auto p_code_offsets = m_code_offsets[gid].dataPtr();

AMREX_FOR_1D ( np, i,
Expand All @@ -194,12 +197,14 @@ buildNeighborCopyOp (bool use_boundary_neighbor)
Gpu::dtoh_memcpy_async(&num_copies, offsets.data()+np, sizeof(int));
Gpu::streamSynchronize();

neighbor_copy_op.resize(gid, lev, num_copies);
neighbor_copy_op.resize(gid, tid, lev, num_copies);

auto p_boxes = neighbor_copy_op.m_boxes[lev][gid].dataPtr();
auto p_levs = neighbor_copy_op.m_levels[lev][gid].dataPtr();
auto p_src_indices = neighbor_copy_op.m_src_indices[lev][gid].dataPtr();
auto p_periodic_shift = neighbor_copy_op.m_periodic_shift[lev][gid].dataPtr();
auto tile_index = std::make_pair(gid, tid);
auto p_boxes = neighbor_copy_op.m_boxes[lev][tile_index].dataPtr();
auto p_levs = neighbor_copy_op.m_levels[lev][tile_index].dataPtr();
auto p_tiles = neighbor_copy_op.m_tiles[lev][tile_index].dataPtr();
auto p_src_indices = neighbor_copy_op.m_src_indices[lev][tile_index].dataPtr();
auto p_periodic_shift = neighbor_copy_op.m_periodic_shift[lev][tile_index].dataPtr();

Gpu::streamSynchronize();
AMREX_FOR_1D ( np, i,
Expand All @@ -213,6 +218,9 @@ buildNeighborCopyOp (bool use_boundary_neighbor)
for (int j=0; j<nisec_box; ++j) {
if (p_isec_boxes[j].contains(iv)) {
p_boxes[k] = p_code_array[j].grid_id;
Box tbx;
p_tiles[k] = getTileIndex(iv, p_code_array[j].grid_box,
do_tiling, tile_size, tbx);
p_levs[k] = 0;
p_periodic_shift[k] = p_code_array[j].periodic_shift;
p_src_indices[k] = pid;
Expand All @@ -235,6 +243,9 @@ fillNeighborsGPU ()

AMREX_ASSERT(numParticlesOutOfRange(*this, 0) == 0);

AMREX_ALWAYS_ASSERT_WITH_MESSAGE(this->do_tiling == 0,
"Tiling on the GPU is not supported for neighbor particles.");

buildNeighborMask();
this->defineBufferMap();

Expand Down
83 changes: 61 additions & 22 deletions Src/Particle/AMReX_ParticleBufferMap.H
Original file line number Diff line number Diff line change
Expand Up @@ -13,39 +13,45 @@ namespace amrex {
struct GetPID
{
const int* m_bucket_to_pid;
const int* m_lev_gid_to_bucket;
const int* m_lev_gid_tid_to_bucket;
const int* m_lev_offsets;
const int* m_gid_offsets;

GetPID (const Gpu::DeviceVector<int>& bucket_to_pid,
const Gpu::DeviceVector<int>& lev_gid_to_bucket,
const Gpu::DeviceVector<int>& lev_offsets)
const Gpu::DeviceVector<int>& lev_gid_tid_to_bucket,
const Gpu::DeviceVector<int>& lev_offsets,
const Gpu::DeviceVector<int>& gid_offsets)
: m_bucket_to_pid(bucket_to_pid.dataPtr()),
m_lev_gid_to_bucket(lev_gid_to_bucket.dataPtr()),
m_lev_offsets(lev_offsets.dataPtr())
m_lev_gid_tid_to_bucket(lev_gid_tid_to_bucket.dataPtr()),
m_lev_offsets(lev_offsets.dataPtr()),
m_gid_offsets(gid_offsets.dataPtr())
{}

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
int operator() (const int lev, const int gid) const noexcept
int operator() (const int lev, const int gid, const int tid) const noexcept
{
return m_bucket_to_pid[m_lev_gid_to_bucket[m_lev_offsets[lev]+gid]];
return m_bucket_to_pid[m_lev_gid_tid_to_bucket[m_gid_offsets[m_lev_offsets[lev]+gid] + tid]];
}
};

struct GetBucket
{
const int* m_lev_gid_to_bucket;
const int* m_lev_gid_tid_to_bucket;
const int* m_lev_offsets;

GetBucket (const int* lev_gid_to_bucket_ptr,
const int* lev_offsets_ptr)
: m_lev_gid_to_bucket(lev_gid_to_bucket_ptr),
m_lev_offsets(lev_offsets_ptr)
const int* m_gid_offsets;

GetBucket (const int* lev_gid_tid_to_bucket_ptr,
const int* lev_offsets_ptr,
const int* gid_offsets_ptr)
: m_lev_gid_tid_to_bucket(lev_gid_tid_to_bucket_ptr),
m_lev_offsets(lev_offsets_ptr),
m_gid_offsets(gid_offsets_ptr)
{}

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
int operator() (const int lev, const int gid) const noexcept
int operator() (const int lev, const int gid, const int tid) const noexcept
{
return m_lev_gid_to_bucket[m_lev_offsets[lev]+gid];
return m_lev_gid_tid_to_bucket[m_gid_offsets[m_lev_offsets[lev]+gid] + tid];
}
};

Expand All @@ -56,28 +62,40 @@ class ParticleBufferMap
Vector<DistributionMapping> m_dm;

Vector<int> m_bucket_to_gid;
Vector<int> m_bucket_to_tid;
Vector<int> m_bucket_to_lev;
Vector<int> m_bucket_to_pid;

Vector<int> m_lev_gid_to_bucket;
Vector<int> m_lev_gid_tid_to_bucket;
Vector<int> m_lev_offsets;
Vector<int> m_gid_offsets;

Vector<int> m_proc_box_counts;
Vector<int> m_proc_box_offsets;

bool m_do_tiling{false};
IntVect m_tile_size{AMREX_D_DECL(1024000, 1024000, 1024000)};

Gpu::DeviceVector<int> d_bucket_to_pid;
Gpu::DeviceVector<int> d_lev_gid_to_bucket;
Gpu::DeviceVector<int> d_lev_gid_tid_to_bucket;
Gpu::DeviceVector<int> d_lev_offsets;
Gpu::DeviceVector<int> d_gid_offsets;

public:
ParticleBufferMap () = default;

ParticleBufferMap (const ParGDBBase* a_gdb);

ParticleBufferMap (const ParGDBBase* a_gdb, bool a_do_tiling, const IntVect& a_tile_size);

void define (const ParGDBBase* a_gdb);

void define (const ParGDBBase* a_gdb, bool a_do_tiling, const IntVect& a_tile_size);

bool isValid (const ParGDBBase* a_gdb) const;

bool isValid (const ParGDBBase* a_gdb, bool a_do_tiling, const IntVect& a_tile_size) const;

[[nodiscard]] AMREX_FORCE_INLINE
int numLevels () const
{
Expand All @@ -99,6 +117,13 @@ public:
return m_bucket_to_gid[bid];
}

[[nodiscard]] AMREX_FORCE_INLINE
int bucketToTile (int bid) const
{
AMREX_ASSERT(m_defined);
return m_bucket_to_tid[bid];
}

[[nodiscard]] AMREX_FORCE_INLINE
int bucketToLevel (int bid) const
{
Expand All @@ -115,9 +140,16 @@ public:

[[nodiscard]] AMREX_FORCE_INLINE
int gridAndLevToBucket (int gid, int lev) const
{
AMREX_ASSERT(!m_do_tiling);
return gridAndTileAndLevToBucket(gid, 0, lev);
}

[[nodiscard]] AMREX_FORCE_INLINE
int gridAndTileAndLevToBucket (int gid, int tid, int lev) const
{
AMREX_ASSERT(m_defined);
return m_lev_gid_to_bucket[m_lev_offsets[lev] + gid];
return m_lev_gid_tid_to_bucket[m_gid_offsets[m_lev_offsets[lev] + gid] + tid];
}

[[nodiscard]] AMREX_FORCE_INLINE
Expand Down Expand Up @@ -148,14 +180,21 @@ public:

[[nodiscard]] AMREX_FORCE_INLINE
int procID (int gid, int lev) const
{
AMREX_ASSERT(!m_do_tiling);
return procID(gid, 0, lev);
}

[[nodiscard]] AMREX_FORCE_INLINE
int procID (int gid, int tid, int lev) const
{
AMREX_ASSERT(m_defined);
return m_dm[lev][gid];
return m_bucket_to_pid[gridAndTileAndLevToBucket(gid, tid, lev)];
}

[[nodiscard]] GetPID getPIDFunctor () const noexcept { return GetPID(d_bucket_to_pid, d_lev_gid_to_bucket, d_lev_offsets);}
[[nodiscard]] GetBucket getBucketFunctor () const noexcept { return GetBucket(d_lev_gid_to_bucket.data(), d_lev_offsets.data());}
[[nodiscard]] GetBucket getHostBucketFunctor () const noexcept { return GetBucket(m_lev_gid_to_bucket.data(), m_lev_offsets.data());}
[[nodiscard]] GetPID getPIDFunctor () const noexcept { return GetPID(d_bucket_to_pid, d_lev_gid_tid_to_bucket, d_lev_offsets, d_gid_offsets);}
[[nodiscard]] GetBucket getBucketFunctor () const noexcept { return GetBucket(d_lev_gid_tid_to_bucket.data(), d_lev_offsets.data(), d_gid_offsets.data());}
[[nodiscard]] GetBucket getHostBucketFunctor () const noexcept { return GetBucket(m_lev_gid_tid_to_bucket.data(), m_lev_offsets.data(), m_gid_offsets.data());}

};

Expand Down
Loading
Loading